Exemplo n.º 1
0
async def test_mars_with_xgboost_sklearn_reg(ray_large_cluster,
                                             create_cluster):
    from xgboost_ray import RayDMatrix, RayParams, RayXGBRegressor
    from sklearn.datasets import make_regression

    assert create_cluster.session
    session = new_session(address=create_cluster.address,
                          backend="oscar",
                          default=True)
    with session:
        np_X, np_y = make_regression(n_samples=1_0000, n_features=10)
        X, y = md.DataFrame(np_X), md.DataFrame({"target": np_y})
        df: md.DataFrame = md.concat(
            [md.DataFrame(X), md.DataFrame(y)], axis=1)
        df.execute()

        num_shards = 4
        ds = md.to_ray_dataset(df, num_shards)
        assert isinstance(ds, ray_dataset.Dataset)

        import gc

        gc.collect()  # Ensure MLDataset does hold mars dataframe to avoid gc.
        ray_params = RayParams(num_actors=2, cpus_per_actor=1)
        reg = RayXGBRegressor(ray_params=ray_params, random_state=42)
        # train
        reg.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params)
        reg.predict(RayDMatrix(ds, "target"))
        reg.predict(pd.DataFrame(np_X))
Exemplo n.º 2
0
    def testTrainPredict(self, init=True, remote=None, **ray_param_dict):
        """Train with evaluation and predict"""
        if init:
            ray.init(num_cpus=2, num_gpus=0)

        dtrain = RayDMatrix(self.x, self.y)

        evals_result = {}
        bst = train(self.params,
                    dtrain,
                    num_boost_round=38,
                    ray_params=RayParams(num_actors=2, **ray_param_dict),
                    evals=[(dtrain, "dtrain")],
                    evals_result=evals_result,
                    _remote=remote)

        self.assertEqual(get_num_trees(bst), 38)

        self.assertTrue("dtrain" in evals_result)

        x_mat = RayDMatrix(self.x)
        pred_y = predict(bst,
                         x_mat,
                         ray_params=RayParams(num_actors=2, **ray_param_dict),
                         _remote=remote)
        self.assertSequenceEqual(list(self.y), list(pred_y))
Exemplo n.º 3
0
def train_breast_cancer(config, ray_params):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data,
                                                        labels,
                                                        test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    bst = train(params=config,
                dtrain=train_set,
                evals=[(test_set, "eval")],
                evals_result=evals_result,
                ray_params=ray_params,
                verbose_eval=False,
                num_boost_round=10)

    model_path = "tuned.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))
Exemplo n.º 4
0
def train_breast_cancer(config, cpus_per_actor=1, num_actors=1):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data,
                                                        labels,
                                                        test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    bst = train(params=config,
                dtrain=train_set,
                evals=[(test_set, "eval")],
                evals_result=evals_result,
                max_actor_restarts=1,
                checkpoint_path="/tmp/checkpoint/",
                gpus_per_actor=0,
                cpus_per_actor=cpus_per_actor,
                num_actors=num_actors,
                verbose_eval=False,
                num_boost_round=10)

    model_path = "simple.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))
Exemplo n.º 5
0
def main():
    ray.client("anyscale://").connect()

    print("Loading HIGGS data.")

    dask.config.set(scheduler=ray_dask_get)
    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]
    data = dd.read_csv(FILE_URL, names=colnames)
    if args.smoke_test:
        data = data.head(n=1000)

    print("Loaded HIGGS data.")

    # partition on a column
    df_train = data[(data["feature-01"] < 0.4)]
    df_train = df_train.persist()
    df_validation = data[(data["feature-01"] >= 0.4)
                         & (data["feature-01"] < 0.8)]
    df_validation = df_validation.persist()

    dtrain = RayDMatrix(df_train, label="label", columns=colnames)
    dvalidation = RayDMatrix(df_validation, label="label")

    evallist = [(dvalidation, "eval")]
    evals_result = {}
    config = {"tree_method": "hist", "eval_metric": ["logloss", "error"]}
    train(params=config,
          dtrain=dtrain,
          evals_result=evals_result,
          ray_params=RayParams(max_actor_restarts=1,
                               num_actors=4,
                               cpus_per_actor=2),
          num_boost_round=100,
          evals=evallist)
Exemplo n.º 6
0
def main():
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data,
                                                        labels,
                                                        test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    # Set config
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": 3,
    }

    evals_result = {}

    # Train the classifier
    bst = train(config,
                train_set,
                evals=[(test_set, "eval")],
                evals_result=evals_result,
                max_actor_restarts=1,
                verbose_eval=False)

    bst.save_model("simple.xgb")
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))
Exemplo n.º 7
0
    def testCheckpointContinuationValidity(self):
        """Test that checkpoints are stored and loaded correctly"""

        # Train once, get checkpoint via callback returns
        res_1 = {}
        bst_1 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[
                _checkpoint_callback(frequency=1, before_iteration_=False)
            ],
            num_boost_round=2,
            ray_params=RayParams(num_actors=2),
            additional_results=res_1)
        last_checkpoint_1 = res_1["callback_returns"][0][-1]
        last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1]

        # Sanity check
        lc1 = xgb.Booster()
        lc1.load_model(last_checkpoint_1)
        self.assertEqual(last_checkpoint_1, last_checkpoint_other_rank_1)
        self.assertEqual(last_checkpoint_1, lc1.save_raw())
        self.assertEqual(bst_1.save_raw(), lc1.save_raw())

        # Start new training run, starting from existing model
        res_2 = {}
        bst_2 = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[
                _checkpoint_callback(frequency=1, before_iteration_=True),
                _checkpoint_callback(frequency=1, before_iteration_=False)
            ],
            num_boost_round=4,
            ray_params=RayParams(num_actors=2),
            additional_results=res_2,
            xgb_model=lc1)
        first_checkpoint_2 = res_2["callback_returns"][0][0]
        first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0]
        last_checkpoint_2 = res_2["callback_returns"][0][-1]
        last_checkpoint_other_actor_2 = res_2["callback_returns"][1][-1]

        fcp_bst = xgb.Booster()
        fcp_bst.load_model(first_checkpoint_2)

        lcp_bst = xgb.Booster()
        lcp_bst.load_model(last_checkpoint_2)

        # Sanity check
        self.assertEqual(first_checkpoint_2, first_checkpoint_other_actor_2)
        self.assertEqual(last_checkpoint_2, last_checkpoint_other_actor_2)
        self.assertEqual(bst_2.save_raw(), lcp_bst.save_raw())

        # Training should not have proceeded for the first checkpoint,
        # so trees should be equal
        self.assertEqual(last_checkpoint_1, fcp_bst.save_raw())

        # Training should have proceeded for the last checkpoint,
        # so trees should not be equal
        self.assertNotEqual(fcp_bst.save_raw(), lcp_bst.save_raw())
Exemplo n.º 8
0
def main():
    print("Loading HIGGS data.")

    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]

    if args.smoke_test:
        data = pd.read_csv(SIMPLE_HIGGS_S3_URI, names=colnames)
    else:
        data = pd.read_csv(HIGGS_S3_URI, names=colnames)

    print("Loaded HIGGS data.")

    # partition on a column
    df_train = data[(data["feature-01"] < 0.4)]
    df_validation = data[(data["feature-01"] >= 0.4)
                         & (data["feature-01"] < 0.8)]

    dtrain = RayDMatrix(df_train, label="label", columns=colnames)
    dvalidation = RayDMatrix(df_validation, label="label")

    evallist = [(dvalidation, "eval")]
    evals_result = {}
    config = {"tree_method": "hist", "eval_metric": ["logloss", "error"]}
    train(params=config,
          dtrain=dtrain,
          evals_result=evals_result,
          ray_params=RayParams(max_actor_restarts=1,
                               num_actors=4,
                               cpus_per_actor=2),
          num_boost_round=100,
          evals=evallist)
Exemplo n.º 9
0
def train_xgboost(config, train_df, test_df, target_column, ray_params):
    train_set = RayDMatrix(train_df, target_column)
    test_set = RayDMatrix(test_df, target_column)

    evals_result = {}

    train_start_time = time.time()

    # Train the classifier
    bst = train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        verbose_eval=False,
        num_boost_round=100,
        ray_params=ray_params,
    )

    train_end_time = time.time()
    train_duration = train_end_time - train_start_time
    print(f"Total time taken: {train_duration} seconds.")

    model_path = "model.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))

    return bst, evals_result
Exemplo n.º 10
0
    def testDetectDistributed(self):
        with tempfile.TemporaryDirectory() as dir:
            parquet_file = os.path.join(dir, "file.parquet")
            csv_file = os.path.join(dir, "file.csv")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            data_df.to_parquet(parquet_file)
            data_df.to_csv(csv_file)

            mat = RayDMatrix(parquet_file, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix(csv_file, lazy=True)
            # Single CSV files should not be distributed
            self.assertFalse(mat.distributed)

            mat = RayDMatrix([parquet_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix([csv_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            try:
                from ray.util import data as ml_data
                mat = RayDMatrix(ml_data.read_parquet(parquet_file,
                                                      num_shards=1),
                                 lazy=True)
                self.assertTrue(mat.distributed)
            except ImportError:
                print("MLDataset not available in current Ray version. "
                      "Skipping part of test.")
Exemplo n.º 11
0
    def testColumnOrdering(self):
        """When excluding cols, the remaining col order should be preserved."""

        cols = [str(i) for i in range(50)]
        df = pd.DataFrame(np.random.randn(1, len(cols)), columns=cols)
        matrix = RayDMatrix(df, label=cols[-1], num_actors=1)
        data = matrix.get_data(0)["data"]

        assert data.columns.tolist() == cols[:-1]
Exemplo n.º 12
0
    def _testMatrixCreation(self, in_x, in_y, **kwargs):
        if "sharding" not in kwargs:
            kwargs["sharding"] = RayShardingMode.BATCH
        mat = RayDMatrix(in_x, in_y, **kwargs)

        def _load_data(params):
            x = params["data"]
            y = params["label"]

            if isinstance(x, list):
                x = concat_dataframes(x)
            if isinstance(y, list):
                y = concat_dataframes(y)
            return x, y

        params = mat.get_data(rank=0, num_actors=1)
        x, y = _load_data(params)

        self.assertTrue(np.allclose(self.x, x))
        self.assertTrue(np.allclose(self.y, y))

        # Multi actor check
        mat = RayDMatrix(in_x, in_y, **kwargs)

        params = mat.get_data(rank=0, num_actors=2)
        x1, y1 = _load_data(params)

        mat.unload_data()

        params = mat.get_data(rank=1, num_actors=2)
        x2, y2 = _load_data(params)

        self.assertTrue(np.allclose(self.x, concat_dataframes([x1, x2])))
        self.assertTrue(np.allclose(self.y, concat_dataframes([y1, y2])))
Exemplo n.º 13
0
    def _testMatrixCreation(self, in_x, in_y, **kwargs):
        mat = RayDMatrix(in_x, in_y, **kwargs)
        params = mat.get_data(rank=0, num_actors=1)

        x = params["data"]
        y = params["label"]

        if isinstance(x, list):
            x = concat_dataframes(x)
        if isinstance(y, list):
            y = concat_dataframes(y)

        self.assertTrue(np.allclose(self.x, x))
        self.assertTrue(np.allclose(self.y, y))
Exemplo n.º 14
0
    def testCustomObjectiveFunction(self):
        """Ensure that custom objective functions work.

        Runs a custom objective function with pure XGBoost and
        XGBoost on Ray and compares the prediction outputs."""
        self._init_ray()

        params = self.params.copy()
        params.pop("objective", None)

        bst_xgb = xgb.train(params,
                            xgb.DMatrix(self.x, self.y),
                            obj=squared_log)

        bst_ray = train(params,
                        RayDMatrix(self.x, self.y),
                        ray_params=RayParams(num_actors=2),
                        obj=squared_log,
                        **self.kwargs)

        x_mat = xgb.DMatrix(self.x)
        pred_y_xgb = np.round(bst_xgb.predict(x_mat))
        pred_y_ray = np.round(bst_ray.predict(x_mat))

        self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray))
        self.assertSequenceEqual(list(self.y), list(pred_y_ray))
Exemplo n.º 15
0
def main(fname, num_actors=2):
    dtrain = RayDMatrix(
        os.path.abspath(fname), label="labels", ignore=["partition"])

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors),
        num_boost_round=10,
        evals=[(dtrain, "train")])
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("test_data.xgb")
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
Exemplo n.º 16
0
def main():
    print("Loading HIGGS data.")

    dask.config.set(scheduler=ray_dask_get)
    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]
    data = dd.read_csv(FILE_URL, names=colnames)

    print("Loaded HIGGS data.")

    # partition on a column
    df_train = data[(data["feature-01"] < 0.4)]
    df_validation = data[(data["feature-01"] >= 0.4)
                         & (data["feature-01"] < 0.8)]

    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    bst = train_xgboost(
        config, df_train, df_validation, "label",
        RayParams(max_actor_restarts=1, cpus_per_actor=4, num_actors=4))
    tune_xgboost(df_train, df_validation, "label")
    inference_df = RayDMatrix(df_train[sorted(df_train.columns)],
                              ignore=["label", "partition"])
    predict(bst,
            inference_df,
            ray_params=RayParams(cpus_per_actor=2, num_actors=16))
Exemplo n.º 17
0
def main():
    # Example adapted from this blog post:
    # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7
    # This uses the HIGGS dataset. Download here:
    # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
    fname = "HIGGS.csv"
    colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]

    dtrain = RayDMatrix(os.path.abspath(fname), label="label", names=colnames)

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(
        config,
        dtrain,
        evals_result=evals_result,
        max_actor_restarts=1,
        num_boost_round=100,
        evals=[(dtrain, "train")])
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("higgs.xgb")
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
Exemplo n.º 18
0
def main():
    # Run `create_test_data.py` first to create fake data.
    fname = "parted.parquet"

    dtrain = RayDMatrix(os.path.abspath(fname),
                        label="labels",
                        ignore=["partition"])

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(config,
                dtrain,
                evals_result=evals_result,
                max_actor_restarts=1,
                num_boost_round=100,
                evals=[(dtrain, "train")])
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("test_data.xgb")
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
Exemplo n.º 19
0
    def testTrainingContinuationElasticMultiKilled(self):
        """This should still show 20 boost rounds after two failures."""
        logging.getLogger().setLevel(10)

        additional_results = {}

        bst = train(self.params,
                    RayDMatrix(self.x, self.y),
                    callbacks=[
                        _kill_callback(self.die_lock_file,
                                       fail_iteration=6,
                                       actor_rank=0),
                        _kill_callback(self.die_lock_file_2,
                                       fail_iteration=14,
                                       actor_rank=1),
                    ],
                    num_boost_round=20,
                    ray_params=RayParams(max_actor_restarts=2,
                                         num_actors=2,
                                         elastic_training=True,
                                         max_failed_actors=2),
                    additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")
Exemplo n.º 20
0
    def test_communication_colocation(self):
        """Checks that Queue and Event actors are colocated with the driver."""
        with self.ray_start_cluster() as cluster:
            cluster.add_node(num_cpus=3)
            cluster.add_node(num_cpus=3)
            cluster.wait_for_nodes()
            ray.init(address=cluster.address)

            local_node = ray.state.current_node_id()

            # Note that these will have the same IP in the test cluster
            assert len(ray.state.node_ids()) == 2
            assert local_node in ray.state.node_ids()

            def _mock_train(*args, _training_state, **kwargs):
                assert ray.get(_training_state.queue.actor.get_node_id.remote(
                )) == ray.state.current_node_id()
                assert ray.get(
                    _training_state.stop_event.actor.get_node_id.remote()) == \
                    ray.state.current_node_id()
                return _train(*args, _training_state=_training_state, **kwargs)

            with patch("xgboost_ray.main._train") as mocked:
                mocked.side_effect = _mock_train
                train(
                    self.params,
                    RayDMatrix(self.x, self.y),
                    num_boost_round=2,
                    ray_params=RayParams(max_actor_restarts=1, num_actors=6))
Exemplo n.º 21
0
    def test_no_tune_spread(self):
        """Tests whether workers are spread when not using Tune."""
        with self.ray_start_cluster() as cluster:
            cluster.add_node(num_cpus=2)
            cluster.add_node(num_cpus=2)
            cluster.wait_for_nodes()
            ray.init(address=cluster.address)

            ray_params = RayParams(
                max_actor_restarts=1, num_actors=2, cpus_per_actor=2)

            def _mock_train(*args, _training_state, **kwargs):
                try:
                    results = _train(
                        *args, _training_state=_training_state, **kwargs)
                    return results
                except Exception:
                    raise
                finally:
                    assert len(_training_state.actors) == 2
                    if not any(a is None for a in _training_state.actors):
                        actor_infos = ray.state.actors()
                        actor_nodes = []
                        for a in _training_state.actors:
                            actor_info = actor_infos.get(a._actor_id.hex())
                            actor_node = actor_info["Address"]["NodeID"]
                            actor_nodes.append(actor_node)
                        assert actor_nodes[0] != actor_nodes[1]

            with patch("xgboost_ray.main._train", _mock_train):
                train(
                    self.params,
                    RayDMatrix(self.x, self.y),
                    num_boost_round=4,
                    ray_params=ray_params)
Exemplo n.º 22
0
 def inner_func(config):
     with patch("xgboost_ray.main._train", _mock_train):
         train(
             params,
             RayDMatrix(x, y),
             num_boost_round=4,
             ray_params=ray_params)
Exemplo n.º 23
0
 def train_func(config):
     train_set = RayDMatrix(x, y)
     train(config["xgb"],
           dtrain=train_set,
           cpus_per_actor=1,
           num_actors=1,
           num_boost_round=config["num_boost_round"])
Exemplo n.º 24
0
    def testTrainingContinuationKilled(self):
        """This should continue after one actor died."""
        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[_kill_callback(self.die_lock_file)],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # End with two working actors
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Two workers finished, so N=32
        self.assertEqual(additional_results["total_n"], 32)
Exemplo n.º 25
0
    def testFailPrintErrors(self):
        """Test that XGBoost training errors are propagated"""
        x = np.random.uniform(0, 1, size=(100, 4))
        y = np.random.randint(0, 2, size=100)

        train_set = RayDMatrix(x, y)

        try:
            train(
                {
                    "objective": "multi:softmax",
                    "num_class": 2,
                    "eval_metric": ["logloss", "error"]
                },  # This will error
                train_set,
                evals=[(train_set, "train")],
                ray_params=RayParams(num_actors=1, max_actor_restarts=0))
        except RuntimeError as exc:
            self.assertTrue(exc.__cause__)
            self.assertTrue(isinstance(exc.__cause__, RayActorError))

            self.assertTrue(exc.__cause__.__cause__)
            self.assertTrue(isinstance(exc.__cause__.__cause__, RayTaskError))

            self.assertTrue(exc.__cause__.__cause__.cause)
            self.assertTrue(
                isinstance(exc.__cause__.__cause__.cause,
                           RayXGBoostTrainingError))

            self.assertIn("label and prediction size not match",
                          str(exc.__cause__.__cause__))
Exemplo n.º 26
0
    def testRanking(self):
        Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
        Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
        X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)),
                       shape=(20, 4)).toarray()
        y = np.array([
            0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
            0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0
        ])

        qid = np.array([0] * 5 + [1] * 5 + [2] * 5 + [3] * 5)
        dtrain = RayDMatrix(X, label=y, qid=qid)

        params = {
            "eta": 1,
            "objective": "rank:pairwise",
            "eval_metric": ["auc", "aucpr"],
            "max_depth": 1
        }
        evals_result = {}
        train(params,
              dtrain,
              10,
              evals=[(dtrain, "train")],
              evals_result=evals_result,
              ray_params=RayParams(num_actors=2, max_actor_restarts=0))
        auc_rec = evals_result["train"]["auc"]
        self.assertTrue(all(p <= q for p, q in zip(auc_rec, auc_rec[1:])))
        auc_rec = evals_result["train"]["aucpr"]
        self.assertTrue((p <= q for p, q in zip(auc_rec, auc_rec[1:])))
Exemplo n.º 27
0
    def testTrainingStopElastic(self):
        """This should now stop training after one actor died."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0"

        # The `train()` function raises a RuntimeError
        ft_manager = FaultToleranceManager.remote()

        ft_manager.schedule_kill.remote(rank=0, boost_round=3)
        ft_manager.schedule_kill.remote(rank=1, boost_round=6)
        ft_manager.delay_return.remote(rank=0,
                                       start_boost_round=4,
                                       end_boost_round=5)

        delay_callback = DelayedLoadingCallback(ft_manager,
                                                reload_data=True,
                                                sleep_time=0.1)
        die_callback = DieCallback(ft_manager, training_delay=0.25)

        with self.assertRaises(RuntimeError):
            train(self.params,
                  RayDMatrix(self.x, self.y),
                  callbacks=[die_callback],
                  num_boost_round=20,
                  ray_params=RayParams(elastic_training=True,
                                       max_failed_actors=1,
                                       max_actor_restarts=1,
                                       num_actors=2,
                                       distributed_callbacks=[delay_callback]))
Exemplo n.º 28
0
    def testSameObject(self):
        """Test that matrices are recognized as the same in an actor task."""
        @ray.remote
        def same(one, two):
            return one == two

        data = RayDMatrix(self.x, self.y)
        self.assertTrue(ray.get(same.remote(data, data)))
Exemplo n.º 29
0
def train_ray(num_workers, num_boost_rounds, num_files=0, use_gpu=False):
    path = "/data/parted.parquet"

    if num_files:
        files = sorted(glob.glob(f"{path}/**/*.parquet"))
        while num_files > len(files):
            files = files + files
        path = files[0:num_files]

    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401
            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(path,
                                          num_actors=num_workers,
                                          label="labels",
                                          ignore=["partition"],
                                          filetype=RayFileType.PARQUET)
    else:
        dtrain = RayDMatrix(path,
                            num_actors=num_workers,
                            label="labels",
                            ignore=["partition"],
                            filetype=RayFileType.PARQUET)

    config = {
        "tree_method": "hist" if not use_gpu else "gpu_hist",
        "eval_metric": ["logloss", "error"],
    }

    start = time.time()
    evals_result = {}
    bst = train(config,
                dtrain,
                evals_result=evals_result,
                max_actor_restarts=2,
                num_boost_round=num_boost_rounds,
                num_actors=num_workers,
                cpus_per_actor=4,
                checkpoint_path="/tmp/checkpoint/",
                gpus_per_actor=0 if not use_gpu else 1,
                resources_per_actor={
                    "actor_cpus": 4,
                    "actor_gpus": 0 if not use_gpu else 1
                },
                evals=[(dtrain, "train")])
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
    return taken
Exemplo n.º 30
0
    def testSameResultWithAndWithoutError(self):
        """Get the same model with and without errors during training."""
        # Run training
        bst_noerror = train(self.params,
                            RayDMatrix(self.x, self.y),
                            num_boost_round=10,
                            ray_params=RayParams(max_actor_restarts=0,
                                                 num_actors=2))

        bst_2part_1 = train(self.params,
                            RayDMatrix(self.x, self.y),
                            num_boost_round=5,
                            ray_params=RayParams(max_actor_restarts=0,
                                                 num_actors=2))

        bst_2part_2 = train(self.params,
                            RayDMatrix(self.x, self.y),
                            num_boost_round=5,
                            ray_params=RayParams(max_actor_restarts=0,
                                                 num_actors=2),
                            xgb_model=bst_2part_1)

        res_error = {}
        bst_error = train(
            self.params,
            RayDMatrix(self.x, self.y),
            callbacks=[_fail_callback(self.die_lock_file, fail_iteration=7)],
            num_boost_round=10,
            ray_params=RayParams(max_actor_restarts=1,
                                 num_actors=2,
                                 checkpoint_frequency=5),
            additional_results=res_error)

        flat_noerror = flatten_obj({"tree": tree_obj(bst_noerror)})
        flat_error = flatten_obj({"tree": tree_obj(bst_error)})
        flat_2part = flatten_obj({"tree": tree_obj(bst_2part_2)})

        for key in flat_noerror:
            self.assertAlmostEqual(flat_noerror[key], flat_error[key])
            self.assertAlmostEqual(flat_noerror[key], flat_2part[key])

        # We fail at iteration 7, but checkpoints are saved at iteration 5
        # Thus we have two additional returns here.
        print("Callback returns:", res_error["callback_returns"][0])
        self.assertEqual(len(res_error["callback_returns"][0]), 10 + 2)