示例#1
0
async def test_mars_with_xgboost(ray_large_cluster, create_cluster):
    from xgboost_ray import RayDMatrix, RayParams, train, predict
    from sklearn.datasets import load_breast_cancer

    assert create_cluster.session
    session = new_session(address=create_cluster.address,
                          backend="oscar",
                          default=True)
    with session:
        train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
        df: md.DataFrame = md.concat(
            [md.DataFrame(train_x),
             md.DataFrame(train_y)], axis=1)
        df.execute()

        num_shards = 4
        ds = mdd.to_ray_mldataset(df, num_shards)
        assert isinstance(ds, ml_dataset.MLDataset)

        import gc

        gc.collect()  # Ensure MLDataset does hold mars dataframe to avoid gc.

        # train
        train_set = RayDMatrix(ds, "target")
        evals_result = {}
        bst = train(
            {
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error"],
            },
            train_set,
            evals_result=evals_result,
            evals=[(train_set, "train")],
            verbose_eval=False,
            ray_params=RayParams(
                num_actors=num_shards,
                cpus_per_actor=1  # Number of remote actors
            ),
        )
        bst.save_model("model.xgb")
        assert os.path.exists("model.xgb")
        os.remove("model.xgb")
        print("Final training error: {:.4f}".format(
            evals_result["train"]["error"][-1]))
        predict(bst, train_set, ray_params=RayParams(num_actors=2))
示例#2
0
def readme_predict():
    from xgboost_ray import RayDMatrix, RayParams, predict
    from sklearn.datasets import load_breast_cancer
    import xgboost as xgb

    data, labels = load_breast_cancer(return_X_y=True)

    dpred = RayDMatrix(data, labels)

    bst = xgb.Booster(model_file="model.xgb")
    pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2))

    print(pred_ray)
示例#3
0
    def _testJointTraining(self,
                           sharding=RayShardingMode.INTERLEAVED,
                           softprob=False):
        """Train with Ray. The data will be split, but the trees
        should be combined together and find the true model."""
        params = self.params.copy()
        if softprob:
            params["objective"] = "multi:softprob"

        bst = train(params,
                    RayDMatrix(self.x, self.y, sharding=sharding),
                    ray_params=RayParams(num_actors=2))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y), list(pred_y))

        x_mat = RayDMatrix(self.x, sharding=sharding)
        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y), list(pred_y))

        # try on an odd number of rows
        bst = train(params,
                    RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding),
                    ray_params=RayParams(num_actors=2))

        x_mat = RayDMatrix(self.x[:-1], sharding=sharding)
        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
        if softprob:
            pred_y = np.argmax(pred_y, axis=1)
        pred_y = pred_y.astype(int)
        self.assertSequenceEqual(list(self.y[:-1]), list(pred_y))
示例#4
0
def main():
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)

    dmat_xgb = xgb.DMatrix(data, labels)
    dmat_ray = RayDMatrix(data, labels)

    bst = xgb.Booster(model_file="simple.xgb")

    pred_xgb = bst.predict(dmat_xgb)
    pred_ray = predict(bst, dmat_ray)

    np.testing.assert_array_equal(pred_xgb, pred_ray)
    print(pred_ray)
示例#5
0
def main():
    if not os.path.exists("simple.xgb"):
        raise ValueError(f"Model file not found: `simple.xgb`"
                         f"\nFIX THIS by running `python `simple.py` first to "
                         f"train the model.")

    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)

    dmat_xgb = xgb.DMatrix(data, labels)
    dmat_ray = RayDMatrix(data, labels)

    bst = xgb.Booster(model_file="simple.xgb")

    pred_xgb = bst.predict(dmat_xgb)
    pred_ray = predict(bst, dmat_ray)

    np.testing.assert_array_equal(pred_xgb, pred_ray)
    print(pred_ray)
示例#6
0
    def testTrainPredict(self,
                         init=True,
                         remote=None,
                         softprob=False,
                         **ray_param_dict):
        """Train with evaluation and predict"""
        if init:
            ray.init(num_cpus=2, num_gpus=0)

        dtrain = RayDMatrix(self.x, self.y)

        params = self.params
        if softprob:
            params = params.copy()
            params["objective"] = "multi:softprob"

        evals_result = {}
        bst = train(params,
                    dtrain,
                    num_boost_round=38,
                    ray_params=RayParams(num_actors=2, **ray_param_dict),
                    evals=[(dtrain, "dtrain")],
                    evals_result=evals_result,
                    _remote=remote)

        self.assertEqual(get_num_trees(bst), 38)

        self.assertTrue("dtrain" in evals_result)

        x_mat = RayDMatrix(self.x)
        pred_y = predict(bst,
                         x_mat,
                         ray_params=RayParams(num_actors=2, **ray_param_dict),
                         _remote=remote)

        if softprob:
            self.assertEqual(pred_y.shape[1], len(np.unique(self.y)))
            pred_y = np.argmax(pred_y, axis=1)

        self.assertSequenceEqual(list(self.y), list(pred_y))
示例#7
0
bst, evals_result = train_xgboost(
    config,
    df_train,
    df_validation,
    LABEL_COLUMN,
    RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),
)
print(f"Results: {evals_result}")
# -

# ## Prediction
#
# With the model trained, we can now predict on unseen data. For the
# purposes of this example, we will use the same dataset for prediction as
# for training.
#
# Since prediction is naively parallelizable, distributing it over multiple
# actors can measurably reduce the amount of time needed.

# +
inference_df = RayDMatrix(df, ignore=[LABEL_COLUMN, "partition"])
results = predict(
    bst,
    inference_df,
    ray_params=RayParams(cpus_per_actor=cpus_per_actor_inference,
                         num_actors=num_actors_inference),
)

print(results)