async def test_mars_with_xgboost(ray_large_cluster, create_cluster): from xgboost_ray import RayDMatrix, RayParams, train, predict from sklearn.datasets import load_breast_cancer assert create_cluster.session session = new_session(address=create_cluster.address, backend="oscar", default=True) with session: train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True) df: md.DataFrame = md.concat( [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1) df.execute() num_shards = 4 ds = mdd.to_ray_mldataset(df, num_shards) assert isinstance(ds, ml_dataset.MLDataset) import gc gc.collect() # Ensure MLDataset does hold mars dataframe to avoid gc. # train train_set = RayDMatrix(ds, "target") evals_result = {} bst = train( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }, train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=RayParams( num_actors=num_shards, cpus_per_actor=1 # Number of remote actors ), ) bst.save_model("model.xgb") assert os.path.exists("model.xgb") os.remove("model.xgb") print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1])) predict(bst, train_set, ray_params=RayParams(num_actors=2))
def readme_predict(): from xgboost_ray import RayDMatrix, RayParams, predict from sklearn.datasets import load_breast_cancer import xgboost as xgb data, labels = load_breast_cancer(return_X_y=True) dpred = RayDMatrix(data, labels) bst = xgb.Booster(model_file="model.xgb") pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) print(pred_ray)
def _testJointTraining(self, sharding=RayShardingMode.INTERLEAVED, softprob=False): """Train with Ray. The data will be split, but the trees should be combined together and find the true model.""" params = self.params.copy() if softprob: params["objective"] = "multi:softprob" bst = train(params, RayDMatrix(self.x, self.y, sharding=sharding), ray_params=RayParams(num_actors=2)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y), list(pred_y)) x_mat = RayDMatrix(self.x, sharding=sharding) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2)) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y), list(pred_y)) # try on an odd number of rows bst = train(params, RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding), ray_params=RayParams(num_actors=2)) x_mat = RayDMatrix(self.x[:-1], sharding=sharding) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2)) if softprob: pred_y = np.argmax(pred_y, axis=1) pred_y = pred_y.astype(int) self.assertSequenceEqual(list(self.y[:-1]), list(pred_y))
def main(): # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) dmat_xgb = xgb.DMatrix(data, labels) dmat_ray = RayDMatrix(data, labels) bst = xgb.Booster(model_file="simple.xgb") pred_xgb = bst.predict(dmat_xgb) pred_ray = predict(bst, dmat_ray) np.testing.assert_array_equal(pred_xgb, pred_ray) print(pred_ray)
def main(): if not os.path.exists("simple.xgb"): raise ValueError(f"Model file not found: `simple.xgb`" f"\nFIX THIS by running `python `simple.py` first to " f"train the model.") # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) dmat_xgb = xgb.DMatrix(data, labels) dmat_ray = RayDMatrix(data, labels) bst = xgb.Booster(model_file="simple.xgb") pred_xgb = bst.predict(dmat_xgb) pred_ray = predict(bst, dmat_ray) np.testing.assert_array_equal(pred_xgb, pred_ray) print(pred_ray)
def testTrainPredict(self, init=True, remote=None, softprob=False, **ray_param_dict): """Train with evaluation and predict""" if init: ray.init(num_cpus=2, num_gpus=0) dtrain = RayDMatrix(self.x, self.y) params = self.params if softprob: params = params.copy() params["objective"] = "multi:softprob" evals_result = {} bst = train(params, dtrain, num_boost_round=38, ray_params=RayParams(num_actors=2, **ray_param_dict), evals=[(dtrain, "dtrain")], evals_result=evals_result, _remote=remote) self.assertEqual(get_num_trees(bst), 38) self.assertTrue("dtrain" in evals_result) x_mat = RayDMatrix(self.x) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2, **ray_param_dict), _remote=remote) if softprob: self.assertEqual(pred_y.shape[1], len(np.unique(self.y))) pred_y = np.argmax(pred_y, axis=1) self.assertSequenceEqual(list(self.y), list(pred_y))
bst, evals_result = train_xgboost( config, df_train, df_validation, LABEL_COLUMN, RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors), ) print(f"Results: {evals_result}") # - # ## Prediction # # With the model trained, we can now predict on unseen data. For the # purposes of this example, we will use the same dataset for prediction as # for training. # # Since prediction is naively parallelizable, distributing it over multiple # actors can measurably reduce the amount of time needed. # + inference_df = RayDMatrix(df, ignore=[LABEL_COLUMN, "partition"]) results = predict( bst, inference_df, ray_params=RayParams(cpus_per_actor=cpus_per_actor_inference, num_actors=num_actors_inference), ) print(results)