def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None: import cupy as cp cp.cuda.runtime.setDevice(0) X, y, _ = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = DMatrixT(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist', 'debug_synchronize': True}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain) assert isinstance(predictions.compute(), np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) single_node = out['booster'].predict(xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions.compute()) np.testing.assert_allclose(single_node, series_predictions.compute().to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) T = TypeVar('T') def is_df(part: T) -> T: assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions( is_df, meta=dd.utils.make_meta({'prediction': 'f4'})) cp.testing.assert_allclose( predt.values.compute(), single_node) # Make sure the output can be integrated back to original dataframe X["predict"] = predictions X["inplace_predict"] = series_predictions has_null = X.isnull().values.any().compute() assert bool(has_null) is False
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, { 'tree_method': 'gpu_hist', 'debug_synchronize': True }, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions) np.testing.assert_allclose(single_node, series_predictions.to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) def is_df(part): assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions(is_df, meta=dd.utils.make_meta( {'prediction': 'f4'})) cp.testing.assert_allclose(predt.values.compute(), single_node)
def run_with_dask_array(DMatrixT, client): import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = X.map_blocks(cp.asarray) y = y.map_blocks(cp.asarray) dtrain = DMatrixT(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist', 'debug_synchronize': True}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) from_dmatrix = dxgb.predict(client, out, dtrain).compute() inplace_predictions = dxgb.inplace_predict( client, out, X).compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) np.testing.assert_allclose(single_node, from_dmatrix) device = cp.cuda.runtime.getDevice() assert device == inplace_predictions.device.id single_node = cp.array(single_node) assert device == single_node.device.id cp.testing.assert_allclose( single_node, inplace_predictions)
def test_dask_array(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = X.map_blocks(cp.asarray) y = y.map_blocks(cp.asarray) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) from_dmatrix = dxgb.predict(client, out, dtrain).compute() inplace_predictions = dxgb.inplace_predict(client, out, X).compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) np.testing.assert_allclose(single_node, from_dmatrix) device = cp.cuda.runtime.getDevice() assert device == inplace_predictions.device.id single_node = cp.array(single_node) assert device == single_node.device.id cp.testing.assert_allclose(single_node, inplace_predictions)
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cupy.testing.assert_allclose(single_node, predictions) cupy.testing.assert_allclose(single_node, series_predictions)
def test_empty_dmatrix(self): def _check_outputs(out, predictions): assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['validation']['rmse']) == 2 assert isinstance(predictions, np.ndarray) assert predictions.shape[0] == 1 parameters = { 'tree_method': 'gpu_hist', 'verbosity': 3, 'debug_synchronize': True } with LocalCUDACluster() as cluster: with Client(cluster) as client: kRows, kCols = 1, 97 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.rand(kRows)) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, parameters, dtrain=dtrain, evals=[(dtrain, 'validation')], num_boost_round=2) predictions = dxgb.predict(client=client, model=out, data=dtrain).compute() _check_outputs(out, predictions) # train has more rows than evals valid = dtrain kRows += 1 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.rand(kRows)) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, parameters, dtrain=dtrain, evals=[(valid, 'validation')], num_boost_round=2) predictions = dxgb.predict(client=client, model=out, data=valid).compute() _check_outputs(out, predictions)
def test_empty_partition(self, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf import cudf import cupy with Client(local_cuda_cluster) as client: mult = 100 df = cudf.DataFrame({ "a": [1, 2, 3, 4, 5.1] * mult, "b": [10, 15, 29.3, 30, 31] * mult, "y": [10, 20, 30, 40., 50] * mult, }) parameters = {"tree_method": "gpu_hist", "debug_synchronize": True} empty = df.iloc[:0] ddf = dask_cudf.concat( [dask_cudf.from_cudf(empty, npartitions=1)] + [dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst_empty = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt_empty = dxgb.predict(client, bst_empty, X).compute().values ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt = dxgb.predict(client, bst, X).compute().values cupy.testing.assert_allclose(predt, predt_empty)
def test_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(out, dtrain) predictions = predictions.compute()