def run_with_dask_array(DMatrixT, client): import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = X.map_blocks(cp.asarray) y = y.map_blocks(cp.asarray) dtrain = DMatrixT(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist', 'debug_synchronize': True}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) from_dmatrix = dxgb.predict(client, out, dtrain).compute() inplace_predictions = dxgb.inplace_predict( client, out, X).compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) np.testing.assert_allclose(single_node, from_dmatrix) device = cp.cuda.runtime.getDevice() assert device == inplace_predictions.device.id single_node = cp.array(single_node) assert device == single_node.device.id cp.testing.assert_allclose( single_node, inplace_predictions)
def test_data_initialization(self, local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: X, y, _ = generate_array() fw = da.random.random((random_cols, )) fw = fw - fw.min() m = dxgb.DaskDMatrix(client, X, y, feature_weights=fw) workers = _get_client_workers(client) rabit_args = client.sync(dxgb._get_rabit_args, len(workers), client) def worker_fn(worker_addr: str, data_ref: Dict) -> None: with dxgb.RabitContext(rabit_args): local_dtrain = dxgb._dmatrix_from_list_of_parts(**data_ref) fw_rows = local_dtrain.get_float_info("feature_weights").shape[0] assert fw_rows == local_dtrain.num_col() futures = [] for i in range(len(workers)): futures.append( client.submit( worker_fn, workers[i], m._create_fn_args(workers[i]), pure=False, workers=[workers[i]] ) ) client.gather(futures)
async def run_from_dask_array_asyncio(scheduler_address): async with Client(scheduler_address, asynchronous=True) as client: import cupy as cp X, y = generate_array() X = X.map_blocks(cp.array) y = y.map_blocks(cp.array) m = await xgboost.dask.DaskDeviceQuantileDMatrix(client, X, y) output = await xgboost.dask.train(client, {'tree_method': 'gpu_hist'}, dtrain=m) with_m = await xgboost.dask.predict(client, output, m) with_X = await xgboost.dask.predict(client, output, X) inplace = await xgboost.dask.inplace_predict(client, output, X) assert isinstance(with_m, da.Array) assert isinstance(with_X, da.Array) assert isinstance(inplace, da.Array) cp.testing.assert_allclose(await client.compute(with_m), await client.compute(with_X)) cp.testing.assert_allclose(await client.compute(with_m), await client.compute(inplace)) client.shutdown() return output
def test_dask_array(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = X.map_blocks(cp.asarray) y = y.map_blocks(cp.asarray) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) from_dmatrix = dxgb.predict(client, out, dtrain).compute() inplace_predictions = dxgb.inplace_predict(client, out, X).compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) np.testing.assert_allclose(single_node, from_dmatrix) device = cp.cuda.runtime.getDevice() assert device == inplace_predictions.device.id single_node = cp.array(single_node) assert device == single_node.device.id cp.testing.assert_allclose(single_node, inplace_predictions)
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cupy.testing.assert_allclose(single_node, predictions) cupy.testing.assert_allclose(single_node, series_predictions)
def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf with Client(local_cuda_cluster) as client: X_, y_, w_ = generate_array(with_weights=True) y_ = (y_ * 10).astype(np.int32) X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_)) y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_)) w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_)) run_dask_classifier(X, y, w, model, client)
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None: import cupy as cp cp.cuda.runtime.setDevice(0) X, y, _ = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = DMatrixT(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist', 'debug_synchronize': True}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain) assert isinstance(predictions.compute(), np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) single_node = out['booster'].predict(xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions.compute()) np.testing.assert_allclose(single_node, series_predictions.compute().to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) T = TypeVar('T') def is_df(part: T) -> T: assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions( is_df, meta=dd.utils.make_meta({'prediction': 'f4'})) cp.testing.assert_allclose( predt.values.compute(), single_node) # Make sure the output can be integrated back to original dataframe X["predict"] = predictions X["inplace_predict"] = series_predictions has_null = X.isnull().values.any().compute() assert bool(has_null) is False
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, { 'tree_method': 'gpu_hist', 'debug_synchronize': True }, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions) np.testing.assert_allclose(single_node, series_predictions.to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) def is_df(part): assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions(is_df, meta=dd.utils.make_meta( {'prediction': 'f4'})) cp.testing.assert_allclose(predt.values.compute(), single_node)
def test_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray)