def test_early_stopping(self, local_cuda_cluster: LocalCUDACluster) -> None: from sklearn.datasets import load_breast_cancer with Client(local_cuda_cluster) as client: X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = dxgb.DaskDMatrix(client, X, y) valid = dxgb.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = dxgb.train(client, {'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'gpu_hist'}, m, evals=[(valid, 'Valid')], num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)[ 'booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 valid_X = X valid_y = y cls = dxgb.DaskXGBClassifier(objective='binary:logistic', tree_method='gpu_hist', n_estimators=100) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)]) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: import dask_cudf rounds = 10 X, y = make_categorical(client, 10000, 30, 13) X = dask_cudf.from_dask_dataframe(X) X_onehot, _ = make_categorical(client, 10000, 30, 13, True) X_onehot = dask_cudf.from_dask_dataframe(X_onehot) parameters = {"tree_method": "gpu_hist"} m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True) by_etl_results = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], )["history"] m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True) output = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], ) by_builtin_results = output["history"] np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) model = output["booster"] with tempfile.TemporaryDirectory() as tempdir: path = os.path.join(tempdir, "model.json") model.save_model(path) with open(path, "r") as fd: categorical = json.load(fd) categories_sizes = np.array( categorical["learner"]["gradient_booster"]["model"]["trees"] [-1]["categories_sizes"]) assert categories_sizes.shape[0] != 0 np.testing.assert_allclose(categories_sizes, 1)
def test_data_initialization(self, local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: X, y, _ = generate_array() fw = da.random.random((random_cols, )) fw = fw - fw.min() m = dxgb.DaskDMatrix(client, X, y, feature_weights=fw) workers = _get_client_workers(client) rabit_args = client.sync(dxgb._get_rabit_args, len(workers), client) def worker_fn(worker_addr: str, data_ref: Dict) -> None: with dxgb.RabitContext(rabit_args): local_dtrain = dxgb._dmatrix_from_list_of_parts(**data_ref) fw_rows = local_dtrain.get_float_info("feature_weights").shape[0] assert fw_rows == local_dtrain.num_col() futures = [] for i in range(len(workers)): futures.append( client.submit( worker_fn, workers[i], m._create_fn_args(workers[i]), pure=False, workers=[workers[i]] ) ) client.gather(futures)
def test_dask_array(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = X.map_blocks(cp.asarray) y = y.map_blocks(cp.asarray) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) from_dmatrix = dxgb.predict(client, out, dtrain).compute() inplace_predictions = dxgb.inplace_predict(client, out, X).compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) np.testing.assert_allclose(single_node, from_dmatrix) device = cp.cuda.runtime.getDevice() assert device == inplace_predictions.device.id single_node = cp.array(single_node) assert device == single_node.device.id cp.testing.assert_allclose(single_node, inplace_predictions)
def test_gpu_hist(self, params, num_rounds, dataset): with LocalCUDACluster(n_workers=2) as cluster: with Client(cluster) as client: params['tree_method'] = 'gpu_hist' params = dataset.set_params(params) # multi class doesn't handle empty dataset well (empty # means at least 1 worker has data). if params['objective'] == "multi:softmax": return # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1])) y = da.from_array(dataset.y, chunks=(chunk, )) if dataset.w is not None: w = da.from_array(dataset.w, chunks=(chunk, )) else: w = None m = dxgb.DaskDMatrix(client, data=X, label=y, weight=w) history = dxgb.train(client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) assert tm.non_increasing(history['train'][dataset.metric])
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cupy.testing.assert_allclose(single_node, predictions) cupy.testing.assert_allclose(single_node, series_predictions)
def test_empty_dmatrix(self): def _check_outputs(out, predictions): assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['validation']['rmse']) == 2 assert isinstance(predictions, np.ndarray) assert predictions.shape[0] == 1 parameters = { 'tree_method': 'gpu_hist', 'verbosity': 3, 'debug_synchronize': True } with LocalCUDACluster() as cluster: with Client(cluster) as client: kRows, kCols = 1, 97 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.rand(kRows)) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, parameters, dtrain=dtrain, evals=[(dtrain, 'validation')], num_boost_round=2) predictions = dxgb.predict(client=client, model=out, data=dtrain).compute() _check_outputs(out, predictions) # train has more rows than evals valid = dtrain kRows += 1 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.rand(kRows)) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, parameters, dtrain=dtrain, evals=[(valid, 'validation')], num_boost_round=2) predictions = dxgb.predict(client=client, model=out, data=valid).compute() _check_outputs(out, predictions)
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, { 'tree_method': 'gpu_hist', 'debug_synchronize': True }, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions) np.testing.assert_allclose(single_node, series_predictions.to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) def is_df(part): assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions(is_df, meta=dd.utils.make_meta( {'prediction': 'f4'})) cp.testing.assert_allclose(predt.values.compute(), single_node)
def test_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray)