def test_custom_objective(self, client: "Client") -> None: from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) X, y = da.from_array(X), da.from_array(y) rounds = 20 with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, 'log') def sqr(labels: np.ndarray, predts: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: with open(path, 'a') as fd: print('Running sqr', file=fd) grad = predts - labels hess = np.ones(shape=labels.shape[0]) return grad, hess reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, objective=sqr, tree_method='hist') reg.fit(X, y, eval_set=[(X, y)]) # Check the obj is ran for rounds. with open(path, 'r') as fd: out = fd.readlines() assert len(out) == rounds results_custom = reg.evals_result() reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, tree_method='hist') reg.fit(X, y, eval_set=[(X, y)]) results_native = reg.evals_result() np.testing.assert_allclose(results_custom['validation_0']['rmse'], results_native['validation_0']['rmse']) tm.non_increasing(results_native['validation_0']['rmse'])
def test_categorical_ames_housing( self, hist_parameters: Dict[str, Any], cat_parameters: Dict[str, Any], tree_method: str, ) -> None: cat_parameters.update(hist_parameters) dataset = tm.TestDataset("ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse") cat_parameters["tree_method"] = tree_method results = train_result(cat_parameters, dataset.get_dmat(), 16) tm.non_increasing(results["train"]["rmse"])
def test_sparse(self, dataset): param = {"tree_method": "hist", "max_bin": 64} hist_result = train_result(param, dataset.get_dmat(), 16) note(hist_result) assert tm.non_increasing(hist_result['train'][dataset.metric]) param = {"tree_method": "approx", "max_bin": 64} approx_result = train_result(param, dataset.get_dmat(), 16) note(approx_result) assert tm.non_increasing(approx_result['train'][dataset.metric]) np.testing.assert_allclose(hist_result["train"]["rmse"], approx_result["train"]["rmse"])
def test_categorical( self, dataset: tm.TestDataset, exact_parameters: Dict[str, Any], hist_parameters: Dict[str, Any], cat_parameters: Dict[str, Any], n_rounds: int, tree_method: str, ) -> None: cat_parameters.update(exact_parameters) cat_parameters.update(hist_parameters) cat_parameters["tree_method"] = tree_method results = train_result(cat_parameters, dataset.get_dmat(), n_rounds) tm.non_increasing(results["train"]["rmse"])
def test_changed_parameter(self): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) clf = xgb.XGBClassifier(n_estimators=2) clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss") assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"]) with tempfile.TemporaryDirectory() as tmpdir: clf.save_model(os.path.join(tmpdir, "clf.json")) loaded = xgb.XGBClassifier() loaded.load_model(os.path.join(tmpdir, "clf.json")) clf = xgb.XGBClassifier(n_estimators=2) # change metric to error clf.fit(X, y, eval_set=[(X, y)], eval_metric="error") assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
def run_updater_test(self, client, params, num_rounds, dataset, tree_method): params['tree_method'] = tree_method params = dataset.set_params(params) # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1])) y = da.from_array(dataset.y, chunks=(chunk, )) if dataset.w is not None: w = da.from_array(dataset.w, chunks=(chunk, )) else: w = None m = xgb.dask.DaskDMatrix(client, data=X, label=y, weight=w) history = xgb.dask.train(client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) history = history['train'][dataset.metric] assert tm.non_increasing(history) # Make sure that it's decreasing assert history[-1] < history[0]
def run_categorical_basic(self, cat, onehot, label, rounds): by_etl_results = {} by_builtin_results = {} parameters = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'enable_experimental_json_serialization': True } m = xgb.DMatrix(onehot, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_etl_results) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_builtin_results) np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']), np.array( by_builtin_results['Train']['rmse']), rtol=1e-3) assert tm.non_increasing(by_builtin_results['Train']['rmse'])
def test_approx(self, param, hist_param, num_rounds, dataset): param["tree_method"] = "approx" param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric])
def test_external_memory(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) external_result = train_result(param, dataset.get_external_dmat(), num_rounds) assert tm.non_increasing(external_result['train'][dataset.metric])
def test_coordinate(self, param, num_rounds, dataset, coord_param): param['updater'] = 'coord_descent' param.update(coord_param) param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] assert tm.non_increasing(result, 5e-4)
def run_gpu_hist(params, num_rounds, dataset, DMatrixT, client): params['tree_method'] = 'gpu_hist' params = dataset.set_params(params) # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = to_cp(dataset.X, DMatrixT) X = da.from_array(X, chunks=(chunk, dataset.X.shape[1])) y = to_cp(dataset.y, DMatrixT) y = da.from_array(y, chunks=(chunk, )) if dataset.w is not None: w = to_cp(dataset.w, DMatrixT) w = da.from_array(w, chunks=(chunk, )) else: w = None if DMatrixT is dxgb.DaskDeviceQuantileDMatrix: m = DMatrixT(client, data=X, label=y, weight=w, max_bin=params.get('max_bin', 256)) else: m = DMatrixT(client, data=X, label=y, weight=w) history = dxgb.train(client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) assert tm.non_increasing(history['train'][dataset.metric])
def test_gpu_hist(self, params, num_rounds, dataset): with LocalCUDACluster(n_workers=2) as cluster: with Client(cluster) as client: params['tree_method'] = 'gpu_hist' params = dataset.set_params(params) # multi class doesn't handle empty dataset well (empty # means at least 1 worker has data). if params['objective'] == "multi:softmax": return # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1])) y = da.from_array(dataset.y, chunks=(chunk, )) if dataset.w is not None: w = da.from_array(dataset.w, chunks=(chunk, )) else: w = None m = dxgb.DaskDMatrix(client, data=X, label=y, weight=w) history = dxgb.train(client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) assert tm.non_increasing(history['train'][dataset.metric])
def test_exact(self, param, num_rounds, dataset): if dataset.name.endswith("-l1"): return param['tree_method'] = 'exact' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric])
def test_hist(self, param, hist_param, num_rounds, dataset): param['tree_method'] = 'hist' param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric])
def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd): assume(len(dataset.y) > 0) param['updater'] = 'gpu_coord_descent' param['alpha'] = alpha param['lambda'] = lambd param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] assert tm.non_increasing([result[0], result[-1]])
def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) result = train_result(param, dataset.get_device_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric])
def test_gpu_coordinate(self, param, num_rounds, dataset): assume(len(dataset.y) > 0) param['updater'] = 'gpu_coord_descent' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] note(result) assert tm.non_increasing(result)
def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd): param['updater'] = 'shotgun' param['alpha'] = alpha param['lambda'] = lambd param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] assert tm.non_increasing([result[0], result[-1]])
def run_gpu_hist( params: Dict, num_rounds: int, dataset: tm.TestDataset, DMatrixT: Type, client: Client, ) -> None: params["tree_method"] = "gpu_hist" params = dataset.set_params(params) # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = to_cp(dataset.X, DMatrixT) X = da.from_array(X, chunks=(chunk, dataset.X.shape[1])) y = to_cp(dataset.y, DMatrixT) y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk, dataset.y.shape[1]) y = da.from_array(y, chunks=y_chunk) if dataset.w is not None: w = to_cp(dataset.w, DMatrixT) w = da.from_array(w, chunks=(chunk, )) else: w = None if DMatrixT is dxgb.DaskDeviceQuantileDMatrix: m = DMatrixT(client, data=X, label=y, weight=w, max_bin=params.get("max_bin", 256)) else: m = DMatrixT(client, data=X, label=y, weight=w) history = dxgb.train( client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, "train")], )["history"]["train"][dataset.metric] note(history) # See note on `ObjFunction::UpdateTreeLeaf`. update_leaf = dataset.name.endswith("-l1") if update_leaf and len(history) == 2: assert history[0] + 1e-2 >= history[-1] return if update_leaf and len(history) > 2: assert history[0] >= history[-1] return else: assert tm.non_increasing(history)
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd): param['updater'] = 'coord_descent' param['alpha'] = alpha param['lambda'] = lambd param.update(coord_param) param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] note(result) assert tm.non_increasing([result[0], result[-1]])
def test_external_memory(self, param, num_rounds, dataset): pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky') # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) m = dataset.get_external_dmat() external_result = train_result(param, m, num_rounds) del m gc.collect() assert tm.non_increasing(external_result['train'][dataset.metric])
def test_external_memory(self, param, num_rounds, dataset): if dataset.name.endswith("-l1"): return # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) m = dataset.get_external_dmat() external_result = train_result(param, m, num_rounds) del m gc.collect() assert tm.non_increasing(external_result['train'][dataset.metric])
def test_shotgun(self, param, num_rounds, dataset): param['updater'] = 'shotgun' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] # shotgun is non-deterministic, so we relax the test by only using first and last # iteration. if len(result) > 2: sampled_result = (result[0], result[-1]) else: sampled_result = result assert tm.non_increasing(sampled_result)
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: import dask_cudf rounds = 10 X, y = make_categorical(client, 10000, 30, 13) X = dask_cudf.from_dask_dataframe(X) X_onehot, _ = make_categorical(client, 10000, 30, 13, True) X_onehot = dask_cudf.from_dask_dataframe(X_onehot) parameters = {"tree_method": "gpu_hist"} m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True) by_etl_results = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], )["history"] m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True) output = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], ) by_builtin_results = output["history"] np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) model = output["booster"] with tempfile.TemporaryDirectory() as tempdir: path = os.path.join(tempdir, "model.json") model.save_model(path) with open(path, "r") as fd: categorical = json.load(fd) categories_sizes = np.array( categorical["learner"]["gradient_booster"]["model"]["trees"] [-1]["categories_sizes"]) assert categories_sizes.shape[0] != 0 np.testing.assert_allclose(categories_sizes, 1)
def test_early_stopping(self, client: "Client") -> None: from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) valid = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, { 'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist' }, m, evals=[(valid, 'Valid')], num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 valid_X, valid_y = load_breast_cancer(return_X_y=True) valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y) cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=1000) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)]) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 # Specify the metric cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=1000) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)], eval_metric='error') assert tm.non_increasing(cls.evals_result()['validation_0']['error']) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(cls.evals_result()['validation_0']['error']) < 20 assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
def test_shotgun(self, param, num_rounds, dataset): param['updater'] = 'shotgun' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] # shotgun is non-deterministic, so we relax the test by sampling # result. if len(result) > 2: sampled_result = [ score for i, score in enumerate(result) if i % 2 == 0 ] sampled_result[-1] = result[-1] # make sure the last one is used else: sampled_result = result assert tm.non_increasing(sampled_result, 1e-3)
def run(max_cat_to_onehot: int): # Test with onehot splits parameters["max_cat_to_onehot"] = max_cat_to_onehot evals_result: Dict[str, Dict] = {} booster = xgb.train(parameters, Xy, num_boost_round=16, evals=[(Xy, "Train")], evals_result=evals_result) assert tm.non_increasing(evals_result["Train"]["rmse"]) y_predt = booster.predict(Xy) rmse = tm.root_mean_square(label, y_predt) np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
def run_categorical_basic(self, rows, cols, rounds, cats): import pandas as pd rng = np.random.RandomState(1994) pd_dict = {} for i in range(cols): c = rng.randint(low=0, high=cats + 1, size=rows) pd_dict[str(i)] = pd.Series(c, dtype=np.int64) df = pd.DataFrame(pd_dict) label = df.iloc[:, 0] for i in range(0, cols - 1): label += df.iloc[:, i] label += 1 df = df.astype('category') onehot = pd.get_dummies(df) cat = df by_etl_results = {} by_builtin_results = {} parameters = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'enable_experimental_json_serialization': True } m = xgb.DMatrix(onehot, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_etl_results) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_builtin_results) np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']), np.array( by_builtin_results['Train']['rmse']), rtol=1e-3) assert tm.non_increasing(by_builtin_results['Train']['rmse'])
def run_max_cat(self, tree_method: str) -> None: """Test data with size smaller than number of categories.""" import pandas as pd n_cat = 100 n = 5 X = pd.Series( ["".join(choice(ascii_lowercase) for i in range(3)) for i in range(n_cat)], dtype="category", )[:n].to_frame() reg = xgb.XGBRegressor( enable_categorical=True, tree_method=tree_method, n_estimators=10, ) y = pd.Series(range(n)) reg.fit(X=X, y=y, eval_set=[(X, y)]) assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
def run_gpu_hist( params: Dict, num_rounds: int, dataset: tm.TestDataset, DMatrixT: Type, client: Client, ) -> None: params["tree_method"] = "gpu_hist" params = dataset.set_params(params) # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = to_cp(dataset.X, DMatrixT) X = da.from_array(X, chunks=(chunk, dataset.X.shape[1])) y = to_cp(dataset.y, DMatrixT) y = da.from_array(y, chunks=(chunk, )) if dataset.w is not None: w = to_cp(dataset.w, DMatrixT) w = da.from_array(w, chunks=(chunk, )) else: w = None if DMatrixT is dxgb.DaskDeviceQuantileDMatrix: m = DMatrixT(client, data=X, label=y, weight=w, max_bin=params.get("max_bin", 256)) else: m = DMatrixT(client, data=X, label=y, weight=w) history = dxgb.train( client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, "train")], )["history"] note(history) assert tm.non_increasing(history["train"][dataset.metric])