class TestLinear(unittest.TestCase): datasets = ["Boston", "Digits", "Cancer", "Sparse regression", "Boston External Memory"] @pytest.mark.skipif(**tm.no_sklearn()) def test_coordinate(self): variable_param = {'booster': ['gblinear'], 'updater': ['coord_descent'], 'eta': [0.5], 'top_k': [10], 'tolerance': [1e-5], 'nthread': [2], 'alpha': [.005, .1], 'lambda': [.005], 'feature_selector': ['cyclic', 'shuffle', 'greedy', 'thrifty']} #TODO: implement features required by regression_test_utilities.py """ for param in parameter_combinations(variable_param): results = run_suite(param, 150, self.datasets, scale_features=True) assert_regression_result(results, 1e-2) assert_classification_result(results) """ @pytest.mark.skipif(**tm.no_sklearn()) def test_shotgun(self): variable_param = {'booster': ['gblinear'], 'updater': ['shotgun'], 'eta': [0.5], 'top_k': [10], 'tolerance': [1e-5], 'nthread': [2], 'alpha': [.005, .1], 'lambda': [.005], 'feature_selector': ['cyclic', 'shuffle']} #TODO: implement features required by regression_test_utilities.py """
class TestGPULinear(unittest.TestCase): datasets = ["Boston", "Digits", "Cancer", "Sparse regression"] common_param = { 'booster': ['gblinear'], 'updater': ['gpu_coord_descent'], 'eta': [0.5], 'top_k': [10], 'tolerance': [1e-5], 'alpha': [.005, .1], 'lambda': [0.005], 'coordinate_selection': ['cyclic', 'random', 'greedy']} @pytest.mark.skipif(**tm.no_sklearn()) def test_gpu_coordinate(self): parameters = self.common_param.copy() parameters['n_gpus'] = [1] for param in test_linear.parameter_combinations(parameters): results = test_linear.run_suite( param, 150, self.datasets, scale_features=True) test_linear.assert_regression_result(results, 1e-2) test_linear.assert_classification_result(results) @pytest.mark.mgpu @pytest.mark.skipif(**tm.no_sklearn()) def test_gpu_coordinate_mgpu(self): parameters = self.common_param.copy() parameters['n_gpus'] = [-1] parameters['gpu_id'] = [1] for param in test_linear.parameter_combinations(parameters): results = test_linear.run_suite( param, 150, self.datasets, scale_features=True) test_linear.assert_regression_result(results, 1e-2) test_linear.assert_classification_result(results)
class TestBoostFromPrediction(unittest.TestCase): def run_boost_from_prediction(self, tree_method): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model_0 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) model_1 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method) model_1.fit(X=X, y=y, base_margin=margin) predictions_1 = model_1.predict(X, base_margin=margin) cls_2 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method) cls_2.fit(X=X, y=y) predictions_2 = cls_2.predict(X) assert np.all(predictions_1 == predictions_2) @pytest.mark.skipif(**tm.no_sklearn()) def test_boost_from_prediction_hist(self): self.run_boost_from_prediction('hist') @pytest.mark.skipif(**tm.no_sklearn()) def test_boost_from_prediction_approx(self): self.run_boost_from_prediction('approx') @pytest.mark.skipif(**tm.no_sklearn()) def test_boost_from_prediction_exact(self): self.run_boost_from_prediction('exact')
class TestMonotonicConstraints(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_exact(self): assert_constraint(1, 'exact') assert_constraint(-1, 'exact') @pytest.mark.skipif(**tm.no_sklearn()) def test_gpu_hist(self): assert_constraint(1, 'gpu_hist') assert_constraint(-1, 'gpu_hist')
class TestUpdaters(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_histmaker(self): variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_colmaker(self): variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_fast_histmaker(self): variable_param = { 'tree_method': ['hist'], 'max_depth': [2, 8], 'max_bin': [2, 256], 'grow_policy': ['depthwise', 'lossguide'], 'max_leaves': [64, 0], 'silent': [1] } for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = { 'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc' } hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc']
class TestGPULinear(unittest.TestCase): datasets = [ "Boston", "Digits", "Cancer", "Sparse regression", "Boston External Memory" ] @pytest.mark.skipif(**tm.no_sklearn()) def test_gpu_coordinate(self): variable_param = { 'booster': ['gblinear'], 'updater': ['coord_descent'], 'eta': [0.5], 'top_k': [10], 'tolerance': [1e-5], 'nthread': [2], 'alpha': [.005, .1], 'lambda': [0.005], 'coordinate_selection': ['cyclic', 'random', 'greedy'], 'n_gpus': [-1] } for param in test_linear.parameter_combinations(variable_param): results = test_linear.run_suite(param, 200, self.datasets, scale_features=True) test_linear.assert_regression_result(results, 1e-2) test_linear.assert_classification_result(results)
class TestMonotoneConstraints: def test_monotone_constraints_for_exact_tree_method(self): # first check monotonicity for the 'exact' tree method params_for_constrained_exact_method = { 'tree_method': 'exact', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_exact_method = xgb.train( params_for_constrained_exact_method, training_dset ) assert is_correctly_constrained(constrained_exact_method) def test_monotone_constraints_for_depthwise_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train( params_for_constrained_hist_method, training_dset ) assert is_correctly_constrained(constrained_hist_method) def test_monotone_constraints_for_lossguide_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'grow_policy': 'lossguide', 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train( params_for_constrained_hist_method, training_dset ) assert is_correctly_constrained(constrained_hist_method) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_accuracy(self): from sklearn.metrics import accuracy_score dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': 'hist', 'monotone_constraints': '(1, 0)'} num_boost_round = 5 params['grow_policy'] = 'lossguide' bst = xgb.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1 params['grow_policy'] = 'depthwise' bst = xgb.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
class TestGPUBasicModels: cpu_test_cb = test_cb.TestCallbacks() cpu_test_bm = test_bm.TestModels() def run_cls(self, X, y): cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True) cls.fit(X, y) cls.get_booster().save_model('test_deterministic_gpu_hist-0.json') cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True) cls.fit(X, y) cls.get_booster().save_model('test_deterministic_gpu_hist-1.json') with open('test_deterministic_gpu_hist-0.json', 'r') as fd: model_0 = fd.read() with open('test_deterministic_gpu_hist-1.json', 'r') as fd: model_1 = fd.read() os.remove('test_deterministic_gpu_hist-0.json') os.remove('test_deterministic_gpu_hist-1.json') return hash(model_0), hash(model_1) def test_custom_objective(self): self.cpu_test_bm.run_custom_objective("gpu_hist") def test_eta_decay_gpu_hist(self): self.cpu_test_cb.run_eta_decay('gpu_hist') def test_deterministic_gpu_hist(self): kRows = 1000 kCols = 64 kClasses = 4 # Create large values to force rounding. X = np.random.randn(kRows, kCols) * 1e4 y = np.random.randint(0, kClasses, size=kRows) model_0, model_1 = self.run_cls(X, y) assert model_0 == model_1 @pytest.mark.skipif(**tm.no_sklearn()) def test_invalid_gpu_id(self): from sklearn.datasets import load_digits X, y = load_digits(return_X_y=True) # should pass with invalid gpu id cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999) cls1.fit(X, y) # should throw error with fail_on_invalid_gpu_id enabled cls2 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True) try: cls2.fit(X, y) assert False, "Should have failed with with fail_on_invalid_gpu_id enabled" except xgb.core.XGBoostError as err: assert "gpu_id 9999 is invalid" in str(err)
class TestMonotoneConstraints(unittest.TestCase): def test_monotone_constraints_for_exact_tree_method(self): # first check monotonicity for the 'exact' tree method params_for_constrained_exact_method = { 'tree_method': 'exact', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_exact_method = xgb.train( params_for_constrained_exact_method, training_dset) assert is_correctly_constrained(constrained_exact_method) def test_monotone_constraints_for_depthwise_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train(params_for_constrained_hist_method, training_dset) assert is_correctly_constrained(constrained_hist_method) def test_monotone_constraints_for_lossguide_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'grow_policy': 'lossguide', 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train(params_for_constrained_hist_method, training_dset) assert is_correctly_constrained(constrained_hist_method) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_accuracy(self): from sklearn.metrics import accuracy_score #TODO: implement support for ?indexing_model=1 """ dtrain = xgb.DMatrix({username: dpath + 'agaricus.txt.train.enc?indexing_mode=1'}) dtest = xgb.DMatrix({username: dpath + 'agaricus.txt.test.enc?indexing_mode=1'}) params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': 'hist', 'monotone_constraints': '(1, 0)'} num_boost_round = 5 """ #TODO(rishabh): implement get_label() """
class TestDaskCallbacks: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping(self, client): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train(client, {'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist'}, m, evals=[(m, 'Train')], num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') assert booster.best_iteration == 10 dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_custom_eval(self, client): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, {'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist'}, m, evals=[(m, 'Train')], feval=tm.eval_error_metric, num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
class TestTrainingContinuation(unittest.TestCase): num_parallel_tree = 3 xgb_params_01 = { 'silent': 1, 'nthread': 1, } xgb_params_02 = { 'silent': 1, 'nthread': 1, 'num_parallel_tree': num_parallel_tree } xgb_params_03 = { 'silent': 1, 'nthread': 1, 'num_class': 5, 'num_parallel_tree': num_parallel_tree } @pytest.mark.skipif(**tm.no_sklearn()) def test_training_continuation(self): from sklearn.datasets import load_digits from sklearn.metrics import mean_squared_error digits_2class = load_digits(2) digits_5class = load_digits(5) X_2class = digits_2class['data'] y_2class = digits_2class['target'] X_5class = digits_5class['data'] y_5class = digits_5class['target'] dtrain_2class = xgb.DMatrix(X_2class, label=y_2class) dtrain_5class = xgb.DMatrix(X_5class, label=y_5class) gbdt_01 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 gbdt_02 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=0) gbdt_02.save_model('xgb_tc.model') gbdt_02a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02) gbdt_02b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) assert res1 == res2 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) assert res1 == res2 gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') gbdt_03a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) gbdt_03b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) assert res1 == res2 gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error( y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error( y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree res1 = gbdt_05.predict(dtrain_5class) res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit) np.testing.assert_almost_equal(res1, res2)
class TestEarlyStopping: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_nonparallel(self): from sklearn.datasets import load_digits try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split digits = load_digits(2) X = digits['data'] y = digits['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = xgb.XGBClassifier(learning_rate=0.1) clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc", eval_set=[(X_test, y_test)]) clf2 = xgb.XGBClassifier(learning_rate=0.1) clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc", eval_set=[(X_test, y_test)]) # should be the same assert clf1.best_score == clf2.best_score assert clf1.best_score != 1 # check overfit clf3 = xgb.XGBClassifier(learning_rate=0.1) clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) assert clf3.best_score == 1 def evalerror(self, preds, dtrain): from sklearn.metrics import mean_squared_error labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) return 'rmse', mean_squared_error(labels, preds) @staticmethod def assert_metrics_length(cv, expected_length): for key, value in cv.items(): assert len(value) == expected_length @pytest.mark.skipif(**tm.no_sklearn()) def test_cv_early_stopping(self): from sklearn.datasets import load_digits digits = load_digits(2) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'error' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) self.assert_metrics_length(cv, 10) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5) self.assert_metrics_length(cv, 3) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1) self.assert_metrics_length(cv, 1) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=10) self.assert_metrics_length(cv, 10) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=1) self.assert_metrics_length(cv, 5) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, maximize=True, early_stopping_rounds=1) self.assert_metrics_length(cv, 1) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) dm = xgb.DMatrix(X, label=y) params = {'objective': 'binary:logistic'} metrics = [['auc'], ['error'], ['logloss'], ['logloss', 'auc'], ['logloss', 'error'], ['error', 'logloss']] num_iteration_history = [] # If more than one metrics is given, early stopping should use the last metric for i, m in enumerate(metrics): result = xgb.cv(params, dm, num_boost_round=1000, nfold=5, stratified=True, metrics=m, early_stopping_rounds=20, seed=42) num_iteration_history.append(len(result)) df = result['test-{}-mean'.format(m[-1])] # When early stopping is invoked, the last metric should be as best it can be. if m[-1] == 'auc': assert np.all(df <= df.iloc[-1]) else: assert np.all(df >= df.iloc[-1]) assert num_iteration_history[:3] == num_iteration_history[3:]
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) def run_inplace_base_margin(self, booster, dtrain, X, base_margin): import cupy as cp dtrain.set_info(base_margin=base_margin) from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) cp.testing.assert_allclose(from_inplace, from_dmatrix) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 missing = 11 # set to integer for testing cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) missing_idx = [i for i in range(0, cols, 4)] X[:, missing_idx] = missing # set to be missing y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...], missing=missing) predt_from_array = booster.inplace_predict(X[:10, ...], missing=missing) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) base_margin = cp_rng.randn(rows) self.run_inplace_base_margin(booster, dtrain, X, base_margin) # Create a wide dataset X = cp_rng.randn(100, 10000) y = cp_rng.randn(100) missing_idx = [i for i in range(0, X.shape[1], 16)] X[:, missing_idx] = missing reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=8, missing=missing) reg.fit(X, y) gpu_predt = reg.predict(X) reg.set_params(predictor="cpu_predictor") cpu_predt = reg.predict(X) np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): # column major array inplace_predt = booster.inplace_predict(x.values) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) assert cp.all(copied_predt == inplace_predt) inplace_predt = booster.inplace_predict(x) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) base_margin = cudf.Series(rng.randn(rows)) self.run_inplace_base_margin(booster, dtrain, X, base_margin) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, print_blob=True) def test_shap(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20, print_blob=True) def test_shap_interactions(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_shap_categorical(self): X, y = tm.make_categorical(100, 20, 7, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10) booster.set_param({"predictor": "gpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) booster.set_param({"predictor": "cpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None, print_blob=True) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.parametrize("n_classes", [2, 3]) def test_predict_dart(self, n_classes): from sklearn.datasets import make_classification import cupy as cp n_samples = 1000 X_, y_ = make_classification(n_samples=n_samples, n_informative=5, n_classes=n_classes) X, y = cp.array(X_), cp.array(y_) Xy = xgb.DMatrix(X, y) if n_classes == 2: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "binary:logistic" } else: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "multi:softprob", "num_class": n_classes } booster = xgb.train(params, Xy, num_boost_round=32) # predictor=auto inplace = booster.inplace_predict(X) copied = booster.predict(Xy) cpu_inplace = booster.inplace_predict(X_) booster.set_param({"predictor": "cpu_predictor"}) cpu_copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6) cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6) cp.testing.assert_allclose(inplace, copied, atol=1e-6) booster.set_param({"predictor": "gpu_predictor"}) inplace = booster.inplace_predict(X) copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(inplace, copied, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) def test_dtypes(self): import cupy as cp rows = 1000 cols = 10 rng = cp.random.RandomState(1994) orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols) y = rng.randint(low=0, high=127, size=rows) dtrain = xgb.DMatrix(orig, label=y) booster = xgb.train({"tree_method": "gpu_hist"}, dtrain) predt_orig = booster.inplace_predict(orig) # all primitive types in numpy for dtype in [ cp.signedinteger, cp.byte, cp.short, cp.intc, cp.int_, cp.longlong, cp.unsignedinteger, cp.ubyte, cp.ushort, cp.uintc, cp.uint, cp.ulonglong, cp.floating, cp.half, cp.single, cp.double, ]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # boolean orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols) predt_orig = booster.inplace_predict(orig) for dtype in [cp.bool8, cp.bool_]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # unsupported types for dtype in [ cp.complex64, cp.complex128, ]: X = cp.array(orig, dtype=dtype) with pytest.raises(ValueError): booster.inplace_predict(X)
class TestGPUPredict(unittest.TestCase): def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = {'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123} m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df)
class TestWithDask: def test_global_config(self, client: "Client") -> None: X, y, _ = generate_array() xgb.config.set_config(verbosity=0) dtrain = DaskDMatrix(client, X, y) before_fname = './before_training-test_global_config' after_fname = './after_training-test_global_config' class TestCallback(xgb.callback.TrainingCallback): def write_file(self, fname: str) -> None: with open(fname, 'w') as fd: fd.write(str(xgb.config.get_config()['verbosity'])) def before_training(self, model: xgb.Booster) -> xgb.Booster: self.write_file(before_fname) assert xgb.config.get_config()['verbosity'] == 0 return model def after_training(self, model: xgb.Booster) -> xgb.Booster: assert xgb.config.get_config()['verbosity'] == 0 return model def before_iteration(self, model: xgb.Booster, epoch: int, evals_log: Dict) -> bool: assert xgb.config.get_config()['verbosity'] == 0 return False def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: Dict) -> bool: self.write_file(after_fname) assert xgb.config.get_config()['verbosity'] == 0 return False xgb.dask.train(client, {}, dtrain, num_boost_round=4, callbacks=[TestCallback()])['booster'] with open(before_fname, 'r') as before, open(after_fname, 'r') as after: assert before.read() == '0' assert after.read() == '0' os.remove(before_fname) os.remove(after_fname) def run_updater_test(self, client: "Client", params: Dict, num_rounds: int, dataset: tm.TestDataset, tree_method: str) -> None: params['tree_method'] = tree_method params = dataset.set_params(params) # It doesn't make sense to distribute a completely # empty dataset. if dataset.X.shape[0] == 0: return chunk = 128 X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1])) y = da.from_array(dataset.y, chunks=(chunk, )) if dataset.w is not None: w = da.from_array(dataset.w, chunks=(chunk, )) else: w = None m = xgb.dask.DaskDMatrix(client, data=X, label=y, weight=w) history = xgb.dask.train(client, params=params, dtrain=m, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) history = history['train'][dataset.metric] assert tm.non_increasing(history) # Make sure that it's decreasing assert history[-1] < history[0] @given(params=hist_parameter_strategy, dataset=tm.dataset_strategy) @settings(deadline=None, suppress_health_check=suppress) def test_hist(self, params: Dict, dataset: tm.TestDataset, client: "Client") -> None: num_rounds = 30 self.run_updater_test(client, params, num_rounds, dataset, 'hist') @given(params=exact_parameter_strategy, dataset=tm.dataset_strategy) @settings(deadline=None, suppress_health_check=suppress) def test_approx(self, client: "Client", params: Dict, dataset: tm.TestDataset) -> None: num_rounds = 30 self.run_updater_test(client, params, num_rounds, dataset, 'approx') def run_quantile(self, name: str) -> None: if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows") exe: Optional[str] = None for possible_path in { './testxgboost', './build/testxgboost', '../build/testxgboost', '../cpu-build/testxgboost' }: if os.path.exists(possible_path): exe = possible_path if exe is None: return test = "--gtest_filter=Quantile." + name def runit(worker_addr: str, rabit_args: List[bytes]) -> subprocess.CompletedProcess: port_env = '' # setup environment for running the c++ part. for arg in rabit_args: if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'): port_env = arg.decode('utf-8') port = port_env.split('=') env = os.environ.copy() env[port[0]] = port[1] return subprocess.run([str(exe), test], env=env, capture_output=True) with LocalCluster(n_workers=4) as cluster: with Client(cluster) as client: workers = list(_get_client_workers(client).keys()) rabit_args = client.sync(xgb.dask._get_rabit_args, len(workers), client) futures = client.map(runit, workers, pure=False, workers=workers, rabit_args=rabit_args) results = client.gather(futures) for ret in results: msg = ret.stdout.decode('utf-8') assert msg.find('1 test from Quantile') != -1, msg assert ret.returncode == 0, msg @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.gtest def test_quantile_basic(self) -> None: self.run_quantile('DistributedBasic') @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.gtest def test_quantile(self) -> None: self.run_quantile('Distributed') @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.gtest def test_quantile_same_on_all_workers(self) -> None: self.run_quantile('SameOnAllWorkers') def test_n_workers(self) -> None: with LocalCluster(n_workers=2) as cluster: with Client(cluster) as client: workers = list(_get_client_workers(client).keys()) from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) dX = client.submit(da.from_array, X, workers=[workers[0]]).result() dy = client.submit(da.from_array, y, workers=[workers[0]]).result() train = xgb.dask.DaskDMatrix(client, dX, dy) dX = dd.from_array(X) dX = client.persist(dX, workers={dX: workers[1]}) dy = dd.from_array(y) dy = client.persist(dy, workers={dy: workers[1]}) valid = xgb.dask.DaskDMatrix(client, dX, dy) merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')]) assert len(merged) == 2 @pytest.mark.skipif(**tm.no_dask()) def test_feature_weights(self, client: "Client") -> None: kRows = 1024 kCols = 64 X = da.random.random((kRows, kCols), chunks=(32, -1)) y = da.random.random(kRows, chunks=32) fw = np.ones(shape=(kCols, )) for i in range(kCols): fw[i] *= float(i) fw = da.from_array(fw) poly_increasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor) fw = np.ones(shape=(kCols, )) for i in range(kCols): fw[i] *= float(kCols - i) fw = da.from_array(fw) poly_decreasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor) # Approxmated test, this is dependent on the implementation of random # number generator in std library. assert poly_increasing[0] > 0.08 assert poly_decreasing[0] < -0.08 @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_sklearn()) def test_custom_objective(self, client: "Client") -> None: from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) X, y = da.from_array(X), da.from_array(y) rounds = 20 with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, 'log') def sqr(labels: np.ndarray, predts: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: with open(path, 'a') as fd: print('Running sqr', file=fd) grad = predts - labels hess = np.ones(shape=labels.shape[0]) return grad, hess reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, objective=sqr, tree_method='hist') reg.fit(X, y, eval_set=[(X, y)]) # Check the obj is ran for rounds. with open(path, 'r') as fd: out = fd.readlines() assert len(out) == rounds results_custom = reg.evals_result() reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, tree_method='hist') reg.fit(X, y, eval_set=[(X, y)]) results_native = reg.evals_result() np.testing.assert_allclose(results_custom['validation_0']['rmse'], results_native['validation_0']['rmse']) tm.non_increasing(results_native['validation_0']['rmse']) def test_data_initialization(self) -> None: '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't generate unnecessary copies of data. ''' with LocalCluster(n_workers=2) as cluster: with Client(cluster) as client: X, y, _ = generate_array() n_partitions = X.npartitions m = xgb.dask.DaskDMatrix(client, X, y) workers = list(_get_client_workers(client).keys()) rabit_args = client.sync(xgb.dask._get_rabit_args, len(workers), client) n_workers = len(workers) def worker_fn(worker_addr: str, data_ref: Dict) -> None: with xgb.dask.RabitContext(rabit_args): local_dtrain = xgb.dask._dmatrix_from_list_of_parts( **data_ref) total = np.array([local_dtrain.num_row()]) total = xgb.rabit.allreduce(total, xgb.rabit.Op.SUM) assert total[0] == kRows futures = [] for i in range(len(workers)): futures.append( client.submit(worker_fn, workers[i], m.create_fn_args(workers[i]), pure=False, workers=[workers[i]])) client.gather(futures) has_what = client.has_what() cnt = 0 data = set() for k, v in has_what.items(): for d in v: cnt += 1 data.add(d) assert len(data) == cnt # Subtract the on disk resource from each worker assert cnt - n_workers == n_partitions def run_shap(self, X: Any, y: Any, params: Dict[str, Any], client: "Client") -> None: X, y = da.from_array(X), da.from_array(y) Xy = xgb.dask.DaskDMatrix(client, X, y) booster = xgb.dask.train(client, params, Xy, num_boost_round=10)['booster'] test_Xy = xgb.dask.DaskDMatrix(client, X, y) shap = xgb.dask.predict(client, booster, test_Xy, pred_contribs=True).compute() margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute() assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5) def run_shap_cls_sklearn(self, X: Any, y: Any, client: "Client") -> None: X, y = da.from_array(X), da.from_array(y) cls = xgb.dask.DaskXGBClassifier() cls.client = client cls.fit(X, y) booster = cls.get_booster() test_Xy = xgb.dask.DaskDMatrix(client, X, y) shap = xgb.dask.predict(client, booster, test_Xy, pred_contribs=True).compute() margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute() assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5) def test_shap(self, client: "Client") -> None: from sklearn.datasets import load_boston, load_digits X, y = load_boston(return_X_y=True) params: Dict[str, Any] = {'objective': 'reg:squarederror'} self.run_shap(X, y, params, client) X, y = load_digits(return_X_y=True) params = {'objective': 'multi:softmax', 'num_class': 10} self.run_shap(X, y, params, client) params = {'objective': 'multi:softprob', 'num_class': 10} self.run_shap(X, y, params, client) self.run_shap_cls_sklearn(X, y, client) def run_shap_interactions(self, X: Any, y: Any, params: Dict[str, Any], client: "Client") -> None: X, y = da.from_array(X), da.from_array(y) Xy = xgb.dask.DaskDMatrix(client, X, y) booster = xgb.dask.train(client, params, Xy, num_boost_round=10)['booster'] test_Xy = xgb.dask.DaskDMatrix(client, X, y) shap = xgb.dask.predict(client, booster, test_Xy, pred_interactions=True).compute() margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute() assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-5, 1e-5) def test_shap_interactions(self, client: "Client") -> None: from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) params = {'objective': 'reg:squarederror'} self.run_shap_interactions(X, y, params, client) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn_io(self, client: 'Client') -> None: from sklearn.datasets import load_digits X_, y_ = load_digits(return_X_y=True) X, y = da.from_array(X_), da.from_array(y_) cls = xgb.dask.DaskXGBClassifier(n_estimators=10) cls.client = client cls.fit(X, y) predt_0 = cls.predict(X) with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, 'cls.json') cls.save_model(path) cls = xgb.dask.DaskXGBClassifier() cls.load_model(path) assert cls.n_classes_ == 10 predt_1 = cls.predict(X) np.testing.assert_allclose(predt_0.compute(), predt_1.compute()) # Use single node to load cls = xgb.XGBClassifier() cls.load_model(path) assert cls.n_classes_ == 10 predt_2 = cls.predict(X_) np.testing.assert_allclose(predt_0.compute(), predt_2)
class TestTreeMethod: @given(exact_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_exact(self, param, num_rounds, dataset): param['tree_method'] = 'exact' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric]) @given( exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy, ) @settings(deadline=None) def test_approx(self, param, hist_param, num_rounds, dataset): param["tree_method"] = "approx" param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric]) @pytest.mark.skipif(**tm.no_sklearn()) def test_pruner(self): import sklearn params = {'tree_method': 'exact'} cancer = sklearn.datasets.load_breast_cancer() X = cancer['data'] y = cancer["target"] dtrain = xgb.DMatrix(X, y) booster = xgb.train(params, dtrain=dtrain, num_boost_round=10) grown = str(booster.get_dump()) params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'} booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) after_prune = str(booster.get_dump()) assert grown != after_prune booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) second_prune = str(booster.get_dump()) # Second prune should not change the tree assert after_prune == second_prune @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_hist(self, param, hist_param, num_rounds, dataset): param['tree_method'] = 'hist' param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) def test_hist_categorical(self): # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = { 'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc' } hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] @pytest.mark.skipif(**tm.no_sklearn()) def test_hist_degenerate_case(self): # Test a degenerate case where the quantile sketcher won't return any # quantile points for a particular feature (the second feature in # this example). Source: https://github.com/dmlc/xgboost/issues/2943 nan = np.nan param = {'missing': nan, 'tree_method': 'hist'} model = xgb.XGBRegressor(**param) X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], [6.38888889e+05, nan], [6.28086420e+05, nan]]) y = [1000000., 0., 0., 500000.] w = [0, 0, 1, 0] model.fit(X, y, sample_weight=w) def run_invalid_category(self, tree_method: str) -> None: rng = np.random.default_rng() # too large X = rng.integers(low=0, high=4, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) X[13, 7] = np.iinfo(np.int32).max + 1 # Check is performed during sketching. Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) X[13, 7] = 16777216 Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) # mixed positive and negative values X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) if tree_method == "gpu_hist": import cupy as cp X, y = cp.array(X), cp.array(y) with pytest.raises(ValueError): Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10) def test_invalid_category(self) -> None: self.run_invalid_category("approx") def run_categorical_basic(self, rows, cols, rounds, cats, tree_method): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None # Use one-hot exclusively parameters = { "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999 } m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) by_grouping: xgb.callback.TrainingCallback.EvalsLog = {} parameters["max_cat_to_onehot"] = 1 parameters["reg_lambda"] = 0 m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_grouping, ) rmse_oh = by_builtin_results["Train"]["rmse"] rmse_group = by_grouping["Train"]["rmse"] # always better or equal to onehot when there's no regularization. for a, b in zip(rmse_oh, rmse_group): assert a >= b parameters["reg_lambda"] = 1.0 by_grouping = {} xgb.train( parameters, m, num_boost_round=32, evals=[(m, "Train")], evals_result=by_grouping, ) assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): self.run_categorical_basic(rows, cols, rounds, cats, "approx") self.run_categorical_basic(rows, cols, rounds, cats, "hist")
import numpy as np import xgboost as xgb import testing as tm import tempfile import os import shutil import pytest rng = np.random.RandomState(1994) pytestmark = pytest.mark.skipif(**tm.no_sklearn()) class TemporaryDirectory(object): """Context manager for tempfile.mkdtemp()""" def __enter__(self): self.name = tempfile.mkdtemp() return self.name def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) def test_binary_classification(): from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng)
class TestDaskCallbacks: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping(self, client: "Client") -> None: from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) valid = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, { 'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist' }, m, evals=[(valid, 'Valid')], num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 valid_X, valid_y = load_breast_cancer(return_X_y=True) valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y) cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=1000) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)]) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 # Specify the metric cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=1000) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)], eval_metric='error') assert tm.non_increasing(cls.evals_result()['validation_0']['error']) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(cls.evals_result()['validation_0']['error']) < 20 assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_custom_eval(self, client: "Client") -> None: from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) valid = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, { 'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist' }, m, evals=[(m, 'Train'), (valid, 'Valid')], feval=tm.eval_error_metric, num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 valid_X, valid_y = load_breast_cancer(return_X_y=True) valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y) cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=1000) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)], eval_metric=tm.eval_error_metric) booster = cls.get_booster() dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 @pytest.mark.skipif(**tm.no_sklearn()) def test_callback(self, client: "Client") -> None: from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist', n_estimators=10) cls.client = client with tempfile.TemporaryDirectory() as tmpdir: cls.fit(X, y, callbacks=[ xgb.callback.TrainingCheckPoint(directory=Path(tmpdir), iterations=1, name='model') ]) for i in range(1, 10): assert os.path.exists( os.path.join(tmpdir, 'model_' + str(i) + '.json'))
class TestDaskCallbacks: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping(self, client): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, { 'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist' }, m, evals=[(m, 'Train')], num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') assert booster.best_iteration == 10 dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_custom_eval(self, client): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X, y = da.from_array(X), da.from_array(y) m = xgb.dask.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = xgb.dask.train( client, { 'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist' }, m, evals=[(m, 'Train')], feval=tm.eval_error_metric, num_boost_round=1000, early_stopping_rounds=early_stopping_rounds)['booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 def test_data_initialization(self): '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't generate unnecessary copies of data. ''' with LocalCluster(n_workers=2) as cluster: with Client(cluster) as client: X, y = generate_array() n_partitions = X.npartitions m = xgb.dask.DaskDMatrix(client, X, y) workers = list(xgb.dask._get_client_workers(client).keys()) rabit_args = client.sync(xgb.dask._get_rabit_args, workers, client) n_workers = len(workers) def worker_fn(worker_addr, data_ref): with xgb.dask.RabitContext(rabit_args): local_dtrain = xgb.dask._dmatrix_from_worker_map( **data_ref) total = np.array([local_dtrain.num_row()]) total = xgb.rabit.allreduce(total, xgb.rabit.Op.SUM) assert total[0] == kRows futures = client.map(worker_fn, workers, [m.create_fn_args()] * len(workers), pure=False, workers=workers) client.gather(futures) has_what = client.has_what() cnt = 0 data = set() for k, v in has_what.items(): for d in v: cnt += 1 data.add(d) assert len(data) == cnt # Subtract the on disk resource from each worker assert cnt - n_workers == n_partitions
class TestMonotoneConstraints: def test_monotone_constraints_for_exact_tree_method(self): # first check monotonicity for the 'exact' tree method params_for_constrained_exact_method = { 'tree_method': 'exact', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_exact_method = xgb.train( params_for_constrained_exact_method, training_dset) assert is_correctly_constrained(constrained_exact_method) def test_monotone_constraints_for_depthwise_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train(params_for_constrained_hist_method, training_dset) assert is_correctly_constrained(constrained_hist_method) def test_monotone_constraints_for_lossguide_hist_tree_method(self): # next check monotonicity for the 'hist' tree method params_for_constrained_hist_method = { 'tree_method': 'hist', 'verbosity': 1, 'grow_policy': 'lossguide', 'monotone_constraints': '(1, -1)' } constrained_hist_method = xgb.train(params_for_constrained_hist_method, training_dset) assert is_correctly_constrained(constrained_hist_method) @pytest.mark.parametrize('format', [dict, list]) def test_monotone_constraints_feature_names(self, format): # next check monotonicity when initializing monotone_constraints by feature names params = { 'tree_method': 'hist', 'verbosity': 1, 'grow_policy': 'lossguide', 'monotone_constraints': { 'feature_0': 1, 'feature_1': -1 } } if format == list: params = list(params.items()) with pytest.raises(ValueError): xgb.train(params, training_dset) feature_names = ['feature_0', 'feature_2'] training_dset_w_feature_names = xgb.DMatrix( x, label=y, feature_names=feature_names) with pytest.raises(ValueError): xgb.train(params, training_dset_w_feature_names) feature_names = ['feature_0', 'feature_1'] training_dset_w_feature_names = xgb.DMatrix( x, label=y, feature_names=feature_names) constrained_learner = xgb.train(params, training_dset_w_feature_names) assert is_correctly_constrained(constrained_learner, feature_names) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_accuracy(self): from sklearn.metrics import accuracy_score dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') params = { 'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': 'hist', 'monotone_constraints': '(1, 0)' } num_boost_round = 5 params['grow_policy'] = 'lossguide' bst = xgb.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1 params['grow_policy'] = 'depthwise' bst = xgb.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
class TestInteractionConstraints(unittest.TestCase): def run_interaction_constraints(self, tree_method): x1 = np.random.normal(loc=1.0, scale=1.0, size=1000) x2 = np.random.normal(loc=1.0, scale=1.0, size=1000) x3 = np.random.choice([1, 2, 3], size=1000, replace=True) y = x1 + x2 + x3 + x1 * x2 * x3 \ + np.random.normal( loc=0.001, scale=1.0, size=1000) + 3 * np.sin(x1) X = np.column_stack((x1, x2, x3)) dtrain = xgboost.DMatrix(X, label=y) params = { 'max_depth': 3, 'eta': 0.1, 'nthread': 2, 'interaction_constraints': '[[0, 1]]', 'tree_method': tree_method } num_boost_round = 12 # Fit a model that only allows interaction between x1 and x2 bst = xgboost.train(params, dtrain, num_boost_round, evals=[(dtrain, 'train')]) # Set all observations to have the same x3 values then increment # by the same amount def f(x): tmat = xgboost.DMatrix( np.column_stack((x1, x2, np.repeat(x, 1000)))) return bst.predict(tmat) preds = [f(x) for x in [1, 2, 3]] # Check incrementing x3 has the same effect on all observations # since x3 is constrained to be independent of x1 and x2 # and all observations start off from the same x3 value diff1 = preds[1] - preds[0] assert np.all(np.abs(diff1 - diff1[0]) < 1e-4) diff2 = preds[2] - preds[1] assert np.all(np.abs(diff2 - diff2[0]) < 1e-4) def test_exact_interaction_constraints(self): self.run_interaction_constraints(tree_method='exact') def test_hist_interaction_constraints(self): self.run_interaction_constraints(tree_method='hist') def test_approx_interaction_constraints(self): self.run_interaction_constraints(tree_method='approx') @pytest.mark.skipif(**tm.no_sklearn()) def training_accuracy(self, tree_method): from sklearn.metrics import accuracy_score dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') params = { 'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic', 'tree_method': tree_method, 'interaction_constraints': '[[1,2], [2,3,4]]' } num_boost_round = 5 params['grow_policy'] = 'lossguide' bst = xgboost.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1 params['grow_policy'] = 'depthwise' bst = xgboost.train(params, dtrain, num_boost_round) pred_dtest = (bst.predict(dtest) < 0.5) assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1 def test_hist_training_accuracy(self): self.training_accuracy(tree_method='hist') def test_exact_training_accuracy(self): self.training_accuracy(tree_method='exact') def test_approx_training_accuracy(self): self.training_accuracy(tree_method='approx')
class TestUpdaters(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_histmaker(self): variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_colmaker(self): variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_pruner(self): import sklearn params = {'tree_method': 'exact'} cancer = sklearn.datasets.load_breast_cancer() X = cancer['data'] y = cancer["target"] dump_svmlight_file(X, y, temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) dtrain = xgb.DMatrix({username: temp_enc_name}) booster = xgb.train(params, dtrain=dtrain, num_boost_round=10) grown = str(booster.get_dump()) params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'} #TODO(rishabh): add support for xgb_model """ booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) after_prune = str(booster.get_dump()) assert grown != after_prune booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) second_prune = str(booster.get_dump()) # Second prune should not change the tree assert after_prune == second_prune """ @pytest.mark.skipif(**tm.no_sklearn()) def test_fast_histmaker(self): variable_param = {'tree_method': ['hist'], 'max_depth': [2, 8], 'max_bin': [2, 256], 'grow_policy': ['depthwise', 'lossguide'], 'max_leaves': [64, 0], 'verbosity': [0]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) # hist must be same as exact on all-categorial data dpath = HOME_DIR + 'demo/data/' ag_dtrain = xgb.DMatrix({username: dpath + 'agaricus.txt.train.enc'}) ag_dtest = xgb.DMatrix({username: dpath + 'agaricus.txt.test.enc'}) ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} hist_res = {} exact_res = {} #TODO(rishabh): support for evals_result """
class TestEvalMetrics(unittest.TestCase): xgb_params_01 = { 'verbosity': 0, 'nthread': 1, 'eval_metric': 'error' } xgb_params_02 = { 'verbosity': 0, 'nthread': 1, 'eval_metric': ['error'] } xgb_params_03 = { 'verbosity': 0, 'nthread': 1, 'eval_metric': ['rmse', 'error'] } xgb_params_04 = { 'verbosity': 0, 'nthread': 1, 'eval_metric': ['error', 'rmse'] } def evalerror_01(self, preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels != (preds > 0.0))) / len(labels) def evalerror_02(self, preds, dtrain): labels = dtrain.get_label() return [('error', float(sum(labels != (preds > 0.0))) / len(labels))] @pytest.mark.skipif(**tm.no_sklearn()) def evalerror_03(self, preds, dtrain): from sklearn.metrics import mean_squared_error labels = dtrain.get_label() return [('rmse', mean_squared_error(labels, preds)), ('error', float(sum(labels != (preds > 0.0))) / len(labels))] @pytest.mark.skipif(**tm.no_sklearn()) def evalerror_04(self, preds, dtrain): from sklearn.metrics import mean_squared_error labels = dtrain.get_label() return [('error', float(sum(labels != (preds > 0.0))) / len(labels)), ('rmse', mean_squared_error(labels, preds))] @pytest.mark.skipif(**tm.no_sklearn()) def test_eval_metrics(self): try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split from sklearn.datasets import load_digits digits = load_digits(2) X = digits['data'] y = digits['target'] Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(Xt, label=yt) dvalid = xgb.DMatrix(Xv, label=yv) watchlist = [(dtrain, 'train'), (dvalid, 'val')] gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10) gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10) gbdt_03 = xgb.train(self.xgb_params_03, dtrain, num_boost_round=10) assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0] assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0] gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2) gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2) gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2) gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2) assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0] assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0] assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0] gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2, feval=self.evalerror_01) gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2, feval=self.evalerror_02) gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2, feval=self.evalerror_03) gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2, feval=self.evalerror_04) assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0] assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0] assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None) def test_shap(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20) def test_shap_interactions(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
class TestGPUPredict(unittest.TestCase): def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) @given(strategies.integers(1, 200), tm.dataset_strategy, shap_parameter_strategy, strategies.booleans()) @settings(deadline=None) def test_shap(self, num_rounds, dataset, param, all_rows): if param['max_depth'] == 0 and param['max_leaves'] == 0: return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) if all_rows: test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) else: test_dmat = xgb.DMatrix(dataset.X[0:1, :]) shap = bst.predict(test_dmat, pred_contribs=True) bst.set_param({"predictor": "cpu_predictor"}) cpu_shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assert np.allclose(shap, cpu_shap, 1e-3, 1e-3) # feature contributions should add up to predictions assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3)
class TestEarlyStopping(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_nonparallel(self): from sklearn.datasets import load_digits try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split digits = load_digits(2) X = digits['data'] y = digits['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = xgb.XGBClassifier() clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc", eval_set=[(X_test, y_test)]) clf2 = xgb.XGBClassifier() clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc", eval_set=[(X_test, y_test)]) # should be the same assert clf1.best_score == clf2.best_score assert clf1.best_score != 1 # check overfit clf3 = xgb.XGBClassifier() clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) assert clf3.best_score == 1 @pytest.mark.skipif(**tm.no_sklearn()) def evalerror(self, preds, dtrain): from sklearn.metrics import mean_squared_error labels = dtrain.get_label() return 'rmse', mean_squared_error(labels, preds) @pytest.mark.skipif(**tm.no_sklearn()) def test_cv_early_stopping(self): from sklearn.datasets import load_digits digits = load_digits(2) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) assert cv.shape[0] == 10 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5) assert cv.shape[0] == 3 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1) assert cv.shape[0] == 1 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=10) assert cv.shape[0] == 10 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=1) assert cv.shape[0] == 5 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, maximize=True, early_stopping_rounds=1) assert cv.shape[0] == 1
class TestModels: def test_glm(self): param = { 'verbosity': 0, 'objective': 'binary:logistic', 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, 'nthread': 1 } watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.2 def test_dart(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') param = { 'max_depth': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1 } # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest, ntree_limit=num_round) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) # error must be smaller than 10% assert err < 0.1 with tempfile.TemporaryDirectory() as tmpdir: dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') model_path = os.path.join(tmpdir, 'xgboost.model.dart') # save dmatrix into binary buffer dtest.save_binary(dtest_path) model_path = model_path # save model bst.save_model(model_path) # load model and data in bst2 = xgb.Booster(params=param, model_file=model_path) dtest2 = xgb.DMatrix(dtest_path) preds2 = bst2.predict(dtest2, ntree_limit=num_round) # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 def my_logloss(preds, dtrain): labels = dtrain.get_label() return 'logloss', np.sum(np.log(np.where(labels, preds, 1 - preds))) # check whether custom evaluation metrics work bst = xgb.train(param, dtrain, num_round, watchlist, feval=my_logloss) preds3 = bst.predict(dtest, ntree_limit=num_round) assert all(preds3 == preds) # check whether sample_type and normalize_type work num_round = 50 param['verbosity'] = 0 param['learning_rate'] = 0.1 param['rate_drop'] = 0.1 preds_list = [] for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]: param['sample_type'] = p[0] param['normalize_type'] = p[1] bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest, ntree_limit=num_round) err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 preds_list.append(preds) for ii in range(len(preds_list)): for jj in range(ii + 1, len(preds_list)): assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0 def test_boost_from_prediction(self): # Re-construct dtrain here to avoid modification margined = xgb.DMatrix(dpath + 'agaricus.txt.train') bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_0 = bst.predict(margined, output_margin=True) margined.set_base_margin(predt_0) bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_1 = bst.predict(margined) assert np.any(np.abs(predt_1 - predt_0) > 1e-6) bst = xgb.train({'tree_method': 'hist'}, dtrain, 2) predt_2 = bst.predict(dtrain) assert np.all(np.abs(predt_2 - predt_1) < 1e-6) def test_boost_from_existing_model(self): X = xgb.DMatrix(dpath + 'agaricus.txt.train') booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4) assert booster.num_boosted_rounds() == 4 booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4, xgb_model=booster) assert booster.num_boosted_rounds() == 8 booster = xgb.train({ 'updater': 'prune', 'process_type': 'update' }, X, num_boost_round=4, xgb_model=booster) # Trees are moved for update, the rounds is reduced. This test is # written for being compatible with current code (1.0.0). If the # behaviour is considered sub-optimal, feel free to change. assert booster.num_boosted_rounds() == 4 def run_custom_objective(self, tree_method=None): param = { 'max_depth': 2, 'eta': 1, 'objective': 'reg:logistic', "tree_method": tree_method } watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 10 def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0 - preds) return grad, hess def evalerror(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) return 'error', float(sum(labels != (preds > 0.5))) / len(labels) # test custom_objective in training bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 # test custom_objective in cross-validation xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror) # test maximize parameter def neg_evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels == (preds > 0.0))) / len(labels) bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True) preds2 = bst2.predict(dtest) err2 = sum(1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2)) assert err == err2 def test_custom_objective(self): self.run_custom_objective() def test_multi_eval_metric(self): watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = { 'max_depth': 2, 'eta': 0.2, 'verbosity': 1, 'objective': 'binary:logistic' } param['eval_metric'] = ["auc", "logloss", 'error'] evals_result = {} bst = xgb.train(param, dtrain, 4, watchlist, evals_result=evals_result) assert isinstance(bst, xgb.core.Booster) assert len(evals_result['eval']) == 3 assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'} def test_fpreproc(self): param = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } num_round = 2 def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label == 1) param['scale_pos_weight'] = ratio return (dtrain, dtest, param) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc) def test_show_stdv(self): param = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } num_round = 2 xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False) def test_feature_names_validation(self): X = np.random.random((10, 3)) y = np.random.randint(2, size=(10, )) dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c")) dm2 = xgb.DMatrix(X, y) bst = xgb.train([], dm1) bst.predict(dm1) # success with pytest.raises(ValueError): bst.predict(dm2) bst.predict(dm1) # success bst = xgb.train([], dm2) bst.predict(dm2) # success def test_model_binary_io(self): model_path = 'test_model_binary_io.bin' parameters = { 'tree_method': 'hist', 'booster': 'gbtree', 'scale_pos_weight': '0.5' } X = np.random.random((10, 3)) y = np.random.random((10, )) dtrain = xgb.DMatrix(X, y) bst = xgb.train(parameters, dtrain, num_boost_round=2) bst.save_model(model_path) bst = xgb.Booster(model_file=model_path) os.remove(model_path) config = json.loads(bst.save_config()) assert float(config['learner']['objective']['reg_loss_param'] ['scale_pos_weight']) == 0.5 buf = bst.save_raw() from_raw = xgb.Booster() from_raw.load_model(buf) buf_from_raw = from_raw.save_raw() assert buf == buf_from_raw def test_model_json_io(self): loc = locale.getpreferredencoding(False) model_path = 'test_model_json_io.json' parameters = {'tree_method': 'hist', 'booster': 'gbtree'} j_model = json_model(model_path, parameters) assert isinstance(j_model['learner'], dict) bst = xgb.Booster(model_file=model_path) bst.save_model(fname=model_path) with open(model_path, 'r') as fd: j_model = json.load(fd) assert isinstance(j_model['learner'], dict) os.remove(model_path) assert locale.getpreferredencoding(False) == loc @pytest.mark.skipif(**tm.no_json_schema()) def test_json_io_schema(self): import jsonschema model_path = 'test_json_schema.json' path = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) doc = os.path.join(path, 'doc', 'model.schema') with open(doc, 'r') as fd: schema = json.load(fd) parameters = {'tree_method': 'hist', 'booster': 'gbtree'} jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) os.remove(model_path) parameters = {'tree_method': 'hist', 'booster': 'dart'} jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) os.remove(model_path) try: xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1) except ValueError as e: e_str = str(e) beg = e_str.find('Objective candidate') end = e_str.find('Stack trace') e_str = e_str[beg:end] e_str = e_str.strip() splited = e_str.splitlines() objectives = [s.split(': ')[1] for s in splited] j_objectives = schema['properties']['learner']['properties'][ 'objective']['oneOf'] objectives_from_schema = set() for j_obj in j_objectives: objectives_from_schema.add( j_obj['properties']['name']['const']) objectives = set(objectives) assert objectives == objectives_from_schema @pytest.mark.skipif(**tm.no_json_schema()) def test_json_dump_schema(self): import jsonschema def validate_model(parameters): X = np.random.random((100, 30)) y = np.random.randint(0, 4, size=(100, )) parameters['num_class'] = 4 m = xgb.DMatrix(X, y) booster = xgb.train(parameters, m) dump = booster.get_dump(dump_format='json') for i in range(len(dump)): jsonschema.validate(instance=json.loads(dump[i]), schema=schema) path = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) doc = os.path.join(path, 'doc', 'dump.schema') with open(doc, 'r') as fd: schema = json.load(fd) parameters = { 'tree_method': 'hist', 'booster': 'gbtree', 'objective': 'multi:softmax' } validate_model(parameters) parameters = { 'tree_method': 'hist', 'booster': 'dart', 'objective': 'multi:softmax' } validate_model(parameters) @pytest.mark.skipif(**tm.no_sklearn()) def test_attributes(self): from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) cls = xgb.XGBClassifier(n_estimators=2) cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)]) assert cls.get_booster().best_ntree_limit == 2 assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "cls.json") cls.save_model(path) cls = xgb.XGBClassifier(n_estimators=2) cls.load_model(path) assert cls.get_booster().best_ntree_limit == 2 assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.parametrize('booster', ['gbtree', 'dart']) def test_slice(self, booster): from sklearn.datasets import make_classification num_classes = 3 X, y = make_classification(n_samples=1000, n_informative=5, n_classes=num_classes) dtrain = xgb.DMatrix(data=X, label=y) num_parallel_tree = 4 num_boost_round = 16 total_trees = num_parallel_tree * num_classes * num_boost_round booster = xgb.train( { 'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3, 'booster': booster, 'objective': 'multi:softprob' }, num_boost_round=num_boost_round, dtrain=dtrain) assert len(booster.get_dump()) == total_trees beg = 3 end = 7 sliced: xgb.Booster = booster[beg:end] sliced_trees = (end - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced_trees = sliced_trees // 2 sliced: xgb.Booster = booster[beg:end:2] assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[beg:...] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[beg:] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[...:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) with pytest.raises(ValueError, match=r'>= 0'): booster[-1:0] # we do not accept empty slice. with pytest.raises(ValueError): booster[1:1] # stop can not be smaller than begin with pytest.raises(ValueError, match=r'Invalid.*'): booster[3:0] with pytest.raises(ValueError, match=r'Invalid.*'): booster[3:-1] # negative step is not supported. with pytest.raises(ValueError, match=r'.*>= 1.*'): booster[0:2:-1] # step can not be 0. with pytest.raises(ValueError, match=r'.*>= 1.*'): booster[0:2:0] trees = [_ for _ in booster] assert len(trees) == num_boost_round with pytest.raises(TypeError): booster["wrong type"] with pytest.raises(IndexError): booster[:num_boost_round + 1] with pytest.raises(ValueError): booster[1, 2] # too many dims # setitem is not implemented as model is immutable during slicing. with pytest.raises(TypeError): booster[...:end] = booster sliced_0 = booster[1:3] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(1, 3)), sliced_0.predict(dtrain)) sliced_1 = booster[3:7] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(3, 7)), sliced_1.predict(dtrain)) predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 # base score. single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) sliced_0 = booster[1:7:2] # 1,3,5 sliced_1 = booster[2:8:2] # 2,4,6 predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) @pytest.mark.skipif(**tm.no_pandas()) def test_feature_info(self): import pandas as pd rows = 100 cols = 10 X = rng.randn(rows, cols) y = rng.randn(rows) feature_names = ["test_feature_" + str(i) for i in range(cols)] X_pd = pd.DataFrame(X, columns=feature_names) X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int) Xy = xgb.DMatrix(X_pd, y) assert Xy.feature_types[3] == "int" booster = xgb.train({}, dtrain=Xy, num_boost_round=1) assert booster.feature_names == Xy.feature_names assert booster.feature_names == feature_names assert booster.feature_types == Xy.feature_types with tempfile.TemporaryDirectory() as tmpdir: path = tmpdir + "model.json" booster.save_model(path) booster = xgb.Booster() booster.load_model(path) assert booster.feature_names == Xy.feature_names assert booster.feature_types == Xy.feature_types
np.testing.assert_allclose(single_node_proba, probas.compute()) # Test with dataframe. X_d = dd.from_dask_array(X) y_d = dd.from_dask_array(y) classifier.fit(X_d, y_d) assert classifier.n_classes_ == 10 prediction = classifier.predict(X_d) assert prediction.ndim == 1 assert prediction.shape[0] == kRows @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn_grid_search(): from sklearn.model_selection import GridSearchCV with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() reg = xgb.dask.DaskXGBRegressor(learning_rate=0.1, tree_method='hist') reg.client = client model = GridSearchCV(reg, { 'max_depth': [2, 4], 'n_estimators': [5, 10] }, cv=2, verbose=1) model.fit(X, y)
class TestUpdaters(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_histmaker(self): variable_param = {'updater': ['grow_histmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_colmaker(self): variable_param = {'updater': ['grow_colmaker'], 'max_depth': [2, 8]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) @pytest.mark.skipif(**tm.no_sklearn()) def test_fast_histmaker(self): variable_param = {'tree_method': ['hist'], 'max_depth': [2, 8], 'max_bin': [2, 256], 'grow_policy': ['depthwise', 'lossguide'], 'max_leaves': [64, 0], 'verbosity': [0]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] @pytest.mark.skipif(**tm.no_sklearn()) def test_fast_histmaker_degenerate_case(self): # Test a degenerate case where the quantile sketcher won't return any # quantile points for a particular feature (the second feature in # this example). Source: https://github.com/dmlc/xgboost/issues/2943 nan = np.nan param = {'missing': nan, 'tree_method': 'hist'} model = xgb.XGBRegressor(**param) X = [[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], [6.38888889e+05, nan], [6.28086420e+05, nan]] y = [1000000., 0., 0., 500000.] w = [0, 0, 1, 0] model.fit(X, y, sample_weight=w)
class TestTrainingContinuation(unittest.TestCase): num_parallel_tree = 3 def generate_parameters(self, use_json): xgb_params_01_binary = { 'nthread': 1, } xgb_params_02_binary = { 'nthread': 1, 'num_parallel_tree': self.num_parallel_tree } xgb_params_03_binary = { 'nthread': 1, 'num_class': 5, 'num_parallel_tree': self.num_parallel_tree } if use_json: xgb_params_01_binary[ 'enable_experimental_json_serialization'] = True xgb_params_02_binary[ 'enable_experimental_json_serialization'] = True xgb_params_03_binary[ 'enable_experimental_json_serialization'] = True return [ xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary ] def run_training_continuation(self, xgb_params_01, xgb_params_02, xgb_params_03): from sklearn.datasets import load_digits from sklearn.metrics import mean_squared_error digits_2class = load_digits(2) digits_5class = load_digits(5) X_2class = digits_2class['data'] y_2class = digits_2class['target'] X_5class = digits_5class['data'] y_5class = digits_5class['target'] dump_svmlight_file(X_2class, y_2class, temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) dtrain_2class = xgb.DMatrix({username: temp_enc_name}) dump_svmlight_file(X_5class, y_5class, temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) dtrain_5class = xgb.DMatrix({username: temp_enc_name}) gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0) gbdt_02.save_model(HOME_DIR + 'xgb_tc.model') #TODO(rishabh): add support for xgb_model """ gbdt_02a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02) gbdt_02b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) assert res1 == res2 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) assert res1 == res2 gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') gbdt_03a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) gbdt_03b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) assert res1 == res2 gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_04.predict( dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == ( gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_04.predict( dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == ( gbdt_05.best_iteration + 1) * self.num_parallel_tree gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) assert gbdt_05.best_ntree_limit == ( gbdt_05.best_iteration + 1) * self.num_parallel_tree res1 = gbdt_05.predict(dtrain_5class) res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit) np.testing.assert_almost_equal(res1, res2) """ @pytest.mark.skipif(**tm.no_sklearn()) def test_training_continuation_binary(self): params = self.generate_parameters(False) self.run_training_continuation(params[0], params[1], params[2]) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_continuation_json(self): params = self.generate_parameters(True) for p in params: p['enable_experimental_json_serialization'] = True self.run_training_continuation(params[0], params[1], params[2]) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_continuation_updaters_binary(self): updaters = 'grow_colmaker,prune,refresh' params = self.generate_parameters(False) for p in params: p['updater'] = updaters self.run_training_continuation(params[0], params[1], params[2]) @pytest.mark.skipif(**tm.no_sklearn()) def test_training_continuation_updaters_json(self): # Picked up from R tests. updaters = 'grow_colmaker,prune,refresh' params = self.generate_parameters(True) for p in params: p['updater'] = updaters self.run_training_continuation(params[0], params[1], params[2])
class TestTreeMethod: @given(exact_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_exact(self, param, num_rounds, dataset): param['tree_method'] = 'exact' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric]) @given(exact_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_approx(self, param, num_rounds, dataset): param['tree_method'] = 'approx' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric], 1e-3) @pytest.mark.skipif(**tm.no_sklearn()) def test_pruner(self): import sklearn params = {'tree_method': 'exact'} cancer = sklearn.datasets.load_breast_cancer() X = cancer['data'] y = cancer["target"] dtrain = xgb.DMatrix(X, y) booster = xgb.train(params, dtrain=dtrain, num_boost_round=10) grown = str(booster.get_dump()) params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'} booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) after_prune = str(booster.get_dump()) assert grown != after_prune booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) second_prune = str(booster.get_dump()) # Second prune should not change the tree assert after_prune == second_prune @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_hist(self, param, hist_param, num_rounds, dataset): param['tree_method'] = 'hist' param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) def test_hist_categorical(self): # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = { 'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc' } hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] @pytest.mark.skipif(**tm.no_sklearn()) def test_hist_degenerate_case(self): # Test a degenerate case where the quantile sketcher won't return any # quantile points for a particular feature (the second feature in # this example). Source: https://github.com/dmlc/xgboost/issues/2943 nan = np.nan param = {'missing': nan, 'tree_method': 'hist'} model = xgb.XGBRegressor(**param) X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], [6.38888889e+05, nan], [6.28086420e+05, nan]]) y = [1000000., 0., 0., 500000.] w = [0, 0, 1, 0] model.fit(X, y, sample_weight=w)