class TestPlotting: @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self): X, y = tm.make_categorical(1000, 31, 19, onehot=False) reg = xgb.XGBRegressor(enable_categorical=True, n_estimators=10, tree_method="gpu_hist") reg.fit(X, y) trees = reg.get_booster().get_dump(dump_format="json") for tree in trees: j_tree = json.loads(tree) assert "leaf" in j_tree.keys() or isinstance( j_tree["split_condition"], list) graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1) assert isinstance(graph, Source) ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1) assert isinstance(ax, Axes)
class TestGPUUpdaters: @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist(self, param, num_rounds, dataset): param["tree_method"] = "gpu_hist" param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric]) def run_categorical_basic(self, rows, cols, rounds, cats): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"} m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): self.run_categorical_basic(rows, cols, rounds, cats) def test_categorical_32_cat(self): '''32 hits the bound of integer bitset, so special test''' rows = 1000 cols = 10 cats = 32 rounds = 4 self.run_categorical_basic(rows, cols, rounds, cats) def test_invalid_categorical(self): import cupy as cp rng = np.random.default_rng() X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) # Check is performe during sketching. Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": "gpu_hist"}, Xy) X, y = cp.array(X), cp.array(y) with pytest.raises(ValueError): Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10) @pytest.mark.skipif(**tm.no_cupy()) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) result = train_result(param, dataset.get_device_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_external_memory(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) m = dataset.get_external_dmat() external_result = train_result(param, m, num_rounds) del m gc.collect() assert tm.non_increasing(external_result['train'][dataset.metric]) def test_empty_dmatrix_prediction(self): # FIXME(trivialfis): This should be done with all updaters kRows = 0 kCols = 100 X = np.empty((kRows, kCols)) y = np.empty((kRows)) dtrain = xgb.DMatrix(X, y) bst = xgb.train( { 'verbosity': 2, 'tree_method': 'gpu_hist', 'gpu_id': 0 }, dtrain, verbose_eval=True, num_boost_round=6, evals=[(dtrain, 'Train')]) kRows = 100 X = np.random.randn(kRows, kCols) dtest = xgb.DMatrix(X) predictions = bst.predict(dtest) np.testing.assert_allclose(predictions, 0.5, 1e-6) @pytest.mark.mgpu @given(tm.dataset_strategy, strategies.integers(0, 10)) @settings(deadline=None, max_examples=10) def test_specified_gpu_id_gpu_update(self, dataset, gpu_id): param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id} param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), 10) assert tm.non_increasing(result['train'][dataset.metric])
class TestDMatrix: def test_warn_missing(self): from xgboost import data with pytest.warns(UserWarning): data._warn_unused_missing('uri', 4) with pytest.warns(None) as record: data._warn_unused_missing('uri', None) data._warn_unused_missing('uri', np.nan) assert len(record) == 0 with pytest.warns(None) as record: x = rng.randn(10, 10) y = rng.randn(10) xgb.DMatrix(x, y, missing=4) assert len(record) == 0 with pytest.warns(UserWarning): csr = csr_matrix(x) xgb.DMatrix(csr.tocsc(), y, missing=4) def test_dmatrix_numpy_init(self): data = np.random.randn(5, 5) dm = xgb.DMatrix(data) assert dm.num_row() == 5 assert dm.num_col() == 5 data = np.array([[1, 2], [3, 4]]) dm = xgb.DMatrix(data) assert dm.num_row() == 2 assert dm.num_col() == 2 # 0d array with pytest.raises(ValueError): xgb.DMatrix(np.array(1)) # 1d array with pytest.raises(ValueError): xgb.DMatrix(np.array([1, 2, 3])) # 3d array data = np.random.randn(5, 5, 5) with pytest.raises(ValueError): xgb.DMatrix(data) # object dtype data = np.array([['a', 'b'], ['c', 'd']]) with pytest.raises(ValueError): xgb.DMatrix(data) def test_csr(self): indptr = np.array([0, 2, 3, 6]) indices = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]) X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3)) dtrain = xgb.DMatrix(X) assert dtrain.num_row() == 3 assert dtrain.num_col() == 3 def test_csc(self): row = np.array([0, 2, 2, 0, 1, 2]) col = np.array([0, 0, 1, 2, 2, 2]) data = np.array([1, 2, 3, 4, 5, 6]) X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3)) dtrain = xgb.DMatrix(X) assert dtrain.num_row() == 3 assert dtrain.num_col() == 3 def test_coo(self): row = np.array([0, 2, 2, 0, 1, 2]) col = np.array([0, 0, 1, 2, 2, 2]) data = np.array([1, 2, 3, 4, 5, 6]) X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3)) dtrain = xgb.DMatrix(X) assert dtrain.num_row() == 3 assert dtrain.num_col() == 3 def test_np_view(self): # Sliced Float32 array y = np.array([12, 34, 56], np.float32)[::2] from_view = xgb.DMatrix(np.array([[]]), label=y).get_label() from_array = xgb.DMatrix(np.array([[]]), label=y + 0).get_label() assert (from_view.shape == from_array.shape) assert (from_view == from_array).all() # Sliced UInt array z = np.array([12, 34, 56], np.uint32)[::2] dmat = xgb.DMatrix(np.array([[]])) dmat.set_uint_info('group', z) from_view = dmat.get_uint_info('group_ptr') dmat = xgb.DMatrix(np.array([[]])) dmat.set_uint_info('group', z + 0) from_array = dmat.get_uint_info('group_ptr') assert (from_view.shape == from_array.shape) assert (from_view == from_array).all() def test_slice(self): X = rng.randn(100, 100) y = rng.randint(low=0, high=3, size=100).astype(np.float32) d = xgb.DMatrix(X, y) np.testing.assert_equal(d.get_label(), y) fw = rng.uniform(size=100).astype(np.float32) d.set_info(feature_weights=fw) # base margin is per-class in multi-class classifier base_margin = rng.randn(100, 3).astype(np.float32) d.set_base_margin(base_margin.flatten()) ridxs = [1, 2, 3, 4, 5, 6] sliced = d.slice(ridxs) # Slicing works with label and other meta info fields np.testing.assert_equal(sliced.get_label(), y[1:7]) np.testing.assert_equal(sliced.get_float_info('feature_weights'), fw) np.testing.assert_equal(sliced.get_base_margin(), base_margin[1:7, :].flatten()) np.testing.assert_equal(sliced.get_base_margin(), sliced.get_float_info('base_margin')) # Slicing a DMatrix results into a DMatrix that's equivalent to a DMatrix that's # constructed from the corresponding NumPy slice d2 = xgb.DMatrix(X[1:7, :], y[1:7]) d2.set_base_margin(base_margin[1:7, :].flatten()) eval_res = {} _ = xgb.train( { 'num_class': 3, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss' }, d, num_boost_round=2, evals=[(d2, 'd2'), (sliced, 'sliced')], evals_result=eval_res) np.testing.assert_equal(eval_res['d2']['mlogloss'], eval_res['sliced']['mlogloss']) ridxs_arr = np.array(ridxs)[1:] # handles numpy slice correctly sliced = d.slice(ridxs_arr) np.testing.assert_equal(sliced.get_label(), y[2:7]) def test_feature_names_slice(self): data = np.random.randn(5, 5) # different length with pytest.raises(ValueError): xgb.DMatrix(data, feature_names=list('abcdef')) # contains duplicates with pytest.raises(ValueError): xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'd']) # contains symbol with pytest.raises(ValueError): xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'e<1']) dm = xgb.DMatrix(data) dm.feature_names = list('abcde') assert dm.feature_names == list('abcde') assert dm.slice([0, 1]).num_col() == dm.num_col() assert dm.slice([0, 1]).feature_names == dm.feature_names dm.feature_types = 'q' assert dm.feature_types == list('qqqqq') dm.feature_types = list('qiqiq') assert dm.feature_types == list('qiqiq') with pytest.raises(ValueError): dm.feature_types = list('abcde') # reset dm.feature_names = None assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4'] assert dm.feature_types is None def test_feature_names(self): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] for features in cases: dm = xgb.DMatrix(data, label=target, feature_names=features) assert dm.feature_names == features assert dm.num_row() == 100 assert dm.num_col() == 5 params = { 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'eta': 0.3, 'num_class': 3 } bst = xgb.train(params, dm, num_boost_round=10) scores = bst.get_fscore() assert list(sorted(k for k in scores)) == features dummy = np.random.randn(5, 5) dm = xgb.DMatrix(dummy, feature_names=features) bst.predict(dm) # different feature name must raises error dm = xgb.DMatrix(dummy, feature_names=list('abcde')) with pytest.raises(ValueError): bst.predict(dm) @pytest.mark.skipif(**tm.no_pandas()) def test_save_binary(self): import pandas as pd with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, 'm.dmatrix') data = pd.DataFrame({"a": [0, 1], "b": [2, 3], "c": [4, 5]}) m0 = xgb.DMatrix(data.loc[:, ["a", "b"]], data["c"]) assert m0.feature_names == ['a', 'b'] m0.save_binary(path) m1 = xgb.DMatrix(path) assert m0.feature_names == m1.feature_names assert m0.feature_types == m1.feature_types def test_get_info(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtrain.get_float_info('label') dtrain.get_float_info('weight') dtrain.get_float_info('base_margin') dtrain.get_uint_info('group_ptr') def test_qid(self): rows = 100 cols = 10 X, y = rng.randn(rows, cols), rng.randn(rows) qid = rng.randint(low=0, high=10, size=rows, dtype=np.uint32) qid = np.sort(qid) Xy = xgb.DMatrix(X, y) Xy.set_info(qid=qid) group_ptr = Xy.get_uint_info('group_ptr') assert group_ptr[0] == 0 assert group_ptr[-1] == rows def test_feature_weights(self): kRows = 10 kCols = 50 rng = np.random.RandomState(1994) fw = rng.uniform(size=kCols) X = rng.randn(kRows, kCols) m = xgb.DMatrix(X) m.set_info(feature_weights=fw) np.testing.assert_allclose(fw, m.get_float_info('feature_weights')) # Handle empty m.set_info(feature_weights=np.empty((0, 0))) assert m.get_float_info('feature_weights').shape[0] == 0 fw -= 1 with pytest.raises(ValueError): m.set_info(feature_weights=fw) def test_sparse_dmatrix_csr(self): nrow = 100 ncol = 1000 x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) assert x.indices.max() < ncol - 1 x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = { 'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0 } bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) i32 = csr_matrix((x.data.astype(np.int32), x.indices, x.indptr), shape=x.shape) f32 = csr_matrix((i32.data.astype(np.float32), x.indices, x.indptr), shape=x.shape) di32 = xgb.DMatrix(i32) df32 = xgb.DMatrix(f32) dense = xgb.DMatrix(f32.toarray(), missing=0) with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "f32.dmatrix") df32.save_binary(path) with open(path, "rb") as fd: df32_buffer = np.array(fd.read()) path = os.path.join(tmpdir, "f32.dmatrix") di32.save_binary(path) with open(path, "rb") as fd: di32_buffer = np.array(fd.read()) path = os.path.join(tmpdir, "dense.dmatrix") dense.save_binary(path) with open(path, "rb") as fd: dense_buffer = np.array(fd.read()) np.testing.assert_equal(df32_buffer, di32_buffer) np.testing.assert_equal(df32_buffer, dense_buffer) def test_sparse_dmatrix_csc(self): nrow = 1000 ncol = 100 x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng) assert x.indices.max() < nrow - 1 x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = { 'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0 } bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) def test_unknown_data(self): class Data: pass with pytest.raises(TypeError): with pytest.warns(UserWarning): d = Data() xgb.DMatrix(d)
class TestInplacePredict: '''Tests for running inplace prediction''' @classmethod def setup_class(cls): cls.rows = 1000 cls.cols = 10 cls.missing = 11 # set to integer for testing cls.rng = np.random.RandomState(1994) cls.X = cls.rng.randn(cls.rows, cls.cols) missing_idx = [i for i in range(0, cls.cols, 4)] cls.X[:, missing_idx] = cls.missing # set to be missing cls.y = cls.rng.randn(cls.rows) dtrain = xgb.DMatrix(cls.X, cls.y) cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing) cls.num_boost_round = 10 cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10) def test_predict(self): booster = self.booster X = self.X test = self.test predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing) predt_from_dmatrix = booster.predict(test) X_obj = X.copy().astype(object) assert X_obj.dtype.hasobject is True assert X.dtype.hasobject is False np.testing.assert_allclose( booster.inplace_predict(X_obj), booster.inplace_predict(X) ) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) predt_from_array = booster.inplace_predict( X[:10, ...], iteration_range=(0, 4), missing=self.missing ) predt_from_dmatrix = booster.predict(test, ntree_limit=4) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) with pytest.raises(ValueError): booster.predict(test, ntree_limit=booster.best_ntree_limit + 1) with pytest.raises(ValueError): booster.predict(test, iteration_range=(0, booster.best_iteration + 2)) default = booster.predict(test) range_full = booster.predict(test, iteration_range=(0, self.num_boost_round)) ntree_full = booster.predict(test, ntree_limit=self.num_boost_round) np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(ntree_full, default) range_full = booster.predict( test, iteration_range=(0, booster.best_iteration + 1) ) ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit) np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(ntree_full, default) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_dense) def predict_csr(x): inplace_predt = booster.inplace_predict(sparse.csr_matrix(x)) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_csr) @pytest.mark.skipif(**tm.no_pandas()) def test_predict_pd(self): X = self.X # construct it in column major style df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])}) booster = self.booster df_predt = booster.inplace_predict(df) arr_predt = booster.inplace_predict(X) dmat_predt = booster.predict(xgb.DMatrix(X)) X = df.values X = np.asfortranarray(X) fort_predt = booster.inplace_predict(X) np.testing.assert_allclose(dmat_predt, arr_predt) np.testing.assert_allclose(df_predt, arr_predt) np.testing.assert_allclose(fort_predt, arr_predt) def test_base_margin(self): booster = self.booster base_margin = self.rng.randn(self.rows) from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin) dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) np.testing.assert_allclose(from_dmatrix, from_inplace) def test_dtypes(self): orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape( self.rows, self.cols ) predt_orig = self.booster.inplace_predict(orig) # all primitive types in numpy for dtype in [ np.signedinteger, np.byte, np.short, np.intc, np.int_, np.longlong, np.unsignedinteger, np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong, np.floating, np.half, np.single, np.double, ]: X = np.array(orig, dtype=dtype) predt = self.booster.inplace_predict(X) np.testing.assert_allclose(predt, predt_orig) # boolean orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape( self.rows, self.cols ) predt_orig = self.booster.inplace_predict(orig) for dtype in [np.bool8, np.bool_]: X = np.array(orig, dtype=dtype) predt = self.booster.inplace_predict(X) np.testing.assert_allclose(predt, predt_orig) # unsupported types for dtype in [ np.string_, np.complex64, np.complex128, ]: X = np.array(orig, dtype=dtype) with pytest.raises(ValueError): self.booster.inplace_predict(X)
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) def run_inplace_base_margin(self, booster, dtrain, X, base_margin): import cupy as cp dtrain.set_info(base_margin=base_margin) from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) cp.testing.assert_allclose(from_inplace, from_dmatrix) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 missing = 11 # set to integer for testing cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) missing_idx = [i for i in range(0, cols, 4)] X[:, missing_idx] = missing # set to be missing y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...], missing=missing) predt_from_array = booster.inplace_predict(X[:10, ...], missing=missing) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) base_margin = cp_rng.randn(rows) self.run_inplace_base_margin(booster, dtrain, X, base_margin) # Create a wide dataset X = cp_rng.randn(100, 10000) y = cp_rng.randn(100) missing_idx = [i for i in range(0, X.shape[1], 16)] X[:, missing_idx] = missing reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=8, missing=missing) reg.fit(X, y) gpu_predt = reg.predict(X) reg.set_params(predictor="cpu_predictor") cpu_predt = reg.predict(X) np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): # column major array inplace_predt = booster.inplace_predict(x.values) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) assert cp.all(copied_predt == inplace_predt) inplace_predt = booster.inplace_predict(x) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) base_margin = cudf.Series(rng.randn(rows)) self.run_inplace_base_margin(booster, dtrain, X, base_margin) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, print_blob=True) def test_shap(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20, print_blob=True) def test_shap_interactions(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_shap_categorical(self): X, y = tm.make_categorical(100, 20, 7, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10) booster.set_param({"predictor": "gpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) booster.set_param({"predictor": "cpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None, print_blob=True) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.parametrize("n_classes", [2, 3]) def test_predict_dart(self, n_classes): from sklearn.datasets import make_classification import cupy as cp n_samples = 1000 X_, y_ = make_classification(n_samples=n_samples, n_informative=5, n_classes=n_classes) X, y = cp.array(X_), cp.array(y_) Xy = xgb.DMatrix(X, y) if n_classes == 2: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "binary:logistic" } else: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "multi:softprob", "num_class": n_classes } booster = xgb.train(params, Xy, num_boost_round=32) # predictor=auto inplace = booster.inplace_predict(X) copied = booster.predict(Xy) cpu_inplace = booster.inplace_predict(X_) booster.set_param({"predictor": "cpu_predictor"}) cpu_copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6) cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6) cp.testing.assert_allclose(inplace, copied, atol=1e-6) booster.set_param({"predictor": "gpu_predictor"}) inplace = booster.inplace_predict(X) copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(inplace, copied, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) def test_dtypes(self): import cupy as cp rows = 1000 cols = 10 rng = cp.random.RandomState(1994) orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols) y = rng.randint(low=0, high=127, size=rows) dtrain = xgb.DMatrix(orig, label=y) booster = xgb.train({"tree_method": "gpu_hist"}, dtrain) predt_orig = booster.inplace_predict(orig) # all primitive types in numpy for dtype in [ cp.signedinteger, cp.byte, cp.short, cp.intc, cp.int_, cp.longlong, cp.unsignedinteger, cp.ubyte, cp.ushort, cp.uintc, cp.uint, cp.ulonglong, cp.floating, cp.half, cp.single, cp.double, ]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # boolean orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols) predt_orig = booster.inplace_predict(orig) for dtype in [cp.bool8, cp.bool_]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # unsupported types for dtype in [ cp.complex64, cp.complex128, ]: X = cp.array(orig, dtype=dtype) with pytest.raises(ValueError): booster.inplace_predict(X)
class TestTreeMethod: @given(exact_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_exact(self, param, num_rounds, dataset): param['tree_method'] = 'exact' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric]) @given( exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy, ) @settings(deadline=None) def test_approx(self, param, hist_param, num_rounds, dataset): param["tree_method"] = "approx" param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric]) @pytest.mark.skipif(**tm.no_sklearn()) def test_pruner(self): import sklearn params = {'tree_method': 'exact'} cancer = sklearn.datasets.load_breast_cancer() X = cancer['data'] y = cancer["target"] dtrain = xgb.DMatrix(X, y) booster = xgb.train(params, dtrain=dtrain, num_boost_round=10) grown = str(booster.get_dump()) params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'} booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) after_prune = str(booster.get_dump()) assert grown != after_prune booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) second_prune = str(booster.get_dump()) # Second prune should not change the tree assert after_prune == second_prune @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_hist(self, param, hist_param, num_rounds, dataset): param['tree_method'] = 'hist' param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) def test_hist_categorical(self): # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = { 'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc' } hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] @pytest.mark.skipif(**tm.no_sklearn()) def test_hist_degenerate_case(self): # Test a degenerate case where the quantile sketcher won't return any # quantile points for a particular feature (the second feature in # this example). Source: https://github.com/dmlc/xgboost/issues/2943 nan = np.nan param = {'missing': nan, 'tree_method': 'hist'} model = xgb.XGBRegressor(**param) X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], [6.38888889e+05, nan], [6.28086420e+05, nan]]) y = [1000000., 0., 0., 500000.] w = [0, 0, 1, 0] model.fit(X, y, sample_weight=w) def run_invalid_category(self, tree_method: str) -> None: rng = np.random.default_rng() # too large X = rng.integers(low=0, high=4, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) X[13, 7] = np.iinfo(np.int32).max + 1 # Check is performed during sketching. Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) X[13, 7] = 16777216 Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) # mixed positive and negative values X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) if tree_method == "gpu_hist": import cupy as cp X, y = cp.array(X), cp.array(y) with pytest.raises(ValueError): Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10) def test_invalid_category(self) -> None: self.run_invalid_category("approx") def run_categorical_basic(self, rows, cols, rounds, cats, tree_method): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None # Use one-hot exclusively parameters = { "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999 } m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) by_grouping: xgb.callback.TrainingCallback.EvalsLog = {} parameters["max_cat_to_onehot"] = 1 parameters["reg_lambda"] = 0 m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_grouping, ) rmse_oh = by_builtin_results["Train"]["rmse"] rmse_group = by_grouping["Train"]["rmse"] # always better or equal to onehot when there's no regularization. for a, b in zip(rmse_oh, rmse_group): assert a >= b parameters["reg_lambda"] = 1.0 by_grouping = {} xgb.train( parameters, m, num_boost_round=32, evals=[(m, "Train")], evals_result=by_grouping, ) assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): self.run_categorical_basic(rows, cols, rounds, cats, "approx") self.run_categorical_basic(rows, cols, rounds, cats, "hist")
# -*- coding: utf-8 -*- import numpy as np import xgboost as xgb import testing as tm import unittest import pytest try: import pandas as pd except ImportError: pass pytestmark = pytest.mark.skipif(**tm.no_pandas()) dpath = 'demo/data/' rng = np.random.RandomState(1994) class TestPandas(unittest.TestCase): def test_pandas(self): df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ['a', 'b', 'c'] assert dm.feature_types == ['int', 'float', 'i'] assert dm.num_row() == 2 assert dm.num_col() == 3
class TestGPUUpdaters: @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist(self, param, num_rounds, dataset): param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) def run_categorical_basic(self, rows, cols, rounds, cats): import pandas as pd rng = np.random.RandomState(1994) pd_dict = {} for i in range(cols): c = rng.randint(low=0, high=cats + 1, size=rows) pd_dict[str(i)] = pd.Series(c, dtype=np.int64) df = pd.DataFrame(pd_dict) label = df.iloc[:, 0] for i in range(0, cols - 1): label += df.iloc[:, i] label += 1 df = df.astype('category') onehot = pd.get_dummies(df) cat = df by_etl_results = {} by_builtin_results = {} parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'} m = xgb.DMatrix(onehot, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_etl_results) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train(parameters, m, num_boost_round=rounds, evals=[(m, 'Train')], evals_result=by_builtin_results) np.testing.assert_allclose(np.array(by_etl_results['Train']['rmse']), np.array( by_builtin_results['Train']['rmse']), rtol=1e-3) assert tm.non_increasing(by_builtin_results['Train']['rmse']) @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 5), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): pytest.xfail(reason='TestGPUUpdaters::test_categorical is flaky') self.run_categorical_basic(rows, cols, rounds, cats) def test_categorical_32_cat(self): '''32 hits the bound of integer bitset, so special test''' rows = 1000 cols = 10 cats = 32 rounds = 4 self.run_categorical_basic(rows, cols, rounds, cats) @pytest.mark.skipif(**tm.no_cupy()) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) result = train_result(param, dataset.get_device_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_external_memory(self, param, num_rounds, dataset): pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky') # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) m = dataset.get_external_dmat() external_result = train_result(param, m, num_rounds) del m gc.collect() assert tm.non_increasing(external_result['train'][dataset.metric]) def test_empty_dmatrix_prediction(self): # FIXME(trivialfis): This should be done with all updaters kRows = 0 kCols = 100 X = np.empty((kRows, kCols)) y = np.empty((kRows)) dtrain = xgb.DMatrix(X, y) bst = xgb.train( { 'verbosity': 2, 'tree_method': 'gpu_hist', 'gpu_id': 0 }, dtrain, verbose_eval=True, num_boost_round=6, evals=[(dtrain, 'Train')]) kRows = 100 X = np.random.randn(kRows, kCols) dtest = xgb.DMatrix(X) predictions = bst.predict(dtest) np.testing.assert_allclose(predictions, 0.5, 1e-6) @pytest.mark.mgpu @given(tm.dataset_strategy, strategies.integers(0, 10)) @settings(deadline=None, max_examples=10) def test_specified_gpu_id_gpu_update(self, dataset, gpu_id): param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id} param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), 10) assert tm.non_increasing(result['train'][dataset.metric])
class TestTreeMethod: USE_ONEHOT = np.iinfo(np.int32).max USE_PART = 1 @given(exact_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_exact(self, param, num_rounds, dataset): if dataset.name.endswith("-l1"): return param['tree_method'] = 'exact' param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) assert tm.non_increasing(result['train'][dataset.metric]) @given( exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy, ) @settings(deadline=None, print_blob=True) def test_approx(self, param, hist_param, num_rounds, dataset): param["tree_method"] = "approx" param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric]) @pytest.mark.skipif(**tm.no_sklearn()) def test_pruner(self): import sklearn params = {'tree_method': 'exact'} cancer = sklearn.datasets.load_breast_cancer() X = cancer['data'] y = cancer["target"] dtrain = xgb.DMatrix(X, y) booster = xgb.train(params, dtrain=dtrain, num_boost_round=10) grown = str(booster.get_dump()) params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'} booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) after_prune = str(booster.get_dump()) assert grown != after_prune booster = xgb.train(params, dtrain=dtrain, num_boost_round=10, xgb_model=booster) second_prune = str(booster.get_dump()) # Second prune should not change the tree assert after_prune == second_prune @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_hist(self, param, hist_param, num_rounds, dataset): param['tree_method'] = 'hist' param = dataset.set_params(param) param.update(hist_param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) @given(tm.sparse_datasets_strategy) @settings(deadline=None, print_blob=True) def test_sparse(self, dataset): param = {"tree_method": "hist", "max_bin": 64} hist_result = train_result(param, dataset.get_dmat(), 16) note(hist_result) assert tm.non_increasing(hist_result['train'][dataset.metric]) param = {"tree_method": "approx", "max_bin": 64} approx_result = train_result(param, dataset.get_dmat(), 16) note(approx_result) assert tm.non_increasing(approx_result['train'][dataset.metric]) np.testing.assert_allclose(hist_result["train"]["rmse"], approx_result["train"]["rmse"]) def test_hist_categorical(self): # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = { 'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc' } hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc'] @pytest.mark.skipif(**tm.no_sklearn()) def test_hist_degenerate_case(self): # Test a degenerate case where the quantile sketcher won't return any # quantile points for a particular feature (the second feature in # this example). Source: https://github.com/dmlc/xgboost/issues/2943 nan = np.nan param = {'missing': nan, 'tree_method': 'hist'} model = xgb.XGBRegressor(**param) X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan], [6.38888889e+05, nan], [6.28086420e+05, nan]]) y = [1000000., 0., 0., 500000.] w = [0, 0, 1, 0] model.fit(X, y, sample_weight=w) def run_invalid_category(self, tree_method: str) -> None: rng = np.random.default_rng() # too large X = rng.integers(low=0, high=4, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) X[13, 7] = np.iinfo(np.int32).max + 1 # Check is performed during sketching. Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) X[13, 7] = 16777216 Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) # mixed positive and negative values X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10) y = rng.normal(loc=0, scale=1, size=100) Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10) with pytest.raises(ValueError): xgb.train({"tree_method": tree_method}, Xy) if tree_method == "gpu_hist": import cupy as cp X, y = cp.array(X), cp.array(y) with pytest.raises(ValueError): Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10) def test_invalid_category(self) -> None: self.run_invalid_category("approx") self.run_invalid_category("hist") def run_max_cat(self, tree_method: str) -> None: """Test data with size smaller than number of categories.""" import pandas as pd rng = np.random.default_rng(0) n_cat = 100 n = 5 X = pd.Series( [ "".join(rng.choice(list(ascii_lowercase), size=3)) for i in range(n_cat) ], dtype="category", )[:n].to_frame() reg = xgb.XGBRegressor( enable_categorical=True, tree_method=tree_method, n_estimators=10, ) y = pd.Series(range(n)) reg.fit(X=X, y=y, eval_set=[(X, y)]) assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"]) @pytest.mark.parametrize("tree_method", ["hist", "approx"]) @pytest.mark.skipif(**tm.no_pandas()) def test_max_cat(self, tree_method) -> None: self.run_max_cat(tree_method) def run_categorical_missing(self, rows: int, cols: int, cats: int, tree_method: str) -> None: parameters: Dict[str, Any] = {"tree_method": tree_method} cat, label = tm.make_categorical(n_samples=256, n_features=4, n_categories=8, onehot=False, sparsity=0.5) Xy = xgb.DMatrix(cat, label, enable_categorical=True) def run(max_cat_to_onehot: int): # Test with onehot splits parameters["max_cat_to_onehot"] = max_cat_to_onehot evals_result: Dict[str, Dict] = {} booster = xgb.train(parameters, Xy, num_boost_round=16, evals=[(Xy, "Train")], evals_result=evals_result) assert tm.non_increasing(evals_result["Train"]["rmse"]) y_predt = booster.predict(Xy) rmse = tm.root_mean_square(label, y_predt) np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1]) # Test with OHE split run(self.USE_ONEHOT) if tree_method == "gpu_hist": # fixme: Test with GPU. return # Test with partition-based split run(self.USE_PART) def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None parameters = {"tree_method": tree_method, "predictor": predictor} # Use one-hot exclusively parameters["max_cat_to_onehot"] = self.USE_ONEHOT m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) by_grouping: xgb.callback.TrainingCallback.EvalsLog = {} # switch to partition-based splits parameters["max_cat_to_onehot"] = self.USE_PART parameters["reg_lambda"] = 0 m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_grouping, ) rmse_oh = by_builtin_results["Train"]["rmse"] rmse_group = by_grouping["Train"]["rmse"] # always better or equal to onehot when there's no regularization. for a, b in zip(rmse_oh, rmse_group): assert a >= b parameters["reg_lambda"] = 1.0 by_grouping = {} xgb.train( parameters, m, num_boost_round=32, evals=[(m, "Train")], evals_result=by_grouping, ) assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical_ohe(self, rows, cols, rounds, cats): self.run_categorical_ohe(rows, cols, rounds, cats, "approx") self.run_categorical_ohe(rows, cols, rounds, cats, "hist") @given( tm.categorical_dataset_strategy, exact_parameter_strategy, hist_parameter_strategy, cat_parameter_strategy, strategies.integers(4, 32), strategies.sampled_from(["hist", "approx"]), ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical( self, dataset: tm.TestDataset, exact_parameters: Dict[str, Any], hist_parameters: Dict[str, Any], cat_parameters: Dict[str, Any], n_rounds: int, tree_method: str, ) -> None: cat_parameters.update(exact_parameters) cat_parameters.update(hist_parameters) cat_parameters["tree_method"] = tree_method results = train_result(cat_parameters, dataset.get_dmat(), n_rounds) tm.non_increasing(results["train"]["rmse"]) @given( hist_parameter_strategy, cat_parameter_strategy, strategies.sampled_from(["hist", "approx"]), ) @settings(deadline=None, print_blob=True) def test_categorical_ames_housing( self, hist_parameters: Dict[str, Any], cat_parameters: Dict[str, Any], tree_method: str, ) -> None: cat_parameters.update(hist_parameters) dataset = tm.TestDataset("ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse") cat_parameters["tree_method"] = tree_method results = train_result(cat_parameters, dataset.get_dmat(), 16) tm.non_increasing(results["train"]["rmse"]) @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(4, 7)) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical_missing(self, rows, cols, cats): self.run_categorical_missing(rows, cols, cats, "approx") self.run_categorical_missing(rows, cols, cats, "hist")
# -*- coding: utf-8 -*- import unittest import pytest import testing as tm import xgboost as xgb try: import datatable as dt import pandas as pd except ImportError: pass pytestmark = pytest.mark.skipif( tm.no_dt()['condition'] or tm.no_pandas()['condition'], reason=tm.no_dt()['reason'] + ' or ' + tm.no_pandas()['reason']) class TestDataTable(unittest.TestCase): def test_dt(self): df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) dtable = dt.Frame(df) labels = dt.Frame([1, 2]) dm = xgb.DMatrix(dtable, label=labels) assert dm.feature_names == ['a', 'b', 'c'] assert dm.feature_types == ['int', 'float', 'i'] assert dm.num_row() == 2 assert dm.num_col() == 3 # overwrite feature_names
class TestGPUUpdaters: cputest = test_up.TestTreeMethod() @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist(self, param, num_rounds, dataset): param["tree_method"] = "gpu_hist" param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) note(result) assert tm.non_increasing(result["train"][dataset.metric]) @given(strategies.integers(10, 400), strategies.integers(3, 8), strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist") def test_categorical_32_cat(self): '''32 hits the bound of integer bitset, so special test''' rows = 1000 cols = 10 cats = 32 rounds = 4 self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist") @pytest.mark.skipif(**tm.no_cupy()) def test_invalid_categorical(self): self.cputest.run_invalid_category("gpu_hist") @pytest.mark.skipif(**tm.no_cupy()) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) result = train_result(param, dataset.get_device_dmat(), num_rounds) note(result) assert tm.non_increasing(result['train'][dataset.metric]) @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy) @settings(deadline=None) def test_external_memory(self, param, num_rounds, dataset): # We cannot handle empty dataset yet assume(len(dataset.y) > 0) param['tree_method'] = 'gpu_hist' param = dataset.set_params(param) m = dataset.get_external_dmat() external_result = train_result(param, m, num_rounds) del m gc.collect() assert tm.non_increasing(external_result['train'][dataset.metric]) def test_empty_dmatrix_prediction(self): # FIXME(trivialfis): This should be done with all updaters kRows = 0 kCols = 100 X = np.empty((kRows, kCols)) y = np.empty((kRows)) dtrain = xgb.DMatrix(X, y) bst = xgb.train( { 'verbosity': 2, 'tree_method': 'gpu_hist', 'gpu_id': 0 }, dtrain, verbose_eval=True, num_boost_round=6, evals=[(dtrain, 'Train')]) kRows = 100 X = np.random.randn(kRows, kCols) dtest = xgb.DMatrix(X) predictions = bst.predict(dtest) np.testing.assert_allclose(predictions, 0.5, 1e-6) @pytest.mark.mgpu @given(tm.dataset_strategy, strategies.integers(0, 10)) @settings(deadline=None, max_examples=10) def test_specified_gpu_id_gpu_update(self, dataset, gpu_id): param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id} param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), 10) assert tm.non_increasing(result['train'][dataset.metric])
class TestInplacePredict: '''Tests for running inplace prediction''' @classmethod def setup_class(cls): cls.rows = 100 cls.cols = 10 cls.rng = np.random.RandomState(1994) cls.X = cls.rng.randn(cls.rows, cls.cols) cls.y = cls.rng.randn(cls.rows) dtrain = xgb.DMatrix(cls.X, cls.y) cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10) cls.test = xgb.DMatrix(cls.X[:10, ...]) def test_predict(self): booster = self.booster X = self.X test = self.test predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) predt_from_array = booster.inplace_predict(X[:10, ...], iteration_range=(0, 4)) predt_from_dmatrix = booster.predict(test, ntree_limit=4) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_dense) def predict_csr(x): inplace_predt = booster.inplace_predict(sparse.csr_matrix(x)) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_csr) @pytest.mark.skipif(**tm.no_pandas()) def test_predict_pd(self): X = self.X # construct it in column major style df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])}) booster = self.booster df_predt = booster.inplace_predict(df) arr_predt = booster.inplace_predict(X) dmat_predt = booster.predict(xgb.DMatrix(X)) np.testing.assert_allclose(dmat_predt, arr_predt) np.testing.assert_allclose(df_predt, arr_predt) def test_base_margin(self): booster = self.booster base_margin = self.rng.randn(self.rows) from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin) dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) np.testing.assert_allclose(from_dmatrix, from_inplace)
class TestInplacePredict: '''Tests for running inplace prediction''' @classmethod def setup_class(cls): cls.rows = 1000 cls.cols = 10 cls.missing = 11 # set to integer for testing cls.rng = np.random.RandomState(1994) cls.X = cls.rng.randn(cls.rows, cls.cols) missing_idx = [i for i in range(0, cls.cols, 4)] cls.X[:, missing_idx] = cls.missing # set to be missing cls.y = cls.rng.randn(cls.rows) dtrain = xgb.DMatrix(cls.X, cls.y) cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing) cls.num_boost_round = 10 cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10) def test_predict(self): booster = self.booster X = self.X test = self.test predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing) predt_from_dmatrix = booster.predict(test) X_obj = X.copy().astype(object) assert X_obj.dtype.hasobject is True assert X.dtype.hasobject is False np.testing.assert_allclose(booster.inplace_predict(X_obj), booster.inplace_predict(X)) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) predt_from_array = booster.inplace_predict(X[:10, ...], iteration_range=(0, 4), missing=self.missing) predt_from_dmatrix = booster.predict(test, ntree_limit=4) np.testing.assert_allclose(predt_from_dmatrix, predt_from_array) with pytest.raises(ValueError): booster.predict(test, ntree_limit=booster.best_ntree_limit + 1) with pytest.raises(ValueError): booster.predict(test, iteration_range=(0, booster.best_iteration + 2)) default = booster.predict(test) range_full = booster.predict(test, iteration_range=(0, self.num_boost_round)) ntree_full = booster.predict(test, ntree_limit=self.num_boost_round) np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(ntree_full, default) range_full = booster.predict( test, iteration_range=(0, booster.best_iteration + 1)) ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit) np.testing.assert_allclose(range_full, default) np.testing.assert_allclose(ntree_full, default) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_dense) def predict_csr(x): inplace_predt = booster.inplace_predict(sparse.csr_matrix(x)) d = xgb.DMatrix(x) copied_predt = booster.predict(d) return np.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, self.rows, predict_csr) @pytest.mark.skipif(**tm.no_pandas()) def test_predict_pd(self): X = self.X # construct it in column major style df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])}) booster = self.booster df_predt = booster.inplace_predict(df) arr_predt = booster.inplace_predict(X) dmat_predt = booster.predict(xgb.DMatrix(X)) X = df.values X = np.asfortranarray(X) fort_predt = booster.inplace_predict(X) np.testing.assert_allclose(dmat_predt, arr_predt) np.testing.assert_allclose(df_predt, arr_predt) np.testing.assert_allclose(fort_predt, arr_predt) def test_base_margin(self): booster = self.booster base_margin = self.rng.randn(self.rows) from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin) dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) np.testing.assert_allclose(from_dmatrix, from_inplace)
import unittest import pytest import numpy as np import testing as tm import xgboost as xgb import os try: import pyarrow as pa import pyarrow.csv as pc import pandas as pd except ImportError: pass pytestmark = pytest.mark.skipif( tm.no_arrow()["condition"] or tm.no_pandas()["condition"], reason=tm.no_arrow()["reason"] + " or " + tm.no_pandas()["reason"], ) dpath = "demo/data/" class TestArrowTable(unittest.TestCase): def test_arrow_table(self): df = pd.DataFrame( [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"] ) table = pa.Table.from_pandas(df) dm = xgb.DMatrix(table) assert dm.num_row() == 2 assert dm.num_col() == 4
class TestEarlyStopping: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_nonparallel(self): from sklearn.datasets import load_digits try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split digits = load_digits(2) X = digits['data'] y = digits['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = xgb.XGBClassifier(learning_rate=0.1) clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc", eval_set=[(X_test, y_test)]) clf2 = xgb.XGBClassifier(learning_rate=0.1) clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc", eval_set=[(X_test, y_test)]) # should be the same assert clf1.best_score == clf2.best_score assert clf1.best_score != 1 # check overfit clf3 = xgb.XGBClassifier(learning_rate=0.1) clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) assert clf3.best_score == 1 def evalerror(self, preds, dtrain): from sklearn.metrics import mean_squared_error labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) return 'rmse', mean_squared_error(labels, preds) @staticmethod def assert_metrics_length(cv, expected_length): for key, value in cv.items(): assert len(value) == expected_length @pytest.mark.skipif(**tm.no_sklearn()) def test_cv_early_stopping(self): from sklearn.datasets import load_digits digits = load_digits(2) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'error' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) self.assert_metrics_length(cv, 10) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5) self.assert_metrics_length(cv, 3) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1) self.assert_metrics_length(cv, 1) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=10) self.assert_metrics_length(cv, 10) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, early_stopping_rounds=1) self.assert_metrics_length(cv, 5) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror, maximize=True, early_stopping_rounds=1) self.assert_metrics_length(cv, 1) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) dm = xgb.DMatrix(X, label=y) params = {'objective': 'binary:logistic'} metrics = [['auc'], ['error'], ['logloss'], ['logloss', 'auc'], ['logloss', 'error'], ['error', 'logloss']] num_iteration_history = [] # If more than one metrics is given, early stopping should use the last metric for i, m in enumerate(metrics): result = xgb.cv(params, dm, num_boost_round=1000, nfold=5, stratified=True, metrics=m, early_stopping_rounds=20, seed=42) num_iteration_history.append(len(result)) df = result['test-{}-mean'.format(m[-1])] # When early stopping is invoked, the last metric should be as best it can be. if m[-1] == 'auc': assert np.all(df <= df.iloc[-1]) else: assert np.all(df >= df.iloc[-1]) assert num_iteration_history[:3] == num_iteration_history[3:]
class TestModels: def test_glm(self): param = { 'verbosity': 0, 'objective': 'binary:logistic', 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, 'nthread': 1 } watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.2 def test_dart(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') param = { 'max_depth': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1 } # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest, ntree_limit=num_round) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) # error must be smaller than 10% assert err < 0.1 with tempfile.TemporaryDirectory() as tmpdir: dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') model_path = os.path.join(tmpdir, 'xgboost.model.dart') # save dmatrix into binary buffer dtest.save_binary(dtest_path) model_path = model_path # save model bst.save_model(model_path) # load model and data in bst2 = xgb.Booster(params=param, model_file=model_path) dtest2 = xgb.DMatrix(dtest_path) preds2 = bst2.predict(dtest2, ntree_limit=num_round) # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 def my_logloss(preds, dtrain): labels = dtrain.get_label() return 'logloss', np.sum(np.log(np.where(labels, preds, 1 - preds))) # check whether custom evaluation metrics work bst = xgb.train(param, dtrain, num_round, watchlist, feval=my_logloss) preds3 = bst.predict(dtest, ntree_limit=num_round) assert all(preds3 == preds) # check whether sample_type and normalize_type work num_round = 50 param['verbosity'] = 0 param['learning_rate'] = 0.1 param['rate_drop'] = 0.1 preds_list = [] for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]: param['sample_type'] = p[0] param['normalize_type'] = p[1] bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest, ntree_limit=num_round) err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 preds_list.append(preds) for ii in range(len(preds_list)): for jj in range(ii + 1, len(preds_list)): assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0 def test_boost_from_prediction(self): # Re-construct dtrain here to avoid modification margined = xgb.DMatrix(dpath + 'agaricus.txt.train') bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_0 = bst.predict(margined, output_margin=True) margined.set_base_margin(predt_0) bst = xgb.train({'tree_method': 'hist'}, margined, 1) predt_1 = bst.predict(margined) assert np.any(np.abs(predt_1 - predt_0) > 1e-6) bst = xgb.train({'tree_method': 'hist'}, dtrain, 2) predt_2 = bst.predict(dtrain) assert np.all(np.abs(predt_2 - predt_1) < 1e-6) def test_boost_from_existing_model(self): X = xgb.DMatrix(dpath + 'agaricus.txt.train') booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4) assert booster.num_boosted_rounds() == 4 booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4, xgb_model=booster) assert booster.num_boosted_rounds() == 8 booster = xgb.train({ 'updater': 'prune', 'process_type': 'update' }, X, num_boost_round=4, xgb_model=booster) # Trees are moved for update, the rounds is reduced. This test is # written for being compatible with current code (1.0.0). If the # behaviour is considered sub-optimal, feel free to change. assert booster.num_boosted_rounds() == 4 def run_custom_objective(self, tree_method=None): param = { 'max_depth': 2, 'eta': 1, 'objective': 'reg:logistic', "tree_method": tree_method } watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 10 def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0 - preds) return grad, hess def evalerror(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) return 'error', float(sum(labels != (preds > 0.5))) / len(labels) # test custom_objective in training bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 # test custom_objective in cross-validation xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror) # test maximize parameter def neg_evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels == (preds > 0.0))) / len(labels) bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True) preds2 = bst2.predict(dtest) err2 = sum(1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2)) assert err == err2 def test_custom_objective(self): self.run_custom_objective() def test_multi_eval_metric(self): watchlist = [(dtest, 'eval'), (dtrain, 'train')] param = { 'max_depth': 2, 'eta': 0.2, 'verbosity': 1, 'objective': 'binary:logistic' } param['eval_metric'] = ["auc", "logloss", 'error'] evals_result = {} bst = xgb.train(param, dtrain, 4, watchlist, evals_result=evals_result) assert isinstance(bst, xgb.core.Booster) assert len(evals_result['eval']) == 3 assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'} def test_fpreproc(self): param = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } num_round = 2 def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label == 1) param['scale_pos_weight'] = ratio return (dtrain, dtest, param) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc) def test_show_stdv(self): param = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } num_round = 2 xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False) def test_feature_names_validation(self): X = np.random.random((10, 3)) y = np.random.randint(2, size=(10, )) dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c")) dm2 = xgb.DMatrix(X, y) bst = xgb.train([], dm1) bst.predict(dm1) # success with pytest.raises(ValueError): bst.predict(dm2) bst.predict(dm1) # success bst = xgb.train([], dm2) bst.predict(dm2) # success def test_model_binary_io(self): model_path = 'test_model_binary_io.bin' parameters = { 'tree_method': 'hist', 'booster': 'gbtree', 'scale_pos_weight': '0.5' } X = np.random.random((10, 3)) y = np.random.random((10, )) dtrain = xgb.DMatrix(X, y) bst = xgb.train(parameters, dtrain, num_boost_round=2) bst.save_model(model_path) bst = xgb.Booster(model_file=model_path) os.remove(model_path) config = json.loads(bst.save_config()) assert float(config['learner']['objective']['reg_loss_param'] ['scale_pos_weight']) == 0.5 buf = bst.save_raw() from_raw = xgb.Booster() from_raw.load_model(buf) buf_from_raw = from_raw.save_raw() assert buf == buf_from_raw def test_model_json_io(self): loc = locale.getpreferredencoding(False) model_path = 'test_model_json_io.json' parameters = {'tree_method': 'hist', 'booster': 'gbtree'} j_model = json_model(model_path, parameters) assert isinstance(j_model['learner'], dict) bst = xgb.Booster(model_file=model_path) bst.save_model(fname=model_path) with open(model_path, 'r') as fd: j_model = json.load(fd) assert isinstance(j_model['learner'], dict) os.remove(model_path) assert locale.getpreferredencoding(False) == loc @pytest.mark.skipif(**tm.no_json_schema()) def test_json_io_schema(self): import jsonschema model_path = 'test_json_schema.json' path = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) doc = os.path.join(path, 'doc', 'model.schema') with open(doc, 'r') as fd: schema = json.load(fd) parameters = {'tree_method': 'hist', 'booster': 'gbtree'} jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) os.remove(model_path) parameters = {'tree_method': 'hist', 'booster': 'dart'} jsonschema.validate(instance=json_model(model_path, parameters), schema=schema) os.remove(model_path) try: xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1) except ValueError as e: e_str = str(e) beg = e_str.find('Objective candidate') end = e_str.find('Stack trace') e_str = e_str[beg:end] e_str = e_str.strip() splited = e_str.splitlines() objectives = [s.split(': ')[1] for s in splited] j_objectives = schema['properties']['learner']['properties'][ 'objective']['oneOf'] objectives_from_schema = set() for j_obj in j_objectives: objectives_from_schema.add( j_obj['properties']['name']['const']) objectives = set(objectives) assert objectives == objectives_from_schema @pytest.mark.skipif(**tm.no_json_schema()) def test_json_dump_schema(self): import jsonschema def validate_model(parameters): X = np.random.random((100, 30)) y = np.random.randint(0, 4, size=(100, )) parameters['num_class'] = 4 m = xgb.DMatrix(X, y) booster = xgb.train(parameters, m) dump = booster.get_dump(dump_format='json') for i in range(len(dump)): jsonschema.validate(instance=json.loads(dump[i]), schema=schema) path = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) doc = os.path.join(path, 'doc', 'dump.schema') with open(doc, 'r') as fd: schema = json.load(fd) parameters = { 'tree_method': 'hist', 'booster': 'gbtree', 'objective': 'multi:softmax' } validate_model(parameters) parameters = { 'tree_method': 'hist', 'booster': 'dart', 'objective': 'multi:softmax' } validate_model(parameters) @pytest.mark.skipif(**tm.no_sklearn()) def test_attributes(self): from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) cls = xgb.XGBClassifier(n_estimators=2) cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)]) assert cls.get_booster().best_ntree_limit == 2 assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "cls.json") cls.save_model(path) cls = xgb.XGBClassifier(n_estimators=2) cls.load_model(path) assert cls.get_booster().best_ntree_limit == 2 assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.parametrize('booster', ['gbtree', 'dart']) def test_slice(self, booster): from sklearn.datasets import make_classification num_classes = 3 X, y = make_classification(n_samples=1000, n_informative=5, n_classes=num_classes) dtrain = xgb.DMatrix(data=X, label=y) num_parallel_tree = 4 num_boost_round = 16 total_trees = num_parallel_tree * num_classes * num_boost_round booster = xgb.train( { 'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3, 'booster': booster, 'objective': 'multi:softprob' }, num_boost_round=num_boost_round, dtrain=dtrain) assert len(booster.get_dump()) == total_trees beg = 3 end = 7 sliced: xgb.Booster = booster[beg:end] sliced_trees = (end - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced_trees = sliced_trees // 2 sliced: xgb.Booster = booster[beg:end:2] assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[beg:...] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[beg:] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced: xgb.Booster = booster[...:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) with pytest.raises(ValueError, match=r'>= 0'): booster[-1:0] # we do not accept empty slice. with pytest.raises(ValueError): booster[1:1] # stop can not be smaller than begin with pytest.raises(ValueError, match=r'Invalid.*'): booster[3:0] with pytest.raises(ValueError, match=r'Invalid.*'): booster[3:-1] # negative step is not supported. with pytest.raises(ValueError, match=r'.*>= 1.*'): booster[0:2:-1] # step can not be 0. with pytest.raises(ValueError, match=r'.*>= 1.*'): booster[0:2:0] trees = [_ for _ in booster] assert len(trees) == num_boost_round with pytest.raises(TypeError): booster["wrong type"] with pytest.raises(IndexError): booster[:num_boost_round + 1] with pytest.raises(ValueError): booster[1, 2] # too many dims # setitem is not implemented as model is immutable during slicing. with pytest.raises(TypeError): booster[...:end] = booster sliced_0 = booster[1:3] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(1, 3)), sliced_0.predict(dtrain)) sliced_1 = booster[3:7] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(3, 7)), sliced_1.predict(dtrain)) predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 # base score. single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) sliced_0 = booster[1:7:2] # 1,3,5 sliced_1 = booster[2:8:2] # 2,4,6 predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) @pytest.mark.skipif(**tm.no_pandas()) def test_feature_info(self): import pandas as pd rows = 100 cols = 10 X = rng.randn(rows, cols) y = rng.randn(rows) feature_names = ["test_feature_" + str(i) for i in range(cols)] X_pd = pd.DataFrame(X, columns=feature_names) X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int) Xy = xgb.DMatrix(X_pd, y) assert Xy.feature_types[3] == "int" booster = xgb.train({}, dtrain=Xy, num_boost_round=1) assert booster.feature_names == Xy.feature_names assert booster.feature_names == feature_names assert booster.feature_types == Xy.feature_types with tempfile.TemporaryDirectory() as tmpdir: path = tmpdir + "model.json" booster.save_model(path) booster = xgb.Booster() booster.load_model(path) assert booster.feature_names == Xy.feature_names assert booster.feature_types == Xy.feature_types
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None) def test_shap(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20) def test_shap_interactions(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
class TestPlotting: def test_plotting(self): m = xgb.DMatrix(dpath) booster = xgb.train( { 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' }, m, num_boost_round=2) ax = xgb.plot_importance(booster) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 ax = xgb.plot_importance(booster, color='r', title='t', xlabel='x', ylabel='y') assert isinstance(ax, Axes) assert ax.get_title() == 't' assert ax.get_xlabel() == 'x' assert ax.get_ylabel() == 'y' assert len(ax.patches) == 4 for p in ax.patches: assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red ax = xgb.plot_importance(booster, color=['r', 'r', 'b', 'b'], title=None, xlabel=None, ylabel=None) assert isinstance(ax, Axes) assert ax.get_title() == '' assert ax.get_xlabel() == '' assert ax.get_ylabel() == '' assert len(ax.patches) == 4 assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue g = xgb.to_graphviz(booster, num_trees=0) assert isinstance(g, Source) ax = xgb.plot_tree(booster, num_trees=0) assert isinstance(ax, Axes) def test_importance_plot_lim(self): np.random.seed(1) dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50) bst = xgb.train({}, dm) assert len(bst.get_fscore()) == 71 ax = xgb.plot_importance(bst) assert ax.get_xlim() == (0., 11.) assert ax.get_ylim() == (-1., 71.) ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71)) assert ax.get_xlim() == (0., 5.) assert ax.get_ylim() == (10., 71.) def run_categorical(self, tree_method: str) -> None: X, y = tm.make_categorical(1000, 31, 19, onehot=False) reg = xgb.XGBRegressor(enable_categorical=True, n_estimators=10, tree_method=tree_method) reg.fit(X, y) trees = reg.get_booster().get_dump(dump_format="json") for tree in trees: j_tree = json.loads(tree) assert "leaf" in j_tree.keys() or isinstance( j_tree["split_condition"], list) graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1) assert isinstance(graph, Source) ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1) assert isinstance(ax, Axes) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self) -> None: self.run_categorical("approx")
def test_external_memory_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'external_memory.py') cmd = ['python', script] subprocess.check_call(cmd) def test_evals_result_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'evals_result.py') cmd = ['python', script] subprocess.check_call(cmd) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) def test_aft_demo(): script = os.path.join(DEMO_DIR, 'aft_survival', 'aft_survival_demo.py') cmd = ['python', script] subprocess.check_call(cmd) assert os.path.exists('aft_model.json') os.remove('aft_model.json') def test_callbacks_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'callbacks.py') cmd = ['python', script, '--plot=0'] subprocess.check_call(cmd) # gpu_acceleration is not tested due to covertype dataset is being too huge.
class TestPlotting: cputest = tp.TestPlotting() @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self): self.cputest.run_categorical("gpu_hist")