def test_categorical(): X, y = tm.make_categorical(n_samples=32, n_features=2, n_categories=3, onehot=False) ft = ["c"] * X.shape[1] reg = xgb.XGBRegressor( tree_method="hist", feature_types=ft, max_cat_to_onehot=1, enable_categorical=True, ) reg.fit(X.values, y, eval_set=[(X.values, y)]) from_cat = reg.evals_result()["validation_0"]["rmse"] predt_cat = reg.predict(X.values) assert reg.get_booster().feature_types == ft with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "model.json") reg.save_model(path) reg = xgb.XGBRegressor() reg.load_model(path) assert reg.feature_types == ft onehot, y = tm.make_categorical( n_samples=32, n_features=2, n_categories=3, onehot=True ) reg = xgb.XGBRegressor(tree_method="hist") reg.fit(onehot, y, eval_set=[(onehot, y)]) from_enc = reg.evals_result()["validation_0"]["rmse"] predt_enc = reg.predict(onehot) np.testing.assert_allclose(from_cat, from_enc) np.testing.assert_allclose(predt_cat, predt_enc)
def __init__(self, categorical): '''Generate some random data for demostration. Actual data can be anything that is currently supported by XGBoost. ''' import cudf self.rows = self.ROWS_PER_BATCH if categorical: self._data = [] self._labels = [] for i in range(self.BATCHES): X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False) self._data.append(cudf.from_pandas(X)) self._labels.append(y) else: rng = np.random.RandomState(1994) self._data = [ cudf.DataFrame({ 'a': rng.randn(self.ROWS_PER_BATCH), 'b': rng.randn(self.ROWS_PER_BATCH) }) ] * self.BATCHES self._labels = [rng.randn(self.rows)] * self.BATCHES self.it = 0 # set iterator to 0 super().__init__()
def test_scipy_categorical(self): from scipy import sparse n_features = 10 X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False) X = X.values.astype(np.float32) feature_types = ['c'] * n_features X[1, 3] = np.NAN X[2, 4] = np.NAN X = sparse.csr_matrix(X) Xy = xgb.DMatrix(X, y, feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) X = sparse.csc_matrix(X) Xy = xgb.DMatrix(X, y, feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) X = sparse.coo_matrix(X) Xy = xgb.DMatrix(X, y, feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def run_categorical_missing(self, rows: int, cols: int, cats: int, tree_method: str) -> None: parameters: Dict[str, Any] = {"tree_method": tree_method} cat, label = tm.make_categorical(n_samples=256, n_features=4, n_categories=8, onehot=False, sparsity=0.5) Xy = xgb.DMatrix(cat, label, enable_categorical=True) def run(max_cat_to_onehot: int): # Test with onehot splits parameters["max_cat_to_onehot"] = max_cat_to_onehot evals_result: Dict[str, Dict] = {} booster = xgb.train(parameters, Xy, num_boost_round=16, evals=[(Xy, "Train")], evals_result=evals_result) assert tm.non_increasing(evals_result["Train"]["rmse"]) y_predt = booster.predict(Xy) rmse = tm.root_mean_square(label, y_predt) np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1]) # Test with OHE split run(self.USE_ONEHOT) if tree_method == "gpu_hist": # fixme: Test with GPU. return # Test with partition-based split run(self.USE_PART)
def run_split_value_histograms(self, tree_method) -> None: X, y = tm.make_categorical(1000, 10, 13, False) reg = xgb.XGBRegressor(tree_method=tree_method, enable_categorical=True) reg.fit(X, y) with pytest.raises(ValueError, match="doesn't"): reg.get_booster().get_split_value_histogram("3", bins=5)
def run_tree_to_df_categorical(self, tree_method: str) -> None: X, y = tm.make_categorical(100, 10, 31, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=10) df = booster.trees_to_dataframe() for _, x in df.iterrows(): if x["Feature"] != "Leaf": assert len(x["Category"]) >= 1
def run_categorical_basic(self, rows, cols, rounds, cats, tree_method): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None # Use one-hot exclusively parameters = { "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999 } m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
def test_np_categorical(self): n_features = 10 X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False) X = X.values.astype(np.float32) feature_types = ['c'] * n_features assert isinstance(X, np.ndarray) Xy = xgb.DMatrix(X, y, feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def test_categorical(self): import cudf _X, _y = tm.make_categorical(100, 30, 17, False) X = cudf.from_pandas(_X) y = cudf.from_pandas(_y) Xy = xgb.DMatrix(X, y, enable_categorical=True) assert len(Xy.feature_types) == X.shape[1] assert all(t == "categorical" for t in Xy.feature_types) Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True) assert len(Xy.feature_types) == X.shape[1] assert all(t == "categorical" for t in Xy.feature_types)
def test_cupy_categorical(self): import cupy as cp n_features = 10 X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False) X = cp.asarray(X.values.astype(cp.float32)) y = cp.array(y) feature_types = ['c'] * n_features assert isinstance(X, cp.ndarray) Xy = xgb.DMatrix(X, y, feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def run_categorical(self, tree_method: str) -> None: X, y = tm.make_categorical(1000, 31, 19, onehot=False) reg = xgb.XGBRegressor(enable_categorical=True, n_estimators=10, tree_method=tree_method) reg.fit(X, y) trees = reg.get_booster().get_dump(dump_format="json") for tree in trees: j_tree = json.loads(tree) assert "leaf" in j_tree.keys() or isinstance( j_tree["split_condition"], list) graph = xgb.to_graphviz(reg, num_trees=len(j_tree) - 1) assert isinstance(graph, Source) ax = xgb.plot_tree(reg, num_trees=len(j_tree) - 1) assert isinstance(ax, Axes)
def test_cudf_categorical(self): import cudf _X, _y = tm.make_categorical(100, 30, 17, False) X = cudf.from_pandas(_X) y = cudf.from_pandas(_y) Xy = xgb.DMatrix(X, y, enable_categorical=True) assert len(Xy.feature_types) == X.shape[1] assert all(t == "c" for t in Xy.feature_types) Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True) assert len(Xy.feature_types) == X.shape[1] assert all(t == "c" for t in Xy.feature_types) # test missing value X = cudf.DataFrame({"f0": ["a", "b", np.NaN]}) X["f0"] = X["f0"].astype("category") df, cat_codes, _, _ = xgb.data._transform_cudf_df( X, None, None, enable_categorical=True) for col in cat_codes: assert col.has_nulls y = [0, 1, 2] with pytest.raises(ValueError): xgb.DMatrix(X, y) Xy = xgb.DMatrix(X, y, enable_categorical=True) assert Xy.num_row() == 3 assert Xy.num_col() == 1 with pytest.raises(ValueError): xgb.DeviceQuantileDMatrix(X, y) Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True) assert Xy.num_row() == 3 assert Xy.num_col() == 1 X = X["f0"] with pytest.raises(ValueError): xgb.DMatrix(X, y) Xy = xgb.DMatrix(X, y, enable_categorical=True) assert Xy.num_row() == 3 assert Xy.num_col() == 1
def test_shap_categorical(self): X, y = tm.make_categorical(100, 20, 7, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10) booster.set_param({"predictor": "gpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) booster.set_param({"predictor": "cpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3)
def test_categorical_model_io(self): X, y = tm.make_categorical(256, 16, 71, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16) predt_0 = booster.predict(Xy) with tempfile.TemporaryDirectory() as tempdir: path = os.path.join(tempdir, "model.binary") with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"): booster.save_model(path) path = os.path.join(tempdir, "model.json") booster.save_model(path) booster = xgb.Booster(model_file=path) predt_1 = booster.predict(Xy) np.testing.assert_allclose(predt_0, predt_1) path = os.path.join(tempdir, "model.ubj") booster.save_model(path) booster = xgb.Booster(model_file=path) predt_1 = booster.predict(Xy) np.testing.assert_allclose(predt_0, predt_1)
def pack(**kwargs: Any) -> dd.DataFrame: X, y = tm.make_categorical(**kwargs) X["label"] = y return X
def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method): onehot, label = tm.make_categorical(rows, cols, cats, True) cat, _ = tm.make_categorical(rows, cols, cats, False) by_etl_results = {} by_builtin_results = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None parameters = {"tree_method": tree_method, "predictor": predictor} # Use one-hot exclusively parameters["max_cat_to_onehot"] = self.USE_ONEHOT m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_etl_results, ) m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_builtin_results, ) # There are guidelines on how to specify tolerance based on considering output as # random variables. But in here the tree construction is extremely sensitive to # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely # different tree. So even though the test is quite lenient, hypothesis can still # pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) by_grouping: xgb.callback.TrainingCallback.EvalsLog = {} # switch to partition-based splits parameters["max_cat_to_onehot"] = self.USE_PART parameters["reg_lambda"] = 0 m = xgb.DMatrix(cat, label, enable_categorical=True) xgb.train( parameters, m, num_boost_round=rounds, evals=[(m, "Train")], evals_result=by_grouping, ) rmse_oh = by_builtin_results["Train"]["rmse"] rmse_group = by_grouping["Train"]["rmse"] # always better or equal to onehot when there's no regularization. for a, b in zip(rmse_oh, rmse_group): assert a >= b parameters["reg_lambda"] = 1.0 by_grouping = {} xgb.train( parameters, m, num_boost_round=32, evals=[(m, "Train")], evals_result=by_grouping, ) assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping