def test_pandas_input(self): self._init_ray() import pandas as pd from sklearn.calibration import CalibratedClassifierCV rng = np.random.RandomState(self.seed) kRows = 100 kCols = 6 X = rng.randint(low=0, high=2, size=kRows * kCols) X = X.reshape(kRows, kCols) df = pd.DataFrame(X) feature_names = [] for i in range(1, kCols): feature_names += ["k" + str(i)] df.columns = ["status"] + feature_names target = df["status"] train = df.drop(columns=["status"]) model = RayXGBClassifier() model.fit(train, target) clf_isotonic = CalibratedClassifierCV( model, cv="prefit", method="isotonic") clf_isotonic.fit(train, target) assert isinstance( clf_isotonic.calibrated_classifiers_[0].base_estimator, RayXGBClassifier, ) self.assertTrue( np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1])))
def test_sklearn_clone(self): self._init_ray() from sklearn.base import clone clf = RayXGBClassifier(n_jobs=2) clf.n_jobs = -1 clone(clf)
def test_select_feature(self): self._init_ray() from sklearn.datasets import load_digits from sklearn.feature_selection import SelectFromModel digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] cls = RayXGBClassifier() cls.fit(X, y) selector = SelectFromModel(cls, prefit=True, max_features=1) X_selected = selector.transform(X) assert X_selected.shape[1] == 1
def test_sklearn_random_state(self): self._init_ray() clf = RayXGBClassifier(random_state=402) assert clf.get_xgb_params()["random_state"] == 402 clf = RayXGBClassifier(random_state=401) assert clf.get_xgb_params()["random_state"] == 401 random_state = np.random.RandomState(seed=403) clf = RayXGBClassifier(random_state=random_state) assert isinstance(clf.get_xgb_params()["random_state"], int)
def test_kwargs_error(self): self._init_ray() params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1} with self.assertRaises(TypeError): clf = RayXGBClassifier(n_jobs=1000, **params) assert isinstance(clf, RayXGBClassifier)
def test_stacking_classification(self): self._init_ray() from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier X, y = load_iris(return_X_y=True) estimators = [ ("gbm", RayXGBClassifier()), ( "svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42)), ), ] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression()) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test)
def train(booster, forest): rounds = 4 cls = RayXGBClassifier( n_estimators=rounds, num_parallel_tree=forest, booster=booster).fit( X, y, eval_set=[(X, y)], early_stopping_rounds=3) if forest: assert cls.best_ntree_limit == rounds * forest else: assert cls.best_ntree_limit == 0 # best_ntree_limit is used by default, # assert that under gblinear it's # automatically ignored due to being 0. cls.predict(X)
def test_validation_weights_xgbclassifier(self): self._init_ray() from sklearn.datasets import make_hastie_10_2 # prepare training and test data X, y = make_hastie_10_2(n_samples=2000, random_state=42) labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:1600], X[1600:] y_train, y_test = y[:1600], y[1600:] # instantiate model param_dist = { "objective": "binary:logistic", "n_estimators": 2, "random_state": 123, } clf = RayXGBClassifier(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], eval_metric="logloss", verbose=False, ) # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"][ "logloss"] # now use weights for the test set np.random.seed(0) weights_test = np.random.choice([1, 2], len(X_test)) clf.fit( X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[weights_test], eval_metric="logloss", verbose=False, ) evals_result_with_weights = clf.evals_result() logloss_with_weights = evals_result_with_weights["validation_0"][ "logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
def test_sklearn_api_gblinear(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split( iris.data, iris.target, train_size=120) classifier = RayXGBClassifier( booster="gblinear", n_estimators=100, random_state=self.seed) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l err = (sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l)) assert err < 0.5
def testClassifierNoLabelEncoder(self, n_class=2): self._init_ray() from sklearn.datasets import load_digits digits = load_digits(n_class=n_class) y = digits["target"] X = digits["data"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) with self.assertRaisesRegex(Exception, "num_class"): RayXGBClassifier(**self.params).fit(train_matrix, None) with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): RayXGBClassifier(**self.params).fit(train_matrix, None, eval_set=[(X_test, y_test)]) with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): RayXGBClassifier(**self.params).fit(X_train, y_train, eval_set=[(test_matrix, "eval") ]) RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None) clf = RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None, eval_set=[(test_matrix, "eval")]) clf.predict(test_matrix) clf.predict_proba(test_matrix)
def test_kwargs_grid_search(self): self._init_ray() from sklearn.model_selection import GridSearchCV from sklearn import datasets params = {"tree_method": "hist"} clf = RayXGBClassifier(n_estimators=1, learning_rate=1.0, **params) assert clf.get_params()["tree_method"] == "hist" # 'max_leaves' is not a default argument of XGBClassifier # Check we can still do grid search over this parameter search_params = {"max_leaves": range(2, 5)} grid_cv = GridSearchCV(clf, search_params, cv=5) iris = datasets.load_iris() grid_cv.fit(iris.data, iris.target) # Expect unique results for each parameter value # This confirms sklearn is able to successfully update the parameter means = grid_cv.cv_results_["mean_test_score"] assert len(means) == len(set(means))
def test_sklearn_n_jobs(self): self._init_ray() clf = RayXGBClassifier(n_jobs=1) assert clf.get_xgb_params()["n_jobs"] == 1 clf = RayXGBClassifier(n_jobs=2) assert clf.get_xgb_params()["n_jobs"] == 2
def test_save_load_model(self): self._init_ray() with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model") self.save_load_model(model_path) with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") self.save_load_model(model_path) from sklearn.datasets import load_digits with TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, "digits.model.json") digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] booster = xgb.train( { "tree_method": "hist", "objective": "binary:logistic" }, dtrain=xgb.DMatrix(X, y), num_boost_round=4, ) predt_0 = booster.predict(xgb.DMatrix(X)) booster.save_model(model_path) cls = RayXGBClassifier() cls.load_model(model_path) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == 2 # binary predt_1 = cls.predict_proba(X)[:, 1] assert np.allclose(predt_0, predt_1) cls = xgb.XGBModel() cls.load_model(model_path) predt_1 = cls.predict(X) assert np.allclose(predt_0, predt_1)
def test_sklearn_get_default_params(self): self._init_ray() from sklearn.datasets import load_digits digits_2class = load_digits(n_class=2) X = digits_2class["data"] y = digits_2class["target"] cls = RayXGBClassifier() assert cls.get_params()["base_score"] is None cls.fit(X[:4, ...], y[:4, ...]) assert cls.get_params()["base_score"] is not None
def test_classification_with_custom_objective(self): self._init_ray() from sklearn.datasets import load_digits from sklearn.model_selection import KFold def logregobj(y_true, y_pred): y_pred = 1.0 / (1.0 + np.exp(-y_pred)) grad = y_pred - y_true hess = y_pred * (1.0 - y_pred) return grad, hess digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier(objective=logregobj) xgb_model.fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_preds): raise XGBCustomObjectiveException() xgb_model = RayXGBClassifier(objective=dummy_objective) # TODO figure out how to assertRaises XGBCustomObjectiveException with self.assertRaises(RuntimeError): xgb_model.fit(X, y)
def test_estimator_type(self): self._init_ray() assert RayXGBClassifier._estimator_type == "classifier" assert RayXGBRFClassifier._estimator_type == "classifier" assert RayXGBRegressor._estimator_type == "regressor" assert RayXGBRFRegressor._estimator_type == "regressor" assert RayXGBRanker._estimator_type == "ranker" from sklearn.datasets import load_digits X, y = load_digits(n_class=2, return_X_y=True) cls = RayXGBClassifier(n_estimators=2).fit(X, y) with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "cls.json") cls.save_model(path) reg = RayXGBRegressor() with self.assertRaises(TypeError): reg.load_model(path) cls = RayXGBClassifier() cls.load_model(path) # no error
def test_XGBClassifier_resume(self): self._init_ray() from sklearn.datasets import load_breast_cancer from sklearn.metrics import log_loss with TemporaryDirectory() as tempdir: model1_path = os.path.join(tempdir, "test_XGBClassifier.model") model1_booster_path = os.path.join(tempdir, "test_XGBClassifier.booster") X, Y = load_breast_cancer(return_X_y=True) model1 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8) model1.fit(X, Y) pred1 = model1.predict(X) log_loss1 = log_loss(pred1, Y) # file name of stored xgb model model1.save_model(model1_path) model2 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 # file name of 'Booster' instance Xgb model model1.get_booster().save_model(model1_booster_path) model2 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_booster_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2
def save_load_model(self, model_path): from sklearn.datasets import load_digits from sklearn.model_selection import KFold digits = load_digits(n_class=2) y = digits["target"] X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier(use_label_encoder=False).fit( X[train_index], y[train_index]) xgb_model.save_model(model_path) xgb_model = RayXGBClassifier() xgb_model.load_model(model_path) assert xgb_model.use_label_encoder is False assert isinstance(xgb_model.classes_, np.ndarray) assert isinstance(xgb_model._Booster, xgb.Booster) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) assert err < 0.1 assert xgb_model.get_booster().attr("scikit_learn") is None # test native booster preds = xgb_model.predict(X[test_index], output_margin=True) booster = xgb.Booster(model_file=model_path) predt_1 = booster.predict( xgb.DMatrix(X[test_index]), output_margin=True) assert np.allclose(preds, predt_1) with self.assertRaises(TypeError): xgb_model = xgb.XGBModel() xgb_model.load_model(model_path)
def test_multiclass_classification(self): self._init_ray() from sklearn.datasets import load_iris from sklearn.model_selection import KFold def check_pred(preds, labels, output_margin): if output_margin: err = sum(1 for i in range(len(preds)) if preds[i].argmax() != labels[i]) / float( len(preds)) else: err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(len(preds)) assert err < 0.4 iris = load_iris() y = iris["target"] X = iris["data"] kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index]) if hasattr(xgb_model.get_booster(), "num_boosted_rounds"): assert (xgb_model.get_booster().num_boosted_rounds() == xgb_model.n_estimators) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit preds2 = xgb_model.predict( X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict( X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict( X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] check_pred(preds, labels, output_margin=False) check_pred(preds2, labels, output_margin=True) check_pred(preds3, labels, output_margin=True) check_pred(preds4, labels, output_margin=False) cls = RayXGBClassifier(n_estimators=4).fit(X, y) assert cls.n_classes_ == 3 proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == cls.n_classes_ # custom objective, the default is multi:softprob # so no transformation is required. cls = RayXGBClassifier( n_estimators=4, objective=softprob_obj(3)).fit(X, y) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == cls.n_classes_
def test_parameters_access(self): self._init_ray() from sklearn import datasets params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1} clf = RayXGBClassifier(n_estimators=1000, **params) assert clf.get_params()["updater"] == "grow_gpu_hist" assert clf.get_params()["subsample"] == 0.5 assert clf.get_params()["n_estimators"] == 1000 clf = RayXGBClassifier(n_estimators=1, nthread=4) X, y = datasets.load_iris(return_X_y=True) clf.fit(X, y) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 4 clf.set_params(nthread=16) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 16 clf.predict(X) config = json.loads(clf.get_booster().save_config()) assert int(config["learner"]["generic_param"]["nthread"]) == 16
def run_boost_from_prediction(self, tree_method): from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model_0 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method, ) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) model_1 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method, ) model_1.fit(X=X, y=y, base_margin=margin) predictions_1 = model_1.predict(X, base_margin=margin) cls_2 = RayXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method, ) cls_2.fit(X=X, y=y) predictions_2 = cls_2.predict(X) assert np.all(predictions_1 == predictions_2)