예제 #1
0
    def test_parameters_access(self):
        self._init_ray()

        from sklearn import datasets

        params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
        clf = RayXGBClassifier(n_estimators=1000, **params)
        assert clf.get_params()["updater"] == "grow_gpu_hist"
        assert clf.get_params()["subsample"] == 0.5
        assert clf.get_params()["n_estimators"] == 1000

        clf = RayXGBClassifier(n_estimators=1, nthread=4)
        X, y = datasets.load_iris(return_X_y=True)
        clf.fit(X, y)

        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 4

        clf.set_params(nthread=16)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16

        clf.predict(X)
        config = json.loads(clf.get_booster().save_config())
        assert int(config["learner"]["generic_param"]["nthread"]) == 16
예제 #2
0
    def test_pandas_input(self):
        self._init_ray()

        import pandas as pd
        from sklearn.calibration import CalibratedClassifierCV

        rng = np.random.RandomState(self.seed)

        kRows = 100
        kCols = 6

        X = rng.randint(low=0, high=2, size=kRows * kCols)
        X = X.reshape(kRows, kCols)

        df = pd.DataFrame(X)
        feature_names = []
        for i in range(1, kCols):
            feature_names += ["k" + str(i)]

        df.columns = ["status"] + feature_names

        target = df["status"]
        train = df.drop(columns=["status"])
        model = RayXGBClassifier()
        model.fit(train, target)
        clf_isotonic = CalibratedClassifierCV(
            model, cv="prefit", method="isotonic")
        clf_isotonic.fit(train, target)
        assert isinstance(
            clf_isotonic.calibrated_classifiers_[0].base_estimator,
            RayXGBClassifier,
        )
        self.assertTrue(
            np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1])))
예제 #3
0
    def test_validation_weights_xgbclassifier(self):
        self._init_ray()

        from sklearn.datasets import make_hastie_10_2

        # prepare training and test data
        X, y = make_hastie_10_2(n_samples=2000, random_state=42)
        labels, y = np.unique(y, return_inverse=True)
        X_train, X_test = X[:1600], X[1600:]
        y_train, y_test = y[:1600], y[1600:]

        # instantiate model
        param_dist = {
            "objective": "binary:logistic",
            "n_estimators": 2,
            "random_state": 123,
        }
        clf = RayXGBClassifier(**param_dist)

        # train it using instance weights only in the training set
        weights_train = np.random.choice([1, 2], len(X_train))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric="logloss",
            verbose=False,
        )

        # evaluate logloss metric on test set *without* using weights
        evals_result_without_weights = clf.evals_result()
        logloss_without_weights = evals_result_without_weights["validation_0"][
            "logloss"]

        # now use weights for the test set
        np.random.seed(0)
        weights_test = np.random.choice([1, 2], len(X_test))
        clf.fit(
            X_train,
            y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            sample_weight_eval_set=[weights_test],
            eval_metric="logloss",
            verbose=False,
        )
        evals_result_with_weights = clf.evals_result()
        logloss_with_weights = evals_result_with_weights["validation_0"][
            "logloss"]

        # check that the logloss in the test set is actually different
        # when using weights than when not using them
        assert all((logloss_with_weights[i] != logloss_without_weights[i]
                    for i in [0, 1]))
예제 #4
0
    def test_sklearn_get_default_params(self):
        self._init_ray()

        from sklearn.datasets import load_digits

        digits_2class = load_digits(n_class=2)
        X = digits_2class["data"]
        y = digits_2class["target"]
        cls = RayXGBClassifier()
        assert cls.get_params()["base_score"] is None
        cls.fit(X[:4, ...], y[:4, ...])
        assert cls.get_params()["base_score"] is not None
예제 #5
0
    def test_select_feature(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.feature_selection import SelectFromModel

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        cls = RayXGBClassifier()
        cls.fit(X, y)
        selector = SelectFromModel(cls, prefit=True, max_features=1)
        X_selected = selector.transform(X)
        assert X_selected.shape[1] == 1
예제 #6
0
    def test_sklearn_api_gblinear(self):
        self._init_ray()

        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        iris = load_iris()
        tr_d, te_d, tr_l, te_l = train_test_split(
            iris.data, iris.target, train_size=120)

        classifier = RayXGBClassifier(
            booster="gblinear", n_estimators=100, random_state=self.seed)
        classifier.fit(tr_d, tr_l)

        preds = classifier.predict(te_d)
        labels = te_l
        err = (sum([1 for p, l in zip(preds, labels)
                    if p != l]) * 1.0 / len(te_l))
        assert err < 0.5
예제 #7
0
    def run_boost_from_prediction(self, tree_method):
        from sklearn.datasets import load_breast_cancer

        X, y = load_breast_cancer(return_X_y=True)
        model_0 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_0.fit(X=X, y=y)
        margin = model_0.predict(X, output_margin=True)

        model_1 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=4,
            tree_method=tree_method,
        )
        model_1.fit(X=X, y=y, base_margin=margin)
        predictions_1 = model_1.predict(X, base_margin=margin)

        cls_2 = RayXGBClassifier(
            learning_rate=0.3,
            random_state=0,
            n_estimators=8,
            tree_method=tree_method,
        )
        cls_2.fit(X=X, y=y)
        predictions_2 = cls_2.predict(X)
        assert np.all(predictions_1 == predictions_2)
예제 #8
0
    def test_classification_with_custom_objective(self):
        self._init_ray()

        from sklearn.datasets import load_digits
        from sklearn.model_selection import KFold

        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess

        digits = load_digits(n_class=2)
        y = digits["target"]
        X = digits["data"]
        kf = KFold(n_splits=2, shuffle=True, random_state=self.rng)
        for train_index, test_index in kf.split(X, y):
            xgb_model = RayXGBClassifier(objective=logregobj)
            xgb_model.fit(X[train_index], y[train_index])
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1

        # Test that the custom objective function is actually used
        class XGBCustomObjectiveException(Exception):
            pass

        def dummy_objective(y_true, y_preds):
            raise XGBCustomObjectiveException()

        xgb_model = RayXGBClassifier(objective=dummy_objective)
        # TODO figure out how to assertRaises XGBCustomObjectiveException
        with self.assertRaises(RuntimeError):
            xgb_model.fit(X, y)
예제 #9
0
    def test_XGBClassifier_resume(self):
        self._init_ray()

        from sklearn.datasets import load_breast_cancer
        from sklearn.metrics import log_loss

        with TemporaryDirectory() as tempdir:
            model1_path = os.path.join(tempdir, "test_XGBClassifier.model")
            model1_booster_path = os.path.join(tempdir,
                                               "test_XGBClassifier.booster")

            X, Y = load_breast_cancer(return_X_y=True)

            model1 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model1.fit(X, Y)

            pred1 = model1.predict(X)
            log_loss1 = log_loss(pred1, Y)

            # file name of stored xgb model
            model1.save_model(model1_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2

            # file name of 'Booster' instance Xgb model
            model1.get_booster().save_model(model1_booster_path)
            model2 = RayXGBClassifier(
                learning_rate=0.3, random_state=0, n_estimators=8)
            model2.fit(X, Y, xgb_model=model1_booster_path)

            pred2 = model2.predict(X)
            log_loss2 = log_loss(pred2, Y)

            assert np.any(pred1 != pred2)
            assert log_loss1 > log_loss2