예제 #1
0
def test_multiclass_classifier(loop):  # noqa
    # data
    iris = load_iris()
    X, y = iris.data, iris.target
    dX = da.from_array(X, 5)
    dy = da.from_array(y, 5)
    df = pd.DataFrame(X, columns=iris.feature_names)
    labels = pd.Series(y, name="target")

    ddf = dd.from_pandas(df, 2)
    dlabels = dd.from_pandas(labels, 2)
    # model
    a = xgb.XGBClassifier()  # array
    b = dxgb.XGBClassifier()
    c = xgb.XGBClassifier()  # frame
    d = dxgb.XGBClassifier()

    with cluster() as (s, [_, _]):
        with Client(s["address"], loop=loop):
            # fit
            a.fit(X, y)  # array
            b.fit(dX, dy, classes=[0, 1, 2])
            c.fit(df, labels)  # frame
            d.fit(ddf, dlabels, classes=[0, 1, 2])

            # check
            da.utils.assert_eq(a.predict(X), b.predict(dX))
            da.utils.assert_eq(a.predict_proba(X), b.predict_proba(dX))
            da.utils.assert_eq(c.predict(df), d.predict(ddf))
            da.utils.assert_eq(c.predict_proba(df), d.predict_proba(ddf))
예제 #2
0
async def test_predict_proba(c, s, a, b):
    X = da.random.random((50, 2), chunks=25)
    y = da.random.randint(0, 2, size=50, chunks=25)
    X_ = await c.compute(X)

    # array
    clf = dxgb.XGBClassifier()
    clf.fit(X, y, classes=[0, 1])
    booster = await clf._Booster

    result = clf.predict_proba(X_)
    expected = booster.predict(xgb.DMatrix(X_))
    np.testing.assert_array_equal(result, expected)

    # dataframe
    XX = dd.from_dask_array(X, columns=['A', 'B'])
    yy = dd.from_dask_array(y)
    XX_ = await c.compute(XX)

    clf = dxgb.XGBClassifier()
    clf.fit(XX, yy, classes=[0, 1])
    booster = await clf._Booster

    result = clf.predict_proba(XX_)
    expected = booster.predict(xgb.DMatrix(XX_))
    np.testing.assert_array_equal(result, expected)
예제 #3
0
def test_classifier_multi(kind, loop):  # noqa: F811

    if kind == "array":
        X2 = da.from_array(X, 5)
        y2 = da.from_array(np.array([0, 1, 2, 0, 1, 2, 0, 0, 0, 1]), chunks=5)
    else:
        X2 = dd.from_pandas(df, npartitions=2)
        y2 = dd.from_pandas(labels, npartitions=2)

    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            a = dxgb.XGBClassifier(num_class=3,
                                   n_estimators=10,
                                   objective="multi:softprob")
            a.fit(X2, y2)
            p1 = a.predict(X2)

            assert dask.is_dask_collection(p1)

            if kind == "array":
                assert p1.shape == (10, )

            result = p1.compute()
            assert result.shape == (10, )

            # proba
            p2 = a.predict_proba(X2)
            assert dask.is_dask_collection(p2)

            if kind == "array":
                assert p2.shape == (10, 3)
            assert p2.compute().shape == (10, 3)
예제 #4
0
def test_validation_weights_xgbclassifier(loop):  # noqa
    from sklearn.datasets import make_hastie_10_2

    # prepare training and test data
    X, y = make_hastie_10_2(n_samples=2000, random_state=42)
    labels, y = np.unique(y, return_inverse=True)

    param_dist = {
        "objective": "binary:logistic",
        "n_estimators": 2,
        "random_state": 123,
    }

    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            X_train, X_test = X[:1600], X[1600:]
            y_train, y_test = y[:1600], y[1600:]

            dX_train = da.from_array(X_train)
            dy_train = da.from_array(y_train)

            # instantiate model
            clf = dxgb.XGBClassifier(**param_dist)

            # train it using instance weights only in the training set
            weights_train = np.random.choice([1, 2], len(X_train))
            weights_train = da.from_array(weights_train)
            clf.fit(
                dX_train,
                dy_train,
                sample_weight=weights_train,
                eval_set=[(X_test, y_test)],
                eval_metric="logloss",
            )

            # evaluate logloss metric on test set *without* using weights
            evals_result_without_weights = clf.evals_result()
            logloss_without_weights = evals_result_without_weights[
                "validation_0"]["logloss"]

            # now use weights for the test set
            np.random.seed(0)
            weights_test = np.random.choice([1, 2], len(X_test))
            clf.fit(
                dX_train,
                dy_train,
                sample_weight=weights_train,
                eval_set=[(X_test, y_test)],
                sample_weight_eval_set=[weights_test],
                eval_metric="logloss",
            )
            evals_result_with_weights = clf.evals_result()
            logloss_with_weights = evals_result_with_weights["validation_0"][
                "logloss"]

    # check that the logloss in the test set is actually different
    # when using weights than when not using them
    assert all((logloss_with_weights[i] != logloss_without_weights[i]
                for i in [0, 1]))
예제 #5
0
    def __init__(self, client, random_seed=42, n_jobs=20, verbose=True):
        super(DaskModel, self).__init__(random_seed, n_jobs, verbose)

        # Model fields
        self.model = dask_xgboost.XGBClassifier()
        self.client = client

        self.scoring = DaskModel._acc_score
        self.tts = dask_ml.model_selection.train_test_split
예제 #6
0
def test_classifier_different_chunks(loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            a = dxgb.XGBClassifier()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 4)

            with pytest.raises(ValueError):
                a.fit(X2, y2)
예제 #7
0
def test_classifier_early_stopping(loop):  # noqa
    # data
    digits = load_digits(2)
    X = digits["data"]
    y = digits["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dX_train = da.from_array(X_train)
    dy_train = da.from_array(y_train)

    clf1 = dxgb.XGBClassifier()
    clf2 = dxgb.XGBClassifier()
    clf3 = dxgb.XGBClassifier()
    with cluster() as (s, [_, _]):
        with Client(s["address"], loop=loop):
            clf1.fit(
                dX_train,
                dy_train,
                early_stopping_rounds=5,
                eval_metric="auc",
                eval_set=[(X_test, y_test)],
            )
            clf2.fit(
                dX_train,
                dy_train,
                early_stopping_rounds=4,
                eval_metric="auc",
                eval_set=[(X_test, y_test)],
            )

            # should be the same
            assert clf1.best_score == clf2.best_score
            assert clf1.best_score != 1

            # check overfit
            clf3.fit(
                dX_train,
                dy_train,
                early_stopping_rounds=10,
                eval_metric="auc",
                eval_set=[(X_test, y_test)],
            )
            assert clf3.best_score == 1
예제 #8
0
def test_classifier_evals_result(loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            a = dxgb.XGBClassifier()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 5)
            a.fit(X2, y2, eval_metric="rmse", eval_set=[(X, y)])
            evals_result = a.evals_result()

    b = xgb.XGBClassifier()
    b.fit(X, y, eval_metric="rmse", eval_set=[(X, y)])
    assert_eq(evals_result, b.evals_result())
예제 #9
0
def test_classifier(loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            a = dxgb.XGBClassifier()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 5)
            a.fit(X2, y2)
            p1 = a.predict(X2)

    b = xgb.XGBClassifier()
    b.fit(X, y)
    np.testing.assert_array_almost_equal(a.feature_importances_,
                                         b.feature_importances_)
    assert_eq(p1, b.predict(X))
예제 #10
0
def test_classifier(loop):  # noqa
    digits = load_digits(2)
    X = digits["data"]
    y = digits["target"]

    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop):
            a = dxgb.XGBClassifier()
            X2 = da.from_array(X)
            y2 = da.from_array(y)
            a.fit(X2, y2)
            p1 = a.predict(X2)

    b = xgb.XGBClassifier()
    b.fit(X, y)
    np.testing.assert_array_almost_equal(a.feature_importances_,
                                         b.feature_importances_)
    assert_eq(p1, b.predict(X))