Пример #1
0
def test_base_estimator():
    # Test different base estimators.
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC

    # XXX doesn't work with y_class because RF doesn't support classes_
    # Shouldn't AdaBoost run a LabelBinarizer?
    clf = SMOTEBoost(base_estimator=RandomForestClassifier(), k_neighbors=3, random_state=0)
    clf.fit(X, y_regr)

    clf = SMOTEBoost(base_estimator=SVC(), algorithm="SAMME", k_neighbors=3, random_state=0)
    clf.fit(X, y_class)

    # Check that an empty discrete ensemble fails in fit, not predict.
    X_fail = [[1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1],
              [1, 1], [1, 1], [1, 1], [1, 1]]
    y_fail = ["foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo",
              "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar",
              1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]
    clf = SMOTEBoost(base_estimator=SVC(), k_neighbors=3, algorithm="SAMME", random_state=0)
    assert_raises_regexp(ValueError, "worse than random",
                         clf.fit, X_fail, y_fail)
Пример #2
0
def test_oneclass_adaboost_proba():
    # Test predict_proba robustness for one class label input.
    # In response to issue #7501
    # https://github.com/scikit-learn/scikit-learn/issues/7501
    y_t = np.ones(len(X))
    clf = SMOTEBoost().fit(X, y_t)
    assert_array_equal(clf.predict_proba(X), np.ones((len(X), 1)))
Пример #3
0
def test_error():
    # Test that it gives proper exception on deficient input.
    assert_raises(ValueError,
                  SMOTEBoost(learning_rate=-1).fit,
                  X, y_class)

    assert_raises(ValueError,
                  SMOTEBoost(algorithm="foo").fit,
                  X, y_class)

    assert_raises(TypeError,
                  SMOTEBoost().fit,
                  X, y_class, sample_weight=np.asarray([-1]))
Пример #4
0
def test_pickle():
    # Check pickability.
    import pickle

    # Adaboost classifier
    for alg in ['SAMME', 'SAMME.R']:
        obj = SMOTEBoost(algorithm=alg)
        obj.fit(iris.data, iris.target)
        score = obj.score(iris.data, iris.target)
        s = pickle.dumps(obj)

        obj2 = pickle.loads(s)
        assert_equal(type(obj2), obj.__class__)
        score2 = obj2.score(iris.data, iris.target)
        assert_equal(score, score2)
Пример #5
0
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = SMOTEBoost(algorithm=alg, random_state=0)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

        # Check we used multiple estimators
        assert_greater(len(clf.estimators_), 1)
        # Check for distinct random states (see issue #7408)
        assert_equal(len(set(est.random_state for est in clf.estimators_)),
                     len(clf.estimators_))

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
Пример #6
0
def test_gridsearch():
    # Check that base trees can be grid-searched.
    # AdaBoost classification
    boost = SMOTEBoost(base_estimator=DecisionTreeClassifier())
    parameters = {'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2),
                  'algorithm': ('SAMME', 'SAMME.R')}
    clf = GridSearchCV(boost, parameters)
    clf.fit(iris.data, iris.target)
Пример #7
0
def test_imb_performance():
    from maatpy.dataset import simulate_dataset
    from sklearn.metrics import cohen_kappa_score
    from sklearn.model_selection import StratifiedShuffleSplit
    imb = simulate_dataset(n_samples=100, n_features=2, n_classes=2, weights=[0.9, 0.1], random_state=0)
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    sss.get_n_splits(imb.data, imb.target)
    for train_index, test_index in sss.split(imb.data, imb.target):
        X_train, X_test = imb.data[train_index], imb.data[test_index]
        y_train, y_test = imb.target[train_index], imb.target[test_index]
    adaboost = AdaBoostClassifier(random_state=0)
    adaboost.fit(X_train, y_train)
    adaboost_score = cohen_kappa_score(adaboost.predict(X_test), y_test)


    clf = SMOTEBoost(random_state=0)
    clf.fit(X_train, y_train)
    score = cohen_kappa_score(clf.predict(X_test), y_test)
    assert score >= adaboost_score, "Failed with score = %f; AdaBoostClassifier score= %f" % (score, adaboost_score)
Пример #8
0
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = SMOTEBoost(algorithm=alg, random_state=0)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True)
Пример #9
0
def test_classification_toy():
    # Check classification on a toy dataset.
    for alg in ['SAMME', 'SAMME.R']:
        clf = SMOTEBoost(algorithm=alg, k_neighbors=3, random_state=0)
        clf.fit(X, y_class)
        assert_array_equal(clf.predict(T), y_t_class)
        assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
        assert_equal(clf.predict_proba(T).shape, (len(T), 2))
        assert_equal(clf.decision_function(T).shape, (len(T),))