Пример #1
0
def test_use_clones():
    np.random.seed(123)
    X, y = iris_data()

    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    StackingCVClassifier(classifiers=[clf1, clf2],
                         use_clones=True,
                         meta_classifier=meta,
                         shuffle=False).fit(X, y)

    assert_raises(
        exceptions.NotFittedError,
        "This RandomForestClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this estimator.", clf1.predict, X)

    StackingCVClassifier(classifiers=[clf1, clf2],
                         use_probas=True,
                         use_clones=False,
                         meta_classifier=meta,
                         shuffle=False).fit(X, y)

    clf1.predict(X)
Пример #2
0
    def function_set(self):
        # param
        self.__params = {
            # 注意名称必须是这样
            "logisticregression__C":
            list(np.linspace(start=0.1, stop=10, num=5)),
            "gradientboostingclassifier__learning_rate":
            list(np.linspace(start=0.1, stop=1, num=10)),
            "randomforestclassifier__n_estimators":
            list(range(5, 16)),
            "meta-logisticregression__C":
            list(np.linspace(start=0.1, stop=10, num=5))
        }

        # model
        self.__lr = LogisticRegression()
        self.__gb = GradientBoostingClassifier()
        self.__rf = RandomForestClassifier()
        self.__sclf = StackingCVClassifier(
            classifiers=[self.__lr, self.__gb, self.__rf],
            meta_classifier=self.__lr,
            use_probas=True,
            cv=5,
            use_features_in_secondary=True,
            verbose=1)

        self.__grid = GridSearchCV(estimator=self.__sclf,
                                   param_grid=self.__params,
                                   cv=5,
                                   refit=True)
Пример #3
0
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=lr)

    got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))

    expect = ['classifiers',
              'cv',
              'drop_last_proba',
              'gaussiannb',
              'kneighborsclassifier',
              'meta_classifier',
              'n_jobs',
              'pre_dispatch',
              'random_state',
              'randomforestclassifier',
              'shuffle',
              'store_train_meta_features',
              'stratify',
              'use_clones',
              'use_features_in_secondary',
              'use_probas',
              'verbose']
    assert got == expect, got
Пример #4
0
def test_sparse_inputs_with_features_in_secondary():
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)
    X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99

    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 1.0
    else:
        expected_value = 0.99
    assert round(stclf.score(X_train, y_train), 2) == expected_value, \
        round(stclf.score(X_train, y_train), 2)
Пример #5
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, _, y_train, _ = train_test_split(X_iris, y_iris, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(iris.data, iris.target)
Пример #7
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(X_iris, y_iris)
Пример #8
0
def test_no_weight_support():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=meta,
                                shuffle=False)
    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
Пример #10
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=3)
    sclf.fit(iris.data, iris.target)
Пример #11
0
def test_no_weight_support_meta():
    w = np.array([random.random() for _ in range(len(y_iris))])
    meta = KNeighborsClassifier()
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)

    with pytest.raises(TypeError):
        sclf.fit(X_iris, y_iris, sample_weight=w)
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
 def _build_model(self, X_train, y_train):
     knn = KNeighborsClassifier(n_neighbors=1)
     rf = RandomForestClassifier(max_depth=3,max_features=6,n_estimators=50,random_state=0)
     SVM = svm.SVC(C=1.0,kernel='poly',degree=5)
     Xgb = XGBClassifier(alpha=15, colsample_bytree=0.1,learning_rate=1, max_depth=5,reg_lambda=10.0)
     gnb = GaussianNB()
     lr = LogisticRegression(C = 10.0, dual=False, max_iter=100, solver='lbfgs')
     sclf = StackingCVClassifier(classifiers=[knn, rf,lr,SVM,Xgb],
                                 meta_classifier=gnb,
                                 random_state=42)
     sclf.fit(X_train,y_train)
     return sclf
Пример #14
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris,
                                                        test_size=0.3)
    #  test default (class labels)
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0],)
 def model_fit(self):
     self.__ef = ExtraTreesClassifier(n_jobs=-1)
     self.__rf = RandomForestClassifier(n_jobs=-1)
     self.__lr = LogisticRegression()
     self.__gb = GradientBoostingClassifier()
     self.__xgb = XGBClassifier(n_jobs=-1, missing=-999.0)
     self.__sclf = StackingCVClassifier(
         classifiers=[self.__ef, self.__rf, self.__gb, self.__xgb],
         meta_classifier=self.__lr,
         use_probas=True,
         cv=3)
     self.__sclf.fit(self.__application_train_feature.values,
                     self.__application_train_label.values)
def test_pandas():
    X_df = pd.DataFrame(X)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, iris.target)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_pandas():
    X_df = pd.DataFrame(X_iris)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, y_iris)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_pandas():
    X_df = pd.DataFrame(X)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)
    try:
        sclf.fit(X_df, iris.target)
    except KeyError as e:
        assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
Пример #19
0
def test_meta_feat_reordering():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 shuffle=True,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         test_size=0.3)
    stclf.fit(X_train, y_train)

    assert round(roc_auc_score(y_train,
                 stclf.train_meta_features_[:, 1]), 2) == 0.88
def test_list_of_lists():
    X_list = [i for i in X]
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, iris.target)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
def test_list_of_lists():
    X_list = [i for i in X]
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, iris.target)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
Пример #22
0
 def pick_the_best_function(self):
     self.__lr = LogisticRegression(C=0.1)
     self.__gb = GradientBoostingClassifier(learning_rate=0.1)
     self.__rf = RandomForestClassifier(n_estimators=5)
     self.__sclf = StackingCVClassifier(
         classifiers=[self.__lr, self.__gb, self.__rf],
         meta_classifier=self.__lr,
         use_probas=True,
         cv=5,
         use_features_in_secondary=True,
         verbose=1)
     self.__sclf.fit(self.__train, self.__train_label)
     print(
         roc_auc_score(self.__test_label,
                       self.__sclf.predict_proba(self.__test)[:, 1]))
def test_list_of_lists():
    X_list = [i for i in X_iris]
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta,
                                shuffle=False,
                                verbose=0)

    try:
        sclf.fit(X_list, y_iris)
    except TypeError as e:
        assert 'are NumPy arrays. If X and y are lists' in str(e)
Пример #24
0
def test_not_fitted():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                use_probas=True,
                                meta_classifier=meta, shuffle=False)

    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict,
                  iris.data)

    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict_proba,
                  iris.data)

    assert_raises(NotFittedError,
                  "This StackingCVClassifier instance is not fitted yet."
                  " Call 'fit' with appropriate arguments"
                  " before using this method.",
                  sclf.predict_meta_features,
                  iris.data)
def get_stacked_classifiers():
    dict_clfs = get_classifiers()

    kwargs = {
        'C': 100.0,
        'dual': False,
        'fit_intercept': True,
        'multi_class': 'multinomial',
        'penalty': 'l2',
        'solver': 'saga'
    }
    lr = LogisticRegression(**kwargs)

    names = [clf_name for clf_name in dict_clfs.keys()]
    classifiers = [dict_clfs[clf_name] for clf_name in names]

    # names = names[0:1]
    # classifiers = classifiers[0:1]

    clf_stacked = StackingCVClassifier(classifiers=classifiers,
                                       use_probas=True,
                                       use_features_in_secondary=True,
                                       meta_classifier=lr)
    names.append(CLF_TYPES.StackingCVClassifier)
    classifiers.append(clf_stacked)

    return classifiers, names
Пример #26
0
def stacking_clf_and_gridsearch_and_same_algo_multi_times(X, y):
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
    # clf3 = GaussianNB
    lr = LogisticRegression()

    # using a regression algorithm multiple times
    sclf = StackingCVClassifier(classifiers=[clf1, clf1, clf2],
                                meta_classifier=lr,
                                random_state=RANDOM_SEED)
    params = {
        'kneighborsclassifier-1__n_neighbors':
        [1, 5],  # add an additional number suffix in the parameter grid
        'kneighborsclassifier-2__n_neighbors':
        [1, 5],  # add an additional number suffix in the parameter grid
        'randomforestclassifier__n_estimators': [10, 50],
        'meta_classifier__C': [1, 10]
    }
    grid = model_selection.GridSearchCV(estimator=sclf,
                                        param_grid=params,
                                        cv=5,
                                        refit=True)
    grid.fit(X, y)
    cv_key = ('mean_test_score', 'std_test_score', 'params')
    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print('%0.3f +/- %0.2f %r' %
              (grid.cv_results_[cv_key[0]][r], grid.cv_results_[cv_key[1]][r],
               grid.cv_results_[cv_key[2]][r]))
    print('Best paras: %s' % grid.best_params_)
    print('Best scores: %0.3f' % grid.best_score_)
Пример #27
0
def simple_stacking_cv_classification_and_gridsearch(X, y):
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
    clf3 = GaussianNB()
    lr = LogisticRegression()

    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=lr,
                                random_state=RANDOM_SEED)
    params = {
        'kneighborsclassifier__n_neighbors': [1, 5],
        'randomforestclassifier__n_estimators': [10, 50],
        'meta_classifier__C': [1, 10]
    }
    grid = model_selection.GridSearchCV(estimator=sclf,
                                        param_grid=params,
                                        cv=5,
                                        refit=True)
    grid.fit(X, y)
    cv_keys = ('mean_test_score', 'std_test_score', 'params')
    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print(
            "%0.3f +/- %0.2f %r" %
            (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] /
             2.0, grid.cv_results_[cv_keys[2]][r]))

    print('Best parameters: %s' % grid.best_params_)
    print('Accuracy: %0.2f' % grid.best_score_)
Пример #28
0
def aimfuc(k):

    obj = np.ones(k.shape[0])

    for i in range(0, k.shape[0]):

        clf_LG = LogisticRegression(C=k[i, 0],
                                    solver='liblinear',
                                    random_state=RANDOM_SEED,
                                    penalty='l2')
        sclf = StackingCVClassifier(
            classifiers=[clf_SVM, clf_RF, clf_MLP, clf_Ad],
            meta_classifier=clf_LG,
            use_probas=True)

        # sclf.fit(X_C, Y_C)
        scores_ST = cross_val_score(sclf,
                                    X_train,
                                    Y_train,
                                    cv=6,
                                    scoring='accuracy',
                                    n_jobs=-1)
        f = scores_ST.mean()  # 0.9001018065904675 3svm
        obj[i] = -f
    return obj
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=lr)

    got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
    expect = [
        'classifiers', 'cv', 'gaussiannb', 'kneighborsclassifier',
        'meta-logisticregression', 'meta_classifier', 'randomforestclassifier',
        'refit', 'shuffle', 'store_train_meta_features', 'stratify',
        'use_features_in_secondary', 'use_probas', 'verbose'
    ]
    assert got == expect, got
Пример #30
0
def fun2():
    from sklearn import model_selection
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import StackingClassifier
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import StackingClassifier
    from mlxtend.classifier import StackingCVClassifier
    import numpy as np
    import warnings
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression()

    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                use_probas=True,
                                meta_classifier=lr,
                                random_state=42)

    print('3-fold cross validation:\n')

    for clf, label in zip([clf1, clf2, clf3, sclf],
                          ['KNN',
                           'Random Forest',
                           'Naive Bayes',
                           'StackingClassifier']):
        scores = model_selection.cross_val_score(clf, X, y,
                                                 cv=3, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]"
              % (scores.mean(), scores.std(), label))
Пример #31
0
def test_works_with_df_if_fold_indexes_missing():
    """This is a regression test to make sure fitting will still work even if
    training data has ids that cannot be indexed using the indexes from the cv
    (e.g. skf)

    Some possibilities:
    + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
      instead of [0, 1, ... n])
    + Indexes just start from some number greater than the size of the input
      (see test case)

    Training data sometimes has ids that carry other information, and selection
    of rows based on cv should not break.

    This is fixed in the code using `safe_indexing`
    """

    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42,
                                 use_features_in_secondary=True)

    X_modded = pd.DataFrame(X_breast,
                            index=np.arange(X_breast.shape[0]) + 1000)
    y_modded = pd.Series(y_breast,
                         index=np.arange(y_breast.shape[0]) + 1000)

    X_train, X_test, y_train, y_test = train_test_split(X_modded,
                                                        y_modded,
                                                        test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99, \
        round(stclf.score(X_train, y_train), 2)
Пример #32
0
def test_meta_feat_reordering():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 shuffle=True,
                                 random_state=42,
                                 store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         random_state=0,
                                                         test_size=0.3)
    stclf.fit(X_train, y_train)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 0.86
    else:
        expected_value = 0.87

    assert round(roc_auc_score(y_train,
                 stclf.train_meta_features_[:, 1]), 2) == expected_value, \
        round(roc_auc_score(y_train,
              stclf.train_meta_features_[:, 1]), 2)
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],
                                meta_classifier=lr)

    got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
    expect = ['classifiers',
              'cv',
              'gaussiannb',
              'kneighborsclassifier',
              'meta-logisticregression',
              'meta_classifier',
              'randomforestclassifier',
              'refit',
              'shuffle',
              'store_train_meta_features',
              'stratify',
              'use_features_in_secondary',
              'use_probas',
              'verbose']
    assert got == expect, got
Пример #34
0
def test_sparse_inputs():
    np.random.seed(123)
    rf = RandomForestClassifier(n_estimators=10)
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    stclf = StackingCVClassifier(classifiers=[rf, rf],
                                 meta_classifier=lr,
                                 random_state=42)
    X_train, X_test, y_train,  y_test = train_test_split(X_breast, y_breast,
                                                         test_size=0.3)

    # dense
    stclf.fit(X_train, y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99

    # sparse
    stclf.fit(sparse.csr_matrix(X_train), y_train)
    assert round(stclf.score(X_train, y_train), 2) == 0.99
Пример #35
0
def test_no_weight_support_with_no_weight():
    logit = LogisticRegression(multi_class='ovr', solver='liblinear')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    sclf = StackingCVClassifier(classifiers=[logit, rf, gnb],
                                meta_classifier=knn,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)

    sclf = StackingCVClassifier(classifiers=[logit, knn, gnb],
                                meta_classifier=rf,
                                shuffle=False)
    sclf.fit(X_iris, y_iris)
Пример #36
0
def test_sample_weight():
    # with no weight given
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    prob1 = sclf.fit(X_iris, y_iris).predict_proba(X_iris)

    # with weight = 1
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    w = np.ones(len(y_iris))
    prob2 = sclf.fit(X_iris, y_iris,
                     sample_weight=w).predict_proba(X_iris)

    # with random weight
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y_iris))])
    np.random.seed(123)
    meta = LogisticRegression(multi_class='ovr', solver='liblinear')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2],
                                meta_classifier=meta,
                                shuffle=False)
    prob3 = sclf.fit(X_iris, y_iris,
                     sample_weight=w).predict_proba(X_iris)

    diff12 = np.max(np.abs(prob1 - prob2))
    diff23 = np.max(np.abs(prob2 - prob3))
    assert diff12 < 1e-3, "max diff is %.4f" % diff12
    assert diff23 > 1e-3, "max diff is %.4f" % diff23
Пример #37
0
def test_StackingClassifier_drop_last_proba():
    np.random.seed(123)
    lr1 = LogisticRegression(solver='liblinear',
                             multi_class='ovr')
    sclf1 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=False,
                                 meta_classifier=lr1)

    sclf1.fit(X_iris, y_iris)
    r1 = sclf1.predict_meta_features(X_iris[:2])
    assert r1.shape == (2, 6)

    sclf2 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf2.fit(X_iris, y_iris)
    r2 = sclf2.predict_meta_features(X_iris[:2])
    assert r2.shape == (2, 4), r2.shape

    sclf3 = StackingCVClassifier(classifiers=[lr1, lr1],
                                 use_probas=True,
                                 drop_last_proba=True,
                                 meta_classifier=lr1)

    sclf3.fit(X_iris[0:100], y_iris[0:100])  # only 2 classes
    r3 = sclf3.predict_meta_features(X_iris[:2])
    assert r3.shape == (2, 2), r3.shape