def test_use_clones(): np.random.seed(123) X, y = iris_data() meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() StackingCVClassifier(classifiers=[clf1, clf2], use_clones=True, meta_classifier=meta, shuffle=False).fit(X, y) assert_raises( exceptions.NotFittedError, "This RandomForestClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this estimator.", clf1.predict, X) StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, use_clones=False, meta_classifier=meta, shuffle=False).fit(X, y) clf1.predict(X)
def function_set(self): # param self.__params = { # 注意名称必须是这样 "logisticregression__C": list(np.linspace(start=0.1, stop=10, num=5)), "gradientboostingclassifier__learning_rate": list(np.linspace(start=0.1, stop=1, num=10)), "randomforestclassifier__n_estimators": list(range(5, 16)), "meta-logisticregression__C": list(np.linspace(start=0.1, stop=10, num=5)) } # model self.__lr = LogisticRegression() self.__gb = GradientBoostingClassifier() self.__rf = RandomForestClassifier() self.__sclf = StackingCVClassifier( classifiers=[self.__lr, self.__gb, self.__rf], meta_classifier=self.__lr, use_probas=True, cv=5, use_features_in_secondary=True, verbose=1) self.__grid = GridSearchCV(estimator=self.__sclf, param_grid=self.__params, cv=5, refit=True)
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression(multi_class='ovr', solver='liblinear') sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['classifiers', 'cv', 'drop_last_proba', 'gaussiannb', 'kneighborsclassifier', 'meta_classifier', 'n_jobs', 'pre_dispatch', 'random_state', 'randomforestclassifier', 'shuffle', 'store_train_meta_features', 'stratify', 'use_clones', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
def test_sparse_inputs_with_features_in_secondary(): rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 1.0 else: expected_value = 0.99 assert round(stclf.score(X_train, y_train), 2) == expected_value, \ round(stclf.score(X_train, y_train), 2) # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 1.0 else: expected_value = 0.99 assert round(stclf.score(X_train, y_train), 2) == expected_value, \ round(stclf.score(X_train, y_train), 2)
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, _, y_train, _ = train_test_split(X_iris, y_iris, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_verbose(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=3) sclf.fit(iris.data, iris.target)
def test_verbose(): np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=3) sclf.fit(X_iris, y_iris)
def test_no_weight_support(): w = np.array([random.random() for _ in range(len(y_iris))]) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta, shuffle=False) with pytest.raises(TypeError): sclf.fit(X_iris, y_iris, sample_weight=w)
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_no_weight_support_meta(): w = np.array([random.random() for _ in range(len(y_iris))]) meta = KNeighborsClassifier() clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) with pytest.raises(TypeError): sclf.fit(X_iris, y_iris, sample_weight=w)
def _build_model(self, X_train, y_train): knn = KNeighborsClassifier(n_neighbors=1) rf = RandomForestClassifier(max_depth=3,max_features=6,n_estimators=50,random_state=0) SVM = svm.SVC(C=1.0,kernel='poly',degree=5) Xgb = XGBClassifier(alpha=15, colsample_bytree=0.1,learning_rate=1, max_depth=5,reg_lambda=10.0) gnb = GaussianNB() lr = LogisticRegression(C = 10.0, dual=False, max_iter=100, solver='lbfgs') sclf = StackingCVClassifier(classifiers=[knn, rf,lr,SVM,Xgb], meta_classifier=gnb, random_state=42) sclf.fit(X_train,y_train) return sclf
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.3) # test default (class labels) stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def model_fit(self): self.__ef = ExtraTreesClassifier(n_jobs=-1) self.__rf = RandomForestClassifier(n_jobs=-1) self.__lr = LogisticRegression() self.__gb = GradientBoostingClassifier() self.__xgb = XGBClassifier(n_jobs=-1, missing=-999.0) self.__sclf = StackingCVClassifier( classifiers=[self.__ef, self.__rf, self.__gb, self.__xgb], meta_classifier=self.__lr, use_probas=True, cv=3) self.__sclf.fit(self.__application_train_feature.values, self.__application_train_label.values)
def test_pandas(): X_df = pd.DataFrame(X) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_df, iris.target) except KeyError as e: assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_pandas(): X_df = pd.DataFrame(X_iris) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_df, y_iris) except KeyError as e: assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_meta_feat_reordering(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) stclf.fit(X_train, y_train) assert round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) == 0.88
def test_list_of_lists(): X_list = [i for i in X] meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_list, iris.target) except TypeError as e: assert 'are NumPy arrays. If X and y are lists' in str(e)
def pick_the_best_function(self): self.__lr = LogisticRegression(C=0.1) self.__gb = GradientBoostingClassifier(learning_rate=0.1) self.__rf = RandomForestClassifier(n_estimators=5) self.__sclf = StackingCVClassifier( classifiers=[self.__lr, self.__gb, self.__rf], meta_classifier=self.__lr, use_probas=True, cv=5, use_features_in_secondary=True, verbose=1) self.__sclf.fit(self.__train, self.__train_label) print( roc_auc_score(self.__test_label, self.__sclf.predict_proba(self.__test)[:, 1]))
def test_list_of_lists(): X_list = [i for i in X_iris] meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_list, y_iris) except TypeError as e: assert 'are NumPy arrays. If X and y are lists' in str(e)
def test_not_fitted(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False) assert_raises(NotFittedError, "This StackingCVClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict, iris.data) assert_raises(NotFittedError, "This StackingCVClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict_proba, iris.data) assert_raises(NotFittedError, "This StackingCVClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this method.", sclf.predict_meta_features, iris.data)
def get_stacked_classifiers(): dict_clfs = get_classifiers() kwargs = { 'C': 100.0, 'dual': False, 'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'saga' } lr = LogisticRegression(**kwargs) names = [clf_name for clf_name in dict_clfs.keys()] classifiers = [dict_clfs[clf_name] for clf_name in names] # names = names[0:1] # classifiers = classifiers[0:1] clf_stacked = StackingCVClassifier(classifiers=classifiers, use_probas=True, use_features_in_secondary=True, meta_classifier=lr) names.append(CLF_TYPES.StackingCVClassifier) classifiers.append(clf_stacked) return classifiers, names
def stacking_clf_and_gridsearch_and_same_algo_multi_times(X, y): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=RANDOM_SEED) # clf3 = GaussianNB lr = LogisticRegression() # using a regression algorithm multiple times sclf = StackingCVClassifier(classifiers=[clf1, clf1, clf2], meta_classifier=lr, random_state=RANDOM_SEED) params = { 'kneighborsclassifier-1__n_neighbors': [1, 5], # add an additional number suffix in the parameter grid 'kneighborsclassifier-2__n_neighbors': [1, 5], # add an additional number suffix in the parameter grid 'randomforestclassifier__n_estimators': [10, 50], 'meta_classifier__C': [1, 10] } grid = model_selection.GridSearchCV(estimator=sclf, param_grid=params, cv=5, refit=True) grid.fit(X, y) cv_key = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print('%0.3f +/- %0.2f %r' % (grid.cv_results_[cv_key[0]][r], grid.cv_results_[cv_key[1]][r], grid.cv_results_[cv_key[2]][r])) print('Best paras: %s' % grid.best_params_) print('Best scores: %0.3f' % grid.best_score_)
def simple_stacking_cv_classification_and_gridsearch(X, y): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=RANDOM_SEED) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, random_state=RANDOM_SEED) params = { 'kneighborsclassifier__n_neighbors': [1, 5], 'randomforestclassifier__n_estimators': [10, 50], 'meta_classifier__C': [1, 10] } grid = model_selection.GridSearchCV(estimator=sclf, param_grid=params, cv=5, refit=True) grid.fit(X, y) cv_keys = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print( "%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r])) print('Best parameters: %s' % grid.best_params_) print('Accuracy: %0.2f' % grid.best_score_)
def aimfuc(k): obj = np.ones(k.shape[0]) for i in range(0, k.shape[0]): clf_LG = LogisticRegression(C=k[i, 0], solver='liblinear', random_state=RANDOM_SEED, penalty='l2') sclf = StackingCVClassifier( classifiers=[clf_SVM, clf_RF, clf_MLP, clf_Ad], meta_classifier=clf_LG, use_probas=True) # sclf.fit(X_C, Y_C) scores_ST = cross_val_score(sclf, X_train, Y_train, cv=6, scoring='accuracy', n_jobs=-1) f = scores_ST.mean() # 0.9001018065904675 3svm obj[i] = -f return obj
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = [ 'classifiers', 'cv', 'gaussiannb', 'kneighborsclassifier', 'meta-logisticregression', 'meta_classifier', 'randomforestclassifier', 'refit', 'shuffle', 'store_train_meta_features', 'stratify', 'use_features_in_secondary', 'use_probas', 'verbose' ] assert got == expect, got
def fun2(): from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier from mlxtend.classifier import StackingCVClassifier import numpy as np import warnings clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, meta_classifier=lr, random_state=42) print('3-fold cross validation:\n') for clf, label in zip([clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def test_works_with_df_if_fold_indexes_missing(): """This is a regression test to make sure fitting will still work even if training data has ids that cannot be indexed using the indexes from the cv (e.g. skf) Some possibilities: + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...] instead of [0, 1, ... n]) + Indexes just start from some number greater than the size of the input (see test case) Training data sometimes has ids that carry other information, and selection of rows based on cv should not break. This is fixed in the code using `safe_indexing` """ np.random.seed(123) rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_modded = pd.DataFrame(X_breast, index=np.arange(X_breast.shape[0]) + 1000) y_modded = pd.Series(y_breast, index=np.arange(y_breast.shape[0]) + 1000) X_train, X_test, y_train, y_test = train_test_split(X_modded, y_modded, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99, \ round(stclf.score(X_train, y_train), 2)
def test_meta_feat_reordering(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, random_state=42, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, random_state=0, test_size=0.3) stclf.fit(X_train, y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 0.86 else: expected_value = 0.87 assert round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) == expected_value, \ round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2)
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()})) expect = ['classifiers', 'cv', 'gaussiannb', 'kneighborsclassifier', 'meta-logisticregression', 'meta_classifier', 'randomforestclassifier', 'refit', 'shuffle', 'store_train_meta_features', 'stratify', 'use_features_in_secondary', 'use_probas', 'verbose'] assert got == expect, got
def test_sparse_inputs(): np.random.seed(123) rf = RandomForestClassifier(n_estimators=10) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99 # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_no_weight_support_with_no_weight(): logit = LogisticRegression(multi_class='ovr', solver='liblinear') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() sclf = StackingCVClassifier(classifiers=[logit, rf, gnb], meta_classifier=knn, shuffle=False) sclf.fit(X_iris, y_iris) sclf = StackingCVClassifier(classifiers=[logit, knn, gnb], meta_classifier=rf, shuffle=False) sclf.fit(X_iris, y_iris)
def test_sample_weight(): # with no weight given np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) prob1 = sclf.fit(X_iris, y_iris).predict_proba(X_iris) # with weight = 1 np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) w = np.ones(len(y_iris)) prob2 = sclf.fit(X_iris, y_iris, sample_weight=w).predict_proba(X_iris) # with random weight random.seed(87) w = np.array([random.random() for _ in range(len(y_iris))]) np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) prob3 = sclf.fit(X_iris, y_iris, sample_weight=w).predict_proba(X_iris) diff12 = np.max(np.abs(prob1 - prob2)) diff23 = np.max(np.abs(prob2 - prob3)) assert diff12 < 1e-3, "max diff is %.4f" % diff12 assert diff23 > 1e-3, "max diff is %.4f" % diff23
def test_StackingClassifier_drop_last_proba(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=False, meta_classifier=lr1) sclf1.fit(X_iris, y_iris) r1 = sclf1.predict_meta_features(X_iris[:2]) assert r1.shape == (2, 6) sclf2 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf2.fit(X_iris, y_iris) r2 = sclf2.predict_meta_features(X_iris[:2]) assert r2.shape == (2, 4), r2.shape sclf3 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf3.fit(X_iris[0:100], y_iris[0:100]) # only 2 classes r3 = sclf3.predict_meta_features(X_iris[:2]) assert r3.shape == (2, 2), r3.shape