def stacking_classifier(best_logistic_regression, best_knn_classifier, best_gaussian_nb, best_decision_tree_classifier, best_random_forest_classifier, x_train, x_test, y_train, y_test): from sklearn.ensemble import StackingClassifier estimators = [ # ('random_forest_cv', best_random_forest_classifier), ('knn_classifier_cv', best_knn_classifier), ('dct_cv', best_decision_tree_classifier), ('gaussian_nb_cv', best_gaussian_nb) ] final_stacking_classifier = StackingClassifier( estimators=estimators, shuffle=False, use_probas=True, final_estimator=best_logistic_regression) final_stacking_classifier.fit(x_train, y_train) print("Stacking Classifier Training Score {}".format( final_stacking_classifier.score(x_train, y_train))) print("Stacking Classifier Testing Score {}\n".format( final_stacking_classifier.score(x_test, y_test))) y_predict = final_stacking_classifier.predict(x_test) classification_model = 'Stacking Classifier' confusion_matrix_graph(y_test, y_predict, classification_model) roc_curve_graph(y_test, y_predict, classification_model)
def test_stacking_classifier_iris(cv, final_estimator): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) assert X_trans.shape[1] == 6 clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 3
def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split( scale(X_iris), y_iris, stratify=y_iris, random_state=42 ) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough ) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def week10(C, random_state, criterion, min_samples_leaf, max_leaf_samples, n_estimators, solver, cv, clazz, images): trainData, Y = catsvsdogs.train Y = [(y + 1) % 2 for y in Y] from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier( criterion=criterion, # критерий разделения min_samples_leaf=min_samples_leaf, # минимальное число объектов в листе max_leaf_nodes=max_leaf_samples, # максимальное число листьев random_state=random_state) bagging = BaggingClassifier( tree, # базовый алгоритм n_estimators=n_estimators, # количество деревьев random_state=random_state) bagging.fit(trainData, Y) from sklearn.svm import LinearSVC svm = LinearSVC(random_state=random_state, C=C) svm.fit(trainData, Y) from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier( n_estimators=n_estimators, # количество деревьев criterion=criterion, # критерий разделения min_samples_leaf=min_samples_leaf, # минимальное число объектов в листе max_leaf_nodes=max_leaf_samples, # максимальное число листьев random_state=random_state) forest.fit(trainData, Y) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver=solver, random_state=random_state) from sklearn.ensemble import StackingClassifier base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)] sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=cv) sclf.fit(trainData, Y) accuracy = sclf.score(trainData, Y) probas = [] for img in images: histt = catsvsdogs.test[img].reshape(1, -1) probas += [(img, sclf.predict_proba(histt)[0][clazz])] return {'accuracy': accuracy, 'probas': probas}
def test_stacking(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) df = DF.copy() numeric_cols = df.select_dtypes("number").columns categorical_cols = [ col for col in df.columns if (col not in numeric_cols and not col == CLASS_FEAT) ] dum_df = pd.get_dummies(df[categorical_cols]) for col in numeric_cols: dum_df[col] = df[col] dum_df[CLASS_FEAT] = df[CLASS_FEAT] sktrain, sktest = df_shuffled_split(dum_df, random_state=42) sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT] sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT] lone_tree = DecisionTreeClassifier(random_state=42) lone_tree.fit(sktrain_x, sktrain_y) lone_tree_score = lone_tree.score(sktest_x, sktest_y) # print('lone_tree_score',lone_tree_score) irep_tree = SVC(random_state=42) irep_stack_estimators = [("irep", irep), ("tree", irep_tree)] irep_stack = StackingClassifier(estimators=irep_stack_estimators, final_estimator=LogisticRegression()) irep_stack.fit(sktrain_x, sktrain_y) irep_stack_score = irep_stack.score(sktest_x, sktest_y) # print('irep_stack_score', irep_stack_score) assert irep_stack_score != lone_tree_score rip_tree = DecisionTreeClassifier(random_state=42) rip_stack_estimators = [("rip", rip), ("tree", rip_tree)] rip_stack = StackingClassifier(estimators=rip_stack_estimators, final_estimator=LogisticRegression()) rip_stack.fit(sktrain_x, sktrain_y) rip_stack_score = rip_stack.score(sktest_x, sktest_y) # print('rip_stack_score',rip_stack_score) assert rip_stack_score != lone_tree_score
def perform_stacking(self): eclfs = [(k, v) for k, v in self.__classifiers.items()] clf = StackingClassifier(estimators=eclfs, final_estimator=LogisticRegression(), cv=5, verbose=1, n_jobs=-1) clf.fit(self.__train_x, self.__train_y) scores = clf.score(self.__test_x, self.__test_y) print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'StackingClassifier')) self.plot_conf_mat(clf, 'StackingClassifier') return clf
def ensemble_(feat, tar, split): scaler = MinMaxScaler() x_tr,x_te,y_tr,y_te = train_test_split(feat,tar,test_size = split,shuffle = True) scaler.fit(x_tr) x_tr = scaler.transform(x_tr) x_te = scaler.transform(x_te) knn = KNeighborsClassifier() params_knn = {'n_neighbors': np.arange(1, 25)} knn_gs = GridSearchCV(knn, params_knn, cv=5) knn_gs.fit(x_tr, y_tr) knn_best = knn_gs.best_estimator_ print(knn_gs.best_params_) rf = RandomForestClassifier() params_rf = {'n_estimators': [50, 100, 200,300,400]} rf_gs = GridSearchCV(rf, params_rf, cv=5) rf_gs.fit(x_tr, y_tr) rf_best = rf_gs.best_estimator_ print(rf_gs.best_params_) log_reg = LogisticRegression() log_reg.fit(x_tr, y_tr) print('knn: {}'.format(knn_best.score(x_te, y_te))) print('rf: {}'.format(rf_best.score(x_te, y_te))) print('log_reg: {}'.format(log_reg.score(x_te, y_te))) estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)] ensemble = VotingClassifier(estimators, voting='hard') ensemble.fit(x_tr, y_tr) print("ensemble voting score: ",str(ensemble.score(x_te, y_te))) ensemble_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10) ensemble_bagging.fit(x_tr, y_tr) print("ensemble bagging score: ",str(ensemble_bagging.score(x_te, y_te))) ensemble_stacking = StackingClassifier(estimators,LogisticRegression()) ensemble_stacking.fit(x_tr, y_tr) print("ensemble stacking score: ", str(ensemble_stacking.score(x_te, y_te)))
sex = lineX[4] # ageGroup = math.floor(lineX[2] / 20) # if ageGroup not in dictAgeSurvived[sex]: # dictAgeSurvived[sex][ageGroup] = 0 # dictAgeAll[sex][ageGroup] = 0 # dictAgeSurvived[sex][ageGroup] += result # dictAgeAll[sex][ageGroup] = dictAgeAll[sex][ageGroup] + 1 # x = np.arange(len(dictAgeAll[0].keys())) # width = 0.35 # resultSex = [] # for sex in range(2): # resultSex.append([]) # for i in range(len(dictAgeAll[0].keys())): # resultSex[sex].append(dictAgeSurvived[sex][i]/dictAgeAll[sex][i]) # fig, ax = plt.subplots() # ax.bar(x - width/2, resultSex[0], width=width, label='female') # ax.bar(x + width/2, resultSex[1], width=width, label='male') # ax.set_xticks(x) # ax.set_xticklabels(['0-20', '20-40', '40-60', '60-80']) x = np.linspace(0, 80, 4) plt.plot(x, resultSex[0], 'red') plt.plot(x, resultSex[1], 'blue') plt.show() print(clf.score(X, y))
#step2:Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, stratify=y) #step3:Training clf = StackingClassifier(estimators=[ ('knn', KNeighborsClassifier(n_neighbors=7, weights='distance', leaf_size=1, metric='manhattan')), ('ridge', RidgeClassifier(random_state=40, class_weight='balanced', alpha=0.00001)), ], final_estimator=LogisticRegression( random_state=40, class_weight='balanced', max_iter=10000), cv=5, n_jobs=-1) clf.fit(X_train, y_train) score_train = clf.score(X_train, y_train) score_test = clf.score(X_test, y_test) print( 'scores for StackingClassifier(score on training set/testing set):{:.2f}/{:.2f}' .format(score_train, score_test))
from sklearn.ensemble import StackingClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris def load_data(): data = load_iris() x, y = data.data, data.target x_train, x_test, y_train, y_test \ = train_test_split(x, y, test_size=0.3) return x_train, x_test, y_train, y_test if __name__ == '__main__': x_train, x_test, y_train, y_test = load_data() estimators = [('logist', LogisticRegression(max_iter=500)), ('knn', KNeighborsClassifier(n_neighbors=3))] stacking = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier()) stacking.fit(x_train, y_train) acc = stacking.score(x_test, y_test) print("模型在测试集上的准确率为:", acc)
stk = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(C=0.1, class_weight='balanced', random_state=1234), cv=5) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1234) svc.fit(X_train, y_train) svc.score(X_train, y_train) svc.score(X_test, y_test) rf.fit(X_train, y_train) rf.score(X_train, y_train) rf.score(X_test, y_test) dt.fit(X_train, y_train) dt.score(X_train, y_train) dt.score(X_test, y_test) vot_c.fit(X_train, y_train) vot_c.score(X_train, y_train) vot_c.score(X_test, y_test) stk.fit(X_train, y_train) stk.score(X_train, y_train) stk.score(X_test, y_test) svc.score(X_test, y_test) rf.score(X_test, y_test) dt.score(X_test, y_test) vot_c.score(X_test, y_test) stk.score(X_test, y_test)
GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))] layer_two_estimators = [ ('dt_2', DecisionTreeClassifier()), ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)), ] layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression()) # clf = StackingClassifier(estimators=base_learners, final_estimator=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)) stack_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv=10) stack_clf.fit(train[:, 0:(n2 - 1)], train[:, (n2 - 1)]) stack_acc = stack_clf.score(test[:, 0:(n2 - 1)], test[:, (n2 - 1)]) #voting from sklearn.ensemble import VotingClassifier from sklearn.model_selection import cross_val_score v_clf = VotingClassifier(estimators=base_learners, voting='soft') v_clf.fit(data[:, 0:(n2 - 1)], data[:, (n2 - 1)]) v_scores = cross_val_score(v_clf, data[:, 0:(n2 - 1)], data[:, (n2 - 1)], scoring='accuracy', cv=5).mean() df = pd.read_csv('test.csv', delimiter=',')
algorithm=bestalgo, leaf_size=bestleaf, p=2, metric='minkowski') clf4 = svm.SVC(C=bestC, kernel=ker, gamma=bestgamma, decision_function_shape='ovo', random_state=0) estimator = [('NB', clf1), ('RF', clf2), ('KNN', clf3), ('SVM', clf4)] clf5 = StackingClassifier(estimators=estimator, final_estimator=LogisticRegressionCV( cv=5, random_state=0), stack_method='auto', n_jobs=-1).fit(X_train, Y_train) Stackscore = clf5.score(X_test, Y_test) Stackscores.append(Stackscore) ALLmodscores.append((Stackscore, Stackmodelname)) Stackmodelresutls = { 'Stack': clf5, 'X_train': X_train, 'X_test': X_test, 'Y_train': Y_train, 'Y_test': Y_test } ALLmod[Stackmodelname] = Stackmodelresutls print(Stackmodelname + 'done.') ALLmodscores_sorted = sorted(ALLmodscores, reverse=True) LRmeanscore = (round(np.mean(LRscores), 4), 'LRmeanscore') SVMmeansore = (round(np.mean(SVMscores), 4), 'SVMmeanscore') KNNmeanscore = (round(np.mean(KNNscores), 4), 'KNNmeanscore')
}, tol=1e-6) model = StackingClassifier(estimators=estimator, final_estimator=final_est, cv=10) model.fit(xtrain_sample, ytrain_sample) yhat = model.predict(xtest_std) auc = roc_auc_score(ytest, yhat) print(classification_report(ytest, yhat)) print("AUC: ", round(auc, 2)) model.score(xtrain_std, ytrain) model.score(xtest_std, ytest) # Logistic regression with best fitting tuning parameter logreg = LogisticRegression(C=0.7, class_weight={ True: 1.32, False: 1.08 }, tol=1e-6) logreg.fit(xtrain_sample, ytrain_sample) yhat = logreg.predict(xtest_std) auc = roc_auc_score(ytest, yhat)
modname = selectmodrocs[0][3] selectmodels = selectmodrocresults[KEYS[0]] selectmodel = selectmodels[modname] selectmodelname = list(selectmodel.keys())[0] selectmod = selectmodel[selectmodelname] try: testacc = selectmod.score(select_U_data_test_mod, WHO_test) except Exception as e: params = selectmod.get_params() X_train = selectmodel[list(selectmodel.keys())[1]] Y_train = selectmodel[list(selectmodel.keys())[3]] try: selectmod = svm.SVC(C=params['C'], kernel=params['kernel'], gamma=params['gamma'],decision_function_shape='ovo', random_state=0).fit(X_train, Y_train) except Exception as e: selectmod = StackingClassifier(estimators=params['estimators'],final_estimator=params['final_estimator'],cv=params['cv'],stack_method=params['stack_method'],n_jobs=-1).fit(X_train,Y_train) testacc = selectmod.score(select_U_data_test_mod, WHO_test) allgroups_testaccs2.append((testacc,featurename,modname)) allgroups_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod} featurenum = len(select_U_features_mod) if featurenum == 2: groups2_testaccs2.append((testacc,featurename,modname)) groups2_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod} elif featurenum == 3: groups3_testaccs2.append((testacc, featurename, modname)) groups3_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod} elif featurenum == 4: groups4_testaccs2.append((testacc,featurename,modname)) groups4_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod} elif featurenum == 5: groups5_testaccs2.append((testacc,featurename,modname)) groups5_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
# Third forest = RandomForestClassifier(n_estimators=13, criterion='entropy', min_samples_leaf=10, max_leaf_nodes=20, random_state=80) forest.fit(trainData, Y) # Logistic regression lr = LogisticRegression(solver='lbfgs', random_state=80) base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)] sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=2) sclf.fit(trainData, Y) score1 = sclf.score(trainData, Y) print('Logistic regression score:', score1) # Predictions names = ['dog.1049.jpg', 'dog.1028.jpg', 'dog.1011.jpg', 'cat.1016.jpg'] for name in names: singleImage = cv2.imread('data/test/' + name) histt = extract_histogram(singleImage) histt2 = histt.reshape(1, -1) prediction = sclf.predict(histt2) proba = sclf.predict_proba(histt2) print(f'Predictions for {name}:', proba)
print(root, dir, files) x_train, y_train = get_600_data(os.path.join(root + '/' + files[0])) x_test, y_test = get_600_data(os.path.join(root + '/' + files[1])) nca.fit(x_train, y_train) # clf3 = svm.SVC(kernel='rbf', decision_function_shape='ovo', degree=1, gamma='scale', coef0=1.0, shrinking=True, probability=True) clf1 = GradientBoostingClassifier(n_estimators=100, random_state=123) clf2 = RandomForestClassifier(n_estimators=1000, random_state=11) model = StackingClassifier( estimators=[('GBM', clf1), ('RF', clf2)], final_estimator=LogisticRegression(solver='liblinear')) # model.fit(x_train,y_train) model.fit(nca.transform(x_train), y_train) acc = model.score(nca.transform(x_test), y_test) print(acc) # predict = model.predict(x_test) # # import sklearn.metrics as metrics # print ("Accuracy: {}%".format(metrics.accuracy_score(y_test, predict))) # print ("Precision: {}%".format(100*metrics.precision_score(y_test, predict, average="weighted"))) # print ("Recall: {}%".format(100*metrics.recall_score(y_test, predict, average="weighted"))) # print ("f1_score: {}%".format(100*metrics.f1_score(y_test, predict, average="weighted"))) # print (metrics.confusion_matrix(y_test, predict)) # accuracy += acc # precision_0 += metrics.precision_score(y_test, predict, pos_label=0, average="binary")
print("RFmodels done.") for k in Stackmodels.keys(): modname = k modelresults = Stackmodels[k] Stack = modelresults['Stack'] X_test = modelresults['X_test'] Y_test = modelresults['Y_test'] X_train = modelresults['X_train'] Y_train = modelresults['Y_train'] try: Y_pred = Stack.predict_proba(X_test)[:, 1] except Exception as e: params = Stack.get_params() Stack = StackingClassifier(estimators=params['estimators'], final_estimator=params['final_estimator'], cv=params['cv'], stack_method=params['stack_method'], n_jobs=-1).fit(X_train, Y_train) Y_pred = Stack.predict_proba(X_test)[:,1] Stackscore = Stack.score(X_test,Y_test) fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) roc_auc = auc(fpr,tpr) if roc_auc <0.5: Y_pred = Stack.predict_proba(X_test)[:,0] fpr, tpr, thresholds = roc_curve(Y_test,Y_pred) roc_auc = auc(fpr,tpr) Y_pred2 = [] optimal_idxf = np.argmax(tpr-fpr) optimal_thresholdsf = thresholds[optimal_idxf] for prob in Y_pred: if prob >= optimal_thresholdsf: Y_pred2.append(1) else: Y_pred2.append(0) Stackscore_th = accuracy_score(Y_test, Y_pred2)
from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier #SVM KNN: ('s2', clf_res.best_estimator_), ('s', gs_knn.best_estimator_) #SVM XGB: ('s2', clf_res.best_estimator_), ('s4',grid_search.best_estimator_) #SVM Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_) #KNN XGB: ('s', gs_knn.best_estimator_), ('s4',grid_search.best_estimator_) #KNN Random: ('s', gs_knn.best_estimator_), ('s3', gridF.best_estimator_) #XGB Random: ('s4',grid_search.best_estimator_), ('s3', gridF.best_estimator_) #SVM KNN XGB: ('s', gs_knn.best_estimator_), ('s2', clf_res.best_estimator_), ('s4',grid_search.best_estimator_) #SVM KNN Random: ('s', gs_knn.best_estimator_), ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_) #SVM XGB Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_) #KNN XGB Random: ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_), ('s', gs_knn.best_estimator_) #SVM KNN XGB Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_), ('s', gs_knn.best_estimator_) estimators_res = [('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_), ('s4', xb), ('s', gs_knn.best_estimator_)] stacked = StackingClassifier(estimators=estimators_res, final_estimator=LogisticRegression()) stacked.fit(feat_train_res, y_train) print(stacked.score(feat_train_res, y_train)) print(stacked.score(feat_val_res, y_val)) print(stacked.score(feat_test_res, y_test))
names = [] scores = [] for name, model in models: model.fit(x_train_scaled, y_train) y_pred = model.predict(x_test_scaled) scores.append(accuracy_score(y_test, y_pred)) names.append(name) tr_split = pd.DataFrame({'Name': names, 'Score': scores}) print(tr_split) from sklearn.ensemble import StackingClassifier estimators = [ m for m in models] lr = LogisticRegression() sclf = StackingClassifier(estimators=estimators, final_estimator=lr) sclf.fit(x_train, y_train) sclf.score(x_test, y_test) #saving the chosen model from sklearn.externals import joblib rf_gscv = RandomForestClassifier(criterion='entropy', min_samples_leaf=3, min_samples_split=7, n_estimators=30, n_jobs=-1, random_state=123).fit(x_train_scaled, y_train) joblib.dump(rf_gscv, 'q2d_rf_GSCV.pkl') #Make Predictions using the saved model rf_gscv = joblib.load("q2d_rf_GSCV.pkl")
ensembles_results['ensemble'].append('VotingClassifier_soft') ensembles_results['train_score'].append(soft_voting_ensemble.score(X_train, y_train)) ensembles_results['validation_score'].append(soft_voting_ensemble.score(X_validation,y_validation)) # In[148]: # Secondly, I will use StackingClassifier with the same DecisionTreeClassifier + KNeighborsClassifier stacking_ensemble = StackingClassifier(estimators=[('knn_estimator',knn_estimator), ('dtc_estimator',dtc_estimator)]) stacking_ensemble.fit(X_train, y_train) ensembles_results['ensemble'].append('StackingClassifier') ensembles_results['train_score'].append(stacking_ensemble.score(X_train, y_train)) ensembles_results['validation_score'].append(stacking_ensemble.score(X_validation,y_validation)) # In[149]: # Compare StackingClassifier with final AdaBoostClassifier ada_stacking_ensemble = StackingClassifier(estimators=[('knn_estimator',knn_estimator), ('dtc_estimator',dtc_estimator)], final_estimator=None, stack_method='auto', n_jobs=-1) ada_stacking_ensemble.fit(X_train, y_train)
class Classifier(object): def __init__(self, in_model_code, db, y_col="party", label_col="county_fips", where_clauses=None, data_view="master_data", year_col="year", year_test=2020): self.db = db self.mc = in_model_code self.drop_cols = db.query(ModelDropCol).filter_by( model_code_id=self.mc.id).all() where = self.db.query(ModelWhereClause).filter_by( model_code=self.mc).all() if where: self.where = " where " + (" and ".join([wc.sql for wc in where])) else: self.where = "" self.engine_string = database_string self.query = f"select * from {data_view}{self.where}" self.df = pandas.read_sql_query( self.query, database_string).drop(columns=[dc.column for dc in self.drop_cols]) self.y = self.df[y_col].to_numpy() self.x = self.df.drop(columns=y_col).to_numpy() self.model_obj = self.db.query(Model).filter_by( model_code=self.mc).first() if not self.model_obj: rf = RandomForestClassifier(n_estimators=10, random_state=42) svr = make_pipeline( StandardScaler(), LinearSVC(random_state=42, dual=False, max_iter=1000)) knn = KNeighborsClassifier(n_neighbors=3) nb = GaussianNB() classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)] self.model = StackingClassifier( estimators=classifiers, final_estimator=LogisticRegression()) self.accuracy = None self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy) self.db.add(self.model_obj) self.train() self.save() else: self.model = pickle.loads(self.model_obj.model_object) self.accuracy = self.model_obj.accuracy def train(self): x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.33) self.model.fit(x_train, y_train) self.accuracy = self.model.score(x_test, y_test) def save(self): self.model_obj.model_object = pickle.dumps(self.model) self.model_obj.accuracy = self.accuracy self.db.commit() def predict(self, fips, in_file_path=None): """ Currently hard coded to predict for 2020, or the latest election in which all data as available, but not trained on. """ if "2020" in self.mc.id: raise IOError( "Must be a non-2020 model code to predict 2020 results.") year = 2020 logging.info(f"Selecting {self.mc.id} model ({self.mc.description})") if fips in ["ALL", "*"]: and_clause = "" logging.info("Predicting all counties...") all_counties = True else: and_clause = f" and county_fips = {fips}" all_counties = False max_year = self.db.execute( f"select max(year) from ({self.query})").scalar() search_year = max_year - 4 data = pandas.read_sql_query( f"select * from ({self.query}) where year = '{search_year}'{and_clause}", self.engine_string).drop( columns=[dc.column for dc in self.drop_cols]) fields = list(data.columns) county_fips_idx = None for i, f in enumerate(fields): if f == "county_fips": county_fips_idx = i - 1 break y = data["party"].to_numpy() x = data.drop(columns=["party"]).to_numpy() predictions = self.model.predict(x) out_predictions = [] fips_to_county = {} logging.info("Predictions:") i = 0 for val in x: pred = predictions[i] county_id = str(int(val[county_fips_idx])).zfill(6) if county_id in fips_to_county: county = fips_to_county[county_id] else: county = self.db.query(County).filter_by(id=county_id).first() fips_to_county[county_id] = county logging.info(f"{county.name} ({county.id}): {pred}") out_predictions.append({ "party_prediction": pred, "county_fips": county_id, "county_name": county.name, "state_fips": county.state.id, "state_code": county.state.code }) i += 1 if in_file_path: logging.info(f"Writing output to {in_file_path}") out_cols = [ "party_prediction", "county_fips", "county_name", "state_fips", "state_code" ] with open(in_file_path, "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=out_cols) writer.writeheader() writer.writerows(out_predictions) return out_predictions
estimators=estimators, final_estimator=gdbt_clf, cv=5) # clf = clf.fit(X, t) # score = clf.score(X, t) # print(score) kf = KFold(n_splits=10) index = [] scores = [] train_scores2 = [] clfs = [] for train_index, validate_index in kf.split(X): X_train, X_validate = X[train_index], X[validate_index] y_train, y_validate = t[train_index], t[validate_index] clf = clf.fit(X_train, y_train) clfs.append(clf) score = clf.score(X_validate, y_validate) scores.append(score) print(score) score = clf.score(X_train, y_train) train_scores2.append(score) index.append([train_index, validate_index]) print(score) max_index = scores.index(max(scores)) clf = clfs[max_index] result = clf.predict_proba(values_test1) p = features2.index('listing_id') list_id = values2[:, p].reshape((values2.shape[0], 1)) result = np.append(list_id, result, axis=1)
# %% Build pipeline scaler = StandardScaler().fit(X_train) encoder = LabelEncoder().fit(y_train) X_train, y_train = scaler.transform(X_train), encoder.transform(y_train) X_dev, y_dev = scaler.transform(X_dev), encoder.transform(y_dev) # %% estimators = [ ('svm', LinearSVC(C=0.0001)), ('log', LogisticRegression(penalty='l2', C=0.001, max_iter=1000)) ] clf = StackingClassifier( estimators=estimators, final_estimator=GradientBoostingClassifier() ) clf.fit(X_train, y_train) pred_train, pred_dev = clf.predict(X_train), clf.predict(X_dev) train_acc = clf.score(X_train, y_train) dev_acc = clf.score(X_dev, y_dev) train_uar = recall_score(y_train, pred_train, average='macro') dev_uar = recall_score(y_dev, pred_dev, average='macro') print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}") print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}") """ train_acc = 0.83, dev_acc = 0.47 train_uar = 0.83, dev_uar = 0.47 """
optimizer='rmsprop', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy']) return _neural_net neural_net = build_neural_net() print('\nFitting Neural Net...') neural_net.fit(X_train, y_train, epochs=10) # add a predictions layer neural_net.add(layers.Lambda(lambda x: tf.math.round(x))) neural_net.predict(X_test) print('\nEvaluating model...') neural_net.evaluate(X_test, y_test) stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression( max_iter=10000, random_state=seed), n_jobs=-1, verbose=1) # evaluate our final model stacking_clf.fit(X_train, y_train) score = stacking_clf.score(X_test, y_test) print('\nStacking Classifier Score on the test set: {:.4f}'.format(score))