Exemplo n.º 1
0
def stacking_classifier(best_logistic_regression, best_knn_classifier,
                        best_gaussian_nb, best_decision_tree_classifier,
                        best_random_forest_classifier, x_train, x_test,
                        y_train, y_test):
    from sklearn.ensemble import StackingClassifier

    estimators = [
        # ('random_forest_cv', best_random_forest_classifier),
        ('knn_classifier_cv', best_knn_classifier),
        ('dct_cv', best_decision_tree_classifier),
        ('gaussian_nb_cv', best_gaussian_nb)
    ]

    final_stacking_classifier = StackingClassifier(
        estimators=estimators,
        shuffle=False,
        use_probas=True,
        final_estimator=best_logistic_regression)

    final_stacking_classifier.fit(x_train, y_train)

    print("Stacking Classifier Training Score {}".format(
        final_stacking_classifier.score(x_train, y_train)))
    print("Stacking Classifier Testing Score {}\n".format(
        final_stacking_classifier.score(x_test, y_test)))

    y_predict = final_stacking_classifier.predict(x_test)
    classification_model = 'Stacking Classifier'

    confusion_matrix_graph(y_test, y_predict, classification_model)
    roc_curve_graph(y_test, y_predict, classification_model)
Exemplo n.º 2
0
def test_stacking_classifier_iris(cv, final_estimator):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=final_estimator,
                             cv=cv)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 6

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 3
Exemplo n.º 3
0
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(
        scale(X_iris), y_iris, stratify=y_iris, random_state=42
    )
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(
        estimators=estimators, final_estimator=final_estimator, cv=cv,
        passthrough=passthrough
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    expected_column_count = 10 if passthrough else 6
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    expected_column_count_drop = 7 if passthrough else 3
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])
Exemplo n.º 4
0
def week10(C, random_state, criterion, min_samples_leaf, max_leaf_samples,
           n_estimators, solver, cv, clazz, images):
    trainData, Y = catsvsdogs.train
    Y = [(y + 1) % 2 for y in Y]

    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier

    tree = DecisionTreeClassifier(
        criterion=criterion,  # критерий разделения
        min_samples_leaf=min_samples_leaf,  # минимальное число объектов в листе
        max_leaf_nodes=max_leaf_samples,  # максимальное число листьев
        random_state=random_state)
    bagging = BaggingClassifier(
        tree,  # базовый алгоритм
        n_estimators=n_estimators,  # количество деревьев
        random_state=random_state)
    bagging.fit(trainData, Y)

    from sklearn.svm import LinearSVC

    svm = LinearSVC(random_state=random_state, C=C)
    svm.fit(trainData, Y)

    from sklearn.ensemble import RandomForestClassifier

    forest = RandomForestClassifier(
        n_estimators=n_estimators,  # количество деревьев
        criterion=criterion,  # критерий разделения
        min_samples_leaf=min_samples_leaf,  # минимальное число объектов в листе
        max_leaf_nodes=max_leaf_samples,  # максимальное число листьев
        random_state=random_state)
    forest.fit(trainData, Y)

    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression(solver=solver, random_state=random_state)

    from sklearn.ensemble import StackingClassifier

    base_estimators = [('SVM', svm), ('Bagging DT', bagging),
                       ('DecisionForest', forest)]
    sclf = StackingClassifier(estimators=base_estimators,
                              final_estimator=lr,
                              cv=cv)
    sclf.fit(trainData, Y)

    accuracy = sclf.score(trainData, Y)

    probas = []
    for img in images:
        histt = catsvsdogs.test[img].reshape(1, -1)
        probas += [(img, sclf.predict_proba(histt)[0][clazz])]

    return {'accuracy': accuracy, 'probas': probas}
Exemplo n.º 5
0
def test_stacking():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    df = DF.copy()
    numeric_cols = df.select_dtypes("number").columns
    categorical_cols = [
        col for col in df.columns
        if (col not in numeric_cols and not col == CLASS_FEAT)
    ]
    dum_df = pd.get_dummies(df[categorical_cols])
    for col in numeric_cols:
        dum_df[col] = df[col]
    dum_df[CLASS_FEAT] = df[CLASS_FEAT]
    sktrain, sktest = df_shuffled_split(dum_df, random_state=42)
    sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT]
    sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT]

    lone_tree = DecisionTreeClassifier(random_state=42)
    lone_tree.fit(sktrain_x, sktrain_y)
    lone_tree_score = lone_tree.score(sktest_x, sktest_y)
    # print('lone_tree_score',lone_tree_score)

    irep_tree = SVC(random_state=42)
    irep_stack_estimators = [("irep", irep), ("tree", irep_tree)]
    irep_stack = StackingClassifier(estimators=irep_stack_estimators,
                                    final_estimator=LogisticRegression())
    irep_stack.fit(sktrain_x, sktrain_y)
    irep_stack_score = irep_stack.score(sktest_x, sktest_y)
    # print('irep_stack_score', irep_stack_score)
    assert irep_stack_score != lone_tree_score

    rip_tree = DecisionTreeClassifier(random_state=42)
    rip_stack_estimators = [("rip", rip), ("tree", rip_tree)]
    rip_stack = StackingClassifier(estimators=rip_stack_estimators,
                                   final_estimator=LogisticRegression())
    rip_stack.fit(sktrain_x, sktrain_y)
    rip_stack_score = rip_stack.score(sktest_x, sktest_y)
    # print('rip_stack_score',rip_stack_score)
    assert rip_stack_score != lone_tree_score
Exemplo n.º 6
0
 def perform_stacking(self):
     eclfs = [(k, v) for k, v in self.__classifiers.items()]
     clf = StackingClassifier(estimators=eclfs,
                              final_estimator=LogisticRegression(),
                              cv=5,
                              verbose=1,
                              n_jobs=-1)
     clf.fit(self.__train_x, self.__train_y)
     scores = clf.score(self.__test_x, self.__test_y)
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
           (scores.mean(), scores.std(), 'StackingClassifier'))
     self.plot_conf_mat(clf, 'StackingClassifier')
     return clf
Exemplo n.º 7
0
def ensemble_(feat, tar, split):
    scaler = MinMaxScaler()
    x_tr,x_te,y_tr,y_te = train_test_split(feat,tar,test_size = split,shuffle = True)
    scaler.fit(x_tr)
    x_tr = scaler.transform(x_tr)
    x_te = scaler.transform(x_te)
    
    knn = KNeighborsClassifier()
    params_knn = {'n_neighbors': np.arange(1, 25)}
    knn_gs = GridSearchCV(knn, params_knn, cv=5)
    knn_gs.fit(x_tr, y_tr)
    knn_best = knn_gs.best_estimator_
    print(knn_gs.best_params_)
    
    rf = RandomForestClassifier()
    params_rf = {'n_estimators': [50, 100, 200,300,400]}
    rf_gs = GridSearchCV(rf, params_rf, cv=5)
    rf_gs.fit(x_tr, y_tr)
    rf_best = rf_gs.best_estimator_
    print(rf_gs.best_params_)
    
    
    log_reg = LogisticRegression()
    log_reg.fit(x_tr, y_tr)
    
    print('knn: {}'.format(knn_best.score(x_te, y_te)))
    print('rf: {}'.format(rf_best.score(x_te, y_te)))
    print('log_reg: {}'.format(log_reg.score(x_te, y_te)))
    
    estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]
    ensemble = VotingClassifier(estimators, voting='hard')
    ensemble.fit(x_tr, y_tr)
    print("ensemble voting score: ",str(ensemble.score(x_te, y_te)))
    
    ensemble_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10) 
    ensemble_bagging.fit(x_tr, y_tr)
    print("ensemble bagging score: ",str(ensemble_bagging.score(x_te, y_te)))
    
    ensemble_stacking = StackingClassifier(estimators,LogisticRegression())
    ensemble_stacking.fit(x_tr, y_tr)
    print("ensemble stacking score: ", str(ensemble_stacking.score(x_te, y_te)))
Exemplo n.º 8
0
    sex = lineX[4]
    # ageGroup = math.floor(lineX[2] / 20)
    # if ageGroup not in dictAgeSurvived[sex]:
    # dictAgeSurvived[sex][ageGroup] = 0
    # dictAgeAll[sex][ageGroup] = 0
    # dictAgeSurvived[sex][ageGroup] += result
    # dictAgeAll[sex][ageGroup] = dictAgeAll[sex][ageGroup] + 1

# x = np.arange(len(dictAgeAll[0].keys()))
# width = 0.35

# resultSex = []
# for sex in range(2):
# resultSex.append([])
# for i in range(len(dictAgeAll[0].keys())):
# resultSex[sex].append(dictAgeSurvived[sex][i]/dictAgeAll[sex][i])

# fig, ax = plt.subplots()
# ax.bar(x - width/2, resultSex[0], width=width, label='female')
# ax.bar(x + width/2, resultSex[1], width=width, label='male')
# ax.set_xticks(x)
# ax.set_xticklabels(['0-20', '20-40', '40-60', '60-80'])
x = np.linspace(0, 80, 4)

plt.plot(x, resultSex[0], 'red')
plt.plot(x, resultSex[1], 'blue')

plt.show()

print(clf.score(X, y))
Exemplo n.º 9
0
#step2:Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40,
                                                    stratify=y)

#step3:Training
clf = StackingClassifier(estimators=[
    ('knn',
     KNeighborsClassifier(n_neighbors=7,
                          weights='distance',
                          leaf_size=1,
                          metric='manhattan')),
    ('ridge',
     RidgeClassifier(random_state=40, class_weight='balanced', alpha=0.00001)),
],
                         final_estimator=LogisticRegression(
                             random_state=40,
                             class_weight='balanced',
                             max_iter=10000),
                         cv=5,
                         n_jobs=-1)
clf.fit(X_train, y_train)
score_train = clf.score(X_train, y_train)
score_test = clf.score(X_test, y_test)
print(
    'scores for StackingClassifier(score on training set/testing set):{:.2f}/{:.2f}'
    .format(score_train, score_test))
Exemplo n.º 10
0
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris


def load_data():
    data = load_iris()
    x, y = data.data, data.target
    x_train, x_test, y_train, y_test \
        = train_test_split(x, y, test_size=0.3)
    return x_train, x_test, y_train, y_test


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = load_data()
    estimators = [('logist', LogisticRegression(max_iter=500)),
                  ('knn', KNeighborsClassifier(n_neighbors=3))]
    stacking = StackingClassifier(estimators=estimators,
                                  final_estimator=DecisionTreeClassifier())
    stacking.fit(x_train, y_train)
    acc = stacking.score(x_test, y_test)
    print("模型在测试集上的准确率为:", acc)
stk = StackingClassifier(estimators=estimators, 
                        final_estimator=LogisticRegression(C=0.1, class_weight='balanced', random_state=1234), cv=5)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1234)

svc.fit(X_train, y_train)
svc.score(X_train, y_train)
svc.score(X_test, y_test)

rf.fit(X_train, y_train)
rf.score(X_train, y_train)
rf.score(X_test, y_test)

dt.fit(X_train, y_train)
dt.score(X_train, y_train)
dt.score(X_test, y_test)

vot_c.fit(X_train, y_train)
vot_c.score(X_train, y_train)
vot_c.score(X_test, y_test)

stk.fit(X_train, y_train)
stk.score(X_train, y_train)
stk.score(X_test, y_test)

svc.score(X_test, y_test)
rf.score(X_test, y_test)
dt.score(X_test, y_test)
vot_c.score(X_test, y_test)
stk.score(X_test, y_test)
                  GradientBoostingClassifier(n_estimators=100,
                                             learning_rate=1.0,
                                             max_depth=1,
                                             random_state=0))]
layer_two_estimators = [
    ('dt_2', DecisionTreeClassifier()),
    ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
]
layer_two = StackingClassifier(estimators=layer_two_estimators,
                               final_estimator=LogisticRegression())
# clf = StackingClassifier(estimators=base_learners, final_estimator=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))
stack_clf = StackingClassifier(estimators=base_learners,
                               final_estimator=LogisticRegression(),
                               cv=10)
stack_clf.fit(train[:, 0:(n2 - 1)], train[:, (n2 - 1)])
stack_acc = stack_clf.score(test[:, 0:(n2 - 1)], test[:, (n2 - 1)])

#voting
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

v_clf = VotingClassifier(estimators=base_learners, voting='soft')
v_clf.fit(data[:, 0:(n2 - 1)], data[:, (n2 - 1)])
v_scores = cross_val_score(v_clf,
                           data[:, 0:(n2 - 1)],
                           data[:, (n2 - 1)],
                           scoring='accuracy',
                           cv=5).mean()

df = pd.read_csv('test.csv', delimiter=',')
Exemplo n.º 13
0
                                 algorithm=bestalgo,
                                 leaf_size=bestleaf,
                                 p=2,
                                 metric='minkowski')
     clf4 = svm.SVC(C=bestC,
                    kernel=ker,
                    gamma=bestgamma,
                    decision_function_shape='ovo',
                    random_state=0)
     estimator = [('NB', clf1), ('RF', clf2), ('KNN', clf3), ('SVM', clf4)]
     clf5 = StackingClassifier(estimators=estimator,
                               final_estimator=LogisticRegressionCV(
                                   cv=5, random_state=0),
                               stack_method='auto',
                               n_jobs=-1).fit(X_train, Y_train)
     Stackscore = clf5.score(X_test, Y_test)
     Stackscores.append(Stackscore)
     ALLmodscores.append((Stackscore, Stackmodelname))
     Stackmodelresutls = {
         'Stack': clf5,
         'X_train': X_train,
         'X_test': X_test,
         'Y_train': Y_train,
         'Y_test': Y_test
     }
     ALLmod[Stackmodelname] = Stackmodelresutls
     print(Stackmodelname + 'done.')
 ALLmodscores_sorted = sorted(ALLmodscores, reverse=True)
 LRmeanscore = (round(np.mean(LRscores), 4), 'LRmeanscore')
 SVMmeansore = (round(np.mean(SVMscores), 4), 'SVMmeanscore')
 KNNmeanscore = (round(np.mean(KNNscores), 4), 'KNNmeanscore')
Exemplo n.º 14
0
                               },
                               tol=1e-6)

model = StackingClassifier(estimators=estimator,
                           final_estimator=final_est,
                           cv=10)

model.fit(xtrain_sample, ytrain_sample)

yhat = model.predict(xtest_std)
auc = roc_auc_score(ytest, yhat)

print(classification_report(ytest, yhat))
print("AUC: ", round(auc, 2))

model.score(xtrain_std, ytrain)
model.score(xtest_std, ytest)

# Logistic regression with best fitting tuning parameter
logreg = LogisticRegression(C=0.7,
                            class_weight={
                                True: 1.32,
                                False: 1.08
                            },
                            tol=1e-6)

logreg.fit(xtrain_sample, ytrain_sample)

yhat = logreg.predict(xtest_std)
auc = roc_auc_score(ytest, yhat)
Exemplo n.º 15
0
 modname = selectmodrocs[0][3]
 selectmodels = selectmodrocresults[KEYS[0]]
 selectmodel = selectmodels[modname]
 selectmodelname = list(selectmodel.keys())[0]
 selectmod = selectmodel[selectmodelname]
 try:
     testacc = selectmod.score(select_U_data_test_mod, WHO_test)
 except Exception as e:
     params = selectmod.get_params()
     X_train = selectmodel[list(selectmodel.keys())[1]]
     Y_train = selectmodel[list(selectmodel.keys())[3]]
     try:
         selectmod = svm.SVC(C=params['C'], kernel=params['kernel'], gamma=params['gamma'],decision_function_shape='ovo', random_state=0).fit(X_train, Y_train)
     except Exception as e:
         selectmod = StackingClassifier(estimators=params['estimators'],final_estimator=params['final_estimator'],cv=params['cv'],stack_method=params['stack_method'],n_jobs=-1).fit(X_train,Y_train)
     testacc = selectmod.score(select_U_data_test_mod, WHO_test)
 allgroups_testaccs2.append((testacc,featurename,modname))
 allgroups_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
 featurenum = len(select_U_features_mod)
 if featurenum == 2:
     groups2_testaccs2.append((testacc,featurename,modname))
     groups2_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
 elif featurenum == 3:
     groups3_testaccs2.append((testacc, featurename, modname))
     groups3_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
 elif featurenum == 4:
     groups4_testaccs2.append((testacc,featurename,modname))
     groups4_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
 elif featurenum == 5:
     groups5_testaccs2.append((testacc,featurename,modname))
     groups5_testresults2[key] = {'allmodrocresults': allmodrocresults,'allmodmeanaucs': allmodmeanaucs_sorted,'select_U_features_mod': select_features_mod,'select_U_data_mod': select_U_data_mod,'select_U_data_test_mod': select_U_data_test_mod, 'trainCVacc': trainCVacc,'testacc': testacc, 'selectmodel':selectmod}
Exemplo n.º 16
0

# Third
forest = RandomForestClassifier(n_estimators=13,
                                criterion='entropy',
                                min_samples_leaf=10,
                                max_leaf_nodes=20,
                                random_state=80)
forest.fit(trainData, Y)


# Logistic regression
lr = LogisticRegression(solver='lbfgs', random_state=80)
base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)]
sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=2)
sclf.fit(trainData, Y)

score1 = sclf.score(trainData, Y)
print('Logistic regression score:', score1)


# Predictions
names = ['dog.1049.jpg', 'dog.1028.jpg', 'dog.1011.jpg', 'cat.1016.jpg']

for name in names:
    singleImage = cv2.imread('data/test/' + name)
    histt = extract_histogram(singleImage)
    histt2 = histt.reshape(1, -1)
    prediction = sclf.predict(histt2)
    proba = sclf.predict_proba(histt2)
    print(f'Predictions for {name}:', proba)
Exemplo n.º 17
0
        print(root, dir, files)
        x_train, y_train = get_600_data(os.path.join(root + '/' + files[0]))
        x_test, y_test = get_600_data(os.path.join(root + '/' + files[1]))

        nca.fit(x_train, y_train)

        # clf3 = svm.SVC(kernel='rbf', decision_function_shape='ovo', degree=1, gamma='scale', coef0=1.0, shrinking=True, probability=True)
        clf1 = GradientBoostingClassifier(n_estimators=100, random_state=123)
        clf2 = RandomForestClassifier(n_estimators=1000, random_state=11)
        model = StackingClassifier(
            estimators=[('GBM', clf1), ('RF', clf2)],
            final_estimator=LogisticRegression(solver='liblinear'))

        # model.fit(x_train,y_train)
        model.fit(nca.transform(x_train), y_train)
        acc = model.score(nca.transform(x_test), y_test)

        print(acc)

        # predict = model.predict(x_test)
        #
        # import sklearn.metrics as metrics
        # print ("Accuracy: {}%".format(metrics.accuracy_score(y_test, predict)))
        # print ("Precision: {}%".format(100*metrics.precision_score(y_test, predict, average="weighted")))
        # print ("Recall: {}%".format(100*metrics.recall_score(y_test, predict, average="weighted")))
        # print ("f1_score: {}%".format(100*metrics.f1_score(y_test, predict, average="weighted")))
        # print (metrics.confusion_matrix(y_test, predict))
        #
        accuracy += acc

        # precision_0 += metrics.precision_score(y_test, predict, pos_label=0, average="binary")
Exemplo n.º 18
0
 print("RFmodels done.")
 for k in Stackmodels.keys():
     modname = k
     modelresults = Stackmodels[k]
     Stack = modelresults['Stack']
     X_test = modelresults['X_test']
     Y_test = modelresults['Y_test']
     X_train = modelresults['X_train']
     Y_train = modelresults['Y_train']
     try:
         Y_pred = Stack.predict_proba(X_test)[:, 1]
     except Exception as e:
         params = Stack.get_params()
         Stack = StackingClassifier(estimators=params['estimators'], final_estimator=params['final_estimator'], cv=params['cv'], stack_method=params['stack_method'], n_jobs=-1).fit(X_train, Y_train)
         Y_pred = Stack.predict_proba(X_test)[:,1]
     Stackscore = Stack.score(X_test,Y_test)
     fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
     roc_auc = auc(fpr,tpr)
     if roc_auc <0.5:
         Y_pred = Stack.predict_proba(X_test)[:,0]
         fpr, tpr, thresholds = roc_curve(Y_test,Y_pred)
         roc_auc = auc(fpr,tpr)
     Y_pred2 = []
     optimal_idxf = np.argmax(tpr-fpr)
     optimal_thresholdsf = thresholds[optimal_idxf]
     for prob in Y_pred:
         if prob >= optimal_thresholdsf:
             Y_pred2.append(1)
         else:
             Y_pred2.append(0)
     Stackscore_th = accuracy_score(Y_test, Y_pred2)
Exemplo n.º 19
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

#SVM KNN: ('s2', clf_res.best_estimator_), ('s', gs_knn.best_estimator_)
#SVM XGB: ('s2', clf_res.best_estimator_), ('s4',grid_search.best_estimator_)
#SVM Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_)
#KNN XGB: ('s', gs_knn.best_estimator_), ('s4',grid_search.best_estimator_)
#KNN Random: ('s', gs_knn.best_estimator_), ('s3', gridF.best_estimator_)
#XGB Random: ('s4',grid_search.best_estimator_), ('s3', gridF.best_estimator_)
#SVM KNN XGB: ('s', gs_knn.best_estimator_), ('s2', clf_res.best_estimator_), ('s4',grid_search.best_estimator_)
#SVM KNN Random: ('s', gs_knn.best_estimator_), ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_)
#SVM XGB Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_)
#KNN XGB Random: ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_), ('s', gs_knn.best_estimator_)
#SVM KNN XGB Random: ('s2', clf_res.best_estimator_), ('s3', gridF.best_estimator_), ('s4',grid_search.best_estimator_), ('s', gs_knn.best_estimator_)
estimators_res = [('s2', clf_res.best_estimator_),
                  ('s3', gridF.best_estimator_), ('s4', xb),
                  ('s', gs_knn.best_estimator_)]

stacked = StackingClassifier(estimators=estimators_res,
                             final_estimator=LogisticRegression())

stacked.fit(feat_train_res, y_train)

print(stacked.score(feat_train_res, y_train))
print(stacked.score(feat_val_res, y_val))
print(stacked.score(feat_test_res, y_test))
Exemplo n.º 20
0
names = []
scores = []
for name, model in models:
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)
    tr_split = pd.DataFrame({'Name': names, 'Score': scores})
    print(tr_split)
    
from sklearn.ensemble import StackingClassifier
estimators = [ m for m in models]
lr = LogisticRegression()
sclf = StackingClassifier(estimators=estimators, final_estimator=lr)
sclf.fit(x_train, y_train)
sclf.score(x_test, y_test)



#saving the chosen model

from sklearn.externals import joblib

rf_gscv = RandomForestClassifier(criterion='entropy', min_samples_leaf=3, 
                                  min_samples_split=7, n_estimators=30, n_jobs=-1, 
                                  random_state=123).fit(x_train_scaled, y_train)
joblib.dump(rf_gscv, 'q2d_rf_GSCV.pkl')


#Make Predictions using the saved model
rf_gscv = joblib.load("q2d_rf_GSCV.pkl")
Exemplo n.º 21
0
ensembles_results['ensemble'].append('VotingClassifier_soft')
ensembles_results['train_score'].append(soft_voting_ensemble.score(X_train, y_train))
ensembles_results['validation_score'].append(soft_voting_ensemble.score(X_validation,y_validation))


# In[148]:


# Secondly, I will use StackingClassifier with the same DecisionTreeClassifier + KNeighborsClassifier 

stacking_ensemble = StackingClassifier(estimators=[('knn_estimator',knn_estimator), ('dtc_estimator',dtc_estimator)])

stacking_ensemble.fit(X_train, y_train)

ensembles_results['ensemble'].append('StackingClassifier')
ensembles_results['train_score'].append(stacking_ensemble.score(X_train, y_train))
ensembles_results['validation_score'].append(stacking_ensemble.score(X_validation,y_validation))


# In[149]:


# Compare StackingClassifier with final AdaBoostClassifier
ada_stacking_ensemble = StackingClassifier(estimators=[('knn_estimator',knn_estimator), 
                                                       ('dtc_estimator',dtc_estimator)],
                                           final_estimator=None,
                                           stack_method='auto',
                                           n_jobs=-1)

ada_stacking_ensemble.fit(X_train, y_train)
Exemplo n.º 22
0
class Classifier(object):
    def __init__(self,
                 in_model_code,
                 db,
                 y_col="party",
                 label_col="county_fips",
                 where_clauses=None,
                 data_view="master_data",
                 year_col="year",
                 year_test=2020):
        self.db = db
        self.mc = in_model_code
        self.drop_cols = db.query(ModelDropCol).filter_by(
            model_code_id=self.mc.id).all()

        where = self.db.query(ModelWhereClause).filter_by(
            model_code=self.mc).all()
        if where:
            self.where = " where " + (" and ".join([wc.sql for wc in where]))
        else:
            self.where = ""

        self.engine_string = database_string
        self.query = f"select * from {data_view}{self.where}"
        self.df = pandas.read_sql_query(
            self.query,
            database_string).drop(columns=[dc.column for dc in self.drop_cols])

        self.y = self.df[y_col].to_numpy()
        self.x = self.df.drop(columns=y_col).to_numpy()

        self.model_obj = self.db.query(Model).filter_by(
            model_code=self.mc).first()
        if not self.model_obj:

            rf = RandomForestClassifier(n_estimators=10, random_state=42)
            svr = make_pipeline(
                StandardScaler(),
                LinearSVC(random_state=42, dual=False, max_iter=1000))
            knn = KNeighborsClassifier(n_neighbors=3)
            nb = GaussianNB()
            classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)]
            self.model = StackingClassifier(
                estimators=classifiers, final_estimator=LogisticRegression())
            self.accuracy = None
            self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy)
            self.db.add(self.model_obj)
            self.train()
            self.save()
        else:
            self.model = pickle.loads(self.model_obj.model_object)
            self.accuracy = self.model_obj.accuracy

    def train(self):
        x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.33)
        self.model.fit(x_train, y_train)
        self.accuracy = self.model.score(x_test, y_test)

    def save(self):
        self.model_obj.model_object = pickle.dumps(self.model)
        self.model_obj.accuracy = self.accuracy
        self.db.commit()

    def predict(self, fips, in_file_path=None):
        """
        Currently hard coded to predict for 2020, or the latest election in which all data
        as available, but not trained on.
        """
        if "2020" in self.mc.id:
            raise IOError(
                "Must be a non-2020 model code to predict 2020 results.")
        year = 2020
        logging.info(f"Selecting {self.mc.id} model ({self.mc.description})")
        if fips in ["ALL", "*"]:
            and_clause = ""
            logging.info("Predicting all counties...")
            all_counties = True
        else:
            and_clause = f" and county_fips = {fips}"
            all_counties = False
        max_year = self.db.execute(
            f"select max(year) from ({self.query})").scalar()
        search_year = max_year - 4

        data = pandas.read_sql_query(
            f"select * from ({self.query}) where year = '{search_year}'{and_clause}",
            self.engine_string).drop(
                columns=[dc.column for dc in self.drop_cols])

        fields = list(data.columns)
        county_fips_idx = None
        for i, f in enumerate(fields):
            if f == "county_fips":
                county_fips_idx = i - 1
                break

        y = data["party"].to_numpy()
        x = data.drop(columns=["party"]).to_numpy()

        predictions = self.model.predict(x)
        out_predictions = []
        fips_to_county = {}
        logging.info("Predictions:")
        i = 0

        for val in x:
            pred = predictions[i]
            county_id = str(int(val[county_fips_idx])).zfill(6)
            if county_id in fips_to_county:
                county = fips_to_county[county_id]
            else:
                county = self.db.query(County).filter_by(id=county_id).first()
                fips_to_county[county_id] = county

            logging.info(f"{county.name} ({county.id}): {pred}")
            out_predictions.append({
                "party_prediction": pred,
                "county_fips": county_id,
                "county_name": county.name,
                "state_fips": county.state.id,
                "state_code": county.state.code
            })
            i += 1

        if in_file_path:
            logging.info(f"Writing output to {in_file_path}")
            out_cols = [
                "party_prediction", "county_fips", "county_name", "state_fips",
                "state_code"
            ]
            with open(in_file_path, "w") as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=out_cols)
                writer.writeheader()
                writer.writerows(out_predictions)
        return out_predictions
Exemplo n.º 23
0
     estimators=estimators, final_estimator=gdbt_clf, cv=5)

# clf = clf.fit(X, t)
# score = clf.score(X, t)
# print(score)
kf = KFold(n_splits=10)
index = []
scores = []
train_scores2 = []
clfs = []
for train_index, validate_index in kf.split(X):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = t[train_index], t[validate_index]
    clf = clf.fit(X_train, y_train)
    clfs.append(clf)
    score = clf.score(X_validate, y_validate)
    scores.append(score)
    print(score)
    score = clf.score(X_train, y_train)
    train_scores2.append(score)
    index.append([train_index, validate_index])
    print(score)

max_index = scores.index(max(scores))
clf = clfs[max_index]

result = clf.predict_proba(values_test1)
p = features2.index('listing_id')

list_id = values2[:, p].reshape((values2.shape[0], 1))
result = np.append(list_id, result, axis=1)
Exemplo n.º 24
0
# %% Build pipeline
scaler = StandardScaler().fit(X_train)
encoder = LabelEncoder().fit(y_train)

X_train, y_train = scaler.transform(X_train), encoder.transform(y_train)
X_dev, y_dev = scaler.transform(X_dev), encoder.transform(y_dev)

# %%
estimators = [
    ('svm', LinearSVC(C=0.0001)),
    ('log', LogisticRegression(penalty='l2', C=0.001, max_iter=1000))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=GradientBoostingClassifier()
)

clf.fit(X_train, y_train)

pred_train, pred_dev = clf.predict(X_train), clf.predict(X_dev)
train_acc = clf.score(X_train, y_train)
dev_acc = clf.score(X_dev, y_dev)
train_uar = recall_score(y_train, pred_train, average='macro')
dev_uar = recall_score(y_dev, pred_dev, average='macro')

print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}")
print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}")

"""
train_acc = 0.83, dev_acc = 0.47
train_uar = 0.83, dev_uar = 0.47
"""
        optimizer='rmsprop',
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=['accuracy'])

    return _neural_net


neural_net = build_neural_net()

print('\nFitting Neural Net...')
neural_net.fit(X_train, y_train, epochs=10)

# add a predictions layer
neural_net.add(layers.Lambda(lambda x: tf.math.round(x)))

neural_net.predict(X_test)

print('\nEvaluating model...')
neural_net.evaluate(X_test, y_test)

stacking_clf = StackingClassifier(estimators=estimators,
                                  final_estimator=LogisticRegression(
                                      max_iter=10000, random_state=seed),
                                  n_jobs=-1,
                                  verbose=1)

# evaluate our final model
stacking_clf.fit(X_train, y_train)
score = stacking_clf.score(X_test, y_test)
print('\nStacking Classifier Score on the test set: {:.4f}'.format(score))