Python StackingClassifier.predict_proba示例，mlxtend.classifier.StackingClassifier.predict_proba Python示例

示例#1

0

显示文件

文件： Xudong_Li_工件预测_stacking_adaboost.py 项目： 5663015/my_kaggle

def kfold_train(mode):
    acc_list, loss_list = [], []
    prediction = np.zeros((x_test.shape[0], 4))
    for i in range(10):
        print(str(i + 1) + ' th kflod' + '*' * 50)
        kf = KFold(n_splits=5, shuffle=True, random_state=i)
        kfold_list = []
        for k, (train_index, test_index) in enumerate(kf.split(x_train)):
            print(str(k + 1) + 'fold--------------')
            train_x, train_y = x_train[train_index], labels[train_index]
            test_x, test_y = x_train[test_index], labels[test_index]
            # train
            if mode == 'cat':
                model = catboost_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    #early_stopping_rounds=1000, verbose=False
                )
                #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance'))
            if mode == 'lgb':
                model = lightgbm_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    # early_stopping_rounds=50, verbose=True
                    verbose=False)
            if mode == 'xgb':
                model = xgboost_model()
                model.fit(train_x, np.argmax(train_y, 1), verbose=True)
            if mode == 'stack':
                model = StackingClassifier(classifiers=[
                    xgboost_model(),
                    catboost_model(),
                    lightgbm_model(),
                    adaboost_model()
                ],
                                           use_probas=True,
                                           average_probas=False,
                                           meta_classifier=lr)

                model.fit(train_x, np.argmax(train_y, 1))
            # test
            pred = model.predict_proba(test_x)
            acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1))
            loss = log_loss(test_y, pred)
            acc_list.append(acc)
            loss_list.append(loss)
            kfold_list.append(loss)
            print('test acc: %f, test loss: %f' % (acc, loss))
            # predict
            prediction += model.predict_proba(x_test)
        print('this fold mean loss:', np.mean(kfold_list))
    print('*' * 50)
    print('mean acc: %f, mean loss: %f' %
          (np.mean(acc_list), np.mean(loss_list)))
    prediction = prediction / 50.
    return prediction

示例#2

0

显示文件

文件： test_stacking_classifier.py 项目： thaolinhnp/Machine_Learning

def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(sparse.csr_matrix(X[idx]))[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#3

0

显示文件

文件： test_stacking_classifier.py 项目： lw3259111/mlxtend

def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#4

0

显示文件

文件： test_stacking_classifier.py 项目： NextNight/mlxtend

def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#5

0

显示文件

    def model_test(self,model,best_params):

        print 'Model Test'
        print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        lr = self.model_init(model)
        clf1 = self.model_init('KNN')
        clf2 = self.model_init('RFC')
        clf3 = self.model_init('GNB')
        sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr)
        
        sclf.set_params(**best_params)
        
        train_data = self.train.values.copy() 
        train_label = self.train_label['label'].values.copy()
        
        sclf.fit(train_data, train_label)
        
        if model.upper()=='LR':
            coef=sclf.coef_.reshape(clf.coef_.shape[1])
            ind=coef.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='RFC':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='XGB':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att           
            
        test_data = self.test.values.copy()
        test_label = self.test_label['label'].values.copy()
        test_label = test_label.reshape(test_label.shape[0])
            
        res_proba=sclf.predict_proba(test_data)              
        res_auc=roc_auc_score(test_label,res_proba[:,1])
        
        print 'Model: {0} ; Test: {1}'.format(model,res_auc)
                
        print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        return res_auc

示例#6

0

显示文件

文件： test_stacking_classifier.py 项目： rasbt/mlxtend

def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(
        sparse.csr_matrix(X[idx])
    )[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#7

0

显示文件

文件： test_stacking_classifier.py 项目： thaolinhnp/Machine_Learning

def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#8

0

显示文件

文件： test_stacking_classifier.py 项目： rasbt/mlxtend

def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)

示例#9

0

显示文件

文件： voting_pre.py 项目： SunJackson/card_risk

 def stacking_model(self, X_train, X_test, y_train, bst_xgb, bst_lgb):
     '''
     使用stacking集成两个综合表现最佳的模型lgb和xgb，此处元分类器使用较为简单的LR模型来在已经训练好了并且经过参数选择的模型上进一步优化预测结果
     :param X_train:
     :param X_test:
     :param y_train:
     :param bst_xgb:
     :param bst_lgb:
     :return:
     '''
     lr = linear_model.LogisticRegression(random_state=7)
     sclf = StackingClassifier(classifiers=[bst_lgb],
                               use_probas=True,
                               average_probas=False,
                               meta_classifier=lr)
     sclf.fit(X_train, y_train)
     predictions = sclf.predict_proba(X_test)[:, 1]
     joblib.dump(sclf, "./models/train_model_{}.m".format(self.num))
     return predictions

示例#10

0

显示文件

文件： voting_pre.py 项目： SunJackson/card_risk

 def stacking_model2(self, X_train, X_test, y_train, bst_xgb, bst_forest,
                     bst_gradient, bst_lgb):
     '''
     组合四种算法
     :param X_train: 训练集
     :param X_test: 测试集
     :param y_train: 训练标签
     :param bst_xgb: xgb最优参数
     :param bst_forest: forest最优参数
     :param bst_gradient: gradient最优参数
     :param bst_lgb: lgb最优参数
     :return: 预测结果
     '''
     lr = linear_model.LogisticRegression(random_state=7)
     sclf = StackingClassifier(
         classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb],
         use_probas=True,
         average_probas=False,
         meta_classifier=lr)
     sclf.fit(X_train, y_train)
     predictions = sclf.predict_proba(X_test)[:, 1]
     return predictions

示例#11

0

显示文件

文件： stacking_fit.py 项目： ghbj/Prediction-of-quality-coincidence-rate-of-typical-workpieces-in-discrete-manufacturing-processes

y = data_train.iloc[1:, 10]
y = np.array(y)
X_test = data_test.iloc[1:, 5:10]
# print(X_test)
X_test = np.array(X_test)

# clf1 = cbt.CatBoostClassifier(iterations=1000,task_type='GPU',loss_function='MultiClass')
# clf2 = lgb.LGBMClassifier(num_leaves=31,bagging_fraction=0.5,feature_fraction=0.8,max_depth=10,n_estimators=200)
clf2 = RandomForestClassifier()
clf3 = xgb.XGBClassifier(n_estimators=500)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf2, clf3], meta_classifier=lr)

sclf.fit(X, y)

y_predict = sclf.predict_proba(X_test)
# print(y_predict[0])
y_predcit_50 = [y_predict[i:i + 50]
                for i in range(0, y_predict.shape[0], 50)]  # 120组，每组50

A = []
# 把每组的列加起来
for i in y_predcit_50:
    a = np.sum(i, axis=0) / 50
    A.append(a)
A = pd.DataFrame(A)
A.columns = ['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio']
A.to_csv('submission_stacking.csv', index=True, index_label='Group')

# B = pd.DataFrame()
# B.insert(0, 'Excellent ratio', A['0'])

示例#12

0

显示文件

文件： Xudong_Li_B榜0.68114746.py 项目： 5663015/my_kaggle

def kfold_train(mode):
    acc_list, loss_list = [], []
    prediction = np.zeros((x_test.shape[0], 4))
    result_list = []
    n = 10
    for i in range(n):
        print(str(i + 1) + ' th kflod' + '*' * 50)
        result = []
        kf = KFold(n_splits=5, shuffle=True, random_state=i)
        kfold_list = []
        for k, (train_index, test_index) in enumerate(kf.split(x_train)):
            print(str(k + 1) + 'fold--------------')
            train_x, train_y = x_train[train_index], labels[train_index]
            test_x, test_y = x_train[test_index], labels[test_index]
            # train
            if mode == 'cat':
                model = catboost_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    #early_stopping_rounds=1000, verbose=False
                )
                #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance'))
            if mode == 'lgb':
                model = lightgbm_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    # early_stopping_rounds=50, verbose=True
                    verbose=False)
            if mode == 'xgb':
                model = xgboost_model()
                model.fit(train_x, np.argmax(train_y, 1), verbose=True)
            if mode == 'stack':
                model = StackingClassifier(classifiers=[
                    catboost_model(),
                    lightgbm_model(),
                    xgboost_model(),
                    adaboost_model()
                ],
                                           use_probas=True,
                                           average_probas=False,
                                           meta_classifier=lr)
                model.fit(train_x, np.argmax(train_y, 1))
            # test
            pred = model.predict_proba(test_x)
            acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1))
            loss = log_loss(test_y, pred)
            acc_list.append(acc)
            loss_list.append(loss)
            kfold_list.append(loss)
            print('test acc: %f, test loss: %f' % (acc, loss))
            # 用于线下验证
            X_valid = train_data.iloc[test_index, :].copy()
            X_valid.loc[:, [
                'prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'
            ]] = pred
            result.append(X_valid)
            # predict
            prediction += model.predict_proba(x_test)
        print('this fold mean loss:', np.mean(kfold_list))
        result_list.append(pd.concat(result))
    print('*' * 50)
    print('mean acc: %f, mean loss: %f' %
          (np.mean(acc_list), np.mean(loss_list)))
    prediction = prediction / (5. * n)
    # 线下评估
    mean = []
    for group in range(100):
        for result in result_list:
            temp = result.groupby(
                ['group_%s' % group],
                as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass',
                                'prob_Fail', 'label_Excellent', 'label_Good',
                                'label_Pass', 'label_Fail'].mean()
            a = np.abs(
                temp.
                loc[:,
                    ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']].
                values - temp.loc[:, [
                    'label_Excellent', 'label_Good', 'label_Pass', 'label_Fail'
                ]].values).mean()
            mean.append(1 / (1 + 10 * a))
    print("线下mae评估：", np.mean(mean), np.std(mean))

    return prediction

示例#13

0

显示文件

文件： script_11.py 项目： inigooalonso/Project0

#clfKNN = KNeighborsClassifier(n_neighbors=5)
#clfKNN.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev))
#
##NB
#clfNB = MultinomialNB(alpha=1.0)
#clfNB.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev))

#Combine results of classifiers
print "Stacking classifiers for devices with no events"
clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
meta = LogisticRegression()
stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)

stack.fit(X_train_noev, y_train_noev)
print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev))
y_pred_noev = stack.predict_proba(X_test_noev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
#    return y_pred_noev

y_pred_ev = predictor_ev()
#y_pred_noev = predictor_noev()

# Write results
result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_)
result["device_id"] = test_dev
result = result.set_index("device_id")
result.to_csv('stacking_1.gz', index=True,
              index_label='device_id', compression="gzip")

示例#14

0

显示文件

def main():
    # excel_file= 'training.xlsx'
    # df=pd.DataFrame(pd.read_excel(excel_file))
    # excel_file_test= 'test1.xlsx'
    # df1=pd.DataFrame(pd.read_excel(excel_file_test))

    # a=[]
    # b=[]
    # a1=[]
    # b1=[]
    # for url in df['url']:
    #     a.append(url)

    # for output in df['phishing']:
    #     b.append(output)

    # for url1 in df1['url']:
    #     a1.append(url1)

    # for output in df1['result']:
    #     b1.append(output)

    excel_file = 'training.xlsx'
    df1 = pd.DataFrame(pd.read_excel(excel_file))
    length = (len(df1) / 100)
    length = round(length * 80)

    df = pd.DataFrame(df1[0:length])
    df1 = pd.DataFrame(df1[length:])

    a = []
    b = []
    a1 = []
    b1 = []
    for url in df['url']:
        a.append(url)

    for output in df['phishing']:
        b.append(output)

    for url1 in df1['url']:
        a1.append(url1)

    for output in df1['phishing']:
        b1.append(output)

    c = []
    d = []
    for url1, output1 in zip(a, b):
        url = url1
        output = output1
        c.append(extract_feature_train(url, output))

    for url1, output1 in zip(a1, b1):
        url = url1
        output = output1
        d.append(extract_feature_test(url, output))

    df = pd.DataFrame(c,
                      columns=[
                          'r', 'length_of_url', 'http_has', 'suspicious_char',
                          'prefix_suffix', 'dots', 'slash', 'phis_term',
                          'sub_domain', 'ip_contain'
                      ])

    df.to_csv('id3.csv', sep=',', encoding='utf-8')

    df_test = pd.DataFrame(d,
                           columns=[
                               'r', 'length_of_url', 'http_has',
                               'suspicious_char', 'prefix_suffix', 'dots',
                               'slash', 'phis_term', 'sub_domain', 'ip_contain'
                           ])

    df_test.to_csv('feature_test.csv', sep=',', encoding='utf-8')

    data_train = importdata_train()
    data_test = importdata_test()
    X, Y = splitdataset(data_train)
    X1, Y1 = splitdataset(data_test)
    clf = svm.SVC(kernel='linear')
    clf.fit(X, Y)

    model = XGBClassifier(max_depth=5,
                          learning_rate=0.01,
                          n_estimators=100,
                          gamma=0,
                          min_child_weight=1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=0.005)
    model.fit(X, Y)

    gnb = GaussianNB()
    gnb.fit(X, Y)

    #STACKING

    df = pd.read_csv("stack.csv")
    df1 = pd.read_csv("feature_test_stack1.csv")
    target = 'r'

    X_train = df.loc[:, df.columns != target]
    Y_train = df.loc[:, df.columns == target]
    X_test = df1.loc[:, df1.columns != target]
    Y_test = df1.loc[:, df1.columns == target]

    #Stacking Classifier
    # knn1 = KNeighborsClassifier()
    # log_reg1 = LogisticRegression()
    # svm1 = SVC(probability=True)
    print(
        "___________________________Stacking__________________________________________"
    )
    clf = svm.SVC(kernel='linear')
    rf2 = RandomForestClassifier(bootstrap=True,
                                 max_depth=70,
                                 max_features='auto',
                                 min_samples_leaf=4,
                                 min_samples_split=10,
                                 n_estimators=400)
    xgb = XGBClassifier()
    classifiers = [clf, xgb]
    sc = StackingClassifier(classifiers, meta_classifier=rf2)
    sc.fit(X_train, Y_train)

    print("_____________Report___________________")
    y_pred4 = sc.predict(X_test)  #prediction or testing
    acc4 = cal_accuracy(Y_test, y_pred4)

    #confusion Matrix
    matrix = confusion_matrix(Y_test, y_pred4)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    #fig = pyplot.gcf()
    fig.canvas.set_window_title('Stacking')
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    #ROC_AUC curve
    probs = sc.predict_proba(X_test)
    probs = probs[:, 1]
    auc = roc_auc_score(Y_test, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y_test)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('Stacking')
    title = 'Stacking'
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = sc.predict(X_test)
    print(classification_report(Y_test, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(sc, classes=classes, support=True)
    visualizer.fit(X_train, Y_train)
    visualizer.score(X_test, Y_test)
    #fig.canvas.set_window_title('Stacking')
    g = visualizer.poof()

    print(
        "___________________________XGBOOST__________________________________________"
    )
    model = XGBClassifier(max_depth=5,
                          learning_rate=0.01,
                          n_estimators=100,
                          gamma=0,
                          min_child_weight=1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=0.005)
    model.fit(X, Y)
    y_pred1 = model.predict(X1)
    print("_____________Report___________________")
    acc1 = cal_accuracy(Y1, y_pred1)
    # print("_____________user input ___________________")

    #confusion Matrix
    import matplotlib.pyplot as plt1
    matrix = confusion_matrix(Y1, y_pred1)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt1.xticks(tick_marks, class_names)
    plt1.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt1.tight_layout()
    plt1.title('Confusion matrix', y=1.1)
    plt1.ylabel('Actual label')
    plt1.xlabel('Predicted label')
    fig.canvas.set_window_title('XGBoost')
    plt.show()

    #ROC_AUC curve
    probs = model.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr1, tpr1, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('XGBoost')
    plot_roc_curve(fpr1, tpr1)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = model.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer1 = ClassificationReport(model, classes=classes, support=True)
    visualizer1.fit(X, Y)
    visualizer1.score(X1, Y1)
    #fig.canvas.set_window_title('XGBoost')
    g = visualizer1.poof()

    print(
        "___________________________SVM__________________________________________"
    )
    clf = svm.SVC(kernel='linear', probability=True)
    clf.fit(X, Y)
    print("_____________Report___________________")
    y_pred = clf.predict(X1)
    #print(cal_accuracy(Y1, y_pred))
    acc2 = cal_accuracy(Y1, y_pred)
    #print("_____________user input ___________________")

    #confusion Matrix
    matrix = confusion_matrix(Y1, y_pred)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    fig.canvas.set_window_title('SVM')
    plt.show()

    #ROC_AUC curve
    probs = clf.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('SVM')
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = clf.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(clf, classes=classes, support=True)
    visualizer.fit(X, Y)
    visualizer.score(X1, Y1)
    #fig.canvas.set_window_title('SVM')
    g = visualizer.poof()

    print(
        "___________________________Naive Bayes__________________________________________"
    )
    gnb = GaussianNB()
    gnb.fit(X, Y)
    print("_____________Report___________________")
    y_pred = gnb.predict(X1)
    #print(cal_accuracy(Y1, y_pred))
    acc3 = cal_accuracy(Y1, y_pred)
    #print("_____________user input ___________________")

    #confusion Matrix
    matrix = confusion_matrix(Y1, y_pred)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    fig.canvas.set_window_title('NB')
    plt.show()

    #ROC_AUC curve
    probs = gnb.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('NB')
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = gnb.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(gnb, classes=classes, support=True)
    visualizer.fit(X, Y)
    visualizer.score(X1, Y1)
    #fig.canvas.set_window_title('NB')
    g = visualizer.poof()

    labels = [' XGboost', 'SVM', 'NB', 'Stacking']
    #sizes = [5, neg_per, neu_per]
    sizes = [acc1, acc2, acc3, acc4]
    index = np.arange(len(labels))
    plt.bar(index, sizes)
    plt.xlabel('Algorithm', fontsize=20)
    plt.ylabel('Accuracy', fontsize=20)
    plt.xticks(index, labels, fontsize=10, rotation=0)
    plt.title('comparative study')
    plt.show()

    #GUI
    class MainFrame(wx.Frame):
        def __init__(self, parent):
            wx.Frame.__init__(self,
                              parent,
                              id=wx.ID_ANY,
                              title=wx.EmptyString,
                              pos=wx.DefaultPosition,
                              size=wx.Size(500, 300),
                              style=wx.DEFAULT_FRAME_STYLE | wx.TAB_TRAVERSAL)

            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

            bSizer3 = wx.BoxSizer(wx.VERTICAL)

            self.m_staticText2 = wx.StaticText(self, wx.ID_ANY, u"Enter URL",
                                               wx.DefaultPosition,
                                               wx.DefaultSize, 0)
            self.m_staticText2.Wrap(-1)
            bSizer3.Add(self.m_staticText2, 0, wx.ALL, 5)

            self.text1 = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString,
                                     wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.text1, 0, wx.ALL | wx.EXPAND, 5)

            self.predictButton = wx.Button(self, wx.ID_ANY, u"Predict_XGBOOST",
                                           wx.DefaultPosition, wx.DefaultSize,
                                           0)
            bSizer3.Add(self.predictButton, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button2 = wx.Button(self, wx.ID_ANY, u"Predict_SVM",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button2, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button3 = wx.Button(self, wx.ID_ANY, u"Predict_NB",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button3, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button4 = wx.Button(self, wx.ID_ANY, u"Predict_STACKING",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button4, 0, wx.ALL | wx.EXPAND, 5)

            # self.label1 = wx.StaticText( self, wx.ID_ANY, u"Result", wx.DefaultPosition, wx.DefaultSize, 0 )
            # self.label1.Wrap( -1 )
            # bSizer3.Add( self.label1, 0, wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 5 )

            # self.text2 = wx.TextCtrl( self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0 )
            # bSizer3.Add( self.text2, 0, wx.RIGHT|wx.EXPAND, 5 )

            self.SetSizer(bSizer3)
            self.Layout()

            self.Centre(wx.BOTH)

            # Connect Events
            self.predictButton.Bind(wx.EVT_BUTTON, self.click)
            self.m_button2.Bind(wx.EVT_BUTTON, self.svm)
            self.m_button3.Bind(wx.EVT_BUTTON, self.nb)
            self.m_button4.Bind(wx.EVT_BUTTON, self.stacking)

        def __del__(self):
            pass

        # Virtual event handlers, overide them in your derived class

        #XGBOOST
        def click(self, event):
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = model.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app3 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app3.MainLoop()

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(200, 150),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                            def __del__(self):
                                pass

# Virtual event handlers, overide them in your derived class

                            def click(self, event):
                                event.Skip()

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()

                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #SVM

        def svm(self, event):
            clf = svm.SVC(kernel='linear', probability=True)
            clf.fit(X, Y)
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = model.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app2.MainLoop()
                    webbrowser.open(url)

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()

                    def __del__(self):
                        pass

                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #NAIVE BAYES

        def nb(self, event):
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = gnb.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app2.MainLoop()
                    webbrowser.open(url)

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()
                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #STACKING

        def stacking(self, event):
            df = pd.read_csv("stack.csv")
            df1 = pd.read_csv("feature_test_stack1.csv")
            target = 'r'

            X_train = df.loc[:, df.columns != target]
            Y_train = df.loc[:, df.columns == target]
            X_test = df1.loc[:, df1.columns != target]
            Y_test = df1.loc[:, df1.columns == target]
            global url4
            url4 = self.text1.GetValue()
            xgb = XGBClassifier()
            clf = svm.SVC(kernel='linear')
            classifiers = [clf, xgb]
            rf2 = RandomForestClassifier(bootstrap=True,
                                         max_depth=70,
                                         max_features='auto',
                                         min_samples_leaf=4,
                                         min_samples_split=10,
                                         n_estimators=400)
            sc = StackingClassifier(classifiers, meta_classifier=rf2)
            sc.fit(X_train, Y_train)
            # e1=extract_feature_usertest(url4)
            # userpredict4 = sc.predict(e1)

            col = df.columns
            col = col[:-1]
            e4 = extract_feature_usertest_stack(url4)
            output_data = e4
            output_data = pd.DataFrame([output_data], columns=col)
            userpredict4 = sc.predict(output_data)
            if (userpredict4[0] == 0):
                # self.text2.SetValue(str("Legitimate"))
                print('Legitimate')

                class MyDialog1(wx.Dialog):
                    def __init__(self, parent):
                        wx.Dialog.__init__(self,
                                           parent,
                                           id=wx.ID_ANY,
                                           title=wx.EmptyString,
                                           pos=wx.DefaultPosition,
                                           size=wx.Size(159, 114),
                                           style=wx.DEFAULT_DIALOG_STYLE)

                        self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                        sbSizer1 = wx.StaticBoxSizer(
                            wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                            wx.VERTICAL)

                        self.m_staticText1 = wx.StaticText(
                            sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE",
                            wx.DefaultPosition, wx.DefaultSize, 0)
                        self.m_staticText1.Wrap(-1)
                        sbSizer1.Add(self.m_staticText1, 0,
                                     wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5)

                        self.SetSizer(sbSizer1)
                        self.Layout()

                        self.Centre(wx.BOTH)

                app2 = wx.App(False)
                frame = MyDialog1(None)
                frame.Show(True)
                webbrowser.open(url)
                app2.MainLoop()

            else:

                class MyDialog1(wx.Dialog):
                    def __init__(self, parent):
                        wx.Dialog.__init__(self,
                                           parent,
                                           id=wx.ID_ANY,
                                           title=wx.EmptyString,
                                           pos=wx.DefaultPosition,
                                           size=wx.Size(159, 114),
                                           style=wx.DEFAULT_DIALOG_STYLE)

                        self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                        sbSizer1 = wx.StaticBoxSizer(
                            wx.StaticBox(self, wx.ID_ANY, u"Error"),
                            wx.VERTICAL)

                        self.m_staticText1 = wx.StaticText(
                            sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                            wx.DefaultPosition, wx.DefaultSize, 0)
                        self.m_staticText1.Wrap(-1)
                        sbSizer1.Add(self.m_staticText1, 0,
                                     wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5)

                        self.SetSizer(sbSizer1)
                        self.Layout()

                        self.Centre(wx.BOTH)

                app2 = wx.App(False)
                frame = MyDialog1(None)
                frame.Show(True)
                app2.MainLoop()
                # self.text2.SetValue(str("Phising"))
                # print('Phising')
    app1 = wx.App(False)

    frame = MainFrame(None)
    frame.Show(True)
    app1.MainLoop()

示例#15

0

显示文件

    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__subsample': subsample,
    #'catboostclassifier__n_estimators':n_estimators,
    #'catboostclassifier__max_depth': max_depth,
    #'randomforestclassifier__n_estimators':[100],
    #'randomforestclassifier__max_depth': [3],
    'meta-logisticregression__C': C
}

grid = GridSearchCV(estimator=grid,
                    param_grid=params,
                    cv=3,
                    refit=True,
                    verbose=3,
                    n_jobs=n_job,
                    early_stopping_rounds=100,
                    scoring='roc_auc')

print('fitting')
grid.fit(x, y)

joblib.dump(grid, 'export/trend_model.pkl')

predicted = grid.predict_proba(x)
predicted = list(map(lambda x: x[1], predicted))
print('trian roc: ', roc_auc_score(y, predicted))

predicted = pd.Series(grid.predict_proba(test)[:, 1])
predicted.index = test_ids
predicted.to_csv('export/trend_predict_test.csv')

示例#16

0

显示文件

文件： Load-FeatureSelection-ModelTuning.py 项目： aphoti01/EarlyAlert

                        random_state=0,
                        n_estimators=100)
sclf = StackingClassifier(classifiers=[rf],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=xgc)

gbc.fit(xtr, ytrain)
rf.fit(xtr1, ytrain)
ada.fit(xtr2, ytrain)
sclf.fit(xtrain, ytrain)

gbc_pred_proba = gbc.predict_proba(xte)
rf_pred_proba = rf.predict_proba(xte1)
ada_pred_proba = ada.predict_proba(xte2)
sc_pred_proba = sclf.predict_proba(xtest)

gbc_cm = m.confusion_matrix(ytest, (gbc_pred_proba[:, 1] >= 0.5).astype('int'))
rf_cm = m.confusion_matrix(ytest, (rf_pred_proba[:, 1] >= 0.5).astype('int'))
ada_cm = m.confusion_matrix(ytest, (ada_pred_proba[:, 1] >= 0.5).astype('int'))
sc_cm = m.confusion_matrix(ytest, (sc_pred_proba[:, 1] >= 0.5).astype('int'))

k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#k=[0.55,0.57,0.6,0.62,0.65]
for i in k:
    gbc_cm = m.confusion_matrix(ytest,
                                (gbc_pred_proba[:, 1] >= i).astype('int'))
    print(myf1(gbc_cm))

k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
k = [0.65, 0.68, 0.7, 0.72, 0.75]

示例#17

0

显示文件

文件： mlxtendStacking.py 项目： MlGroupsWJ/Customer-Satisfication

params2['objective'] = 'binary:logistic'
params2['booster'] = 'gbtree'
params2['learning_rate'] = 0.02
params2['max_depth'] = 5
params2['subsample'] = 0.6
params2['colsample_bytree'] = 0.5
params2['n_estimators'] = 500

params3 = {}
params3['objective'] = 'binary:logistic'
params3['booster'] = 'gbtree'
params3['learning_rate'] = 0.02
params3['max_depth'] = 4
params3['subsample'] = 0.6
params3['colsample_bytree'] = 0.5
params3['n_estimators'] = 600

clf1 = XGBClassifier(**params1)
clf2 = XGBClassifier(**params2)
clf3 = XGBClassifier(**params3)
clfs = [clf1, clf2, clf3]

lrc = linear_model.LogisticRegression(C=0.5, max_iter=300)

x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
mlxcf = StackingClassifier(clfs, lrc, use_probas=True, average_probas=True)
mlxcf.fit(x_train, y_train)
y_pred = mlxcf.predict_proba(test_data)[:, -1]
submission = pd.DataFrame({"ID": IDlist, "TARGET": y_pred})
submission.to_csv("../Result/mxltendStackingXGB.csv", index=False)

示例#18

0

显示文件

文件： 4_1_Model_fusion.py 项目： blacksevenzqj/zoubo

iris = load_iris()
X = iris.data[:100]
y = iris.target[:100]

pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

sclf.fit(X, y)
decision_scores = sclf.decision_function(X)
print("Val auc Score of Stacking: %f" %
      (roc_auc_score(y,
                     sclf.predict_proba(X)[:, 1])))

fig, axe = plt.subplots(2, 2, figsize=(30, 20))
rlb.ComprehensiveIndicatorFigure(y, decision_scores, axe[0], 1)
rlb.ComprehensiveIndicatorSkLibFigure(y, decision_scores, axe[1])

# In[]:
# 5、ROC Curve with decision_function
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import roc_curve, auc
import numpy as np

示例#19

0

显示文件

    def model_evaluation(Train,
                         Valid,
                         Test,
                         comparative,
                         bootstrap=False,
                         n_estimators: int = 200,
                         max_depth: int = 50,
                         oob_score: bool = False,
                         class_weight='balanced_subsample',
                         sampling=None,
                         label='FRAUDE',
                         model='ert'):

        # With beta = 2, we give the same importance to Recall and Precision
        if sampling is not None:
            class_weight = None
        model_name = str(sampling)

        # fileModel.fit(xTrain.drop(['id_siniestro'], axis=1).values, yTrain.values)

        # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 0].drop([label] + ['id_siniestro'], axis=1).values)))
        # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 1].drop([label] + ['id_siniestro'], axis=1).values)))

        tresholds = np.linspace(0.1, 1.0, 200)

        scores = []
        y_pred_score = np.empty(shape=[0, 2])
        predicted_index = np.empty(shape=[
            0,
        ])
        # y_pred_score = fileModel.predict_proba(Valid.drop([label] + ['id_siniestro'], axis=1).values)
        skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)
        Test = pd.concat([Train, Valid, Test], axis=0).reset_index()
        print(Test.shape)
        X = Test.drop([label] + ['id_siniestro'], axis=1)
        y = Test[[label]]
        for train_index, test_index in skf.split(X.values, y[label].values):
            X_train, X_test = X.loc[train_index].values, X.loc[
                test_index].values
            y_train, y_test = y.loc[train_index].values, y.loc[
                test_index].values
            if sampling == None:
                pass
            elif sampling == 'ALLKNN':
                X_train, y_train = under_sampling(X_train, y_train)
                class_weight = None
            else:
                X_train, y_train = over_sampling(X_train,
                                                 y_train,
                                                 model=sampling)
                class_weight = None

            min_sample_leaf = round(y_train.shape[0] * 0.005)
            min_sample_split = min_sample_leaf * 10
            max_features = round(X_train.shape[1] / 3)
            if model == 'ert':
                fileModel = ensemble.ExtraTreesClassifier(
                    criterion='entropy',
                    bootstrap=bootstrap,
                    min_samples_leaf=min_sample_leaf,
                    min_samples_split=min_sample_split,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    max_features=max_features,
                    oob_score=oob_score,
                    random_state=531,
                    verbose=1,
                    class_weight=class_weight,
                    n_jobs=-1)
            elif model == 'gb':
                fileModel = ensemble.GradientBoostingClassifier(
                    loss='deviance',
                    learning_rate=0.01,
                    n_estimators=200,
                    subsample=1.0,
                    criterion='friedman_mse',
                    min_samples_split=min_sample_split,
                    min_samples_leaf=min_sample_leaf,
                    min_weight_fraction_leaf=0.,
                    max_depth=max_depth,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    init=None,
                    random_state=531,
                    max_features=None,
                    verbose=1,
                    max_leaf_nodes=None,
                    warm_start=False,
                    presort='auto')
            elif model == 'lxgb':
                fileModel = lgbx.LGBMClassifier(
                    boosting_type="gbdt",
                    num_leaves=2000,
                    max_depth=200,
                    learning_rate=0.005,
                    n_estimators=300,
                    max_bin=500,
                    objective='binary',
                    min_split_gain=0.,
                    min_child_weight=5,
                    min_child_samples=min_sample_leaf,
                    subsample=1.,
                    subsample_freq=1,
                    colsample_bytree=1.,
                    reg_alpha=0.,
                    reg_lambda=0.,
                    random_state=531,
                    n_jobs=-1,
                    silent=True)

            elif model.startswith('stacked'):

                ERT = ensemble.ExtraTreesClassifier(
                    bootstrap=bootstrap,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    oob_score=oob_score,
                    class_weight=class_weight,
                    min_samples_leaf=min_sample_leaf,
                    min_samples_split=min_sample_split,
                    max_features='auto',
                    n_jobs=-1)

                Gboost = ensemble.GradientBoostingClassifier(
                    n_estimators=n_estimators,
                    learning_rate=0.005,
                    max_depth=max_depth,
                    loss='deviance',
                    random_state=531,
                    min_samples_split=min_sample_split,
                    min_samples_leaf=min_sample_leaf)

                Light_Gboost = lgbx.LGBMClassifier(
                    boosting_type="gbdt",
                    num_leaves=2000,
                    max_depth=-1,
                    learning_rate=0.005,
                    n_estimators=300,
                    max_bin=500,
                    objective='binary',
                    min_split_gain=0.,
                    min_child_weight=5,
                    min_child_samples=min_sample_leaf,
                    subsample=1.,
                    subsample_freq=1,
                    colsample_bytree=1.,
                    reg_alpha=0.,
                    reg_lambda=0.,
                    random_state=531,
                    n_jobs=-1,
                    silent=False)
                if model.endswith('_ERT'):
                    fileModel = StackingClassifier(
                        classifiers=[Gboost, Light_Gboost],
                        meta_classifier=ERT,
                        average_probas=True,
                        use_probas=True)
                elif model.endswith('_GB'):
                    fileModel = StackingClassifier(
                        classifiers=[ERT, Light_Gboost],
                        meta_classifier=Gboost,
                        average_probas=True,
                        use_probas=True)
                elif model.endswith('_LXGB'):
                    fileModel = StackingClassifier(
                        classifiers=[ERT, Gboost],
                        meta_classifier=Light_Gboost,
                        average_probas=True,
                        use_probas=True)

            fileModel.fit(X_train, y_train)
            y_pred_score_i = fileModel.predict_proba(X_test)
            y_pred_score = np.append(y_pred_score, y_pred_score_i, axis=0)
            print(y_pred_score.shape)
            print(test_index.shape)
            print(predicted_index.shape)
            predicted_index = np.append(predicted_index, test_index, axis=0)
            print(predicted_index)
            del X_train, X_test, y_train, y_test

        y_pred_score = np.delete(y_pred_score, 0, axis=1)
        print('min', y_pred_score.min())
        print('max', y_pred_score.max())

        for treshold in tresholds:
            y_hat = (y_pred_score > treshold).astype(int)
            y_hat = y_hat.tolist()
            y_hat = [item for sublist in y_hat for item in sublist]

            scores.append([
                recall_score(y_pred=y_hat, y_true=Test[label].values),
                precision_score(y_pred=y_hat, y_true=Test[label].values),
                fbeta_score(y_pred=y_hat, y_true=Test[label].values, beta=2)
            ])

        scores = np.array(scores)
        print('F-Score', scores[:, 2].max(), scores[:, 2].argmax())
        print('scores', scores[scores[:2].argmax()])
        print(scores)

        plot.plot(tresholds, scores[:, 0], label='$Recall$')
        plot.plot(tresholds, scores[:, 1], label='$Precision$')
        plot.plot(tresholds, scores[:, 2], label='$F_2$')
        plot.ylabel('Score')
        plot.xlabel('Threshold')
        plot.legend(loc='best')
        plot.show()
        plot.close()

        final_tresh = tresholds[scores[:, 2].argmax()]
        print(final_tresh)

        y_hat_test = (y_pred_score > final_tresh).astype(int)
        y_hat_test = y_hat_test.tolist()
        y_hat_test = [item for sublist in y_hat_test for item in sublist]

        Test['id_siniestro'] = Test['id_siniestro'].map(int)
        comparative['id_siniestro'] = comparative['id_siniestro'].map(int)
        Test = pd.merge(Test,
                        comparative[['id_siniestro', 'FRAUDE']],
                        how='left',
                        on='id_siniestro')
        cnf_matrix = confusion_matrix(Test['FRAUDE_Clusters'].values,
                                      y_hat_test)
        plot_confusion_matrix(cnf_matrix,
                              classes=['Normal', 'Abnormal'],
                              title='Confusion matrix')

        cnf_matrix = confusion_matrix(Test['FRAUDE'].values, y_hat_test)
        plot_confusion_matrix(cnf_matrix,
                              classes=['Unknown', 'Fraud'],
                              title='Confusion matrix')

        return None

示例#20

0

显示文件

# # 作業
# * 分類預測的集成泛化, 也與回歸的很不一樣
# 既然分類的 Blending 要變成機率, 才比較容易集成,
# 那麼分類的 Stacking 要讓第一層的模型輸出機率當特徵, 應該要怎麼寫呢?

# In[14]:

from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(subsample=0.70,
                                            n_estimators=15,
                                            max_features='sqrt',
                                            max_depth=4,
                                            learning_rate=0.3)
"""
Your Code Here
"""
stacking = StackingClassifier(classifiers=[gdbt, rf],
                              meta_classifier=meta_estimator)

# In[15]:

stacking.fit(df, train_Y)
stacking_pred = stacking.predict_proba(df2)
sub = pd.DataFrame({'name': ids, 'poi': stacking_pred[:, 1]})
sub = sub.append(df3, ignore_index=True)
sub.to_csv('poi_stacking_4.csv', index=False)

# In[ ]:

示例#21

0

显示文件

    def _model_constructor(self):
        ########################################
        ## sample train/validation data
        ########################################
        print("Loading train data...")
        X_train = self.train_custom_features
        
        df_train = pd.read_csv(TRAIN_DATA_FILE, encoding="utf-8")

        y_train = df_train['is_duplicate'].values
        
        #UPDownSampling
        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train)).as_matrix()
        y_train = np.array([0] * neg_train.shape[0]
                           + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0]
                           + [0] * neg_train.shape[0])
        print("New duplicate content:", np.mean(y_train))
        del pos_train, neg_train


        ESTIMATORS = 180
        self.clfs = [ # MLPClassifier(hidden_layer_sizes=(300, 200, 100),
                      #               activation="relu",
                      #               learning_rate="adaptive",
                      #               verbose=True,                                   
                      #               batch_size=128,
                      #               max_iter=10,
                      #               tol=0.001,
                      #               early_stopping=True,                                    
                      #               warm_start=False),
                      # MLPClassifier(hidden_layer_sizes=(128, 128, 128),
                      #              activation="relu",
                      #              learning_rate="adaptive",
                      #              verbose=True,                                   
                      #              batch_size=64,
                      #              early_stopping=True),
                      RandomForestClassifier(n_estimators=ESTIMATORS,
                                             n_jobs=-1,
                                             criterion='entropy',
                                             verbose=1),                      
                      RandomForestClassifier(n_estimators=ESTIMATORS,
                                             n_jobs=-1,
                                             criterion='gini',
                                             verbose=1,
                                             warm_start=True),
                      ExtraTreesClassifier(n_estimators=ESTIMATORS,
                                           n_jobs=-1,
                                           criterion='gini',
                                           verbose=1),
                      ExtraTreesClassifier(n_estimators=ESTIMATORS,
                                           n_jobs=-1,
                                           criterion='entropy',
                                           verbose=1,
                                           warm_start=True)]
        lr = LogisticRegression()

        sclf = StackingClassifier(classifiers=self.clfs,
                                  use_probas=True,
                                  average_probas=False,
                                  verbose=2,
                                  meta_classifier=lr)

        sclf.fit(X_train, y_train)

        self.model = sclf
        
        bst_val_score = log_loss(y_train, sclf.predict_proba(X_train)[:, 1])
        print("Model train loss:", bst_val_score)
                
        return (sclf, bst_val_score)

示例#22

0

显示文件

model_final.fit(train_stack,train_y)
pre=model_final.predict_proba(test_stack)
    
model1=RandomForestClassifier(n_estimators = 100)
model2=lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', num_class=4,
                                   learning_rate=0.1, n_estimators=100,
                                   num_leaves=124, max_depth=13, 
                                   bagging_fraction=0.66, feature_fraction=0.88,
                                   bagging_freq=66, min_data_in_leaf=86,
                                   min_child_weight=8.8, min_split_gain=0.02,
                                   reg_lambda=3, reg_alpha=6.7,
                                   n_jobs= 8
                                  )   
model3= MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,solver='sgd')

model_final=RandomForestClassifier(n_estimators = 100)

sclf = StackingClassifier(classifiers=[model1, model2, model3], 
                          meta_classifier=model_final,use_probas=True,average_probas=False)
sclf.fit(train_X,train_y)

temp=sclf.predict_proba(X_test)

# 输出预测结果
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[:,0]
result['label_1']=temp[:,1]
result['label_2']=temp[:,2]
result['label_3']=temp[:,3]
result.to_csv('submit1.csv',index=False)

示例#23

0

显示文件

文件： script_11.py 项目： inigooalonso/Project0

def predictor_ev():
    print "Building Neural Net classifiers for devices with events"
    n_input = X_train_ev.shape[1]
    n_train = X_train_ev.shape[0]
    
    from keras.models import Sequential
    from keras.layers import Dense, Activation
    from keras.layers.core import Dropout
    from keras.layers.advanced_activations import PReLU
    from keras.regularizers import l2
    from keras.optimizers import Adadelta
    from keras.optimizers import SGD
    from keras.wrappers.scikit_learn import KerasClassifier
    from keras.callbacks import ModelCheckpoint
    
    def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']):
        n_in = n_input    
        model = Sequential()
        for i in xrange(n_hidden_layers):
            n_out = nodes[i]
            dropout = dropouts[i]
            act = acts[i]
            model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg)))
            model.add(Activation(act))
            model.add(Dropout(dropout))
            n_in = n_out
        model.add(Dense(output_dim=12, W_regularizer=l2(reg)))
        model.add(Activation("softmax"))
        # Compile model
        adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)
        sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
        return model
    
    class KerasClassifier2(KerasClassifier):
            
        def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2):
            self.random_state = random_state
            self.nb_epoch = nb_epoch
            self.batch_size = batch_size
            self.verbose = verbose
            super(KerasClassifier2, self).__init__(build_fn, **fn_args)
            self.classes_= np.arange(12)
            self.n_classes_ = 12
            self.model = build_fn(**fn_args)
            
        def fit(self, X, y, sample_weight=None):
            return super(KerasClassifier2, self).fit(X, indicator(y),
                             verbose = self.verbose, sample_weight=sample_weight,
                             validation_data=(X_cv_ev, indicator(y_cv_ev)),
                             nb_epoch=self.nb_epoch, batch_size=self.batch_size)
    
    
        def predict_proba(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)
            
        def predict(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)            
    
    nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8,
                'dropouts': [.3, .4], 'acts': ['relu', 'relu']}
    nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0,
                'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']}
    nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0,
                'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']}
    nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2,
                'dropouts': [.25], 'acts': ['relu']}
    nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117],
                'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4],
                'acts': ['relu', 'relu', 'relu', 'relu', 'relu']}
    
    clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5)
    clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11)
    clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6)
    clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6)
    clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12)
    
    print "Building XGBoost classifiers for devices with events"
    xgb_params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3.5,
    }
    
    class XGBClassifier2(xgb.XGBClassifier):
    
        def __init__(self, max_depth=xgb_params['max_depth'],
                     objective='multi:softprob', missing=None, 
                     learning_rate=xgb_params['eta'], n_estimators=40, subsample=1,
                     reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'):
            super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed,
                        objective=objective, missing=missing,
                        learning_rate=learning_rate, n_estimators=n_estimators,
                        subsample=subsample, reg_alpha=reg_alpha)
            self.booster = xgb_params['booster']
            
        def fit(self, X, y):
            super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss',
                                            eval_set=[(X_cv_ev.tocsc(), y_cv_ev)])
    
    gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28)
    gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28)
    gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28)
    gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28)
    gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28)
    
    print "Building Logistic Regression classifier for devices with events"
    clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg')
    
    #Combine results of classifiers
    print "Stacking classifiers for devices with events"
    clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
    meta = LogisticRegression()
    stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)
    
    stack.fit(X_train_ev, y_train_ev)
    print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev))
    y_pred_ev = stack.predict_proba(X_test_ev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
    return y_pred_ev

示例#24

0

显示文件

文件： FQSC_credit_amount_tiaocan.py 项目： dataisfree/data_science_note

    meta_classifier=clf_lg)
label = ['stacking']
sclf.fit(X_train_standar, y_train)
score_stacking = cross_val_score(sclf,
                                 X_train_standar,
                                 y_train,
                                 scoring='accuracy')
cross_val_score(sclf, X_train_standar, y_train, scoring='f1')
score_mean_sclf = score_stacking.mean()
print('stacking final score\'s mean is % .2f' % score_mean_sclf)

print('accuracy: %.2f (+/- %.2f) [%s]' %
      (score_stacking.mean(), score_stacking.std(), label))

result_stacking = sclf.predict(X_test_stander)
result_stacking_proba = sclf.predict_proba(X_test_stander)
clf_stacking_test_score = sclf.score(X_test_stander, y_test)

precision, recall, thresholds = precision_recall_curve(y_test,
                                                       sclf.predict(X_test))
report = result_stacking_proba[:, 1] >= 0.8
print(classification_report(y_test, report, target_names=['0', '1']))

# ==============================================================================
# 模型持久化
# os.chdir(u'D:\【01】行健金融\【01】数据中心\【05】数据分析项目\【03】2018\May\规则引擎_分期商城_风控+授信')
# joblib.dump(sclf, 'stackingpkl.pkl')
# joblib.dump(scaler, 'scaler.pkl')

# ==============================================================================

示例#25

0

显示文件

文件：最终模型.py 项目： ytyz1307zzh/Similar-Crowd-Labeling

def stack_test(train_x, train_y, test_x, test_y):
    print("start stacking test")
    clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf2 = lgb.LGBMClassifier(boosting_type='dart',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf3 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)

    clf4 = XGBClassifier(max_depth=5,
                         learning_rate=0.1,
                         n_estimators=2000,
                         objective='binary:logistic',
                         booster='gbtree',
                         n_jobs=-1,
                         min_child_weight=50,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         random_state=2018)

    stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                   meta_classifier=clf4,
                                   use_probas=True,
                                   average_probas=True,
                                   verbose=1)

    stack_clf.fit(train_x, train_y)
    pred_score = stack_clf.predict_proba(test_x)[:, 1]
    auc_score = roc_auc_score(test_y, pred_score)
    output = open(dir_path + r'/auc_score.txt', 'w')
    print("auc score is {}".format(auc_score), file=output)
    print("auc score is {}".format(auc_score))

    return stack_clf

示例#26

0

显示文件

文件： Data_model_building.py 项目： zhaiwc/Analyse_Tool

class cls_model_stack():
    def __init__(self,listModelName,isGridSearch = True , dict_para = {},meta_reg = 'logistic'):
        
        self.listModelName = listModelName
        self.isGridSearch = isGridSearch
        self.dict_para = dict_para
        self.meta_reg = meta_reg
        #缺省参数
        self.train_model = defaultdict(list)
        self.stack = None
    
    def fit(self,x,y):
        '''
        拟合：
        '''
        model_list = []
        basic_cls = ['logistic','knn','svm','dt','rf','adaBoost','gbm','xgb','bp']
        for model_name in self.listModelName:
            if model_name in basic_cls:

                cls = cls_model(model_name,isGridSearch = self.isGridSearch)
                
                if model_name in self.dict_para.keys():
                    #如果用户自定义了参数范围，则对模型参数进行设置
                    cls.set_parameters(self.dict_para[model_name])
                else:
                    pass
                #模型拟合
                cls.fit(x,y)
                model_list.append(cls.cls_model)
                
                self.train_model[model_name] = cls
        
        if self.meta_reg == 'logistic':
            meta_cls = linear_model.LogisticRegression()
            
        elif self.meta_reg == 'knn':
            meta_cls = KNeighborsClassifier()
            
        self.stack = StackingClassifier(classifiers = model_list,meta_classifier = meta_cls)
        self.stack.fit(x.values,y.values.reshape(len(y)))
    
    def predict(self,x):
        return self.stack.predict(x)
    
    def get_vip(self,stack_method = 'avg',isplot = True):
        res = []
        idx = []
        for i,key in enumerate(self.train_model):
            vip = self.train_model[key].get_vip(isplot = False)
            if vip is not None:
                res.append(vip)
                idx.append(i)
        #不同模型结果融合
        if len(res) == 0:
            res = None
        else:
            temp = pd.concat(res,axis = 1)
            if stack_method == 'avg':
                res = temp.mean(axis = 1).sort_values()
                res = pd.DataFrame(res,columns = ['variable importance'])
#            elif stack_method == 'weight':
#                pass
#                res = np.dot(temp.values,self.stack.coef_[idx])
#                res = pd.DataFrame(res,index = temp.index,columns = ['variable importance']).sort_values('variable importance')
            
            #画条形图
            if isplot:
                plt = Data_plot.plot_bar_analysis(res)
                plt.title('variable importance')
                plt.show()
            
        return res

    def predict_proba(self,x):
        x_pred = np.array(x)
        try:
            res = self.stack.predict_proba(x_pred)
        except:
            res = None
        finally:
            return res

示例#27

0

显示文件

def stack_test(train_x, train_y, predict_x, res):
    print("start test")
    clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf2 = lgb.LGBMClassifier(boosting_type='dart',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100),
                         activation='relu',
                         solver='adam',
                         alpha=0.001,
                         random_state=2018,
                         learning_rate_init=0.1)
    clf4 = lgb.LGBMClassifier(boosting_type='rf',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf5 = MLPClassifier(hidden_layer_sizes=(100, 100, 100),
                         activation='relu',
                         learning_rate='invscaling',
                         solver='sgd',
                         alpha=0.001,
                         random_state=2018,
                         learning_rate_init=0.1)
    '''
    clf5=XGBClassifier(
        max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic',
        booster='gbtree', n_jobs=-1, min_child_weight=5,scale_pos_weight=10,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018
     )
     
    clf6=XGBClassifier(
        max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic',
        booster='gbtree', n_jobs=-1, min_child_weight=5,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018
     )
     '''
    clf6 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5],
                                   meta_classifier=clf6,
                                   use_probas=True,
                                   verbose=1)
    stack_clf.fit(train_x, train_y)
    res['score'] = stack_clf.predict_proba(predict_x)[:, 1]
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    res.to_csv(dir_path + r'/submission.csv', index=False)
    '''
    for clf, label in zip([clf1, clf2, clf3, clf4,stack_clf],
                      ['lgbm1', 'lgbm2', 'mlp','lgbm3', 'stack_clf']):
        scores =cross_val_score(clf, train_x, train_y, cv=4, scoring='f1')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))
          '''
    return stack_clf

示例#28

0

显示文件

文件： iris_stack3.py 项目： andong0323/stacking-1

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

################## load packages #####################
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data, iris.target

################## define classifier #####################

pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

################## fit and predict #####################
sclf.fit(x, y)

print(sclf.predict(x))

########### predict class probability ###########
print(sclf.predict_proba(x))

示例#29

0

显示文件

Test_lg=Test_df[Concat_df_list].values


#xgboost，lr模型stacking融合
clf= XGBClassifier(max_depth=4, learning_rate=0.1,
                 n_estimators=80, silent=True,
                 objective="binary:logistic", booster='gbtree',min_child_weight=3,subsample=0.8,
 gamma=0)
 
 from sklearn.linear_model import LogisticRegression
clf2= LogisticRegression(C=0.1, penalty='l2', tol=1e-4)

from sklearn.ensemble import RandomForestClassifier
clf4= RandomForestClassifier(n_estimators=400,oob_score=True)



from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier


eclf= StackingClassifier(classifiers=[clf,clf2],
                          meta_classifier=LogisticRegression(C=0.1, penalty='l2', tol=1e-4), use_probas=True,  verbose=3)
eclf.fit(Train_lg, Train_label)
R=eclf.predict_proba(Test_lg)
instance_id_list=Test['cust_id'].values

with open('1.csv','w') as f:
    f.write('cust_id,pred_prob\n')
    for i in range(len(instance_id_list)):
        f.write('%d,%f\n'%(instance_id_list[i],float(R[i][1])))

示例#30

0

显示文件

class FraudModel(object):
    def __init__(self,
                 alpha=0.1,
                 n_jobs=-1,
                 max_features='sqrt',
                 n_estimators=1000,
                 RandomForest=True,
                 KMeansFeatures=True,
                 NaiveBayes=True):
        """
        INPUT:
        - alpha = Additive laplace smoothing parameter for NaiveBayes
        - n_jobs = Number of jobs to run RFC on
        - max_features = Number of featres to consider on RFC
        - n_estimators = Number of trees in RFC
        - RandomForest = Bool, run RFC
        - KMeansFeatures = Bool, include K means features in RFC
        - NaiveBayes = Bool, run MNB

        ATTRIBUTES:
        - RFC = Random Forest Classifier
        - MNB = Multinomial Naive Bayes Classifier
        """
        self.RFC = RandomForestClassifier(n_jobs=n_jobs,
                                          max_features=max_features,
                                          n_estimators=n_estimators)
        self.MNB = MultinomialNB(alpha=alpha)
        self.LogR = LogisticRegression()
        self.STK = StackingClassifier(classifiers=[self.RFC, self.MNB],
                                      meta_classifier=self.LogR,
                                      use_probas=True)

        self.RandomForest = RandomForest
        self.KMeansFeatures = KMeansFeatures
        self.NaiveBayes = NaiveBayes

    def fit(self, X, y):
        """
        INPUT:
        - X: dataframe representing feature matrix for training data
        - y: series representing labels for training data
        """

        # NLP
        if self.KMeansFeatures == True or self.NaiveBayes == True:
            desc_no_html = update_data_frame(X)
            self.tfidf = TfidfVectorizer(stop_words='english', max_features=10)
            word_counts = self.tfidf.fit_transform(
                desc_no_html['description_no_HTML'])

            if self.KMeansFeatures == True:
                # K-means
                desc_kmeans = KMeans(n_clusters=5, random_state=56, n_jobs=-1)
                desc_kmeans.fit(word_counts)
                self.cluster_centers = desc_kmeans.cluster_centers_
                X_cluster = compute_cluster_distance(word_counts,
                                                     self.cluster_centers)
                RF_X = pd.merge(X_cluster,
                                X,
                                left_index=True,
                                right_index=True).drop(columns=['description'])
        else:
            RF_X = X.drop(columns=['description'])

        # Random Forest
        if self.RandomForest == True:
            # Random Forest
            self.RFC.fit(RF_X, y)

        if self.NaiveBayes == True:
            # Naive Bayes
            self.MNB.fit(word_counts, y)

        # Stacked Classifier
        if self.RandomForest == True and self.NaiveBayes == True:
            RFCpipeline = make_pipeline(RF_X, self.RFC)

            MNBpipeline = make_pipeline(word_counts, self.MNB)

            self.STK.fit(y, classifiers=[RFCpipeline, MNBpipeline])

    def predict_proba(self, X):
        """
        INPUT:
        - X: dataframe representing feature matrix for data

        OUTPUT:
        - blah
        """
        if self.KMeansFeatures == True or self.NaiveBayes == True:
            desc_no_html = update_data_frame(X)
            word_counts = self.tfidf.transform(
                desc_no_html['description_no_HTML'])

            if self.KMeansFeatures == True:
                X_cluster = compute_cluster_distance(word_counts,
                                                     self.cluster_centers)
                RF_X = pd.merge(X_cluster,
                                X,
                                left_index=True,
                                right_index=True).drop(columns=['description'])
        else:
            RF_X = X.drop(columns=['description'])

        if self.RandomForest == True and self.NaiveBayes == False:
            RFC_preds = self.RFC.predict_proba(RF_X)
            return RFC_preds
        elif self.RandomForest == False and self.NaiveBayes == True:
            NB_preds = self.MNB.predict_proba(word_counts)
            return NB_preds
        elif self.RandomForest == True and self.NaiveBayes == True:
            STK_preds = self.STK.predict_proba(X)
            return STK_preds

    def _log_loss(
        self,
        y_true,
    ):
        pass

示例#31

0

显示文件

    'meta-logisticregression__C': C
}

fit_params = {"early_stopping_rounds": 100}

grid = RandomizedSearchCV(grid,
                          n_jobs=n_jobs,
                          param_distributions=params,
                          verbose=3,
                          n_iter=n_iter_search,
                          cv=cv)

print('fitting')
grid.fit(x, y)

joblib.dump(grid, 'export/trend_model_random_%s.pkl' % version)

predicted = grid.predict_proba(x)
predicted = list(map(lambda x: x[1], predicted))
print('trian roc: ', roc_auc_score(y, predicted))
print('val roc: ', grid.best_score_)
print('best params: ', grid.best_params_)
if oversamp:
    predicted = pd.Series(grid.predict_proba(test.as_matrix())[:, 1])
else:
    predicted = pd.Series(grid.predict_proba(test)[:, 1])
predicted.index = test_ids
predicted.to_csv('export/trend_predict_random_%s_%s.csv' %
                 (version, int(time.time())))
print('cost time: ', time.time() - a)

示例#32

0

显示文件

print("Support vector machines : Log Loss: %0.2f" %
      (log_loss(cv_y, sig_clf2.predict_proba(cv_x_onehotCoding))))
sig_clf3.fit(train_x_onehotCoding, train_y)
print("Naive Bayes : Log Loss: %0.2f" %
      (log_loss(cv_y, sig_clf3.predict_proba(cv_x_onehotCoding))))
print("-" * 50)
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10]
best_alpha = 999
for i in alpha:
    lr = LogisticRegression(C=i)
    sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3],
                              meta_classifier=lr,
                              use_probas=True)
    sclf.fit(train_x_onehotCoding, train_y)
    print("Stacking Classifer : for the value of alpha: %f Log Loss: %0.3f" %
          (i, log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))))
    log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
    if best_alpha > log_error:
        best_alpha = log_error

lr = LogisticRegression(C=0.1)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3],
                          meta_classifier=lr,
                          use_probas=True)
sclf.fit(train_x_onehotCoding, train_y)

log_error = log_loss(train_y, sclf.predict_proba(train_x_onehotCoding))
print("Log loss (train) on the stacking classifier :", log_error)

log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
print("Log loss (CV) on the stacking classifier :", log_error)

示例#33

0

显示文件

文件： Final_submission.py 项目： abzeefly/Overfitting-Challenge

                             class_weight='balanced',
                             max_iter=10)
modelDT = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=5,
                                 min_samples_split=2)
modelXGB = XGBClassifier(max_depth=2,
                         gamma=2,
                         eta=0.8,
                         reg_alpha=0.5,
                         reg_lambda=0.5)

#turn these datasets to scalers for consistent fitting
scaler = StandardScaler()
normX = scaler.fit_transform(normX)
X_eval = scaler.fit_transform(X_fs_eval)

#stack the classifiers using mlxtend, make LR model the meta_classifier to give it more weight
m = StackingClassifier(classifiers=[modelLR, modelDT, modelXGB],
                       use_probas=True,
                       meta_classifier=modelLR)

#fit the model and save the predictions
m.fit(normX, normY)
pred = m.predict_proba(X_fs_eval)[:, 1]

#save the results into the file
submission = pd.read_csv('sample_submission.csv')
submission['target'] = pred
submission.to_csv('sample_submission.csv', index=False)