def kfold_train(mode):
    acc_list, loss_list = [], []
    prediction = np.zeros((x_test.shape[0], 4))
    for i in range(10):
        print(str(i + 1) + ' th kflod' + '*' * 50)
        kf = KFold(n_splits=5, shuffle=True, random_state=i)
        kfold_list = []
        for k, (train_index, test_index) in enumerate(kf.split(x_train)):
            print(str(k + 1) + 'fold--------------')
            train_x, train_y = x_train[train_index], labels[train_index]
            test_x, test_y = x_train[test_index], labels[test_index]
            # train
            if mode == 'cat':
                model = catboost_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    #early_stopping_rounds=1000, verbose=False
                )
                #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance'))
            if mode == 'lgb':
                model = lightgbm_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    # early_stopping_rounds=50, verbose=True
                    verbose=False)
            if mode == 'xgb':
                model = xgboost_model()
                model.fit(train_x, np.argmax(train_y, 1), verbose=True)
            if mode == 'stack':
                model = StackingClassifier(classifiers=[
                    xgboost_model(),
                    catboost_model(),
                    lightgbm_model(),
                    adaboost_model()
                ],
                                           use_probas=True,
                                           average_probas=False,
                                           meta_classifier=lr)

                model.fit(train_x, np.argmax(train_y, 1))
            # test
            pred = model.predict_proba(test_x)
            acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1))
            loss = log_loss(test_y, pred)
            acc_list.append(acc)
            loss_list.append(loss)
            kfold_list.append(loss)
            print('test acc: %f, test loss: %f' % (acc, loss))
            # predict
            prediction += model.predict_proba(x_test)
        print('this fold mean loss:', np.mean(kfold_list))
    print('*' * 50)
    print('mean acc: %f, mean loss: %f' %
          (np.mean(acc_list), np.mean(loss_list)))
    prediction = prediction / 50.
    return prediction
def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(sparse.csr_matrix(X[idx]))[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)
Exemplo n.º 3
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)
Exemplo n.º 4
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)
Exemplo n.º 5
0
    def model_test(self,model,best_params):

        print 'Model Test'
        print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        lr = self.model_init(model)
        clf1 = self.model_init('KNN')
        clf2 = self.model_init('RFC')
        clf3 = self.model_init('GNB')
        sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr)
        
        sclf.set_params(**best_params)
        
        train_data = self.train.values.copy() 
        train_label = self.train_label['label'].values.copy()
        
        sclf.fit(train_data, train_label)
        
        if model.upper()=='LR':
            coef=sclf.coef_.reshape(clf.coef_.shape[1])
            ind=coef.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='RFC':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='XGB':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att           
            
        test_data = self.test.values.copy()
        test_label = self.test_label['label'].values.copy()
        test_label = test_label.reshape(test_label.shape[0])
            
        res_proba=sclf.predict_proba(test_data)              
        res_auc=roc_auc_score(test_label,res_proba[:,1])
        
        print 'Model: {0} ; Test: {1}'.format(model,res_auc)
                
        print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        return res_auc
Exemplo n.º 6
0
def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(
        sparse.csr_matrix(X[idx])
    )[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)
Exemplo n.º 8
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)
Exemplo n.º 9
0
 def stacking_model(self, X_train, X_test, y_train, bst_xgb, bst_lgb):
     '''
     使用stacking集成两个综合表现最佳的模型lgb和xgb,此处元分类器使用较为简单的LR模型来在已经训练好了并且经过参数选择的模型上进一步优化预测结果
     :param X_train:
     :param X_test:
     :param y_train:
     :param bst_xgb:
     :param bst_lgb:
     :return:
     '''
     lr = linear_model.LogisticRegression(random_state=7)
     sclf = StackingClassifier(classifiers=[bst_lgb],
                               use_probas=True,
                               average_probas=False,
                               meta_classifier=lr)
     sclf.fit(X_train, y_train)
     predictions = sclf.predict_proba(X_test)[:, 1]
     joblib.dump(sclf, "./models/train_model_{}.m".format(self.num))
     return predictions
Exemplo n.º 10
0
 def stacking_model2(self, X_train, X_test, y_train, bst_xgb, bst_forest,
                     bst_gradient, bst_lgb):
     '''
     组合四种算法
     :param X_train: 训练集
     :param X_test: 测试集
     :param y_train: 训练标签
     :param bst_xgb: xgb最优参数
     :param bst_forest: forest最优参数
     :param bst_gradient: gradient最优参数
     :param bst_lgb: lgb最优参数
     :return: 预测结果
     '''
     lr = linear_model.LogisticRegression(random_state=7)
     sclf = StackingClassifier(
         classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb],
         use_probas=True,
         average_probas=False,
         meta_classifier=lr)
     sclf.fit(X_train, y_train)
     predictions = sclf.predict_proba(X_test)[:, 1]
     return predictions
y = data_train.iloc[1:, 10]
y = np.array(y)
X_test = data_test.iloc[1:, 5:10]
# print(X_test)
X_test = np.array(X_test)

# clf1 = cbt.CatBoostClassifier(iterations=1000,task_type='GPU',loss_function='MultiClass')
# clf2 = lgb.LGBMClassifier(num_leaves=31,bagging_fraction=0.5,feature_fraction=0.8,max_depth=10,n_estimators=200)
clf2 = RandomForestClassifier()
clf3 = xgb.XGBClassifier(n_estimators=500)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf2, clf3], meta_classifier=lr)

sclf.fit(X, y)

y_predict = sclf.predict_proba(X_test)
# print(y_predict[0])
y_predcit_50 = [y_predict[i:i + 50]
                for i in range(0, y_predict.shape[0], 50)]  # 120组,每组50

A = []
# 把每组的列加起来
for i in y_predcit_50:
    a = np.sum(i, axis=0) / 50
    A.append(a)
A = pd.DataFrame(A)
A.columns = ['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio']
A.to_csv('submission_stacking.csv', index=True, index_label='Group')

# B = pd.DataFrame()
# B.insert(0, 'Excellent ratio', A['0'])
Exemplo n.º 12
0
def kfold_train(mode):
    acc_list, loss_list = [], []
    prediction = np.zeros((x_test.shape[0], 4))
    result_list = []
    n = 10
    for i in range(n):
        print(str(i + 1) + ' th kflod' + '*' * 50)
        result = []
        kf = KFold(n_splits=5, shuffle=True, random_state=i)
        kfold_list = []
        for k, (train_index, test_index) in enumerate(kf.split(x_train)):
            print(str(k + 1) + 'fold--------------')
            train_x, train_y = x_train[train_index], labels[train_index]
            test_x, test_y = x_train[test_index], labels[test_index]
            # train
            if mode == 'cat':
                model = catboost_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    #early_stopping_rounds=1000, verbose=False
                )
                #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance'))
            if mode == 'lgb':
                model = lightgbm_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    # early_stopping_rounds=50, verbose=True
                    verbose=False)
            if mode == 'xgb':
                model = xgboost_model()
                model.fit(train_x, np.argmax(train_y, 1), verbose=True)
            if mode == 'stack':
                model = StackingClassifier(classifiers=[
                    catboost_model(),
                    lightgbm_model(),
                    xgboost_model(),
                    adaboost_model()
                ],
                                           use_probas=True,
                                           average_probas=False,
                                           meta_classifier=lr)
                model.fit(train_x, np.argmax(train_y, 1))
            # test
            pred = model.predict_proba(test_x)
            acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1))
            loss = log_loss(test_y, pred)
            acc_list.append(acc)
            loss_list.append(loss)
            kfold_list.append(loss)
            print('test acc: %f, test loss: %f' % (acc, loss))
            # 用于线下验证
            X_valid = train_data.iloc[test_index, :].copy()
            X_valid.loc[:, [
                'prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'
            ]] = pred
            result.append(X_valid)
            # predict
            prediction += model.predict_proba(x_test)
        print('this fold mean loss:', np.mean(kfold_list))
        result_list.append(pd.concat(result))
    print('*' * 50)
    print('mean acc: %f, mean loss: %f' %
          (np.mean(acc_list), np.mean(loss_list)))
    prediction = prediction / (5. * n)
    # 线下评估
    mean = []
    for group in range(100):
        for result in result_list:
            temp = result.groupby(
                ['group_%s' % group],
                as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass',
                                'prob_Fail', 'label_Excellent', 'label_Good',
                                'label_Pass', 'label_Fail'].mean()
            a = np.abs(
                temp.
                loc[:,
                    ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']].
                values - temp.loc[:, [
                    'label_Excellent', 'label_Good', 'label_Pass', 'label_Fail'
                ]].values).mean()
            mean.append(1 / (1 + 10 * a))
    print("线下mae评估:", np.mean(mean), np.std(mean))

    return prediction
Exemplo n.º 13
0
#clfKNN = KNeighborsClassifier(n_neighbors=5)
#clfKNN.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev))
#
##NB
#clfNB = MultinomialNB(alpha=1.0)
#clfNB.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev))

#Combine results of classifiers
print "Stacking classifiers for devices with no events"
clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
meta = LogisticRegression()
stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)

stack.fit(X_train_noev, y_train_noev)
print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev))
y_pred_noev = stack.predict_proba(X_test_noev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
#    return y_pred_noev

y_pred_ev = predictor_ev()
#y_pred_noev = predictor_noev()

# Write results
result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_)
result["device_id"] = test_dev
result = result.set_index("device_id")
result.to_csv('stacking_1.gz', index=True,
              index_label='device_id', compression="gzip")
Exemplo n.º 14
0
def main():
    # excel_file= 'training.xlsx'
    # df=pd.DataFrame(pd.read_excel(excel_file))
    # excel_file_test= 'test1.xlsx'
    # df1=pd.DataFrame(pd.read_excel(excel_file_test))

    # a=[]
    # b=[]
    # a1=[]
    # b1=[]
    # for url in df['url']:
    #     a.append(url)

    # for output in df['phishing']:
    #     b.append(output)

    # for url1 in df1['url']:
    #     a1.append(url1)

    # for output in df1['result']:
    #     b1.append(output)

    excel_file = 'training.xlsx'
    df1 = pd.DataFrame(pd.read_excel(excel_file))
    length = (len(df1) / 100)
    length = round(length * 80)

    df = pd.DataFrame(df1[0:length])
    df1 = pd.DataFrame(df1[length:])

    a = []
    b = []
    a1 = []
    b1 = []
    for url in df['url']:
        a.append(url)

    for output in df['phishing']:
        b.append(output)

    for url1 in df1['url']:
        a1.append(url1)

    for output in df1['phishing']:
        b1.append(output)

    c = []
    d = []
    for url1, output1 in zip(a, b):
        url = url1
        output = output1
        c.append(extract_feature_train(url, output))

    for url1, output1 in zip(a1, b1):
        url = url1
        output = output1
        d.append(extract_feature_test(url, output))

    df = pd.DataFrame(c,
                      columns=[
                          'r', 'length_of_url', 'http_has', 'suspicious_char',
                          'prefix_suffix', 'dots', 'slash', 'phis_term',
                          'sub_domain', 'ip_contain'
                      ])

    df.to_csv('id3.csv', sep=',', encoding='utf-8')

    df_test = pd.DataFrame(d,
                           columns=[
                               'r', 'length_of_url', 'http_has',
                               'suspicious_char', 'prefix_suffix', 'dots',
                               'slash', 'phis_term', 'sub_domain', 'ip_contain'
                           ])

    df_test.to_csv('feature_test.csv', sep=',', encoding='utf-8')

    data_train = importdata_train()
    data_test = importdata_test()
    X, Y = splitdataset(data_train)
    X1, Y1 = splitdataset(data_test)
    clf = svm.SVC(kernel='linear')
    clf.fit(X, Y)

    model = XGBClassifier(max_depth=5,
                          learning_rate=0.01,
                          n_estimators=100,
                          gamma=0,
                          min_child_weight=1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=0.005)
    model.fit(X, Y)

    gnb = GaussianNB()
    gnb.fit(X, Y)

    #STACKING

    df = pd.read_csv("stack.csv")
    df1 = pd.read_csv("feature_test_stack1.csv")
    target = 'r'

    X_train = df.loc[:, df.columns != target]
    Y_train = df.loc[:, df.columns == target]
    X_test = df1.loc[:, df1.columns != target]
    Y_test = df1.loc[:, df1.columns == target]

    #Stacking Classifier
    # knn1 = KNeighborsClassifier()
    # log_reg1 = LogisticRegression()
    # svm1 = SVC(probability=True)
    print(
        "___________________________Stacking__________________________________________"
    )
    clf = svm.SVC(kernel='linear')
    rf2 = RandomForestClassifier(bootstrap=True,
                                 max_depth=70,
                                 max_features='auto',
                                 min_samples_leaf=4,
                                 min_samples_split=10,
                                 n_estimators=400)
    xgb = XGBClassifier()
    classifiers = [clf, xgb]
    sc = StackingClassifier(classifiers, meta_classifier=rf2)
    sc.fit(X_train, Y_train)

    print("_____________Report___________________")
    y_pred4 = sc.predict(X_test)  #prediction or testing
    acc4 = cal_accuracy(Y_test, y_pred4)

    #confusion Matrix
    matrix = confusion_matrix(Y_test, y_pred4)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    #fig = pyplot.gcf()
    fig.canvas.set_window_title('Stacking')
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    #ROC_AUC curve
    probs = sc.predict_proba(X_test)
    probs = probs[:, 1]
    auc = roc_auc_score(Y_test, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y_test)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('Stacking')
    title = 'Stacking'
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = sc.predict(X_test)
    print(classification_report(Y_test, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(sc, classes=classes, support=True)
    visualizer.fit(X_train, Y_train)
    visualizer.score(X_test, Y_test)
    #fig.canvas.set_window_title('Stacking')
    g = visualizer.poof()

    print(
        "___________________________XGBOOST__________________________________________"
    )
    model = XGBClassifier(max_depth=5,
                          learning_rate=0.01,
                          n_estimators=100,
                          gamma=0,
                          min_child_weight=1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=0.005)
    model.fit(X, Y)
    y_pred1 = model.predict(X1)
    print("_____________Report___________________")
    acc1 = cal_accuracy(Y1, y_pred1)
    # print("_____________user input ___________________")

    #confusion Matrix
    import matplotlib.pyplot as plt1
    matrix = confusion_matrix(Y1, y_pred1)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt1.xticks(tick_marks, class_names)
    plt1.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt1.tight_layout()
    plt1.title('Confusion matrix', y=1.1)
    plt1.ylabel('Actual label')
    plt1.xlabel('Predicted label')
    fig.canvas.set_window_title('XGBoost')
    plt.show()

    #ROC_AUC curve
    probs = model.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr1, tpr1, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('XGBoost')
    plot_roc_curve(fpr1, tpr1)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = model.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer1 = ClassificationReport(model, classes=classes, support=True)
    visualizer1.fit(X, Y)
    visualizer1.score(X1, Y1)
    #fig.canvas.set_window_title('XGBoost')
    g = visualizer1.poof()

    print(
        "___________________________SVM__________________________________________"
    )
    clf = svm.SVC(kernel='linear', probability=True)
    clf.fit(X, Y)
    print("_____________Report___________________")
    y_pred = clf.predict(X1)
    #print(cal_accuracy(Y1, y_pred))
    acc2 = cal_accuracy(Y1, y_pred)
    #print("_____________user input ___________________")

    #confusion Matrix
    matrix = confusion_matrix(Y1, y_pred)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    fig.canvas.set_window_title('SVM')
    plt.show()

    #ROC_AUC curve
    probs = clf.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('SVM')
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = clf.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(clf, classes=classes, support=True)
    visualizer.fit(X, Y)
    visualizer.score(X1, Y1)
    #fig.canvas.set_window_title('SVM')
    g = visualizer.poof()

    print(
        "___________________________Naive Bayes__________________________________________"
    )
    gnb = GaussianNB()
    gnb.fit(X, Y)
    print("_____________Report___________________")
    y_pred = gnb.predict(X1)
    #print(cal_accuracy(Y1, y_pred))
    acc3 = cal_accuracy(Y1, y_pred)
    #print("_____________user input ___________________")

    #confusion Matrix
    matrix = confusion_matrix(Y1, y_pred)
    class_names = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    fig.canvas.set_window_title('NB')
    plt.show()

    #ROC_AUC curve
    probs = gnb.predict_proba(X1)
    probs = probs[:, 1]
    auc = roc_auc_score(Y1, probs)
    print('AUC: %.2f' % auc)
    le = preprocessing.LabelEncoder()
    y_test1 = le.fit_transform(Y1)
    fpr, tpr, thresholds = roc_curve(y_test1, probs)
    #fig.canvas.set_window_title('NB')
    plot_roc_curve(fpr, tpr)

    #Classification Report
    target_names = ['Yes', 'No']
    prediction = gnb.predict(X1)
    print(classification_report(Y1, prediction, target_names=target_names))
    classes = ["Yes", "No"]
    visualizer = ClassificationReport(gnb, classes=classes, support=True)
    visualizer.fit(X, Y)
    visualizer.score(X1, Y1)
    #fig.canvas.set_window_title('NB')
    g = visualizer.poof()

    labels = [' XGboost', 'SVM', 'NB', 'Stacking']
    #sizes = [5, neg_per, neu_per]
    sizes = [acc1, acc2, acc3, acc4]
    index = np.arange(len(labels))
    plt.bar(index, sizes)
    plt.xlabel('Algorithm', fontsize=20)
    plt.ylabel('Accuracy', fontsize=20)
    plt.xticks(index, labels, fontsize=10, rotation=0)
    plt.title('comparative study')
    plt.show()

    #GUI
    class MainFrame(wx.Frame):
        def __init__(self, parent):
            wx.Frame.__init__(self,
                              parent,
                              id=wx.ID_ANY,
                              title=wx.EmptyString,
                              pos=wx.DefaultPosition,
                              size=wx.Size(500, 300),
                              style=wx.DEFAULT_FRAME_STYLE | wx.TAB_TRAVERSAL)

            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

            bSizer3 = wx.BoxSizer(wx.VERTICAL)

            self.m_staticText2 = wx.StaticText(self, wx.ID_ANY, u"Enter URL",
                                               wx.DefaultPosition,
                                               wx.DefaultSize, 0)
            self.m_staticText2.Wrap(-1)
            bSizer3.Add(self.m_staticText2, 0, wx.ALL, 5)

            self.text1 = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString,
                                     wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.text1, 0, wx.ALL | wx.EXPAND, 5)

            self.predictButton = wx.Button(self, wx.ID_ANY, u"Predict_XGBOOST",
                                           wx.DefaultPosition, wx.DefaultSize,
                                           0)
            bSizer3.Add(self.predictButton, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button2 = wx.Button(self, wx.ID_ANY, u"Predict_SVM",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button2, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button3 = wx.Button(self, wx.ID_ANY, u"Predict_NB",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button3, 0, wx.ALL | wx.EXPAND, 5)

            self.m_button4 = wx.Button(self, wx.ID_ANY, u"Predict_STACKING",
                                       wx.DefaultPosition, wx.DefaultSize, 0)
            bSizer3.Add(self.m_button4, 0, wx.ALL | wx.EXPAND, 5)

            # self.label1 = wx.StaticText( self, wx.ID_ANY, u"Result", wx.DefaultPosition, wx.DefaultSize, 0 )
            # self.label1.Wrap( -1 )
            # bSizer3.Add( self.label1, 0, wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 5 )

            # self.text2 = wx.TextCtrl( self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0 )
            # bSizer3.Add( self.text2, 0, wx.RIGHT|wx.EXPAND, 5 )

            self.SetSizer(bSizer3)
            self.Layout()

            self.Centre(wx.BOTH)

            # Connect Events
            self.predictButton.Bind(wx.EVT_BUTTON, self.click)
            self.m_button2.Bind(wx.EVT_BUTTON, self.svm)
            self.m_button3.Bind(wx.EVT_BUTTON, self.nb)
            self.m_button4.Bind(wx.EVT_BUTTON, self.stacking)

        def __del__(self):
            pass

        # Virtual event handlers, overide them in your derived class

        #XGBOOST
        def click(self, event):
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = model.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app3 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app3.MainLoop()

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(200, 150),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                            def __del__(self):
                                pass

# Virtual event handlers, overide them in your derived class

                            def click(self, event):
                                event.Skip()

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()

                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #SVM

        def svm(self, event):
            clf = svm.SVC(kernel='linear', probability=True)
            clf.fit(X, Y)
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = model.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app2.MainLoop()
                    webbrowser.open(url)

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()

                    def __del__(self):
                        pass

                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #NAIVE BAYES

        def nb(self, event):
            try:
                url = self.text1.GetValue()
                e = np.array([extract_feature_usertest(url)])
                userpredict1 = gnb.predict(e.reshape(1, -1))
                if (userpredict1[0] == 'no'):
                    # self.text2.SetValue(str("Legitimate"))
                    print('Legitimate')

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY,
                                u"LEGITIMATE", wx.DefaultPosition,
                                wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    webbrowser.open(url)
                    app2.MainLoop()
                    webbrowser.open(url)

                else:

                    class MyDialog1(wx.Dialog):
                        def __init__(self, parent):
                            wx.Dialog.__init__(self,
                                               parent,
                                               id=wx.ID_ANY,
                                               title=wx.EmptyString,
                                               pos=wx.DefaultPosition,
                                               size=wx.Size(159, 114),
                                               style=wx.DEFAULT_DIALOG_STYLE)

                            self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                            sbSizer1 = wx.StaticBoxSizer(
                                wx.StaticBox(self, wx.ID_ANY, u"Error"),
                                wx.VERTICAL)

                            self.m_staticText1 = wx.StaticText(
                                sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                                wx.DefaultPosition, wx.DefaultSize, 0)
                            self.m_staticText1.Wrap(-1)
                            sbSizer1.Add(self.m_staticText1, 0,
                                         wx.ALL | wx.ALIGN_CENTER_HORIZONTAL,
                                         5)

                            self.SetSizer(sbSizer1)
                            self.Layout()

                            self.Centre(wx.BOTH)

                    app2 = wx.App(False)
                    frame = MyDialog1(None)
                    frame.Show(True)
                    app2.MainLoop()
                    # self.text2.SetValue(str("Phising"))
                    # print('Phising')
            except Exception:
                print('error')

    #STACKING

        def stacking(self, event):
            df = pd.read_csv("stack.csv")
            df1 = pd.read_csv("feature_test_stack1.csv")
            target = 'r'

            X_train = df.loc[:, df.columns != target]
            Y_train = df.loc[:, df.columns == target]
            X_test = df1.loc[:, df1.columns != target]
            Y_test = df1.loc[:, df1.columns == target]
            global url4
            url4 = self.text1.GetValue()
            xgb = XGBClassifier()
            clf = svm.SVC(kernel='linear')
            classifiers = [clf, xgb]
            rf2 = RandomForestClassifier(bootstrap=True,
                                         max_depth=70,
                                         max_features='auto',
                                         min_samples_leaf=4,
                                         min_samples_split=10,
                                         n_estimators=400)
            sc = StackingClassifier(classifiers, meta_classifier=rf2)
            sc.fit(X_train, Y_train)
            # e1=extract_feature_usertest(url4)
            # userpredict4 = sc.predict(e1)

            col = df.columns
            col = col[:-1]
            e4 = extract_feature_usertest_stack(url4)
            output_data = e4
            output_data = pd.DataFrame([output_data], columns=col)
            userpredict4 = sc.predict(output_data)
            if (userpredict4[0] == 0):
                # self.text2.SetValue(str("Legitimate"))
                print('Legitimate')

                class MyDialog1(wx.Dialog):
                    def __init__(self, parent):
                        wx.Dialog.__init__(self,
                                           parent,
                                           id=wx.ID_ANY,
                                           title=wx.EmptyString,
                                           pos=wx.DefaultPosition,
                                           size=wx.Size(159, 114),
                                           style=wx.DEFAULT_DIALOG_STYLE)

                        self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                        sbSizer1 = wx.StaticBoxSizer(
                            wx.StaticBox(self, wx.ID_ANY, u"POP-UP"),
                            wx.VERTICAL)

                        self.m_staticText1 = wx.StaticText(
                            sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE",
                            wx.DefaultPosition, wx.DefaultSize, 0)
                        self.m_staticText1.Wrap(-1)
                        sbSizer1.Add(self.m_staticText1, 0,
                                     wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5)

                        self.SetSizer(sbSizer1)
                        self.Layout()

                        self.Centre(wx.BOTH)

                app2 = wx.App(False)
                frame = MyDialog1(None)
                frame.Show(True)
                webbrowser.open(url)
                app2.MainLoop()

            else:

                class MyDialog1(wx.Dialog):
                    def __init__(self, parent):
                        wx.Dialog.__init__(self,
                                           parent,
                                           id=wx.ID_ANY,
                                           title=wx.EmptyString,
                                           pos=wx.DefaultPosition,
                                           size=wx.Size(159, 114),
                                           style=wx.DEFAULT_DIALOG_STYLE)

                        self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize)

                        sbSizer1 = wx.StaticBoxSizer(
                            wx.StaticBox(self, wx.ID_ANY, u"Error"),
                            wx.VERTICAL)

                        self.m_staticText1 = wx.StaticText(
                            sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING",
                            wx.DefaultPosition, wx.DefaultSize, 0)
                        self.m_staticText1.Wrap(-1)
                        sbSizer1.Add(self.m_staticText1, 0,
                                     wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5)

                        self.SetSizer(sbSizer1)
                        self.Layout()

                        self.Centre(wx.BOTH)

                app2 = wx.App(False)
                frame = MyDialog1(None)
                frame.Show(True)
                app2.MainLoop()
                # self.text2.SetValue(str("Phising"))
                # print('Phising')
    app1 = wx.App(False)

    frame = MainFrame(None)
    frame.Show(True)
    app1.MainLoop()
Exemplo n.º 15
0
    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__subsample': subsample,
    #'catboostclassifier__n_estimators':n_estimators,
    #'catboostclassifier__max_depth': max_depth,
    #'randomforestclassifier__n_estimators':[100],
    #'randomforestclassifier__max_depth': [3],
    'meta-logisticregression__C': C
}

grid = GridSearchCV(estimator=grid,
                    param_grid=params,
                    cv=3,
                    refit=True,
                    verbose=3,
                    n_jobs=n_job,
                    early_stopping_rounds=100,
                    scoring='roc_auc')

print('fitting')
grid.fit(x, y)

joblib.dump(grid, 'export/trend_model.pkl')

predicted = grid.predict_proba(x)
predicted = list(map(lambda x: x[1], predicted))
print('trian roc: ', roc_auc_score(y, predicted))

predicted = pd.Series(grid.predict_proba(test)[:, 1])
predicted.index = test_ids
predicted.to_csv('export/trend_predict_test.csv')
                        random_state=0,
                        n_estimators=100)
sclf = StackingClassifier(classifiers=[rf],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=xgc)

gbc.fit(xtr, ytrain)
rf.fit(xtr1, ytrain)
ada.fit(xtr2, ytrain)
sclf.fit(xtrain, ytrain)

gbc_pred_proba = gbc.predict_proba(xte)
rf_pred_proba = rf.predict_proba(xte1)
ada_pred_proba = ada.predict_proba(xte2)
sc_pred_proba = sclf.predict_proba(xtest)

gbc_cm = m.confusion_matrix(ytest, (gbc_pred_proba[:, 1] >= 0.5).astype('int'))
rf_cm = m.confusion_matrix(ytest, (rf_pred_proba[:, 1] >= 0.5).astype('int'))
ada_cm = m.confusion_matrix(ytest, (ada_pred_proba[:, 1] >= 0.5).astype('int'))
sc_cm = m.confusion_matrix(ytest, (sc_pred_proba[:, 1] >= 0.5).astype('int'))

k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#k=[0.55,0.57,0.6,0.62,0.65]
for i in k:
    gbc_cm = m.confusion_matrix(ytest,
                                (gbc_pred_proba[:, 1] >= i).astype('int'))
    print(myf1(gbc_cm))

k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
k = [0.65, 0.68, 0.7, 0.72, 0.75]
params2['objective'] = 'binary:logistic'
params2['booster'] = 'gbtree'
params2['learning_rate'] = 0.02
params2['max_depth'] = 5
params2['subsample'] = 0.6
params2['colsample_bytree'] = 0.5
params2['n_estimators'] = 500

params3 = {}
params3['objective'] = 'binary:logistic'
params3['booster'] = 'gbtree'
params3['learning_rate'] = 0.02
params3['max_depth'] = 4
params3['subsample'] = 0.6
params3['colsample_bytree'] = 0.5
params3['n_estimators'] = 600

clf1 = XGBClassifier(**params1)
clf2 = XGBClassifier(**params2)
clf3 = XGBClassifier(**params3)
clfs = [clf1, clf2, clf3]

lrc = linear_model.LogisticRegression(C=0.5, max_iter=300)

x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
mlxcf = StackingClassifier(clfs, lrc, use_probas=True, average_probas=True)
mlxcf.fit(x_train, y_train)
y_pred = mlxcf.predict_proba(test_data)[:, -1]
submission = pd.DataFrame({"ID": IDlist, "TARGET": y_pred})
submission.to_csv("../Result/mxltendStackingXGB.csv", index=False)
Exemplo n.º 18
0
iris = load_iris()
X = iris.data[:100]
y = iris.target[:100]

pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

sclf.fit(X, y)
decision_scores = sclf.decision_function(X)
print("Val auc Score of Stacking: %f" %
      (roc_auc_score(y,
                     sclf.predict_proba(X)[:, 1])))

fig, axe = plt.subplots(2, 2, figsize=(30, 20))
rlb.ComprehensiveIndicatorFigure(y, decision_scores, axe[0], 1)
rlb.ComprehensiveIndicatorSkLibFigure(y, decision_scores, axe[1])

# In[]:
# 5、ROC Curve with decision_function
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import roc_curve, auc
import numpy as np
Exemplo n.º 19
0
    def model_evaluation(Train,
                         Valid,
                         Test,
                         comparative,
                         bootstrap=False,
                         n_estimators: int = 200,
                         max_depth: int = 50,
                         oob_score: bool = False,
                         class_weight='balanced_subsample',
                         sampling=None,
                         label='FRAUDE',
                         model='ert'):

        # With beta = 2, we give the same importance to Recall and Precision
        if sampling is not None:
            class_weight = None
        model_name = str(sampling)

        # fileModel.fit(xTrain.drop(['id_siniestro'], axis=1).values, yTrain.values)

        # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 0].drop([label] + ['id_siniestro'], axis=1).values)))
        # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 1].drop([label] + ['id_siniestro'], axis=1).values)))

        tresholds = np.linspace(0.1, 1.0, 200)

        scores = []
        y_pred_score = np.empty(shape=[0, 2])
        predicted_index = np.empty(shape=[
            0,
        ])
        # y_pred_score = fileModel.predict_proba(Valid.drop([label] + ['id_siniestro'], axis=1).values)
        skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)
        Test = pd.concat([Train, Valid, Test], axis=0).reset_index()
        print(Test.shape)
        X = Test.drop([label] + ['id_siniestro'], axis=1)
        y = Test[[label]]
        for train_index, test_index in skf.split(X.values, y[label].values):
            X_train, X_test = X.loc[train_index].values, X.loc[
                test_index].values
            y_train, y_test = y.loc[train_index].values, y.loc[
                test_index].values
            if sampling == None:
                pass
            elif sampling == 'ALLKNN':
                X_train, y_train = under_sampling(X_train, y_train)
                class_weight = None
            else:
                X_train, y_train = over_sampling(X_train,
                                                 y_train,
                                                 model=sampling)
                class_weight = None

            min_sample_leaf = round(y_train.shape[0] * 0.005)
            min_sample_split = min_sample_leaf * 10
            max_features = round(X_train.shape[1] / 3)
            if model == 'ert':
                fileModel = ensemble.ExtraTreesClassifier(
                    criterion='entropy',
                    bootstrap=bootstrap,
                    min_samples_leaf=min_sample_leaf,
                    min_samples_split=min_sample_split,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    max_features=max_features,
                    oob_score=oob_score,
                    random_state=531,
                    verbose=1,
                    class_weight=class_weight,
                    n_jobs=-1)
            elif model == 'gb':
                fileModel = ensemble.GradientBoostingClassifier(
                    loss='deviance',
                    learning_rate=0.01,
                    n_estimators=200,
                    subsample=1.0,
                    criterion='friedman_mse',
                    min_samples_split=min_sample_split,
                    min_samples_leaf=min_sample_leaf,
                    min_weight_fraction_leaf=0.,
                    max_depth=max_depth,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    init=None,
                    random_state=531,
                    max_features=None,
                    verbose=1,
                    max_leaf_nodes=None,
                    warm_start=False,
                    presort='auto')
            elif model == 'lxgb':
                fileModel = lgbx.LGBMClassifier(
                    boosting_type="gbdt",
                    num_leaves=2000,
                    max_depth=200,
                    learning_rate=0.005,
                    n_estimators=300,
                    max_bin=500,
                    objective='binary',
                    min_split_gain=0.,
                    min_child_weight=5,
                    min_child_samples=min_sample_leaf,
                    subsample=1.,
                    subsample_freq=1,
                    colsample_bytree=1.,
                    reg_alpha=0.,
                    reg_lambda=0.,
                    random_state=531,
                    n_jobs=-1,
                    silent=True)

            elif model.startswith('stacked'):

                ERT = ensemble.ExtraTreesClassifier(
                    bootstrap=bootstrap,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    oob_score=oob_score,
                    class_weight=class_weight,
                    min_samples_leaf=min_sample_leaf,
                    min_samples_split=min_sample_split,
                    max_features='auto',
                    n_jobs=-1)

                Gboost = ensemble.GradientBoostingClassifier(
                    n_estimators=n_estimators,
                    learning_rate=0.005,
                    max_depth=max_depth,
                    loss='deviance',
                    random_state=531,
                    min_samples_split=min_sample_split,
                    min_samples_leaf=min_sample_leaf)

                Light_Gboost = lgbx.LGBMClassifier(
                    boosting_type="gbdt",
                    num_leaves=2000,
                    max_depth=-1,
                    learning_rate=0.005,
                    n_estimators=300,
                    max_bin=500,
                    objective='binary',
                    min_split_gain=0.,
                    min_child_weight=5,
                    min_child_samples=min_sample_leaf,
                    subsample=1.,
                    subsample_freq=1,
                    colsample_bytree=1.,
                    reg_alpha=0.,
                    reg_lambda=0.,
                    random_state=531,
                    n_jobs=-1,
                    silent=False)
                if model.endswith('_ERT'):
                    fileModel = StackingClassifier(
                        classifiers=[Gboost, Light_Gboost],
                        meta_classifier=ERT,
                        average_probas=True,
                        use_probas=True)
                elif model.endswith('_GB'):
                    fileModel = StackingClassifier(
                        classifiers=[ERT, Light_Gboost],
                        meta_classifier=Gboost,
                        average_probas=True,
                        use_probas=True)
                elif model.endswith('_LXGB'):
                    fileModel = StackingClassifier(
                        classifiers=[ERT, Gboost],
                        meta_classifier=Light_Gboost,
                        average_probas=True,
                        use_probas=True)

            fileModel.fit(X_train, y_train)
            y_pred_score_i = fileModel.predict_proba(X_test)
            y_pred_score = np.append(y_pred_score, y_pred_score_i, axis=0)
            print(y_pred_score.shape)
            print(test_index.shape)
            print(predicted_index.shape)
            predicted_index = np.append(predicted_index, test_index, axis=0)
            print(predicted_index)
            del X_train, X_test, y_train, y_test

        y_pred_score = np.delete(y_pred_score, 0, axis=1)
        print('min', y_pred_score.min())
        print('max', y_pred_score.max())

        for treshold in tresholds:
            y_hat = (y_pred_score > treshold).astype(int)
            y_hat = y_hat.tolist()
            y_hat = [item for sublist in y_hat for item in sublist]

            scores.append([
                recall_score(y_pred=y_hat, y_true=Test[label].values),
                precision_score(y_pred=y_hat, y_true=Test[label].values),
                fbeta_score(y_pred=y_hat, y_true=Test[label].values, beta=2)
            ])

        scores = np.array(scores)
        print('F-Score', scores[:, 2].max(), scores[:, 2].argmax())
        print('scores', scores[scores[:2].argmax()])
        print(scores)

        plot.plot(tresholds, scores[:, 0], label='$Recall$')
        plot.plot(tresholds, scores[:, 1], label='$Precision$')
        plot.plot(tresholds, scores[:, 2], label='$F_2$')
        plot.ylabel('Score')
        plot.xlabel('Threshold')
        plot.legend(loc='best')
        plot.show()
        plot.close()

        final_tresh = tresholds[scores[:, 2].argmax()]
        print(final_tresh)

        y_hat_test = (y_pred_score > final_tresh).astype(int)
        y_hat_test = y_hat_test.tolist()
        y_hat_test = [item for sublist in y_hat_test for item in sublist]

        Test['id_siniestro'] = Test['id_siniestro'].map(int)
        comparative['id_siniestro'] = comparative['id_siniestro'].map(int)
        Test = pd.merge(Test,
                        comparative[['id_siniestro', 'FRAUDE']],
                        how='left',
                        on='id_siniestro')
        cnf_matrix = confusion_matrix(Test['FRAUDE_Clusters'].values,
                                      y_hat_test)
        plot_confusion_matrix(cnf_matrix,
                              classes=['Normal', 'Abnormal'],
                              title='Confusion matrix')

        cnf_matrix = confusion_matrix(Test['FRAUDE'].values, y_hat_test)
        plot_confusion_matrix(cnf_matrix,
                              classes=['Unknown', 'Fraud'],
                              title='Confusion matrix')

        return None
Exemplo n.º 20
0
# # 作業
# * 分類預測的集成泛化, 也與回歸的很不一樣
# 既然分類的 Blending 要變成機率, 才比較容易集成,
# 那麼分類的 Stacking 要讓第一層的模型輸出機率當特徵, 應該要怎麼寫呢?

# In[14]:

from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(subsample=0.70,
                                            n_estimators=15,
                                            max_features='sqrt',
                                            max_depth=4,
                                            learning_rate=0.3)
"""
Your Code Here
"""
stacking = StackingClassifier(classifiers=[gdbt, rf],
                              meta_classifier=meta_estimator)

# In[15]:

stacking.fit(df, train_Y)
stacking_pred = stacking.predict_proba(df2)
sub = pd.DataFrame({'name': ids, 'poi': stacking_pred[:, 1]})
sub = sub.append(df3, ignore_index=True)
sub.to_csv('poi_stacking_4.csv', index=False)

# In[ ]:
Exemplo n.º 21
0
    def _model_constructor(self):
        ########################################
        ## sample train/validation data
        ########################################
        print("Loading train data...")
        X_train = self.train_custom_features
        
        df_train = pd.read_csv(TRAIN_DATA_FILE, encoding="utf-8")

        y_train = df_train['is_duplicate'].values
        
        #UPDownSampling
        pos_train = X_train[y_train == 1]
        neg_train = X_train[y_train == 0]
        X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train)).as_matrix()
        y_train = np.array([0] * neg_train.shape[0]
                           + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0]
                           + [0] * neg_train.shape[0])
        print("New duplicate content:", np.mean(y_train))
        del pos_train, neg_train


        ESTIMATORS = 180
        self.clfs = [ # MLPClassifier(hidden_layer_sizes=(300, 200, 100),
                      #               activation="relu",
                      #               learning_rate="adaptive",
                      #               verbose=True,                                   
                      #               batch_size=128,
                      #               max_iter=10,
                      #               tol=0.001,
                      #               early_stopping=True,                                    
                      #               warm_start=False),
                      # MLPClassifier(hidden_layer_sizes=(128, 128, 128),
                      #              activation="relu",
                      #              learning_rate="adaptive",
                      #              verbose=True,                                   
                      #              batch_size=64,
                      #              early_stopping=True),
                      RandomForestClassifier(n_estimators=ESTIMATORS,
                                             n_jobs=-1,
                                             criterion='entropy',
                                             verbose=1),                      
                      RandomForestClassifier(n_estimators=ESTIMATORS,
                                             n_jobs=-1,
                                             criterion='gini',
                                             verbose=1,
                                             warm_start=True),
                      ExtraTreesClassifier(n_estimators=ESTIMATORS,
                                           n_jobs=-1,
                                           criterion='gini',
                                           verbose=1),
                      ExtraTreesClassifier(n_estimators=ESTIMATORS,
                                           n_jobs=-1,
                                           criterion='entropy',
                                           verbose=1,
                                           warm_start=True)]
        lr = LogisticRegression()

        sclf = StackingClassifier(classifiers=self.clfs,
                                  use_probas=True,
                                  average_probas=False,
                                  verbose=2,
                                  meta_classifier=lr)

        sclf.fit(X_train, y_train)

        self.model = sclf
        
        bst_val_score = log_loss(y_train, sclf.predict_proba(X_train)[:, 1])
        print("Model train loss:", bst_val_score)
                
        return (sclf, bst_val_score)
Exemplo n.º 22
0
model_final.fit(train_stack,train_y)
pre=model_final.predict_proba(test_stack)
    
model1=RandomForestClassifier(n_estimators = 100)
model2=lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', num_class=4,
                                   learning_rate=0.1, n_estimators=100,
                                   num_leaves=124, max_depth=13, 
                                   bagging_fraction=0.66, feature_fraction=0.88,
                                   bagging_freq=66, min_data_in_leaf=86,
                                   min_child_weight=8.8, min_split_gain=0.02,
                                   reg_lambda=3, reg_alpha=6.7,
                                   n_jobs= 8
                                  )   
model3= MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,solver='sgd')

model_final=RandomForestClassifier(n_estimators = 100)

sclf = StackingClassifier(classifiers=[model1, model2, model3], 
                          meta_classifier=model_final,use_probas=True,average_probas=False)
sclf.fit(train_X,train_y)

temp=sclf.predict_proba(X_test)

# 输出预测结果
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[:,0]
result['label_1']=temp[:,1]
result['label_2']=temp[:,2]
result['label_3']=temp[:,3]
result.to_csv('submit1.csv',index=False)
Exemplo n.º 23
0
def predictor_ev():
    print "Building Neural Net classifiers for devices with events"
    n_input = X_train_ev.shape[1]
    n_train = X_train_ev.shape[0]
    
    from keras.models import Sequential
    from keras.layers import Dense, Activation
    from keras.layers.core import Dropout
    from keras.layers.advanced_activations import PReLU
    from keras.regularizers import l2
    from keras.optimizers import Adadelta
    from keras.optimizers import SGD
    from keras.wrappers.scikit_learn import KerasClassifier
    from keras.callbacks import ModelCheckpoint
    
    def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']):
        n_in = n_input    
        model = Sequential()
        for i in xrange(n_hidden_layers):
            n_out = nodes[i]
            dropout = dropouts[i]
            act = acts[i]
            model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg)))
            model.add(Activation(act))
            model.add(Dropout(dropout))
            n_in = n_out
        model.add(Dense(output_dim=12, W_regularizer=l2(reg)))
        model.add(Activation("softmax"))
        # Compile model
        adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)
        sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
        return model
    
    class KerasClassifier2(KerasClassifier):
            
        def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2):
            self.random_state = random_state
            self.nb_epoch = nb_epoch
            self.batch_size = batch_size
            self.verbose = verbose
            super(KerasClassifier2, self).__init__(build_fn, **fn_args)
            self.classes_= np.arange(12)
            self.n_classes_ = 12
            self.model = build_fn(**fn_args)
            
        def fit(self, X, y, sample_weight=None):
            return super(KerasClassifier2, self).fit(X, indicator(y),
                             verbose = self.verbose, sample_weight=sample_weight,
                             validation_data=(X_cv_ev, indicator(y_cv_ev)),
                             nb_epoch=self.nb_epoch, batch_size=self.batch_size)
    
    
        def predict_proba(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)
            
        def predict(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)            
    
    nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8,
                'dropouts': [.3, .4], 'acts': ['relu', 'relu']}
    nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0,
                'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']}
    nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0,
                'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']}
    nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2,
                'dropouts': [.25], 'acts': ['relu']}
    nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117],
                'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4],
                'acts': ['relu', 'relu', 'relu', 'relu', 'relu']}
    
    clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5)
    clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11)
    clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6)
    clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6)
    clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12)
    
    print "Building XGBoost classifiers for devices with events"
    xgb_params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3.5,
    }
    
    class XGBClassifier2(xgb.XGBClassifier):
    
        def __init__(self, max_depth=xgb_params['max_depth'],
                     objective='multi:softprob', missing=None, 
                     learning_rate=xgb_params['eta'], n_estimators=40, subsample=1,
                     reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'):
            super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed,
                        objective=objective, missing=missing,
                        learning_rate=learning_rate, n_estimators=n_estimators,
                        subsample=subsample, reg_alpha=reg_alpha)
            self.booster = xgb_params['booster']
            
        def fit(self, X, y):
            super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss',
                                            eval_set=[(X_cv_ev.tocsc(), y_cv_ev)])
    
    gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28)
    gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28)
    gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28)
    gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28)
    gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28)
    
    print "Building Logistic Regression classifier for devices with events"
    clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg')
    
    #Combine results of classifiers
    print "Stacking classifiers for devices with events"
    clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
    meta = LogisticRegression()
    stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)
    
    stack.fit(X_train_ev, y_train_ev)
    print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev))
    y_pred_ev = stack.predict_proba(X_test_ev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
    return y_pred_ev
    meta_classifier=clf_lg)
label = ['stacking']
sclf.fit(X_train_standar, y_train)
score_stacking = cross_val_score(sclf,
                                 X_train_standar,
                                 y_train,
                                 scoring='accuracy')
cross_val_score(sclf, X_train_standar, y_train, scoring='f1')
score_mean_sclf = score_stacking.mean()
print('stacking final score\'s mean is % .2f' % score_mean_sclf)

print('accuracy: %.2f (+/- %.2f) [%s]' %
      (score_stacking.mean(), score_stacking.std(), label))

result_stacking = sclf.predict(X_test_stander)
result_stacking_proba = sclf.predict_proba(X_test_stander)
clf_stacking_test_score = sclf.score(X_test_stander, y_test)

precision, recall, thresholds = precision_recall_curve(y_test,
                                                       sclf.predict(X_test))
report = result_stacking_proba[:, 1] >= 0.8
print(classification_report(y_test, report, target_names=['0', '1']))

# ==============================================================================
# 模型持久化
# os.chdir(u'D:\【01】行健金融\【01】数据中心\【05】数据分析项目\【03】2018\May\规则引擎_分期商城_风控+授信')
# joblib.dump(sclf, 'stackingpkl.pkl')
# joblib.dump(scaler, 'scaler.pkl')

# ==============================================================================
def stack_test(train_x, train_y, test_x, test_y):
    print("start stacking test")
    clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf2 = lgb.LGBMClassifier(boosting_type='dart',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf3 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=50,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_depth=-1,
                              n_estimators=2000,
                              objective='binary',
                              min_child_weight=50,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)

    clf4 = XGBClassifier(max_depth=5,
                         learning_rate=0.1,
                         n_estimators=2000,
                         objective='binary:logistic',
                         booster='gbtree',
                         n_jobs=-1,
                         min_child_weight=50,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         random_state=2018)

    stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                   meta_classifier=clf4,
                                   use_probas=True,
                                   average_probas=True,
                                   verbose=1)

    stack_clf.fit(train_x, train_y)
    pred_score = stack_clf.predict_proba(test_x)[:, 1]
    auc_score = roc_auc_score(test_y, pred_score)
    output = open(dir_path + r'/auc_score.txt', 'w')
    print("auc score is {}".format(auc_score), file=output)
    print("auc score is {}".format(auc_score))

    return stack_clf
Exemplo n.º 26
0
class cls_model_stack():
    def __init__(self,listModelName,isGridSearch = True , dict_para = {},meta_reg = 'logistic'):
        
        self.listModelName = listModelName
        self.isGridSearch = isGridSearch
        self.dict_para = dict_para
        self.meta_reg = meta_reg
        #缺省参数
        self.train_model = defaultdict(list)
        self.stack = None
    
    def fit(self,x,y):
        '''
        拟合:
        '''
        model_list = []
        basic_cls = ['logistic','knn','svm','dt','rf','adaBoost','gbm','xgb','bp']
        for model_name in self.listModelName:
            if model_name in basic_cls:

                cls = cls_model(model_name,isGridSearch = self.isGridSearch)
                
                if model_name in self.dict_para.keys():
                    #如果用户自定义了参数范围,则对模型参数进行设置
                    cls.set_parameters(self.dict_para[model_name])
                else:
                    pass
                #模型拟合
                cls.fit(x,y)
                model_list.append(cls.cls_model)
                
                self.train_model[model_name] = cls
        
        if self.meta_reg == 'logistic':
            meta_cls = linear_model.LogisticRegression()
            
        elif self.meta_reg == 'knn':
            meta_cls = KNeighborsClassifier()
            
        self.stack = StackingClassifier(classifiers = model_list,meta_classifier = meta_cls)
        self.stack.fit(x.values,y.values.reshape(len(y)))
    
    def predict(self,x):
        return self.stack.predict(x)
    
    def get_vip(self,stack_method = 'avg',isplot = True):
        res = []
        idx = []
        for i,key in enumerate(self.train_model):
            vip = self.train_model[key].get_vip(isplot = False)
            if vip is not None:
                res.append(vip)
                idx.append(i)
        #不同模型结果融合
        if len(res) == 0:
            res = None
        else:
            temp = pd.concat(res,axis = 1)
            if stack_method == 'avg':
                res = temp.mean(axis = 1).sort_values()
                res = pd.DataFrame(res,columns = ['variable importance'])
#            elif stack_method == 'weight':
#                pass
#                res = np.dot(temp.values,self.stack.coef_[idx])
#                res = pd.DataFrame(res,index = temp.index,columns = ['variable importance']).sort_values('variable importance')
            
            #画条形图
            if isplot:
                plt = Data_plot.plot_bar_analysis(res)
                plt.title('variable importance')
                plt.show()
            
        return res

    def predict_proba(self,x):
        x_pred = np.array(x)
        try:
            res = self.stack.predict_proba(x_pred)
        except:
            res = None
        finally:
            return res
Exemplo n.º 27
0
def stack_test(train_x, train_y, predict_x, res):
    print("start test")
    clf1 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf2 = lgb.LGBMClassifier(boosting_type='dart',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100),
                         activation='relu',
                         solver='adam',
                         alpha=0.001,
                         random_state=2018,
                         learning_rate_init=0.1)
    clf4 = lgb.LGBMClassifier(boosting_type='rf',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    clf5 = MLPClassifier(hidden_layer_sizes=(100, 100, 100),
                         activation='relu',
                         learning_rate='invscaling',
                         solver='sgd',
                         alpha=0.001,
                         random_state=2018,
                         learning_rate_init=0.1)
    '''
    clf5=XGBClassifier(
        max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic',
        booster='gbtree', n_jobs=-1, min_child_weight=5,scale_pos_weight=10,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018
     )
     
    clf6=XGBClassifier(
        max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic',
        booster='gbtree', n_jobs=-1, min_child_weight=5,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018
     )
     '''
    clf6 = lgb.LGBMClassifier(boosting_type='gbdt',
                              num_leaves=31,
                              reg_alpha=0.0,
                              reg_lambda=1,
                              max_bin=150,
                              max_depth=-1,
                              n_estimators=500,
                              objective='binary',
                              subsample=0.8,
                              colsample_bytree=0.8,
                              subsample_freq=1,
                              learning_rate=0.1,
                              random_state=2018,
                              n_jobs=-1)
    stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5],
                                   meta_classifier=clf6,
                                   use_probas=True,
                                   verbose=1)
    stack_clf.fit(train_x, train_y)
    res['score'] = stack_clf.predict_proba(predict_x)[:, 1]
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    res.to_csv(dir_path + r'/submission.csv', index=False)
    '''
    for clf, label in zip([clf1, clf2, clf3, clf4,stack_clf],
                      ['lgbm1', 'lgbm2', 'mlp','lgbm3', 'stack_clf']):
        scores =cross_val_score(clf, train_x, train_y, cv=4, scoring='f1')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))
          '''
    return stack_clf
Exemplo n.º 28
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

################## load packages #####################
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data, iris.target

################## define classifier #####################

pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

################## fit and predict #####################
sclf.fit(x, y)

print(sclf.predict(x))

########### predict class probability ###########
print(sclf.predict_proba(x))
Exemplo n.º 29
0
Test_lg=Test_df[Concat_df_list].values


#xgboost,lr模型stacking融合
clf= XGBClassifier(max_depth=4, learning_rate=0.1,
                 n_estimators=80, silent=True,
                 objective="binary:logistic", booster='gbtree',min_child_weight=3,subsample=0.8,
 gamma=0)
 
 from sklearn.linear_model import LogisticRegression
clf2= LogisticRegression(C=0.1, penalty='l2', tol=1e-4)

from sklearn.ensemble import RandomForestClassifier
clf4= RandomForestClassifier(n_estimators=400,oob_score=True)



from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier


eclf= StackingClassifier(classifiers=[clf,clf2],
                          meta_classifier=LogisticRegression(C=0.1, penalty='l2', tol=1e-4), use_probas=True,  verbose=3)
eclf.fit(Train_lg, Train_label)
R=eclf.predict_proba(Test_lg)
instance_id_list=Test['cust_id'].values

with open('1.csv','w') as f:
    f.write('cust_id,pred_prob\n')
    for i in range(len(instance_id_list)):
        f.write('%d,%f\n'%(instance_id_list[i],float(R[i][1])))
Exemplo n.º 30
0
class FraudModel(object):
    def __init__(self,
                 alpha=0.1,
                 n_jobs=-1,
                 max_features='sqrt',
                 n_estimators=1000,
                 RandomForest=True,
                 KMeansFeatures=True,
                 NaiveBayes=True):
        """
        INPUT:
        - alpha = Additive laplace smoothing parameter for NaiveBayes
        - n_jobs = Number of jobs to run RFC on
        - max_features = Number of featres to consider on RFC
        - n_estimators = Number of trees in RFC
        - RandomForest = Bool, run RFC
        - KMeansFeatures = Bool, include K means features in RFC
        - NaiveBayes = Bool, run MNB

        ATTRIBUTES:
        - RFC = Random Forest Classifier
        - MNB = Multinomial Naive Bayes Classifier
        """
        self.RFC = RandomForestClassifier(n_jobs=n_jobs,
                                          max_features=max_features,
                                          n_estimators=n_estimators)
        self.MNB = MultinomialNB(alpha=alpha)
        self.LogR = LogisticRegression()
        self.STK = StackingClassifier(classifiers=[self.RFC, self.MNB],
                                      meta_classifier=self.LogR,
                                      use_probas=True)

        self.RandomForest = RandomForest
        self.KMeansFeatures = KMeansFeatures
        self.NaiveBayes = NaiveBayes

    def fit(self, X, y):
        """
        INPUT:
        - X: dataframe representing feature matrix for training data
        - y: series representing labels for training data
        """

        # NLP
        if self.KMeansFeatures == True or self.NaiveBayes == True:
            desc_no_html = update_data_frame(X)
            self.tfidf = TfidfVectorizer(stop_words='english', max_features=10)
            word_counts = self.tfidf.fit_transform(
                desc_no_html['description_no_HTML'])

            if self.KMeansFeatures == True:
                # K-means
                desc_kmeans = KMeans(n_clusters=5, random_state=56, n_jobs=-1)
                desc_kmeans.fit(word_counts)
                self.cluster_centers = desc_kmeans.cluster_centers_
                X_cluster = compute_cluster_distance(word_counts,
                                                     self.cluster_centers)
                RF_X = pd.merge(X_cluster,
                                X,
                                left_index=True,
                                right_index=True).drop(columns=['description'])
        else:
            RF_X = X.drop(columns=['description'])

        # Random Forest
        if self.RandomForest == True:
            # Random Forest
            self.RFC.fit(RF_X, y)

        if self.NaiveBayes == True:
            # Naive Bayes
            self.MNB.fit(word_counts, y)

        # Stacked Classifier
        if self.RandomForest == True and self.NaiveBayes == True:
            RFCpipeline = make_pipeline(RF_X, self.RFC)

            MNBpipeline = make_pipeline(word_counts, self.MNB)

            self.STK.fit(y, classifiers=[RFCpipeline, MNBpipeline])

    def predict_proba(self, X):
        """
        INPUT:
        - X: dataframe representing feature matrix for data

        OUTPUT:
        - blah
        """
        if self.KMeansFeatures == True or self.NaiveBayes == True:
            desc_no_html = update_data_frame(X)
            word_counts = self.tfidf.transform(
                desc_no_html['description_no_HTML'])

            if self.KMeansFeatures == True:
                X_cluster = compute_cluster_distance(word_counts,
                                                     self.cluster_centers)
                RF_X = pd.merge(X_cluster,
                                X,
                                left_index=True,
                                right_index=True).drop(columns=['description'])
        else:
            RF_X = X.drop(columns=['description'])

        if self.RandomForest == True and self.NaiveBayes == False:
            RFC_preds = self.RFC.predict_proba(RF_X)
            return RFC_preds
        elif self.RandomForest == False and self.NaiveBayes == True:
            NB_preds = self.MNB.predict_proba(word_counts)
            return NB_preds
        elif self.RandomForest == True and self.NaiveBayes == True:
            STK_preds = self.STK.predict_proba(X)
            return STK_preds

    def _log_loss(
        self,
        y_true,
    ):
        pass
Exemplo n.º 31
0
    'meta-logisticregression__C': C
}

fit_params = {"early_stopping_rounds": 100}

grid = RandomizedSearchCV(grid,
                          n_jobs=n_jobs,
                          param_distributions=params,
                          verbose=3,
                          n_iter=n_iter_search,
                          cv=cv)

print('fitting')
grid.fit(x, y)

joblib.dump(grid, 'export/trend_model_random_%s.pkl' % version)

predicted = grid.predict_proba(x)
predicted = list(map(lambda x: x[1], predicted))
print('trian roc: ', roc_auc_score(y, predicted))
print('val roc: ', grid.best_score_)
print('best params: ', grid.best_params_)
if oversamp:
    predicted = pd.Series(grid.predict_proba(test.as_matrix())[:, 1])
else:
    predicted = pd.Series(grid.predict_proba(test)[:, 1])
predicted.index = test_ids
predicted.to_csv('export/trend_predict_random_%s_%s.csv' %
                 (version, int(time.time())))
print('cost time: ', time.time() - a)
Exemplo n.º 32
0
print("Support vector machines : Log Loss: %0.2f" %
      (log_loss(cv_y, sig_clf2.predict_proba(cv_x_onehotCoding))))
sig_clf3.fit(train_x_onehotCoding, train_y)
print("Naive Bayes : Log Loss: %0.2f" %
      (log_loss(cv_y, sig_clf3.predict_proba(cv_x_onehotCoding))))
print("-" * 50)
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10]
best_alpha = 999
for i in alpha:
    lr = LogisticRegression(C=i)
    sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3],
                              meta_classifier=lr,
                              use_probas=True)
    sclf.fit(train_x_onehotCoding, train_y)
    print("Stacking Classifer : for the value of alpha: %f Log Loss: %0.3f" %
          (i, log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))))
    log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
    if best_alpha > log_error:
        best_alpha = log_error

lr = LogisticRegression(C=0.1)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3],
                          meta_classifier=lr,
                          use_probas=True)
sclf.fit(train_x_onehotCoding, train_y)

log_error = log_loss(train_y, sclf.predict_proba(train_x_onehotCoding))
print("Log loss (train) on the stacking classifier :", log_error)

log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
print("Log loss (CV) on the stacking classifier :", log_error)
                             class_weight='balanced',
                             max_iter=10)
modelDT = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=5,
                                 min_samples_split=2)
modelXGB = XGBClassifier(max_depth=2,
                         gamma=2,
                         eta=0.8,
                         reg_alpha=0.5,
                         reg_lambda=0.5)

#turn these datasets to scalers for consistent fitting
scaler = StandardScaler()
normX = scaler.fit_transform(normX)
X_eval = scaler.fit_transform(X_fs_eval)

#stack the classifiers using mlxtend, make LR model the meta_classifier to give it more weight
m = StackingClassifier(classifiers=[modelLR, modelDT, modelXGB],
                       use_probas=True,
                       meta_classifier=modelLR)

#fit the model and save the predictions
m.fit(normX, normY)
pred = m.predict_proba(X_fs_eval)[:, 1]

#save the results into the file
submission = pd.read_csv('sample_submission.csv')
submission['target'] = pred
submission.to_csv('sample_submission.csv', index=False)