def kfold_train(mode): acc_list, loss_list = [], [] prediction = np.zeros((x_test.shape[0], 4)) for i in range(10): print(str(i + 1) + ' th kflod' + '*' * 50) kf = KFold(n_splits=5, shuffle=True, random_state=i) kfold_list = [] for k, (train_index, test_index) in enumerate(kf.split(x_train)): print(str(k + 1) + 'fold--------------') train_x, train_y = x_train[train_index], labels[train_index] test_x, test_y = x_train[test_index], labels[test_index] # train if mode == 'cat': model = catboost_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), #early_stopping_rounds=1000, verbose=False ) #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance')) if mode == 'lgb': model = lightgbm_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), # early_stopping_rounds=50, verbose=True verbose=False) if mode == 'xgb': model = xgboost_model() model.fit(train_x, np.argmax(train_y, 1), verbose=True) if mode == 'stack': model = StackingClassifier(classifiers=[ xgboost_model(), catboost_model(), lightgbm_model(), adaboost_model() ], use_probas=True, average_probas=False, meta_classifier=lr) model.fit(train_x, np.argmax(train_y, 1)) # test pred = model.predict_proba(test_x) acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1)) loss = log_loss(test_y, pred) acc_list.append(acc) loss_list.append(loss) kfold_list.append(loss) print('test acc: %f, test loss: %f' % (acc, loss)) # predict prediction += model.predict_proba(x_test) print('this fold mean loss:', np.mean(kfold_list)) print('*' * 50) print('mean acc: %f, mean loss: %f' % (np.mean(acc_list), np.mean(loss_list))) prediction = prediction / 50. return prediction
def test_use_features_in_secondary_sparse_input_predict_proba(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) sclf = StackingClassifier(classifiers=[clf1], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(sparse.csr_matrix(X), y) idx = [0, 1, 2] y_pred = sclf.predict_proba(sparse.csr_matrix(X[idx]))[:, 0] expect = np.array([0.910, 0.829, 0.882]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.911, 0.829, 0.885]) np.testing.assert_almost_equal(y_pred, expect, 3)
def model_test(self,model,best_params): print 'Model Test' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() sclf.fit(train_data, train_label) if model.upper()=='LR': coef=sclf.coef_.reshape(clf.coef_.shape[1]) ind=coef.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='RFC': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='XGB': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att test_data = self.test.values.copy() test_label = self.test_label['label'].values.copy() test_label = test_label.reshape(test_label.shape[0]) res_proba=sclf.predict_proba(test_data) res_auc=roc_auc_score(test_label,res_proba[:,1]) print 'Model: {0} ; Test: {1}'.format(model,res_auc) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return res_auc
def test_use_features_in_secondary_sparse_input_predict_proba(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) sclf = StackingClassifier(classifiers=[clf1], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(sparse.csr_matrix(X), y) idx = [0, 1, 2] y_pred = sclf.predict_proba( sparse.csr_matrix(X[idx]) )[:, 0] expect = np.array([0.910, 0.829, 0.882]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) X, y = iris_data() meta = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1) clf1 = RandomForestClassifier(n_estimators=10, random_state=1) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.916, 0.828, 0.889]) np.testing.assert_almost_equal(y_pred, expect, 3)
def stacking_model(self, X_train, X_test, y_train, bst_xgb, bst_lgb): ''' 使用stacking集成两个综合表现最佳的模型lgb和xgb,此处元分类器使用较为简单的LR模型来在已经训练好了并且经过参数选择的模型上进一步优化预测结果 :param X_train: :param X_test: :param y_train: :param bst_xgb: :param bst_lgb: :return: ''' lr = linear_model.LogisticRegression(random_state=7) sclf = StackingClassifier(classifiers=[bst_lgb], use_probas=True, average_probas=False, meta_classifier=lr) sclf.fit(X_train, y_train) predictions = sclf.predict_proba(X_test)[:, 1] joblib.dump(sclf, "./models/train_model_{}.m".format(self.num)) return predictions
def stacking_model2(self, X_train, X_test, y_train, bst_xgb, bst_forest, bst_gradient, bst_lgb): ''' 组合四种算法 :param X_train: 训练集 :param X_test: 测试集 :param y_train: 训练标签 :param bst_xgb: xgb最优参数 :param bst_forest: forest最优参数 :param bst_gradient: gradient最优参数 :param bst_lgb: lgb最优参数 :return: 预测结果 ''' lr = linear_model.LogisticRegression(random_state=7) sclf = StackingClassifier( classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb], use_probas=True, average_probas=False, meta_classifier=lr) sclf.fit(X_train, y_train) predictions = sclf.predict_proba(X_test)[:, 1] return predictions
y = data_train.iloc[1:, 10] y = np.array(y) X_test = data_test.iloc[1:, 5:10] # print(X_test) X_test = np.array(X_test) # clf1 = cbt.CatBoostClassifier(iterations=1000,task_type='GPU',loss_function='MultiClass') # clf2 = lgb.LGBMClassifier(num_leaves=31,bagging_fraction=0.5,feature_fraction=0.8,max_depth=10,n_estimators=200) clf2 = RandomForestClassifier() clf3 = xgb.XGBClassifier(n_estimators=500) lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf2, clf3], meta_classifier=lr) sclf.fit(X, y) y_predict = sclf.predict_proba(X_test) # print(y_predict[0]) y_predcit_50 = [y_predict[i:i + 50] for i in range(0, y_predict.shape[0], 50)] # 120组,每组50 A = [] # 把每组的列加起来 for i in y_predcit_50: a = np.sum(i, axis=0) / 50 A.append(a) A = pd.DataFrame(A) A.columns = ['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio'] A.to_csv('submission_stacking.csv', index=True, index_label='Group') # B = pd.DataFrame() # B.insert(0, 'Excellent ratio', A['0'])
def kfold_train(mode): acc_list, loss_list = [], [] prediction = np.zeros((x_test.shape[0], 4)) result_list = [] n = 10 for i in range(n): print(str(i + 1) + ' th kflod' + '*' * 50) result = [] kf = KFold(n_splits=5, shuffle=True, random_state=i) kfold_list = [] for k, (train_index, test_index) in enumerate(kf.split(x_train)): print(str(k + 1) + 'fold--------------') train_x, train_y = x_train[train_index], labels[train_index] test_x, test_y = x_train[test_index], labels[test_index] # train if mode == 'cat': model = catboost_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), #early_stopping_rounds=1000, verbose=False ) #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance')) if mode == 'lgb': model = lightgbm_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), # early_stopping_rounds=50, verbose=True verbose=False) if mode == 'xgb': model = xgboost_model() model.fit(train_x, np.argmax(train_y, 1), verbose=True) if mode == 'stack': model = StackingClassifier(classifiers=[ catboost_model(), lightgbm_model(), xgboost_model(), adaboost_model() ], use_probas=True, average_probas=False, meta_classifier=lr) model.fit(train_x, np.argmax(train_y, 1)) # test pred = model.predict_proba(test_x) acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1)) loss = log_loss(test_y, pred) acc_list.append(acc) loss_list.append(loss) kfold_list.append(loss) print('test acc: %f, test loss: %f' % (acc, loss)) # 用于线下验证 X_valid = train_data.iloc[test_index, :].copy() X_valid.loc[:, [ 'prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail' ]] = pred result.append(X_valid) # predict prediction += model.predict_proba(x_test) print('this fold mean loss:', np.mean(kfold_list)) result_list.append(pd.concat(result)) print('*' * 50) print('mean acc: %f, mean loss: %f' % (np.mean(acc_list), np.mean(loss_list))) prediction = prediction / (5. * n) # 线下评估 mean = [] for group in range(100): for result in result_list: temp = result.groupby( ['group_%s' % group], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail', 'label_Excellent', 'label_Good', 'label_Pass', 'label_Fail'].mean() a = np.abs( temp. loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']]. values - temp.loc[:, [ 'label_Excellent', 'label_Good', 'label_Pass', 'label_Fail' ]].values).mean() mean.append(1 / (1 + 10 * a)) print("线下mae评估:", np.mean(mean), np.std(mean)) return prediction
#clfKNN = KNeighborsClassifier(n_neighbors=5) #clfKNN.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev)) # ##NB #clfNB = MultinomialNB(alpha=1.0) #clfNB.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev)) #Combine results of classifiers print "Stacking classifiers for devices with no events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_noev, y_train_noev) print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev)) y_pred_noev = stack.predict_proba(X_test_noev) #y_pre = (pred_prob_nn+y_pre)/2.0 # return y_pred_noev y_pred_ev = predictor_ev() #y_pred_noev = predictor_noev() # Write results result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_) result["device_id"] = test_dev result = result.set_index("device_id") result.to_csv('stacking_1.gz', index=True, index_label='device_id', compression="gzip")
def main(): # excel_file= 'training.xlsx' # df=pd.DataFrame(pd.read_excel(excel_file)) # excel_file_test= 'test1.xlsx' # df1=pd.DataFrame(pd.read_excel(excel_file_test)) # a=[] # b=[] # a1=[] # b1=[] # for url in df['url']: # a.append(url) # for output in df['phishing']: # b.append(output) # for url1 in df1['url']: # a1.append(url1) # for output in df1['result']: # b1.append(output) excel_file = 'training.xlsx' df1 = pd.DataFrame(pd.read_excel(excel_file)) length = (len(df1) / 100) length = round(length * 80) df = pd.DataFrame(df1[0:length]) df1 = pd.DataFrame(df1[length:]) a = [] b = [] a1 = [] b1 = [] for url in df['url']: a.append(url) for output in df['phishing']: b.append(output) for url1 in df1['url']: a1.append(url1) for output in df1['phishing']: b1.append(output) c = [] d = [] for url1, output1 in zip(a, b): url = url1 output = output1 c.append(extract_feature_train(url, output)) for url1, output1 in zip(a1, b1): url = url1 output = output1 d.append(extract_feature_test(url, output)) df = pd.DataFrame(c, columns=[ 'r', 'length_of_url', 'http_has', 'suspicious_char', 'prefix_suffix', 'dots', 'slash', 'phis_term', 'sub_domain', 'ip_contain' ]) df.to_csv('id3.csv', sep=',', encoding='utf-8') df_test = pd.DataFrame(d, columns=[ 'r', 'length_of_url', 'http_has', 'suspicious_char', 'prefix_suffix', 'dots', 'slash', 'phis_term', 'sub_domain', 'ip_contain' ]) df_test.to_csv('feature_test.csv', sep=',', encoding='utf-8') data_train = importdata_train() data_test = importdata_test() X, Y = splitdataset(data_train) X1, Y1 = splitdataset(data_test) clf = svm.SVC(kernel='linear') clf.fit(X, Y) model = XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=100, gamma=0, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005) model.fit(X, Y) gnb = GaussianNB() gnb.fit(X, Y) #STACKING df = pd.read_csv("stack.csv") df1 = pd.read_csv("feature_test_stack1.csv") target = 'r' X_train = df.loc[:, df.columns != target] Y_train = df.loc[:, df.columns == target] X_test = df1.loc[:, df1.columns != target] Y_test = df1.loc[:, df1.columns == target] #Stacking Classifier # knn1 = KNeighborsClassifier() # log_reg1 = LogisticRegression() # svm1 = SVC(probability=True) print( "___________________________Stacking__________________________________________" ) clf = svm.SVC(kernel='linear') rf2 = RandomForestClassifier(bootstrap=True, max_depth=70, max_features='auto', min_samples_leaf=4, min_samples_split=10, n_estimators=400) xgb = XGBClassifier() classifiers = [clf, xgb] sc = StackingClassifier(classifiers, meta_classifier=rf2) sc.fit(X_train, Y_train) print("_____________Report___________________") y_pred4 = sc.predict(X_test) #prediction or testing acc4 = cal_accuracy(Y_test, y_pred4) #confusion Matrix matrix = confusion_matrix(Y_test, y_pred4) class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() #fig = pyplot.gcf() fig.canvas.set_window_title('Stacking') plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() #ROC_AUC curve probs = sc.predict_proba(X_test) probs = probs[:, 1] auc = roc_auc_score(Y_test, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1 = le.fit_transform(Y_test) fpr, tpr, thresholds = roc_curve(y_test1, probs) #fig.canvas.set_window_title('Stacking') title = 'Stacking' plot_roc_curve(fpr, tpr) #Classification Report target_names = ['Yes', 'No'] prediction = sc.predict(X_test) print(classification_report(Y_test, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer = ClassificationReport(sc, classes=classes, support=True) visualizer.fit(X_train, Y_train) visualizer.score(X_test, Y_test) #fig.canvas.set_window_title('Stacking') g = visualizer.poof() print( "___________________________XGBOOST__________________________________________" ) model = XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=100, gamma=0, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005) model.fit(X, Y) y_pred1 = model.predict(X1) print("_____________Report___________________") acc1 = cal_accuracy(Y1, y_pred1) # print("_____________user input ___________________") #confusion Matrix import matplotlib.pyplot as plt1 matrix = confusion_matrix(Y1, y_pred1) class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt1.xticks(tick_marks, class_names) plt1.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt1.tight_layout() plt1.title('Confusion matrix', y=1.1) plt1.ylabel('Actual label') plt1.xlabel('Predicted label') fig.canvas.set_window_title('XGBoost') plt.show() #ROC_AUC curve probs = model.predict_proba(X1) probs = probs[:, 1] auc = roc_auc_score(Y1, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1 = le.fit_transform(Y1) fpr1, tpr1, thresholds = roc_curve(y_test1, probs) #fig.canvas.set_window_title('XGBoost') plot_roc_curve(fpr1, tpr1) #Classification Report target_names = ['Yes', 'No'] prediction = model.predict(X1) print(classification_report(Y1, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer1 = ClassificationReport(model, classes=classes, support=True) visualizer1.fit(X, Y) visualizer1.score(X1, Y1) #fig.canvas.set_window_title('XGBoost') g = visualizer1.poof() print( "___________________________SVM__________________________________________" ) clf = svm.SVC(kernel='linear', probability=True) clf.fit(X, Y) print("_____________Report___________________") y_pred = clf.predict(X1) #print(cal_accuracy(Y1, y_pred)) acc2 = cal_accuracy(Y1, y_pred) #print("_____________user input ___________________") #confusion Matrix matrix = confusion_matrix(Y1, y_pred) class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') fig.canvas.set_window_title('SVM') plt.show() #ROC_AUC curve probs = clf.predict_proba(X1) probs = probs[:, 1] auc = roc_auc_score(Y1, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1 = le.fit_transform(Y1) fpr, tpr, thresholds = roc_curve(y_test1, probs) #fig.canvas.set_window_title('SVM') plot_roc_curve(fpr, tpr) #Classification Report target_names = ['Yes', 'No'] prediction = clf.predict(X1) print(classification_report(Y1, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer = ClassificationReport(clf, classes=classes, support=True) visualizer.fit(X, Y) visualizer.score(X1, Y1) #fig.canvas.set_window_title('SVM') g = visualizer.poof() print( "___________________________Naive Bayes__________________________________________" ) gnb = GaussianNB() gnb.fit(X, Y) print("_____________Report___________________") y_pred = gnb.predict(X1) #print(cal_accuracy(Y1, y_pred)) acc3 = cal_accuracy(Y1, y_pred) #print("_____________user input ___________________") #confusion Matrix matrix = confusion_matrix(Y1, y_pred) class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') fig.canvas.set_window_title('NB') plt.show() #ROC_AUC curve probs = gnb.predict_proba(X1) probs = probs[:, 1] auc = roc_auc_score(Y1, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1 = le.fit_transform(Y1) fpr, tpr, thresholds = roc_curve(y_test1, probs) #fig.canvas.set_window_title('NB') plot_roc_curve(fpr, tpr) #Classification Report target_names = ['Yes', 'No'] prediction = gnb.predict(X1) print(classification_report(Y1, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer = ClassificationReport(gnb, classes=classes, support=True) visualizer.fit(X, Y) visualizer.score(X1, Y1) #fig.canvas.set_window_title('NB') g = visualizer.poof() labels = [' XGboost', 'SVM', 'NB', 'Stacking'] #sizes = [5, neg_per, neu_per] sizes = [acc1, acc2, acc3, acc4] index = np.arange(len(labels)) plt.bar(index, sizes) plt.xlabel('Algorithm', fontsize=20) plt.ylabel('Accuracy', fontsize=20) plt.xticks(index, labels, fontsize=10, rotation=0) plt.title('comparative study') plt.show() #GUI class MainFrame(wx.Frame): def __init__(self, parent): wx.Frame.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(500, 300), style=wx.DEFAULT_FRAME_STYLE | wx.TAB_TRAVERSAL) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) bSizer3 = wx.BoxSizer(wx.VERTICAL) self.m_staticText2 = wx.StaticText(self, wx.ID_ANY, u"Enter URL", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText2.Wrap(-1) bSizer3.Add(self.m_staticText2, 0, wx.ALL, 5) self.text1 = wx.TextCtrl(self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0) bSizer3.Add(self.text1, 0, wx.ALL | wx.EXPAND, 5) self.predictButton = wx.Button(self, wx.ID_ANY, u"Predict_XGBOOST", wx.DefaultPosition, wx.DefaultSize, 0) bSizer3.Add(self.predictButton, 0, wx.ALL | wx.EXPAND, 5) self.m_button2 = wx.Button(self, wx.ID_ANY, u"Predict_SVM", wx.DefaultPosition, wx.DefaultSize, 0) bSizer3.Add(self.m_button2, 0, wx.ALL | wx.EXPAND, 5) self.m_button3 = wx.Button(self, wx.ID_ANY, u"Predict_NB", wx.DefaultPosition, wx.DefaultSize, 0) bSizer3.Add(self.m_button3, 0, wx.ALL | wx.EXPAND, 5) self.m_button4 = wx.Button(self, wx.ID_ANY, u"Predict_STACKING", wx.DefaultPosition, wx.DefaultSize, 0) bSizer3.Add(self.m_button4, 0, wx.ALL | wx.EXPAND, 5) # self.label1 = wx.StaticText( self, wx.ID_ANY, u"Result", wx.DefaultPosition, wx.DefaultSize, 0 ) # self.label1.Wrap( -1 ) # bSizer3.Add( self.label1, 0, wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 5 ) # self.text2 = wx.TextCtrl( self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0 ) # bSizer3.Add( self.text2, 0, wx.RIGHT|wx.EXPAND, 5 ) self.SetSizer(bSizer3) self.Layout() self.Centre(wx.BOTH) # Connect Events self.predictButton.Bind(wx.EVT_BUTTON, self.click) self.m_button2.Bind(wx.EVT_BUTTON, self.svm) self.m_button3.Bind(wx.EVT_BUTTON, self.nb) self.m_button4.Bind(wx.EVT_BUTTON, self.stacking) def __del__(self): pass # Virtual event handlers, overide them in your derived class #XGBOOST def click(self, event): try: url = self.text1.GetValue() e = np.array([extract_feature_usertest(url)]) userpredict1 = model.predict(e.reshape(1, -1)) if (userpredict1[0] == 'no'): # self.text2.SetValue(str("Legitimate")) print('Legitimate') class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"POP-UP"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app3 = wx.App(False) frame = MyDialog1(None) frame.Show(True) webbrowser.open(url) app3.MainLoop() else: class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(200, 150), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"Error"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) def __del__(self): pass # Virtual event handlers, overide them in your derived class def click(self, event): event.Skip() app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) app2.MainLoop() # self.text2.SetValue(str("Phising")) # print('Phising') except Exception: print('error') #SVM def svm(self, event): clf = svm.SVC(kernel='linear', probability=True) clf.fit(X, Y) try: url = self.text1.GetValue() e = np.array([extract_feature_usertest(url)]) userpredict1 = model.predict(e.reshape(1, -1)) if (userpredict1[0] == 'no'): # self.text2.SetValue(str("Legitimate")) print('Legitimate') class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"POP-UP"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) webbrowser.open(url) app2.MainLoop() webbrowser.open(url) else: class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"Error"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) app2.MainLoop() def __del__(self): pass # self.text2.SetValue(str("Phising")) # print('Phising') except Exception: print('error') #NAIVE BAYES def nb(self, event): try: url = self.text1.GetValue() e = np.array([extract_feature_usertest(url)]) userpredict1 = gnb.predict(e.reshape(1, -1)) if (userpredict1[0] == 'no'): # self.text2.SetValue(str("Legitimate")) print('Legitimate') class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"POP-UP"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) webbrowser.open(url) app2.MainLoop() webbrowser.open(url) else: class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"Error"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) app2.MainLoop() # self.text2.SetValue(str("Phising")) # print('Phising') except Exception: print('error') #STACKING def stacking(self, event): df = pd.read_csv("stack.csv") df1 = pd.read_csv("feature_test_stack1.csv") target = 'r' X_train = df.loc[:, df.columns != target] Y_train = df.loc[:, df.columns == target] X_test = df1.loc[:, df1.columns != target] Y_test = df1.loc[:, df1.columns == target] global url4 url4 = self.text1.GetValue() xgb = XGBClassifier() clf = svm.SVC(kernel='linear') classifiers = [clf, xgb] rf2 = RandomForestClassifier(bootstrap=True, max_depth=70, max_features='auto', min_samples_leaf=4, min_samples_split=10, n_estimators=400) sc = StackingClassifier(classifiers, meta_classifier=rf2) sc.fit(X_train, Y_train) # e1=extract_feature_usertest(url4) # userpredict4 = sc.predict(e1) col = df.columns col = col[:-1] e4 = extract_feature_usertest_stack(url4) output_data = e4 output_data = pd.DataFrame([output_data], columns=col) userpredict4 = sc.predict(output_data) if (userpredict4[0] == 0): # self.text2.SetValue(str("Legitimate")) print('Legitimate') class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"POP-UP"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"LEGITIMATE", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) webbrowser.open(url) app2.MainLoop() else: class MyDialog1(wx.Dialog): def __init__(self, parent): wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=wx.EmptyString, pos=wx.DefaultPosition, size=wx.Size(159, 114), style=wx.DEFAULT_DIALOG_STYLE) self.SetSizeHintsSz(wx.DefaultSize, wx.DefaultSize) sbSizer1 = wx.StaticBoxSizer( wx.StaticBox(self, wx.ID_ANY, u"Error"), wx.VERTICAL) self.m_staticText1 = wx.StaticText( sbSizer1.GetStaticBox(), wx.ID_ANY, u"PHISING", wx.DefaultPosition, wx.DefaultSize, 0) self.m_staticText1.Wrap(-1) sbSizer1.Add(self.m_staticText1, 0, wx.ALL | wx.ALIGN_CENTER_HORIZONTAL, 5) self.SetSizer(sbSizer1) self.Layout() self.Centre(wx.BOTH) app2 = wx.App(False) frame = MyDialog1(None) frame.Show(True) app2.MainLoop() # self.text2.SetValue(str("Phising")) # print('Phising') app1 = wx.App(False) frame = MainFrame(None) frame.Show(True) app1.MainLoop()
'lgbmclassifier__max_depth': max_depth, 'lgbmclassifier__subsample': subsample, #'catboostclassifier__n_estimators':n_estimators, #'catboostclassifier__max_depth': max_depth, #'randomforestclassifier__n_estimators':[100], #'randomforestclassifier__max_depth': [3], 'meta-logisticregression__C': C } grid = GridSearchCV(estimator=grid, param_grid=params, cv=3, refit=True, verbose=3, n_jobs=n_job, early_stopping_rounds=100, scoring='roc_auc') print('fitting') grid.fit(x, y) joblib.dump(grid, 'export/trend_model.pkl') predicted = grid.predict_proba(x) predicted = list(map(lambda x: x[1], predicted)) print('trian roc: ', roc_auc_score(y, predicted)) predicted = pd.Series(grid.predict_proba(test)[:, 1]) predicted.index = test_ids predicted.to_csv('export/trend_predict_test.csv')
random_state=0, n_estimators=100) sclf = StackingClassifier(classifiers=[rf], use_probas=True, average_probas=False, meta_classifier=xgc) gbc.fit(xtr, ytrain) rf.fit(xtr1, ytrain) ada.fit(xtr2, ytrain) sclf.fit(xtrain, ytrain) gbc_pred_proba = gbc.predict_proba(xte) rf_pred_proba = rf.predict_proba(xte1) ada_pred_proba = ada.predict_proba(xte2) sc_pred_proba = sclf.predict_proba(xtest) gbc_cm = m.confusion_matrix(ytest, (gbc_pred_proba[:, 1] >= 0.5).astype('int')) rf_cm = m.confusion_matrix(ytest, (rf_pred_proba[:, 1] >= 0.5).astype('int')) ada_cm = m.confusion_matrix(ytest, (ada_pred_proba[:, 1] >= 0.5).astype('int')) sc_cm = m.confusion_matrix(ytest, (sc_pred_proba[:, 1] >= 0.5).astype('int')) k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #k=[0.55,0.57,0.6,0.62,0.65] for i in k: gbc_cm = m.confusion_matrix(ytest, (gbc_pred_proba[:, 1] >= i).astype('int')) print(myf1(gbc_cm)) k = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] k = [0.65, 0.68, 0.7, 0.72, 0.75]
params2['objective'] = 'binary:logistic' params2['booster'] = 'gbtree' params2['learning_rate'] = 0.02 params2['max_depth'] = 5 params2['subsample'] = 0.6 params2['colsample_bytree'] = 0.5 params2['n_estimators'] = 500 params3 = {} params3['objective'] = 'binary:logistic' params3['booster'] = 'gbtree' params3['learning_rate'] = 0.02 params3['max_depth'] = 4 params3['subsample'] = 0.6 params3['colsample_bytree'] = 0.5 params3['n_estimators'] = 600 clf1 = XGBClassifier(**params1) clf2 = XGBClassifier(**params2) clf3 = XGBClassifier(**params3) clfs = [clf1, clf2, clf3] lrc = linear_model.LogisticRegression(C=0.5, max_iter=300) x_train = train_data.iloc[:, :-1] y_train = train_data.iloc[:, -1] mlxcf = StackingClassifier(clfs, lrc, use_probas=True, average_probas=True) mlxcf.fit(x_train, y_train) y_pred = mlxcf.predict_proba(test_data)[:, -1] submission = pd.DataFrame({"ID": IDlist, "TARGET": y_pred}) submission.to_csv("../Result/mxltendStackingXGB.csv", index=False)
iris = load_iris() X = iris.data[:100] y = iris.target[:100] pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression()) sclf.fit(X, y) decision_scores = sclf.decision_function(X) print("Val auc Score of Stacking: %f" % (roc_auc_score(y, sclf.predict_proba(X)[:, 1]))) fig, axe = plt.subplots(2, 2, figsize=(30, 20)) rlb.ComprehensiveIndicatorFigure(y, decision_scores, axe[0], 1) rlb.ComprehensiveIndicatorSkLibFigure(y, decision_scores, axe[1]) # In[]: # 5、ROC Curve with decision_function from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingCVClassifier from sklearn.metrics import roc_curve, auc import numpy as np
def model_evaluation(Train, Valid, Test, comparative, bootstrap=False, n_estimators: int = 200, max_depth: int = 50, oob_score: bool = False, class_weight='balanced_subsample', sampling=None, label='FRAUDE', model='ert'): # With beta = 2, we give the same importance to Recall and Precision if sampling is not None: class_weight = None model_name = str(sampling) # fileModel.fit(xTrain.drop(['id_siniestro'], axis=1).values, yTrain.values) # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 0].drop([label] + ['id_siniestro'], axis=1).values))) # print(np.median(fileModel.predict_proba(Valid[Valid[label] == 1].drop([label] + ['id_siniestro'], axis=1).values))) tresholds = np.linspace(0.1, 1.0, 200) scores = [] y_pred_score = np.empty(shape=[0, 2]) predicted_index = np.empty(shape=[ 0, ]) # y_pred_score = fileModel.predict_proba(Valid.drop([label] + ['id_siniestro'], axis=1).values) skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False) Test = pd.concat([Train, Valid, Test], axis=0).reset_index() print(Test.shape) X = Test.drop([label] + ['id_siniestro'], axis=1) y = Test[[label]] for train_index, test_index in skf.split(X.values, y[label].values): X_train, X_test = X.loc[train_index].values, X.loc[ test_index].values y_train, y_test = y.loc[train_index].values, y.loc[ test_index].values if sampling == None: pass elif sampling == 'ALLKNN': X_train, y_train = under_sampling(X_train, y_train) class_weight = None else: X_train, y_train = over_sampling(X_train, y_train, model=sampling) class_weight = None min_sample_leaf = round(y_train.shape[0] * 0.005) min_sample_split = min_sample_leaf * 10 max_features = round(X_train.shape[1] / 3) if model == 'ert': fileModel = ensemble.ExtraTreesClassifier( criterion='entropy', bootstrap=bootstrap, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split, n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, oob_score=oob_score, random_state=531, verbose=1, class_weight=class_weight, n_jobs=-1) elif model == 'gb': fileModel = ensemble.GradientBoostingClassifier( loss='deviance', learning_rate=0.01, n_estimators=200, subsample=1.0, criterion='friedman_mse', min_samples_split=min_sample_split, min_samples_leaf=min_sample_leaf, min_weight_fraction_leaf=0., max_depth=max_depth, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=531, max_features=None, verbose=1, max_leaf_nodes=None, warm_start=False, presort='auto') elif model == 'lxgb': fileModel = lgbx.LGBMClassifier( boosting_type="gbdt", num_leaves=2000, max_depth=200, learning_rate=0.005, n_estimators=300, max_bin=500, objective='binary', min_split_gain=0., min_child_weight=5, min_child_samples=min_sample_leaf, subsample=1., subsample_freq=1, colsample_bytree=1., reg_alpha=0., reg_lambda=0., random_state=531, n_jobs=-1, silent=True) elif model.startswith('stacked'): ERT = ensemble.ExtraTreesClassifier( bootstrap=bootstrap, n_estimators=n_estimators, max_depth=max_depth, oob_score=oob_score, class_weight=class_weight, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split, max_features='auto', n_jobs=-1) Gboost = ensemble.GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=0.005, max_depth=max_depth, loss='deviance', random_state=531, min_samples_split=min_sample_split, min_samples_leaf=min_sample_leaf) Light_Gboost = lgbx.LGBMClassifier( boosting_type="gbdt", num_leaves=2000, max_depth=-1, learning_rate=0.005, n_estimators=300, max_bin=500, objective='binary', min_split_gain=0., min_child_weight=5, min_child_samples=min_sample_leaf, subsample=1., subsample_freq=1, colsample_bytree=1., reg_alpha=0., reg_lambda=0., random_state=531, n_jobs=-1, silent=False) if model.endswith('_ERT'): fileModel = StackingClassifier( classifiers=[Gboost, Light_Gboost], meta_classifier=ERT, average_probas=True, use_probas=True) elif model.endswith('_GB'): fileModel = StackingClassifier( classifiers=[ERT, Light_Gboost], meta_classifier=Gboost, average_probas=True, use_probas=True) elif model.endswith('_LXGB'): fileModel = StackingClassifier( classifiers=[ERT, Gboost], meta_classifier=Light_Gboost, average_probas=True, use_probas=True) fileModel.fit(X_train, y_train) y_pred_score_i = fileModel.predict_proba(X_test) y_pred_score = np.append(y_pred_score, y_pred_score_i, axis=0) print(y_pred_score.shape) print(test_index.shape) print(predicted_index.shape) predicted_index = np.append(predicted_index, test_index, axis=0) print(predicted_index) del X_train, X_test, y_train, y_test y_pred_score = np.delete(y_pred_score, 0, axis=1) print('min', y_pred_score.min()) print('max', y_pred_score.max()) for treshold in tresholds: y_hat = (y_pred_score > treshold).astype(int) y_hat = y_hat.tolist() y_hat = [item for sublist in y_hat for item in sublist] scores.append([ recall_score(y_pred=y_hat, y_true=Test[label].values), precision_score(y_pred=y_hat, y_true=Test[label].values), fbeta_score(y_pred=y_hat, y_true=Test[label].values, beta=2) ]) scores = np.array(scores) print('F-Score', scores[:, 2].max(), scores[:, 2].argmax()) print('scores', scores[scores[:2].argmax()]) print(scores) plot.plot(tresholds, scores[:, 0], label='$Recall$') plot.plot(tresholds, scores[:, 1], label='$Precision$') plot.plot(tresholds, scores[:, 2], label='$F_2$') plot.ylabel('Score') plot.xlabel('Threshold') plot.legend(loc='best') plot.show() plot.close() final_tresh = tresholds[scores[:, 2].argmax()] print(final_tresh) y_hat_test = (y_pred_score > final_tresh).astype(int) y_hat_test = y_hat_test.tolist() y_hat_test = [item for sublist in y_hat_test for item in sublist] Test['id_siniestro'] = Test['id_siniestro'].map(int) comparative['id_siniestro'] = comparative['id_siniestro'].map(int) Test = pd.merge(Test, comparative[['id_siniestro', 'FRAUDE']], how='left', on='id_siniestro') cnf_matrix = confusion_matrix(Test['FRAUDE_Clusters'].values, y_hat_test) plot_confusion_matrix(cnf_matrix, classes=['Normal', 'Abnormal'], title='Confusion matrix') cnf_matrix = confusion_matrix(Test['FRAUDE'].values, y_hat_test) plot_confusion_matrix(cnf_matrix, classes=['Unknown', 'Fraud'], title='Confusion matrix') return None
# # 作業 # * 分類預測的集成泛化, 也與回歸的很不一樣 # 既然分類的 Blending 要變成機率, 才比較容易集成, # 那麼分類的 Stacking 要讓第一層的模型輸出機率當特徵, 應該要怎麼寫呢? # In[14]: from mlxtend.classifier import StackingClassifier meta_estimator = GradientBoostingClassifier(subsample=0.70, n_estimators=15, max_features='sqrt', max_depth=4, learning_rate=0.3) """ Your Code Here """ stacking = StackingClassifier(classifiers=[gdbt, rf], meta_classifier=meta_estimator) # In[15]: stacking.fit(df, train_Y) stacking_pred = stacking.predict_proba(df2) sub = pd.DataFrame({'name': ids, 'poi': stacking_pred[:, 1]}) sub = sub.append(df3, ignore_index=True) sub.to_csv('poi_stacking_4.csv', index=False) # In[ ]:
def _model_constructor(self): ######################################## ## sample train/validation data ######################################## print("Loading train data...") X_train = self.train_custom_features df_train = pd.read_csv(TRAIN_DATA_FILE, encoding="utf-8") y_train = df_train['is_duplicate'].values #UPDownSampling pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train)).as_matrix() y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0]) print("New duplicate content:", np.mean(y_train)) del pos_train, neg_train ESTIMATORS = 180 self.clfs = [ # MLPClassifier(hidden_layer_sizes=(300, 200, 100), # activation="relu", # learning_rate="adaptive", # verbose=True, # batch_size=128, # max_iter=10, # tol=0.001, # early_stopping=True, # warm_start=False), # MLPClassifier(hidden_layer_sizes=(128, 128, 128), # activation="relu", # learning_rate="adaptive", # verbose=True, # batch_size=64, # early_stopping=True), RandomForestClassifier(n_estimators=ESTIMATORS, n_jobs=-1, criterion='entropy', verbose=1), RandomForestClassifier(n_estimators=ESTIMATORS, n_jobs=-1, criterion='gini', verbose=1, warm_start=True), ExtraTreesClassifier(n_estimators=ESTIMATORS, n_jobs=-1, criterion='gini', verbose=1), ExtraTreesClassifier(n_estimators=ESTIMATORS, n_jobs=-1, criterion='entropy', verbose=1, warm_start=True)] lr = LogisticRegression() sclf = StackingClassifier(classifiers=self.clfs, use_probas=True, average_probas=False, verbose=2, meta_classifier=lr) sclf.fit(X_train, y_train) self.model = sclf bst_val_score = log_loss(y_train, sclf.predict_proba(X_train)[:, 1]) print("Model train loss:", bst_val_score) return (sclf, bst_val_score)
model_final.fit(train_stack,train_y) pre=model_final.predict_proba(test_stack) model1=RandomForestClassifier(n_estimators = 100) model2=lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', num_class=4, learning_rate=0.1, n_estimators=100, num_leaves=124, max_depth=13, bagging_fraction=0.66, feature_fraction=0.88, bagging_freq=66, min_data_in_leaf=86, min_child_weight=8.8, min_split_gain=0.02, reg_lambda=3, reg_alpha=6.7, n_jobs= 8 ) model3= MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,solver='sgd') model_final=RandomForestClassifier(n_estimators = 100) sclf = StackingClassifier(classifiers=[model1, model2, model3], meta_classifier=model_final,use_probas=True,average_probas=False) sclf.fit(train_X,train_y) temp=sclf.predict_proba(X_test) # 输出预测结果 result=pd.read_csv('sample_submit.csv') result['label_0']=temp[:,0] result['label_1']=temp[:,1] result['label_2']=temp[:,2] result['label_3']=temp[:,3] result.to_csv('submit1.csv',index=False)
def predictor_ev(): print "Building Neural Net classifiers for devices with events" n_input = X_train_ev.shape[1] n_train = X_train_ev.shape[0] from keras.models import Sequential from keras.layers import Dense, Activation from keras.layers.core import Dropout from keras.layers.advanced_activations import PReLU from keras.regularizers import l2 from keras.optimizers import Adadelta from keras.optimizers import SGD from keras.wrappers.scikit_learn import KerasClassifier from keras.callbacks import ModelCheckpoint def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']): n_in = n_input model = Sequential() for i in xrange(n_hidden_layers): n_out = nodes[i] dropout = dropouts[i] act = acts[i] model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg))) model.add(Activation(act)) model.add(Dropout(dropout)) n_in = n_out model.add(Dense(output_dim=12, W_regularizer=l2(reg))) model.add(Activation("softmax")) # Compile model adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08) sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy']) return model class KerasClassifier2(KerasClassifier): def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2): self.random_state = random_state self.nb_epoch = nb_epoch self.batch_size = batch_size self.verbose = verbose super(KerasClassifier2, self).__init__(build_fn, **fn_args) self.classes_= np.arange(12) self.n_classes_ = 12 self.model = build_fn(**fn_args) def fit(self, X, y, sample_weight=None): return super(KerasClassifier2, self).fit(X, indicator(y), verbose = self.verbose, sample_weight=sample_weight, validation_data=(X_cv_ev, indicator(y_cv_ev)), nb_epoch=self.nb_epoch, batch_size=self.batch_size) def predict_proba(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) def predict(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8, 'dropouts': [.3, .4], 'acts': ['relu', 'relu']} nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0, 'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']} nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0, 'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']} nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2, 'dropouts': [.25], 'acts': ['relu']} nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117], 'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4], 'acts': ['relu', 'relu', 'relu', 'relu', 'relu']} clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5) clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11) clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6) clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6) clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12) print "Building XGBoost classifiers for devices with events" xgb_params = { "objective": "multi:softprob", "num_class": 12, "booster": "gblinear", "max_depth": 6, "eval_metric": "mlogloss", "eta": 0.07, "silent": 1, "alpha": 3.5, } class XGBClassifier2(xgb.XGBClassifier): def __init__(self, max_depth=xgb_params['max_depth'], objective='multi:softprob', missing=None, learning_rate=xgb_params['eta'], n_estimators=40, subsample=1, reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'): super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed, objective=objective, missing=missing, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, reg_alpha=reg_alpha) self.booster = xgb_params['booster'] def fit(self, X, y): super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss', eval_set=[(X_cv_ev.tocsc(), y_cv_ev)]) gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28) gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28) gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28) gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28) gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28) print "Building Logistic Regression classifier for devices with events" clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg') #Combine results of classifiers print "Stacking classifiers for devices with events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_ev, y_train_ev) print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev)) y_pred_ev = stack.predict_proba(X_test_ev) #y_pre = (pred_prob_nn+y_pre)/2.0 return y_pred_ev
meta_classifier=clf_lg) label = ['stacking'] sclf.fit(X_train_standar, y_train) score_stacking = cross_val_score(sclf, X_train_standar, y_train, scoring='accuracy') cross_val_score(sclf, X_train_standar, y_train, scoring='f1') score_mean_sclf = score_stacking.mean() print('stacking final score\'s mean is % .2f' % score_mean_sclf) print('accuracy: %.2f (+/- %.2f) [%s]' % (score_stacking.mean(), score_stacking.std(), label)) result_stacking = sclf.predict(X_test_stander) result_stacking_proba = sclf.predict_proba(X_test_stander) clf_stacking_test_score = sclf.score(X_test_stander, y_test) precision, recall, thresholds = precision_recall_curve(y_test, sclf.predict(X_test)) report = result_stacking_proba[:, 1] >= 0.8 print(classification_report(y_test, report, target_names=['0', '1'])) # ============================================================================== # 模型持久化 # os.chdir(u'D:\【01】行健金融\【01】数据中心\【05】数据分析项目\【03】2018\May\规则引擎_分期商城_风控+授信') # joblib.dump(sclf, 'stackingpkl.pkl') # joblib.dump(scaler, 'scaler.pkl') # ==============================================================================
def stack_test(train_x, train_y, test_x, test_y): print("start stacking test") clf1 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=2000, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf2 = lgb.LGBMClassifier(boosting_type='dart', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=2000, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf3 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=2000, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf4 = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=2000, objective='binary:logistic', booster='gbtree', n_jobs=-1, min_child_weight=50, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, random_state=2018) stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=clf4, use_probas=True, average_probas=True, verbose=1) stack_clf.fit(train_x, train_y) pred_score = stack_clf.predict_proba(test_x)[:, 1] auc_score = roc_auc_score(test_y, pred_score) output = open(dir_path + r'/auc_score.txt', 'w') print("auc score is {}".format(auc_score), file=output) print("auc score is {}".format(auc_score)) return stack_clf
class cls_model_stack(): def __init__(self,listModelName,isGridSearch = True , dict_para = {},meta_reg = 'logistic'): self.listModelName = listModelName self.isGridSearch = isGridSearch self.dict_para = dict_para self.meta_reg = meta_reg #缺省参数 self.train_model = defaultdict(list) self.stack = None def fit(self,x,y): ''' 拟合: ''' model_list = [] basic_cls = ['logistic','knn','svm','dt','rf','adaBoost','gbm','xgb','bp'] for model_name in self.listModelName: if model_name in basic_cls: cls = cls_model(model_name,isGridSearch = self.isGridSearch) if model_name in self.dict_para.keys(): #如果用户自定义了参数范围,则对模型参数进行设置 cls.set_parameters(self.dict_para[model_name]) else: pass #模型拟合 cls.fit(x,y) model_list.append(cls.cls_model) self.train_model[model_name] = cls if self.meta_reg == 'logistic': meta_cls = linear_model.LogisticRegression() elif self.meta_reg == 'knn': meta_cls = KNeighborsClassifier() self.stack = StackingClassifier(classifiers = model_list,meta_classifier = meta_cls) self.stack.fit(x.values,y.values.reshape(len(y))) def predict(self,x): return self.stack.predict(x) def get_vip(self,stack_method = 'avg',isplot = True): res = [] idx = [] for i,key in enumerate(self.train_model): vip = self.train_model[key].get_vip(isplot = False) if vip is not None: res.append(vip) idx.append(i) #不同模型结果融合 if len(res) == 0: res = None else: temp = pd.concat(res,axis = 1) if stack_method == 'avg': res = temp.mean(axis = 1).sort_values() res = pd.DataFrame(res,columns = ['variable importance']) # elif stack_method == 'weight': # pass # res = np.dot(temp.values,self.stack.coef_[idx]) # res = pd.DataFrame(res,index = temp.index,columns = ['variable importance']).sort_values('variable importance') #画条形图 if isplot: plt = Data_plot.plot_bar_analysis(res) plt.title('variable importance') plt.show() return res def predict_proba(self,x): x_pred = np.array(x) try: res = self.stack.predict_proba(x_pred) except: res = None finally: return res
def stack_test(train_x, train_y, predict_x, res): print("start test") clf1 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_bin=150, max_depth=-1, n_estimators=500, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf2 = lgb.LGBMClassifier(boosting_type='dart', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_bin=150, max_depth=-1, n_estimators=500, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='relu', solver='adam', alpha=0.001, random_state=2018, learning_rate_init=0.1) clf4 = lgb.LGBMClassifier(boosting_type='rf', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_bin=150, max_depth=-1, n_estimators=500, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf5 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='relu', learning_rate='invscaling', solver='sgd', alpha=0.001, random_state=2018, learning_rate_init=0.1) ''' clf5=XGBClassifier( max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic', booster='gbtree', n_jobs=-1, min_child_weight=5,scale_pos_weight=10, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018 ) clf6=XGBClassifier( max_depth=5, learning_rate=0.1, n_estimators=500, objective='binary:logistic', booster='gbtree', n_jobs=-1, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,random_state=2018 ) ''' clf6 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_bin=150, max_depth=-1, n_estimators=500, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=clf6, use_probas=True, verbose=1) stack_clf.fit(train_x, train_y) res['score'] = stack_clf.predict_proba(predict_x)[:, 1] res['score'] = res['score'].apply(lambda x: float('%.6f' % x)) res.to_csv(dir_path + r'/submission.csv', index=False) ''' for clf, label in zip([clf1, clf2, clf3, clf4,stack_clf], ['lgbm1', 'lgbm2', 'mlp','lgbm3', 'stack_clf']): scores =cross_val_score(clf, train_x, train_y, cv=4, scoring='f1') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) ''' return stack_clf
#!/usr/bin/env python3 # -*- coding: utf-8 -*- ################## load packages ##################### from sklearn import datasets from sklearn.linear_model import LogisticRegression from mlxtend.classifier import StackingClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import make_pipeline ################## load data ##################### iris = datasets.load_iris() x, y = iris.data, iris.target ################## define classifier ##################### pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression()) ################## fit and predict ##################### sclf.fit(x, y) print(sclf.predict(x)) ########### predict class probability ########### print(sclf.predict_proba(x))
Test_lg=Test_df[Concat_df_list].values #xgboost,lr模型stacking融合 clf= XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=80, silent=True, objective="binary:logistic", booster='gbtree',min_child_weight=3,subsample=0.8, gamma=0) from sklearn.linear_model import LogisticRegression clf2= LogisticRegression(C=0.1, penalty='l2', tol=1e-4) from sklearn.ensemble import RandomForestClassifier clf4= RandomForestClassifier(n_estimators=400,oob_score=True) from sklearn.ensemble import VotingClassifier from mlxtend.classifier import StackingClassifier eclf= StackingClassifier(classifiers=[clf,clf2], meta_classifier=LogisticRegression(C=0.1, penalty='l2', tol=1e-4), use_probas=True, verbose=3) eclf.fit(Train_lg, Train_label) R=eclf.predict_proba(Test_lg) instance_id_list=Test['cust_id'].values with open('1.csv','w') as f: f.write('cust_id,pred_prob\n') for i in range(len(instance_id_list)): f.write('%d,%f\n'%(instance_id_list[i],float(R[i][1])))
class FraudModel(object): def __init__(self, alpha=0.1, n_jobs=-1, max_features='sqrt', n_estimators=1000, RandomForest=True, KMeansFeatures=True, NaiveBayes=True): """ INPUT: - alpha = Additive laplace smoothing parameter for NaiveBayes - n_jobs = Number of jobs to run RFC on - max_features = Number of featres to consider on RFC - n_estimators = Number of trees in RFC - RandomForest = Bool, run RFC - KMeansFeatures = Bool, include K means features in RFC - NaiveBayes = Bool, run MNB ATTRIBUTES: - RFC = Random Forest Classifier - MNB = Multinomial Naive Bayes Classifier """ self.RFC = RandomForestClassifier(n_jobs=n_jobs, max_features=max_features, n_estimators=n_estimators) self.MNB = MultinomialNB(alpha=alpha) self.LogR = LogisticRegression() self.STK = StackingClassifier(classifiers=[self.RFC, self.MNB], meta_classifier=self.LogR, use_probas=True) self.RandomForest = RandomForest self.KMeansFeatures = KMeansFeatures self.NaiveBayes = NaiveBayes def fit(self, X, y): """ INPUT: - X: dataframe representing feature matrix for training data - y: series representing labels for training data """ # NLP if self.KMeansFeatures == True or self.NaiveBayes == True: desc_no_html = update_data_frame(X) self.tfidf = TfidfVectorizer(stop_words='english', max_features=10) word_counts = self.tfidf.fit_transform( desc_no_html['description_no_HTML']) if self.KMeansFeatures == True: # K-means desc_kmeans = KMeans(n_clusters=5, random_state=56, n_jobs=-1) desc_kmeans.fit(word_counts) self.cluster_centers = desc_kmeans.cluster_centers_ X_cluster = compute_cluster_distance(word_counts, self.cluster_centers) RF_X = pd.merge(X_cluster, X, left_index=True, right_index=True).drop(columns=['description']) else: RF_X = X.drop(columns=['description']) # Random Forest if self.RandomForest == True: # Random Forest self.RFC.fit(RF_X, y) if self.NaiveBayes == True: # Naive Bayes self.MNB.fit(word_counts, y) # Stacked Classifier if self.RandomForest == True and self.NaiveBayes == True: RFCpipeline = make_pipeline(RF_X, self.RFC) MNBpipeline = make_pipeline(word_counts, self.MNB) self.STK.fit(y, classifiers=[RFCpipeline, MNBpipeline]) def predict_proba(self, X): """ INPUT: - X: dataframe representing feature matrix for data OUTPUT: - blah """ if self.KMeansFeatures == True or self.NaiveBayes == True: desc_no_html = update_data_frame(X) word_counts = self.tfidf.transform( desc_no_html['description_no_HTML']) if self.KMeansFeatures == True: X_cluster = compute_cluster_distance(word_counts, self.cluster_centers) RF_X = pd.merge(X_cluster, X, left_index=True, right_index=True).drop(columns=['description']) else: RF_X = X.drop(columns=['description']) if self.RandomForest == True and self.NaiveBayes == False: RFC_preds = self.RFC.predict_proba(RF_X) return RFC_preds elif self.RandomForest == False and self.NaiveBayes == True: NB_preds = self.MNB.predict_proba(word_counts) return NB_preds elif self.RandomForest == True and self.NaiveBayes == True: STK_preds = self.STK.predict_proba(X) return STK_preds def _log_loss( self, y_true, ): pass
'meta-logisticregression__C': C } fit_params = {"early_stopping_rounds": 100} grid = RandomizedSearchCV(grid, n_jobs=n_jobs, param_distributions=params, verbose=3, n_iter=n_iter_search, cv=cv) print('fitting') grid.fit(x, y) joblib.dump(grid, 'export/trend_model_random_%s.pkl' % version) predicted = grid.predict_proba(x) predicted = list(map(lambda x: x[1], predicted)) print('trian roc: ', roc_auc_score(y, predicted)) print('val roc: ', grid.best_score_) print('best params: ', grid.best_params_) if oversamp: predicted = pd.Series(grid.predict_proba(test.as_matrix())[:, 1]) else: predicted = pd.Series(grid.predict_proba(test)[:, 1]) predicted.index = test_ids predicted.to_csv('export/trend_predict_random_%s_%s.csv' % (version, int(time.time()))) print('cost time: ', time.time() - a)
print("Support vector machines : Log Loss: %0.2f" % (log_loss(cv_y, sig_clf2.predict_proba(cv_x_onehotCoding)))) sig_clf3.fit(train_x_onehotCoding, train_y) print("Naive Bayes : Log Loss: %0.2f" % (log_loss(cv_y, sig_clf3.predict_proba(cv_x_onehotCoding)))) print("-" * 50) alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10] best_alpha = 999 for i in alpha: lr = LogisticRegression(C=i) sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True) sclf.fit(train_x_onehotCoding, train_y) print("Stacking Classifer : for the value of alpha: %f Log Loss: %0.3f" % (i, log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding)))) log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding)) if best_alpha > log_error: best_alpha = log_error lr = LogisticRegression(C=0.1) sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True) sclf.fit(train_x_onehotCoding, train_y) log_error = log_loss(train_y, sclf.predict_proba(train_x_onehotCoding)) print("Log loss (train) on the stacking classifier :", log_error) log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding)) print("Log loss (CV) on the stacking classifier :", log_error)
class_weight='balanced', max_iter=10) modelDT = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf=5, min_samples_split=2) modelXGB = XGBClassifier(max_depth=2, gamma=2, eta=0.8, reg_alpha=0.5, reg_lambda=0.5) #turn these datasets to scalers for consistent fitting scaler = StandardScaler() normX = scaler.fit_transform(normX) X_eval = scaler.fit_transform(X_fs_eval) #stack the classifiers using mlxtend, make LR model the meta_classifier to give it more weight m = StackingClassifier(classifiers=[modelLR, modelDT, modelXGB], use_probas=True, meta_classifier=modelLR) #fit the model and save the predictions m.fit(normX, normY) pred = m.predict_proba(X_fs_eval)[:, 1] #save the results into the file submission = pd.read_csv('sample_submission.csv') submission['target'] = pred submission.to_csv('sample_submission.csv', index=False)