def random_forest_churn(X_train, y_train, X_test, y_test): fea_path = "conf/u_plan_train_conf" col_names = [] with open(fea_path, 'r') as file: for line in file: line = line.strip('\n') col_names.append(line) # model construction estimator_RF = RandomForestClassifier(n_estimators=116, max_depth=18, max_features=10, random_state=0, n_jobs=-11) # model training model_RF = estimator_RF.fit(X_train, y_train) # model prediction predict_RF = model_RF.predict(X_test) y_predict_proba = model_RF.predict_proba(X_test) model_validation.model_valid(y_test, predict_RF) accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid(y_test, predict_RF) print("RandomForest模型准确率 : " + str(accu_scr)) print("RandomForest模型精确率 : " + str(prec_scr)) print("RandomForest模型召回率 : " + str(rec_scr)) print("RandomForest模型F1 score : " + str(f1_scr)) # generate roc fpr, tpr, thresholds = roc_curve(y_test, y_predict_proba[:, 1]) roc_auc = auc(fpr, tpr) # get RF feature importance importance_score_list = [] for i in range(len(model_RF.feature_importances_)): importance_score_list.append([model_RF.feature_importances_[i], col_names[i]]) importance_score_list = sorted(importance_score_list, key=lambda x: x[0], reverse=True) # get RF importance features print("=" * 50) print("xgboost feature importance") for i in range(len(importance_score_list)): print(importance_score_list[i][1] + '\t' + str(importance_score_list[i][0])) print("=" * 50) # # 得分 # score_RF_train = model_RF.score(X_train, y_train) # score_RF_test = model_RF.score(X_test, y_test) # # 特征重要性 # fig, ax = plt.subplots(figsize=(7, 5)) # ax.bar(col_names, model_RF.feature_importances_) # ax.set_title("Feature Importances") # fig.savefig('pic/RF_feature_importance.png') return fpr, tpr, roc_auc
def lightGBM_churn(X_train, y_train, X_test, y_test): # model construction lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=63, max_depth=-1, learning_rate=0.1, n_estimators=200, max_bin=255, subsample_for_bin=2000, objective=None, min_split_gain=0.0, min_child_weight=2, scale_pos_weight=2, min_child_samples=20, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, reg_alpha=1, reg_lambda=1, random_state=12, n_jobs=8, silent=True) # model training lgb_churn_model = lgb_model.fit(X_train, y_train, eval_metric='auc', eval_set=[(X_test, y_test)], early_stopping_rounds=100) y_pred = lgb_churn_model.predict(X_test) y_pred_proba = lgb_churn_model.predict_proba(X_test) accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid( y_test, y_pred) print("lightGBM模型准确率 : " + str(accu_scr)) print("lightGBM模型精确率 : " + str(prec_scr)) print("lightGBM模型召回率 : " + str(rec_scr)) print("lightGBM模型F1 score : " + str(f1_scr)) # generate roc fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1]) roc_auc = auc(fpr, tpr) return fpr, tpr, roc_auc
def xgboost_churn(X_train, y_train, X_test, y_test): # load conf name = "xgboost_churn" fea_path = "conf/u_plan_train_conf" col_names = ['y'] with open(fea_path, 'r') as file: for line in file: line = line.strip('\n') col_names.append(line) real_fea_names = col_names[1:] X_df = pd.DataFrame(X_train, columns=real_fea_names) # model training dtrain = xgb.DMatrix(X_train, y_train) lmda = 1.0 params = { "objective": "binary:logistic", "max_depth": 6, "eta": 0.3, "gamma": 1, "colsample_bytree": 0.8, "min_child_weight": 5, "subsample": 0.8 } best_iteration = 150 bst = xgb.train(params, dtrain, best_iteration) # model validation dtest = xgb.DMatrix(X_test) y_pred_proba = bst.predict(dtest) y_pred = np.array([1 if item >= 0.5 else 0 for item in y_pred_proba]) accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid( y_test, y_pred) print("xgboost模型准确率 : " + str(accu_scr)) print("xgboost模型精确率 : " + str(prec_scr)) print("xgboost模型召回率 : " + str(rec_scr)) print("xgboost模型F1 score : " + str(f1_scr)) # generate roc fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) roc_auc = auc(fpr, tpr) # get importance score and rank importance_score_dict = bst.get_score(importance_type="gain") importance_score_list = [] for k, v in importance_score_dict.items(): importance_score_list.append((v, int(k[1:]))) importance_score_list = sorted(importance_score_list, key=lambda x: x[0], reverse=True) # get xgboost importance features print("=" * 50) print("xgboost feature importance") for i in range(len(importance_score_list)): print(real_fea_names[importance_score_list[i][1]] + '\t' + str(importance_score_list[i][0])) print("=" * 50) top_20_f = [] for i in range(20): top_20_f.append(real_fea_names[importance_score_list[i][1]]) # # explainer model training # params = {"objective": "binary:logistic", 'silent': 1, 'eval_metric': 'auc', 'base_score': 0.5, "lambda": lmda} # bst = xgb.train(params, dtrain, best_iteration) # # # calculate xgboost probability # tree_lst = xgb_exp.model2table(bst, lmda=lmda) # leaf_lsts = bst.predict(dtrain, pred_leaf=True) # fea_logit = [[] for _ in range(len(real_fea_names))] # for i, leaf_lst in enumerate(leaf_lsts): # dist = xgb_exp.logit_contribution(tree_lst, leaf_lst) # for idx in range(len(real_fea_names)): # if 'f' + str(idx) in dist: # fea_logit[idx].append(-dist['f' + str(idx)]) # else: # fea_logit[idx].append(0) # # # explainer plot # fig = plt.figure(figsize=(30, 30)) # fig.suptitle('Feature User Churn') # for i in range(len(top_20_f)): # fea = top_20_f[i] # idx = real_fea_names.index(fea) # fea_data = X_df[fea].values # ax = fig.add_subplot(5, 4, i + 1) # ax.set_xscale('log') # ax.set_xlim(min([i for i in fea_data if i > 0]), max(fea_data)) # ax.scatter(fea_data, fea_logit[idx], s=0.1) # ax.hlines(0, min([i for i in fea_data if i > 0]), max(fea_data), linewidth=0.5) # ax.set_title(top_20_f[i]) # plt.subplots_adjust(wspace=0.3, hspace=0.3) # plt.savefig('pic/' + name + '_' + str(best_iteration) + '.png', dpi=800) return fpr, tpr, roc_auc