def random_forest_churn(X_train, y_train, X_test, y_test):
    fea_path = "conf/u_plan_train_conf"
    col_names = []

    with open(fea_path, 'r') as file:
        for line in file:
            line = line.strip('\n')
            col_names.append(line)

    # model construction
    estimator_RF = RandomForestClassifier(n_estimators=116, max_depth=18, max_features=10, random_state=0, n_jobs=-11)

    # model training
    model_RF = estimator_RF.fit(X_train, y_train)

    # model prediction
    predict_RF = model_RF.predict(X_test)
    y_predict_proba = model_RF.predict_proba(X_test)
    model_validation.model_valid(y_test, predict_RF)
    accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid(y_test, predict_RF)

    print("RandomForest模型准确率 : " + str(accu_scr))
    print("RandomForest模型精确率 : " + str(prec_scr))
    print("RandomForest模型召回率 : " + str(rec_scr))
    print("RandomForest模型F1 score : " + str(f1_scr))

    # generate roc
    fpr, tpr, thresholds = roc_curve(y_test, y_predict_proba[:, 1])
    roc_auc = auc(fpr, tpr)

    # get RF feature importance
    importance_score_list = []
    for i in range(len(model_RF.feature_importances_)):
        importance_score_list.append([model_RF.feature_importances_[i], col_names[i]])
    importance_score_list = sorted(importance_score_list, key=lambda x: x[0], reverse=True)
    # get RF importance features
    print("=" * 50)
    print("xgboost feature importance")
    for i in range(len(importance_score_list)):
        print(importance_score_list[i][1] + '\t' + str(importance_score_list[i][0]))
    print("=" * 50)

    # # 得分
    # score_RF_train = model_RF.score(X_train, y_train)
    # score_RF_test = model_RF.score(X_test, y_test)

    # # 特征重要性
    # fig, ax = plt.subplots(figsize=(7, 5))
    # ax.bar(col_names, model_RF.feature_importances_)
    # ax.set_title("Feature Importances")
    # fig.savefig('pic/RF_feature_importance.png')

    return fpr, tpr, roc_auc
示例#2
0
def lightGBM_churn(X_train, y_train, X_test, y_test):
    # model construction
    lgb_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                   num_leaves=63,
                                   max_depth=-1,
                                   learning_rate=0.1,
                                   n_estimators=200,
                                   max_bin=255,
                                   subsample_for_bin=2000,
                                   objective=None,
                                   min_split_gain=0.0,
                                   min_child_weight=2,
                                   scale_pos_weight=2,
                                   min_child_samples=20,
                                   subsample=0.8,
                                   subsample_freq=1,
                                   colsample_bytree=0.8,
                                   reg_alpha=1,
                                   reg_lambda=1,
                                   random_state=12,
                                   n_jobs=8,
                                   silent=True)

    # model training
    lgb_churn_model = lgb_model.fit(X_train,
                                    y_train,
                                    eval_metric='auc',
                                    eval_set=[(X_test, y_test)],
                                    early_stopping_rounds=100)
    y_pred = lgb_churn_model.predict(X_test)
    y_pred_proba = lgb_churn_model.predict_proba(X_test)
    accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid(
        y_test, y_pred)

    print("lightGBM模型准确率 : " + str(accu_scr))
    print("lightGBM模型精确率 : " + str(prec_scr))
    print("lightGBM模型召回率 : " + str(rec_scr))
    print("lightGBM模型F1 score : " + str(f1_scr))

    # generate roc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)

    return fpr, tpr, roc_auc
示例#3
0
def xgboost_churn(X_train, y_train, X_test, y_test):
    # load conf
    name = "xgboost_churn"
    fea_path = "conf/u_plan_train_conf"
    col_names = ['y']

    with open(fea_path, 'r') as file:
        for line in file:
            line = line.strip('\n')
            col_names.append(line)
    real_fea_names = col_names[1:]

    X_df = pd.DataFrame(X_train, columns=real_fea_names)

    # model training
    dtrain = xgb.DMatrix(X_train, y_train)
    lmda = 1.0
    params = {
        "objective": "binary:logistic",
        "max_depth": 6,
        "eta": 0.3,
        "gamma": 1,
        "colsample_bytree": 0.8,
        "min_child_weight": 5,
        "subsample": 0.8
    }
    best_iteration = 150
    bst = xgb.train(params, dtrain, best_iteration)

    # model validation
    dtest = xgb.DMatrix(X_test)
    y_pred_proba = bst.predict(dtest)
    y_pred = np.array([1 if item >= 0.5 else 0 for item in y_pred_proba])
    accu_scr, prec_scr, rec_scr, f1_scr = model_validation.model_valid(
        y_test, y_pred)

    print("xgboost模型准确率 : " + str(accu_scr))
    print("xgboost模型精确率 : " + str(prec_scr))
    print("xgboost模型召回率 : " + str(rec_scr))
    print("xgboost模型F1 score : " + str(f1_scr))

    # generate roc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # get importance score and rank
    importance_score_dict = bst.get_score(importance_type="gain")
    importance_score_list = []
    for k, v in importance_score_dict.items():
        importance_score_list.append((v, int(k[1:])))
    importance_score_list = sorted(importance_score_list,
                                   key=lambda x: x[0],
                                   reverse=True)
    # get xgboost importance features
    print("=" * 50)
    print("xgboost feature importance")
    for i in range(len(importance_score_list)):
        print(real_fea_names[importance_score_list[i][1]] + '\t' +
              str(importance_score_list[i][0]))
    print("=" * 50)
    top_20_f = []
    for i in range(20):
        top_20_f.append(real_fea_names[importance_score_list[i][1]])

    # # explainer model training
    # params = {"objective": "binary:logistic", 'silent': 1, 'eval_metric': 'auc', 'base_score': 0.5, "lambda": lmda}
    # bst = xgb.train(params, dtrain, best_iteration)
    #
    # # calculate xgboost probability
    # tree_lst = xgb_exp.model2table(bst, lmda=lmda)
    # leaf_lsts = bst.predict(dtrain, pred_leaf=True)
    # fea_logit = [[] for _ in range(len(real_fea_names))]
    # for i, leaf_lst in enumerate(leaf_lsts):
    #     dist = xgb_exp.logit_contribution(tree_lst, leaf_lst)
    #     for idx in range(len(real_fea_names)):
    #         if 'f' + str(idx) in dist:
    #             fea_logit[idx].append(-dist['f' + str(idx)])
    #         else:
    #             fea_logit[idx].append(0)
    #
    # # explainer plot
    # fig = plt.figure(figsize=(30, 30))
    # fig.suptitle('Feature User Churn')
    # for i in range(len(top_20_f)):
    #     fea = top_20_f[i]
    #     idx = real_fea_names.index(fea)
    #     fea_data = X_df[fea].values
    #     ax = fig.add_subplot(5, 4, i + 1)
    #     ax.set_xscale('log')
    #     ax.set_xlim(min([i for i in fea_data if i > 0]), max(fea_data))
    #     ax.scatter(fea_data, fea_logit[idx], s=0.1)
    #     ax.hlines(0, min([i for i in fea_data if i > 0]), max(fea_data), linewidth=0.5)
    #     ax.set_title(top_20_f[i])
    # plt.subplots_adjust(wspace=0.3, hspace=0.3)
    # plt.savefig('pic/' + name + '_' + str(best_iteration) + '.png', dpi=800)

    return fpr, tpr, roc_auc