예제 #1
0
    def XGB_over_sample(data):
        # 采样前的数据
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train, X_test, y_train, y_test = DataTools.data_split(X, y)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_test)
        print('分割后的训练集:')
        DataTools.print_data_ratio(y_train)

        # smote后的数据
        X_os, y_os = OverSample.over_sample_own(X_train, y_train)
        print('上采样后的训练集:')
        DataTools.print_data_ratio(y_os)

        start_time = time.clock()
        y_predict = ModelXGB.xgb_predict(X_os, y_os, X_test)
        y_predict_prob = ModelXGB.xgb_predict_prob(X_os, y_os, X_test)
        end_time = time.clock()
        cost_time = end_time - start_time
        result = DataTools.compute_score_list(y_test, y_predict,
                                              y_predict_prob, cost_time)
        pd.DataFrame(result).to_csv('data/score/xgb_oversample.csv')
        print('结果已保存至score文件夹下 ^_^')
예제 #2
0
 def smoteEE_own(X, y):
     number_records_fraud = len(y[y.Class == 1])
     fraud_indices = np.array(y[y.Class == 1].index)
     normal_indices = y[y.Class == 0].index
     # 对负类进行下采样
     random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
     random_normal_indices = np.array(random_normal_indices)
     # 负类的index + 下采样后正类的index
     under_ee_indices = np.concatenate([fraud_indices, random_normal_indices])
     # iloc是将序列当作数组来访问,下标会从0开始
     X_ee_sample = X.loc[under_ee_indices, :]
     y_ee_sample = y.loc[under_ee_indices, :]
     print('EE下采样后的训练集:')
     DataTools.print_data_ratio(y_ee_sample)
     sm = SMOTE(ratio={1: math.ceil(number_records_fraud * 1.5)},
                # random_state=0,
                kind='regular',
                )
     X_ee_smote_train_array, y_ee_smote_train_array = sm.fit_sample(X_ee_sample, y_ee_sample.values.ravel())
     X_ee_smote_train = pd.DataFrame(X_ee_smote_train_array,
                                     columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
                                              'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
                                              'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'normAmount'])
     y_ee_smote_train = pd.DataFrame(y_ee_smote_train_array, columns=['Class'])
     return X_ee_smote_train, y_ee_smote_train
예제 #3
0
 def LR_origin_data(data):
     # 采样前的数据
     X, y = DataPreprocessing.read_X_y(data)
     X_train, X_test, y_train, y_test = DataTools.data_split(X, y)
     # 训练LR,得到C参数
     c_param_scores = ClassifierLR.c_param_scores(X_train, y_train)
     # 训练模型
     y_predict = ClassifierLR.fit_model_LR(X_train, y_train, X_test,
                                           c_param_scores)
     # 计算混淆矩阵
     cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict)
     # 绘制混淆矩阵图
     PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')
     # 绘制ROC曲线
     ResultLR.LR_plot_ROC(X_train, y_train, X_test, y_test, c_param_scores)
예제 #4
0
    def LR_under_sample_test(data):
        # 下采样后的新数据
        X_under_sample, y_under_sample = UnderSample.under_sample_own(data)
        X_under_sample_train, X_under_sample_test, y_under_sample_train, y_under_sample_test = DataTools.data_split(
            X_under_sample, y_under_sample)
        # 训练LR,得到C参数
        c_param_scores = ClassifierLR.c_param_scores(X_under_sample_train,
                                                     y_under_sample_train)
        # 训练模型
        y_predict = ClassifierLR.fit_model_LR(X_under_sample_train,
                                              y_under_sample_train,
                                              X_under_sample_test,
                                              c_param_scores)
        # 计算混淆矩阵
        cnf_matrix = DataTools.compute_confusion_matrix(
            y_under_sample_test, y_predict)
        # 绘制混淆矩阵图
        PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')
        # 绘制ROC曲线
        ResultLR.LR_plot_ROC(X_under_sample_train, y_under_sample_train,
                             X_under_sample_test, y_under_sample_test,
                             c_param_scores)
        # 绘制阈值图
        # ResultLR.LR_plot_threshold(X_under_sample_train, y_under_sample_train, X_under_sample_test,
        #                            y_under_sample_test, c_param_scores)

        # 绘制精度-召回率曲线
        ResultLR.LR_plot_precision_recall(X_under_sample_train,
                                          y_under_sample_train,
                                          X_under_sample_test,
                                          y_under_sample_test, c_param_scores)
예제 #5
0
    def model_all_own(clf, X_train, y_train, X_test, y_test):

        print('*******************************************')
        print(clf.__class__.__name__, '开始fit...')
        start_time = time()
        clf.fit(X_train, y_train.values.ravel())
        y_pred = clf.predict(X_test)
        y_perd_prob = clf.predict_proba(X_test)
        end_time = time()

        result = {}
        roc_pr = {}
        recall_, accuracy_, precision_, f1_, f5_, auc_, g_mean_, fpr_, tpr_ = \
            DataTools.compute_score(y_test, y_pred, y_perd_prob)

        result['recall'] = recall_
        result['acc'] = accuracy_
        result['precision'] = precision_
        result['f1'] = f1_
        result['f5'] = f5_
        result['auc'] = auc_
        result['gmean'] = g_mean_
        result['time'] = end_time - start_time

        roc_pr['fpr'] = fpr_
        roc_pr['tpr'] = tpr_

        print("{} 训练结束,耗时: {:.4f} ".format(clf.__class__.__name__,
                                           (end_time - start_time)))

        return result, roc_pr
예제 #6
0
 def ee_own(X, y):
     number_records_fraud = len(y[y.Class == 1])
     fraud_indices = np.array(y[y.Class == 1].index)
     normal_indices = y[y.Class == 0].index
     # 对负类进行下采样
     random_normal_indices = np.random.choice(normal_indices,
                                              int(number_records_fraud *
                                                  1.05),
                                              replace=False)
     random_normal_indices = np.array(random_normal_indices)
     # 负类的index + 下采样后正类的index
     under_ee_indices = np.concatenate(
         [fraud_indices, random_normal_indices])
     # iloc是将序列当作数组来访问,下标会从0开始
     X_ee_sample = X.loc[under_ee_indices, :]
     y_ee_sample = y.loc[under_ee_indices, :]
     print('EE采样后的训练集:')
     DataTools.print_data_ratio(y_ee_sample)
     return X_ee_sample, y_ee_sample
예제 #7
0
    def XGB_under_sample(data):
        # pandas显示
        # count_class = pd.value_counts(under_sample_data['Class']).sort_index()
        # print('下采用后的class为:', count_class)

        # 采样前的数据
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train, X_test, y_train, y_test = DataTools.data_split(X, y)
        print('分割后的训练集:')
        DataTools.print_data_ratio(y_train)
        # 下采样后的新数据
        X_under_sample_train, y_under_sample_train = UnderSample.under_sample_own(
            X_train, y_train)
        print('下采样后的训练集:')
        DataTools.print_data_ratio(y_under_sample_train)

        # X_under_sample_train, X_under_sample_test, y_under_sample_train, y_under_sample_test = DataTools.data_split(
        #     X_under_sample, y_under_sample)
        # print('下采样分割后的训练集:')
        # DataTools.print_data_ratio(y_under_sample_train)

        # xgb自带cv训练参数
        # ModelXGB.xgb_cv_param(X_under_sample_train, y_under_sample_train)
        # 使用GridSearchCV训练参数
        # ModelXGB.xgb_gridSearchCV(X_under_sample_train, y_under_sample_train)

        # 训练模型
        start_time = time.clock()
        y_predict = ModelXGB.xgb_predict(X_under_sample_train,
                                         y_under_sample_train, X_test)
        y_predict_prob = ModelXGB.xgb_predict_prob(X_under_sample_train,
                                                   y_under_sample_train,
                                                   X_test)
        end_time = time.clock()
        cost_time = end_time - start_time
        result = DataTools.compute_score_list(y_test, y_predict,
                                              y_predict_prob, cost_time)
        pd.DataFrame(result).to_csv('data/score2/xgb_us2.csv')
        print('结果已保存至score文件夹下 ^_^')

        # 计算混淆矩阵
        cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict)

        # 绘制混淆矩阵图
        PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')
        PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
예제 #8
0
 def plot_thresholds(y_true, y_pred_proba):
     thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
     plt.figure(figsize=(10, 10))
     j = 1
     for i in thresholds:
         y_predictions_high_recall = y_pred_proba[:, 1] > i
         plt.subplot(3, 3, j)
         j += 1
         cnf_matrix = DataTools.compute_confusion_matrix(
             y_true, y_predictions_high_recall)
         class_names = [0, 1]
         PlotTools.plot_confusion_matrix(cnf_matrix,
                                         classes=class_names,
                                         title='Threshold >= %s' % i)
예제 #9
0
    def LR_EE_smote(data):
        # 子集数目
        num_subsets = 10
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train, X_test, y_train, y_test = DataTools.data_split(X, y)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_test)
        print('分割后的训练集:')
        DataTools.print_data_ratio(y_train)

        result = {}
        result_recall = []
        result_acc = []
        result_precision = []
        result_f1 = []
        result_auc = []
        result_gmean = []
        result_fpr_temps = []
        result_tpr_temps = []

        start_time = time.clock()
        for i in (range(num_subsets)):
            print(
                '******************************************************************************'
            )
            print('第 ', i + 1, ' 个分类器开始:')
            # EE&smote后的数据

            X_ee_smote, y_ee_smote = SmoteEE.smoteEE_own(X_train, y_train)
            DataTools.print_data_ratio(y_ee_smote)

            # 训练参数
            # print('训练集子集%d:' % (i + 1))
            # ClassifierLR.lr_grid_search_cv(X_ee_smote, y_ee_smote)

            pd.concat([X_ee_smote, y_ee_smote],
                      axis=1).to_csv('data/subsets/lr_subset%d.csv' % (i + 1))
            print('第%d个 子集导出成功!' % (i + 1))
            print('训练集子集%d:' % (i + 1))
            DataTools.print_data_ratio(y_ee_smote)

            y_predict = ClassifierLR.fit_model_LR(X_ee_smote, y_ee_smote,
                                                  X_test, 0.1)
            y_predict_prob = ClassifierLR.lr_predict_proba(
                X_ee_smote, y_ee_smote, X_test, 0.1)
            recall_, accuracy_, precision_, f1_, auc_, g_mean_, fpr_, tpr_ = \
                DataTools.compute_score(y_test, y_predict, y_predict_prob)
            result_recall.append(recall_)
            result_acc.append(accuracy_)
            result_precision.append(precision_)
            result_f1.append(f1_)
            result_auc.append(auc_)
            result_gmean.append(g_mean_)
            result_fpr_temps.append(fpr_)
            result_tpr_temps.append(tpr_)

        end_time = time.clock()
        result['time'] = end_time - start_time
        result['recall'] = np.mean(result_recall)
        result['acc'] = np.mean(result_acc)
        result['precision'] = np.mean(result_precision)
        result['f1'] = np.mean(result_f1)
        result['auc'] = np.mean(result_auc)
        result['gmean'] = np.mean(result_gmean)
        result['fpr'] = pd.DataFrame(result_fpr_temps).mean()
        result['tpr'] = pd.DataFrame(result_tpr_temps).mean()
        pd.DataFrame(result).to_csv('data/score/lr_ee_tuned.csv')
        print('结果已保存至score文件夹下 ^_^')
예제 #10
0
    def XGB_EE(data):
        # 子集数目
        num_subsets = 5
        X, y = DataPreprocessing.read_X_y(data)
        X_train_tmp, X_test, y_train_tmp, y_test = DataTools.data_split(X, y)
        X_train, X_validate, y_train, y_validate = DataTools.data_split(
            X_train_tmp, y_train_tmp)

        result = {}
        result_recall = []
        result_acc = []
        result_precision = []
        result_f1 = []
        result_f5 = []
        result_auc = []
        result_gmean = []
        result_fpr_temps = []
        result_tpr_temps = []

        start_time = time.clock()
        for i in (range(num_subsets)):
            print(
                '******************************************************************************'
            )
            print('第 ', i + 1, ' 个分类器开始:')
            # EE&smote后的数据
            X_ee, y_ee = EE.ee_own(X_train, y_train)
            pd.concat([X_ee, y_ee],
                      axis=1).to_csv('data/subsets/subset_ee%d.csv' % (i + 1))
            print('第%d个 子集导出成功!' % (i + 1))
            print('训练集子集%d:' % (i + 1))
            DataTools.print_data_ratio(y_ee)

            # 训练参数
            # ModelXGB.xgb_cv_param(X_ee, y_ee)
            # ModelXGB.xgb_gridSearchCV(X_ee, y_ee)
            # return

            y_predict = ModelXGB.xgb_predict(X_ee, y_ee, X_test)
            y_predict_prob = ModelXGB.xgb_predict_prob(X_ee, y_ee, X_test)
            recall_, accuracy_, precision_, f1_, f5_, auc_, g_mean_, fpr_, tpr_ = \
                DataTools.compute_score(y_test, y_predict, y_predict_prob)
            result_recall.append(recall_)
            result_acc.append(accuracy_)
            result_precision.append(precision_)
            result_f1.append(f1_)
            result_f5.append(f5_)
            result_auc.append(auc_)
            result_gmean.append(g_mean_)
            result_fpr_temps.append(fpr_)
            result_tpr_temps.append(tpr_)

        end_time = time.clock()
        result['time'] = end_time - start_time
        result['recall'] = np.mean(result_recall)
        result['acc'] = np.mean(result_acc)
        result['precision'] = np.mean(result_precision)
        result['f1'] = np.mean(result_f1)
        result['f5'] = np.mean(result_f5)
        result['auc'] = np.mean(result_auc)
        result['gmean'] = np.mean(result_gmean)
        result['fpr'] = pd.DataFrame(result_fpr_temps).mean()
        result['tpr'] = pd.DataFrame(result_tpr_temps).mean()
        pd.DataFrame(result).to_csv('data/score2/xgb_ee3.csv')
        print('结果已保存至score文件夹下 ^_^')
        # # 计算混淆矩阵
        cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict)
        # # 绘制混淆矩阵图
        PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')

        PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
예제 #11
0
    def XGB_smote(data):
        # 采样前的数据
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train, X_test, y_train, y_test = DataTools.data_split(X, y)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_test)
        print('分割后的训练集:')
        DataTools.print_data_ratio(y_train)
        # smote后的数据
        X_smote, y_smote = SmoteOrigin.smote_own(X_train, y_train)
        print('SMOTE后的训练集:')
        DataTools.print_data_ratio(y_smote)

        start_time = time.clock()
        y_predict = ModelXGB.xgb_predict(X_smote, y_smote, X_test)
        y_predict_prob = ModelXGB.xgb_predict_prob(X_smote, y_smote, X_test)
        end_time = time.clock()
        cost_time = end_time - start_time
        result = DataTools.compute_score_list(y_test, y_predict,
                                              y_predict_prob, cost_time)
        pd.DataFrame(result).to_csv('data/score/xgb_smote.csv')
        print('结果已保存至score文件夹下 ^_^')

        # 计算混淆矩阵
        cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict)
        # 绘制混淆矩阵图
        PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')
예제 #12
0
    def XGB_origin(data):
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train_temp, X_test, y_train_temp, y_test = DataTools.data_split(X, y)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_test)
        print('分割后的训练集temp:')
        DataTools.print_data_ratio(y_train_temp)

        X_train, X_validate, y_train, y_validate = DataTools.data_split(
            X_train_temp, y_train_temp)
        print('分割后的验证集:')
        DataTools.print_data_ratio(y_validate)
        print('分割后的训练集:')
        DataTools.print_data_ratio(y_train)

        # sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=666)
        # sss.get_n_splits(X, y)

        start_time = time.clock()
        y_predict = ModelXGB.xgb_predict(X_train, y_train, X_test)
        y_predict_prob = ModelXGB.xgb_predict_prob(X_train, y_train, X_test)
        end_time = time.clock()
        cost_time = end_time - start_time
        result = DataTools.compute_score_list(y_test, y_predict,
                                              y_predict_prob, cost_time)
        pd.DataFrame(result).to_csv('data/score2/xgb_origin.csv')
        print('结果已保存至score文件夹下 ^_^')

        # # 计算混淆矩阵
        cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict)
        # # 绘制混淆矩阵图
        PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')

        PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
예제 #13
0
    def result_all_model(data):
        X, y = DataPreprocessing.read_X_y(data)
        print('原始数据:')
        DataTools.print_data_ratio(y)
        X_train_temp, X_test, y_train_temp, y_test = DataTools.data_split(X, y)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_test)
        print('分割后的训练集temp:')
        DataTools.print_data_ratio(y_train_temp)

        X_train, X_test_validate, y_train, y_test_validate = DataTools.data_split(
            X_train_temp, y_train_temp)
        print('分割后的验证集:')
        DataTools.print_data_ratio(y_test_validate)
        print('分割后的测试集:')
        DataTools.print_data_ratio(y_train)

        # X_under_sample_train, y_under_sample_train = SmoteOrigin.smote_own(X_train, y_train)
        # print('smote后的测试集:')
        # DataTools.print_data_ratio(y_under_sample_train)

        # 下采样后的新数据
        # X_under_sample_train, y_under_sample_train = UnderSample.under_sample_own(X_train, y_train)
        # print('下采样后的训练集:')
        # DataTools.print_data_ratio(y_under_sample_train)

        # ModelAll.grid_search_cv_all(X_train, y_train)
        # ModelXGB.xgb_gridSearchCV(X_ee, y_ee)
        # return

        X_under_sample_train, y_under_sample_train = OverSample.over_sample_own(
            X_train, y_train)
        print('上采样后的测试集:')
        DataTools.print_data_ratio(y_under_sample_train)

        clf_knn = KNeighborsClassifier()

        clf_lr = LogisticRegression()
        clf_dt = DecisionTreeClassifier()
        clf_aba = AdaBoostClassifier()
        # n_estimators=300,
        # learning_rate=0.28,
        # random_state=321)

        clf_gbdt = GradientBoostingClassifier()
        # class_weight='balanced',
        # max_depth=5,
        # criterion='entropy')

        clf_rf = RandomForestClassifier()
        # n_estimators=15,
        # class_weight='balanced',
        # max_depth=5)
        clf_xgb = XGBClassifier()
        # learning_rate=0.1,
        # n_estimators=70,
        # max_depth=4,
        # min_child_weight=1,
        # gamma=0,
        # objective='binary:logistic',
        # # subsample=0.6,
        # # colsample_bytree=0.4,
        # reg_lambda=0.1)

        results = {}
        # for clf in [clf_xgb]:
        for clf in [clf_dt, clf_lr, clf_aba, clf_gbdt, clf_xgb, clf_rf]:
            clf_name = clf.__class__.__name__
            results[clf_name] = {}
            results[clf_name], roc_list = ModelAll.model_all_own(
                clf, X_under_sample_train, y_under_sample_train, X_test,
                y_test)
            # 绘制ROC曲线图
            PlotTools.plot_roc_curve2(roc_list['fpr'], roc_list['tpr'],
                                      clf_name)

        dt_pd = pd.DataFrame(results['DecisionTreeClassifier'], index=['DT'])
        lr_pd = pd.DataFrame(results['LogisticRegression'], index=['LR'])
        ada_pd = pd.DataFrame(results['AdaBoostClassifier'], index=['ADA'])
        gbdt_pd = pd.DataFrame(results['GradientBoostingClassifier'],
                               index=['GBDT'])
        rf_pd = pd.DataFrame(results['RandomForestClassifier'], index=['RF'])
        xgb_pd = pd.DataFrame(results['XGBClassifier'], index=['XGB'])

        all_pd = pd.concat([lr_pd, dt_pd, ada_pd, gbdt_pd, rf_pd, xgb_pd])
        all_pd.to_csv('data/score3/all_models_os.csv')
        print('结果已保存至score文件夹下 ^_^')