예제 #1
0
def model_fuse_evaluation(model,train_data,valid_data,test_data,feature_used):
    evaluation_result = []
    X_valid = valid_data[feature_used]
    y_valid = valid_data['user_type']
    X_test = test_data[feature_used]
    y_test = test_data['user_type']
    X_train = train_data[feature_used]
    y_train = train_data['user_type']
    
    predict_valid = model.predict_proba(X_valid)[:,1]
    predict_label = model.predict(X_valid)
    valid_ks = plot_ks_curve(predict_valid,y_valid)
    valid_auc = roc_auc_score(y_valid, predict_valid)
    
    predict_test = model.predict_proba(X_test)[:,1]
    ks_test = plot_ks_curve(predict_test,y_test)
    auc_test = roc_auc_score(y_test, predict_test)  
    
    predict_train = model.predict_proba(X_train)[:,1]
    ks_train = plot_ks_curve(predict_train,y_train)
    auc_train = roc_auc_score(y_train, predict_train)    
    
    # 计算准确率
    print(classification_report(y_valid.values,predict_label,target_names=['0', '1']))
    accuracy = metrics.accuracy_score(y_valid,predict_label)
    
    # 结果保存
    evaluation_result = [valid_ks,valid_auc,ks_test,auc_test,auc_train,ks_train,accuracy]
    return evaluation_result
예제 #2
0
def submodel_evaluation(train_data,valid_data,model_list,\
                        category_feature,numeric_feature): 
    X_train = train_data[category_feature+numeric_feature] 
    y_train = train_data['user_type']
    X_valid = valid_data[category_feature+numeric_feature]
    y_valid = valid_data['user_type']
    
    pipeline_transformer = feature_union(category_feature,numeric_feature)    
    model_result_dict = {}
    for model in model_list:
        model_name = model.__class__.__name__
        print('model %s evaluation'%model_name)
        
        sub_model = PMMLPipeline([
            ('mapper',pipeline_transformer),
            ('classifier',model)
        ])
        sub_model.fit(X_train,y_train)
        predict_valid = sub_model.predict_proba(X_valid)[:,1]
        predict_label = sub_model.predict(X_valid)
        model_ks = plot_ks_curve(predict_valid,valid_data['user_type'])
        model_auc = roc_auc_score(y_valid, predict_valid)
        accuracy = metrics.accuracy_score(y_valid,predict_label)
        model_result_dict[model_name] = [model_ks,model_auc,accuracy]
    return model_result_dict
def pipe_train_test_evaluate(data_dict, pipeline_estimator):
    '''
    进行pipeline在训练集和多个测试集上面的评估
    data_dict: 多个数据集组成的dict 包含train test_xxx等等key
    pipeline_estimator: pipeline模型
    
    return:
    detail_result: 预测的概率结果和真实标签
    statistic_result: 预测的指标
    '''
    ##取出训练集 进行模型训练
    X_train = data_dict.get('train').get('X')
    y_train = data_dict.get('train').get('y')
    pipeline_estimator.fit(X_train, y_train)
    statistic_result = {}
    detail_result = {}
    ##进行模型预测
    for key in data_dict.keys():
        temp_X = data_dict.get(key).get('X')
        temp_y = data_dict.get(key).get('y')
        temp_predict = pipeline_estimator.predict_proba(temp_X)[:, 1]
        temp_auc = plot_roc_curve(temp_y, temp_predict)
        temp_ks = plot_ks_curve(temp_predict, temp_y, is_score=False, n=10)
        detail_result[key] = {'predict': temp_predict, 'true': temp_y.values}
        statistic_result[key] = {'auc': temp_auc, 'ks': temp_ks}
    return detail_result, statistic_result
예제 #4
0
def gbm_cv_evaluate(X,y,total_features,category_features,cv,groups=None,X_test=None,y_test=None,params_dict=None):
    '''
    单个light模型的CV评估的结果 即将废弃
    X:X数据 传入pandas.DataFrame对象
    y:Y数据 传入pandas.Series对象
    total_features: 入模所有特征 list
    category_features: 入模类别特征 list
    cv: 数据集切分方法
    groups: 数据分组
    X_test:测试X数据 可不指定
    y_test:测试y数据 可不指定
    params_dict: 模型参数
    
    return:
    detail_result: 每一折cv的结果 pandas.DataFrame对象
    statistic_result 最终各指标的结果 dict对象
    '''
    valid_auc_list = []
    valid_ks_list = []
    train_auc_list = []
    train_ks_list = []
    if X_test is None:
        test_auc_list = np.nan
        test_ks_list = np.nan
    else:
        test_auc_list = []
        test_ks_list = []
        
    best_iteration_list = []
    
    ##遍历数据集
    for fold_n, (train_index, valid_index) in enumerate(cv.split(X,y,groups)):
        
        ##取出当前轮使用的的训练数据和验证数据
        X_valid,y_valid = X.iloc[valid_index][total_features],y.iloc[valid_index]
        X_train,y_train = X.iloc[train_index][total_features],y.iloc[train_index]
        if len(category_features) == 0:
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)  
        else:
            train_data = lgb.Dataset(X_train, label=y_train,categorical_feature=category_features)
            valid_data = lgb.Dataset(X_valid, label=y_valid,categorical_feature=category_features)  
        ##模型训练
        model = lgb.train(params_dict,train_data,num_boost_round=20000,\
                          valid_sets = [valid_data],verbose_eval=None,early_stopping_rounds=100)
        ##模型预测
        y_pred_train = model.predict(X_train,num_iteration=model.best_iteration)
        y_pred_valid = model.predict(X_valid,num_iteration=model.best_iteration)
        ##结果评估
        train_auc = plot_roc_curve(y_train,y_pred_train)
        train_ks = plot_ks_curve(y_pred_train,y_train, is_score=False, n=10)
        valid_auc = plot_roc_curve(y_valid,y_pred_valid)
        valid_ks = plot_ks_curve(y_pred_valid,y_valid, is_score=False, n=10)
        ##结果记录
        train_auc_list.append(train_auc)
        train_ks_list.append(train_ks)
        valid_auc_list.append(valid_auc)
        valid_ks_list.append(valid_ks)
        if X_test is not None:
            y_pred_test = model.predict(X_test[total_features],num_iteration=model.best_iteration)
            test_auc = plot_roc_curve(y_test,y_pred_test)
            test_ks = plot_ks_curve(y_pred_test,y_test, is_score=False, n=10)
            test_auc_list.append(test_auc)
            test_ks_list.append(test_ks)
        best_iteration_list.append(model.best_iteration)
        
    detail_result = pd.DataFrame(data={'test_auc':test_auc_list,
                                          'test_ks':test_ks_list,
                                          'valid_auc':valid_auc_list,
                                          'valid_ks':valid_ks_list,
                                          'train_ks':train_ks_list,
                                          'train_auc':train_auc_list,
                                          'best_iteration':best_iteration_list
                                          })
    statistic_result = { 'train_auc_mean':np.mean(train_auc_list),
                        'train_auc_std':np.std(train_auc_list),
                        'train_ks_mean':np.mean(train_ks_list),
                        'train_ks_std':np.std(train_ks_list),
                        'valid_auc_mean':np.mean(valid_auc_list),
                        'valid_auc_std':np.std(valid_auc_list),
                        'valid_ks_mean':np.mean(valid_ks_list),
                        'valid_ks_std':np.std(valid_ks_list),
                        'test_auc_mean':np.mean(test_auc_list),
                        'test_auc_std':np.std(test_auc_list),
                        'test_ks_mean':np.mean(test_ks_list),
                        'test_ks_std':np.std(test_ks_list)
                   }
    print('train AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_auc_list), np.std(train_auc_list)),\
          'train KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_ks_list), np.std(train_ks_list)))
    print('valid AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_auc_list), np.std(valid_auc_list)),\
          'valid KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_ks_list), np.std(valid_ks_list)))
    if X_test is not None:
        print('test AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_auc_list), np.std(test_auc_list)),\
              'test KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_ks_list), np.std(test_ks_list)))
    print('best_iteration:',best_iteration_list)
    return detail_result,statistic_result
예제 #5
0
def pipe_cv_evaluate_old(X,y,pipeline_estimator,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=10),groups=None,X_test=None,y_test=None):
    '''
    单个Pipeline的CV评估的结果 即将废弃 建议使用pipe_cv_evaluate
    X:X数据 传入pandas.DataFrame对象
    y:Y数据 传入pandas.Series对象
    pipeline_estimator:pipeline模型
    cv: 数据集切分方法
    groups: 数据分组
    X_test:测试X数据 可不指定
    y_test:测试y数据 可不指定
    
    return:
    detail_result: 每一折cv的结果 pandas.DataFrame对象
    statistic_result: 最终各指标的结果 dict对象
    '''
    valid_auc_list = []
    valid_ks_list = []
    train_auc_list = []
    train_ks_list = []
    if X_test is None:
        test_auc_list = np.nan
        test_ks_list = np.nan
    else:
        test_auc_list = []
        test_ks_list = []
    ##遍历数据集
    for fold_n, (train_index, valid_index) in enumerate(cv.split(X,y,groups)):

        ##取出当前轮使用的的训练数据和验证数据
        X_valid,y_valid = X.iloc[valid_index],y.iloc[valid_index]
        X_train,y_train = X.iloc[train_index],y.iloc[train_index]
        ##模型训练
        pipeline_estimator.fit(X,y)
        ##模型预测
        y_pred_train = pipeline_estimator.predict_proba(X_train)[:,1]
        y_pred_valid = pipeline_estimator.predict_proba(X_valid)[:,1]
        ##结果评估
        train_auc = plot_roc_curve(y_train,y_pred_train)
        train_ks = plot_ks_curve(y_pred_train,y_train, is_score=False, n=10)
        valid_auc = plot_roc_curve(y_valid,y_pred_valid)
        valid_ks = plot_ks_curve(y_pred_valid,y_valid, is_score=False, n=10)
        ##结果记录
        train_auc_list.append(train_auc)
        train_ks_list.append(train_ks)
        valid_auc_list.append(valid_auc)
        valid_ks_list.append(valid_ks)
        if X_test is not None:
            y_pred_test = pipeline_estimator.predict_proba(X_test)[:,1]
            test_auc = plot_roc_curve(y_test,y_pred_test)
            test_ks = plot_ks_curve(y_pred_test,y_test, is_score=False, n=10)
            test_auc_list.append(test_auc)
            test_ks_list.append(test_ks)
        
    detail_result = pd.DataFrame(data={'test_auc':test_auc_list,
                                          'test_ks':test_ks_list,
                                          'valid_auc':valid_auc_list,
                                          'valid_ks':valid_ks_list,
                                          'train_ks':train_ks_list,
                                          'train_auc':train_auc_list
                                          })
    statistic_result = { 'train_auc_mean':np.mean(train_auc_list),
                        'train_auc_std':np.std(train_auc_list),
                        'train_ks_mean':np.mean(train_ks_list),
                        'train_ks_std':np.std(train_ks_list),
                        'valid_auc_mean':np.mean(valid_auc_list),
                        'valid_auc_std':np.std(valid_auc_list),
                        'valid_ks_mean':np.mean(valid_ks_list),
                        'valid_ks_std':np.std(valid_ks_list),
                        'test_auc_mean':np.mean(test_auc_list),
                        'test_auc_std':np.std(test_auc_list),
                        'test_ks_mean':np.mean(test_ks_list),
                        'test_ks_std':np.std(test_ks_list)
                   }
    print('train AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_auc_list), np.std(train_auc_list)),\
          'train KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_ks_list), np.std(train_ks_list)))
    print('valid AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_auc_list), np.std(valid_auc_list)),\
          'valid KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_ks_list), np.std(valid_ks_list)))
    if X_test is not None:
        print('test AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_auc_list), np.std(test_auc_list)),\
              'test KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_ks_list), np.std(test_ks_list)))
    return detail_result,statistic_result
def gbdt_cv_evaluate_earlystop(data_dict,
                               gbdt_estimator,
                               total_features,
                               category_features,
                               cv=StratifiedKFold(n_splits=5,
                                                  shuffle=True,
                                                  random_state=10),
                               groups=None):
    '''
    使用gbdt模型进行交叉验证评估,训练中使用earlystop,适用于XBG、GBM、GBDT以及CAT等模型
    data_dict: 多个数据集组成的dict 包含train test_xxx等等key
    gbdt_estimator: gbdt类的estimator
    total_features: 入模所有特征 list
    category_features: 入模类别特征 list
    cv: 数据集切分方法
    groups: 数据分组
    
    return:
    fold_detail_result: 每一折内各个数据集预测的概率结果和真实标签
    fold_statistic_result: 每一折内各个数据集预测的指标
    fold_best_iteration_result: 每一折内的提前停止情况
    '''
    ##为了防止下面的操作更改dict内容所以复制一份 但是考虑到内存占用问题后续可以优化
    data_dict = data_dict.copy()
    ##取出训练集 进行模型训练
    X = data_dict.get('train').get('X')
    y = data_dict.get('train').get('y')
    fold_detail_result = {}
    fold_statistic_result = {}
    fold_best_iteration_result = {}

    ##遍历数据集
    for fold_n, (train_index, valid_index) in enumerate(cv.split(X, y,
                                                                 groups)):
        if groups is None:
            group_fold = 'NULL'
        else:
            group_fold = str(groups[valid_index[0]])
        print('正在进行第{}折的验证,验证组号为{}'.format(fold_n, group_fold))
        ##取出当前轮使用的的训练数据和验证数据
        X_outer_valid, y_outer_valid = X.iloc[valid_index], y.iloc[valid_index]
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        ##对train进行数据集划分用于early stop
        X_train, X_inner_valid, y_train, y_inner_valid = train_test_split(
            X_train, y_train, test_size=0.2, random_state=0)
        ##用于训练的数据集准备
        temp_dict = {
            'train': {
                'X': X_train,
                'y': y_train
            },
            'inner_valid': {
                'X': X_inner_valid,
                'y': y_inner_valid
            },
            'outer_valid': {
                'X': X_outer_valid,
                'y': y_outer_valid
            }
        }
        data_dict.update(temp_dict)
        ##模型训练
        #gbdt_estimator.set_params({'n_estimators':20000})
        if category_features == []:
            gbdt_estimator.fit(X_train[total_features],
                               y_train,
                               eval_metric=['auc'],
                               eval_set=[(X_inner_valid[total_features],
                                          y_inner_valid)],
                               early_stopping_rounds=500,
                               verbose=500)
        else:
            gbdt_estimator.fit(X_train[total_features],
                               y_train,
                               categorical_feature=category_features,
                               eval_metric=['auc'],
                               eval_set=[(X_inner_valid[total_features],
                                          y_inner_valid)],
                               early_stopping_rounds=500,
                               verbose=500)
        ##获取最佳训练论数
        if hasattr(gbdt_estimator, 'best_iteration_'):
            best_iteration = gbdt_estimator.best_iteration_
        elif hasattr(gbdt_estimator, 'best_iteration'):
            best_iteration = gbdt_estimator.best_iteration
        else:
            raise ValueError(
                "cannot find best_iteration in {0}".format(gbdt_estimator))
        ##进行模型预测
        detail_result = {}
        statistic_result = {}
        for key in data_dict.keys():
            temp_X = data_dict.get(key).get('X')
            temp_y = data_dict.get(key).get('y')
            temp_predict = gbdt_estimator.predict_proba(
                temp_X[total_features])[:, 1]
            temp_auc = plot_roc_curve(temp_y, temp_predict)
            temp_ks = plot_ks_curve(temp_predict, temp_y, is_score=False, n=10)
            detail_result[key] = {
                'predict': temp_predict,
                'true': temp_y.values
            }
            statistic_result[key] = {'auc': temp_auc, 'ks': temp_ks}
        ##数据存储
        fold_detail_result['fold_' + str(fold_n) + '_group_' +
                           group_fold] = detail_result
        fold_statistic_result['fold_' + str(fold_n) + '_group_' +
                              group_fold] = statistic_result
        fold_best_iteration_result['fold_' + str(fold_n) + '_group_' +
                                   group_fold] = best_iteration
    return fold_detail_result, fold_statistic_result, fold_best_iteration_result
        'X': test_x,
        'y': test_y
    }
}
model_detail_result = {}
model_statistic_result = {}

model_detail_result5, model_statistic_result5 = pipe_train_test_evaluate(
    data_dict, imbRandom5)
model_detail_result4, model_statistic_result4 = pipe_train_test_evaluate(
    data_dict, imbRandom4)
model_detail_result3, model_statistic_result3 = pipe_train_test_evaluate(
    data_dict, imbRandom3)


model_predict_result = model_result_combine({'RandomSample':model_detail_result5,\
                                             'SMOTEENN':model_detail_result4,\
                                             'Smote':model_detail_result3},'test')
### ks曲线
ks = plot_ks_curve(model_predict_result.get('SMOTEENN').get('predict'),\
                   model_predict_result.get('SMOTEENN').get('true'),n=10,return_graph=True)
## roc曲线
roc_dict, auc_dict = plot_multi_roc_curve_dict_type(model_predict_result)
## 通过率vs拒绝率曲线
bad_rate_result = plot_multi_reject_bad_curve_dict_type(model_predict_result)
## PR曲线
plot_multi_PR_curve_dict_type(model_predict_result)
### 预测概率目的曲线
plot_density_curve(model_predict_result.get('SMOTEENN').get('true'),\
                   model_predict_result.get('SMOTEENN').get('predict'))