def model_fuse_evaluation(model,train_data,valid_data,test_data,feature_used): evaluation_result = [] X_valid = valid_data[feature_used] y_valid = valid_data['user_type'] X_test = test_data[feature_used] y_test = test_data['user_type'] X_train = train_data[feature_used] y_train = train_data['user_type'] predict_valid = model.predict_proba(X_valid)[:,1] predict_label = model.predict(X_valid) valid_ks = plot_ks_curve(predict_valid,y_valid) valid_auc = roc_auc_score(y_valid, predict_valid) predict_test = model.predict_proba(X_test)[:,1] ks_test = plot_ks_curve(predict_test,y_test) auc_test = roc_auc_score(y_test, predict_test) predict_train = model.predict_proba(X_train)[:,1] ks_train = plot_ks_curve(predict_train,y_train) auc_train = roc_auc_score(y_train, predict_train) # 计算准确率 print(classification_report(y_valid.values,predict_label,target_names=['0', '1'])) accuracy = metrics.accuracy_score(y_valid,predict_label) # 结果保存 evaluation_result = [valid_ks,valid_auc,ks_test,auc_test,auc_train,ks_train,accuracy] return evaluation_result
def submodel_evaluation(train_data,valid_data,model_list,\ category_feature,numeric_feature): X_train = train_data[category_feature+numeric_feature] y_train = train_data['user_type'] X_valid = valid_data[category_feature+numeric_feature] y_valid = valid_data['user_type'] pipeline_transformer = feature_union(category_feature,numeric_feature) model_result_dict = {} for model in model_list: model_name = model.__class__.__name__ print('model %s evaluation'%model_name) sub_model = PMMLPipeline([ ('mapper',pipeline_transformer), ('classifier',model) ]) sub_model.fit(X_train,y_train) predict_valid = sub_model.predict_proba(X_valid)[:,1] predict_label = sub_model.predict(X_valid) model_ks = plot_ks_curve(predict_valid,valid_data['user_type']) model_auc = roc_auc_score(y_valid, predict_valid) accuracy = metrics.accuracy_score(y_valid,predict_label) model_result_dict[model_name] = [model_ks,model_auc,accuracy] return model_result_dict
def pipe_train_test_evaluate(data_dict, pipeline_estimator): ''' 进行pipeline在训练集和多个测试集上面的评估 data_dict: 多个数据集组成的dict 包含train test_xxx等等key pipeline_estimator: pipeline模型 return: detail_result: 预测的概率结果和真实标签 statistic_result: 预测的指标 ''' ##取出训练集 进行模型训练 X_train = data_dict.get('train').get('X') y_train = data_dict.get('train').get('y') pipeline_estimator.fit(X_train, y_train) statistic_result = {} detail_result = {} ##进行模型预测 for key in data_dict.keys(): temp_X = data_dict.get(key).get('X') temp_y = data_dict.get(key).get('y') temp_predict = pipeline_estimator.predict_proba(temp_X)[:, 1] temp_auc = plot_roc_curve(temp_y, temp_predict) temp_ks = plot_ks_curve(temp_predict, temp_y, is_score=False, n=10) detail_result[key] = {'predict': temp_predict, 'true': temp_y.values} statistic_result[key] = {'auc': temp_auc, 'ks': temp_ks} return detail_result, statistic_result
def gbm_cv_evaluate(X,y,total_features,category_features,cv,groups=None,X_test=None,y_test=None,params_dict=None): ''' 单个light模型的CV评估的结果 即将废弃 X:X数据 传入pandas.DataFrame对象 y:Y数据 传入pandas.Series对象 total_features: 入模所有特征 list category_features: 入模类别特征 list cv: 数据集切分方法 groups: 数据分组 X_test:测试X数据 可不指定 y_test:测试y数据 可不指定 params_dict: 模型参数 return: detail_result: 每一折cv的结果 pandas.DataFrame对象 statistic_result 最终各指标的结果 dict对象 ''' valid_auc_list = [] valid_ks_list = [] train_auc_list = [] train_ks_list = [] if X_test is None: test_auc_list = np.nan test_ks_list = np.nan else: test_auc_list = [] test_ks_list = [] best_iteration_list = [] ##遍历数据集 for fold_n, (train_index, valid_index) in enumerate(cv.split(X,y,groups)): ##取出当前轮使用的的训练数据和验证数据 X_valid,y_valid = X.iloc[valid_index][total_features],y.iloc[valid_index] X_train,y_train = X.iloc[train_index][total_features],y.iloc[train_index] if len(category_features) == 0: train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_valid, label=y_valid) else: train_data = lgb.Dataset(X_train, label=y_train,categorical_feature=category_features) valid_data = lgb.Dataset(X_valid, label=y_valid,categorical_feature=category_features) ##模型训练 model = lgb.train(params_dict,train_data,num_boost_round=20000,\ valid_sets = [valid_data],verbose_eval=None,early_stopping_rounds=100) ##模型预测 y_pred_train = model.predict(X_train,num_iteration=model.best_iteration) y_pred_valid = model.predict(X_valid,num_iteration=model.best_iteration) ##结果评估 train_auc = plot_roc_curve(y_train,y_pred_train) train_ks = plot_ks_curve(y_pred_train,y_train, is_score=False, n=10) valid_auc = plot_roc_curve(y_valid,y_pred_valid) valid_ks = plot_ks_curve(y_pred_valid,y_valid, is_score=False, n=10) ##结果记录 train_auc_list.append(train_auc) train_ks_list.append(train_ks) valid_auc_list.append(valid_auc) valid_ks_list.append(valid_ks) if X_test is not None: y_pred_test = model.predict(X_test[total_features],num_iteration=model.best_iteration) test_auc = plot_roc_curve(y_test,y_pred_test) test_ks = plot_ks_curve(y_pred_test,y_test, is_score=False, n=10) test_auc_list.append(test_auc) test_ks_list.append(test_ks) best_iteration_list.append(model.best_iteration) detail_result = pd.DataFrame(data={'test_auc':test_auc_list, 'test_ks':test_ks_list, 'valid_auc':valid_auc_list, 'valid_ks':valid_ks_list, 'train_ks':train_ks_list, 'train_auc':train_auc_list, 'best_iteration':best_iteration_list }) statistic_result = { 'train_auc_mean':np.mean(train_auc_list), 'train_auc_std':np.std(train_auc_list), 'train_ks_mean':np.mean(train_ks_list), 'train_ks_std':np.std(train_ks_list), 'valid_auc_mean':np.mean(valid_auc_list), 'valid_auc_std':np.std(valid_auc_list), 'valid_ks_mean':np.mean(valid_ks_list), 'valid_ks_std':np.std(valid_ks_list), 'test_auc_mean':np.mean(test_auc_list), 'test_auc_std':np.std(test_auc_list), 'test_ks_mean':np.mean(test_ks_list), 'test_ks_std':np.std(test_ks_list) } print('train AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_auc_list), np.std(train_auc_list)),\ 'train KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_ks_list), np.std(train_ks_list))) print('valid AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_auc_list), np.std(valid_auc_list)),\ 'valid KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_ks_list), np.std(valid_ks_list))) if X_test is not None: print('test AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_auc_list), np.std(test_auc_list)),\ 'test KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_ks_list), np.std(test_ks_list))) print('best_iteration:',best_iteration_list) return detail_result,statistic_result
def pipe_cv_evaluate_old(X,y,pipeline_estimator,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=10),groups=None,X_test=None,y_test=None): ''' 单个Pipeline的CV评估的结果 即将废弃 建议使用pipe_cv_evaluate X:X数据 传入pandas.DataFrame对象 y:Y数据 传入pandas.Series对象 pipeline_estimator:pipeline模型 cv: 数据集切分方法 groups: 数据分组 X_test:测试X数据 可不指定 y_test:测试y数据 可不指定 return: detail_result: 每一折cv的结果 pandas.DataFrame对象 statistic_result: 最终各指标的结果 dict对象 ''' valid_auc_list = [] valid_ks_list = [] train_auc_list = [] train_ks_list = [] if X_test is None: test_auc_list = np.nan test_ks_list = np.nan else: test_auc_list = [] test_ks_list = [] ##遍历数据集 for fold_n, (train_index, valid_index) in enumerate(cv.split(X,y,groups)): ##取出当前轮使用的的训练数据和验证数据 X_valid,y_valid = X.iloc[valid_index],y.iloc[valid_index] X_train,y_train = X.iloc[train_index],y.iloc[train_index] ##模型训练 pipeline_estimator.fit(X,y) ##模型预测 y_pred_train = pipeline_estimator.predict_proba(X_train)[:,1] y_pred_valid = pipeline_estimator.predict_proba(X_valid)[:,1] ##结果评估 train_auc = plot_roc_curve(y_train,y_pred_train) train_ks = plot_ks_curve(y_pred_train,y_train, is_score=False, n=10) valid_auc = plot_roc_curve(y_valid,y_pred_valid) valid_ks = plot_ks_curve(y_pred_valid,y_valid, is_score=False, n=10) ##结果记录 train_auc_list.append(train_auc) train_ks_list.append(train_ks) valid_auc_list.append(valid_auc) valid_ks_list.append(valid_ks) if X_test is not None: y_pred_test = pipeline_estimator.predict_proba(X_test)[:,1] test_auc = plot_roc_curve(y_test,y_pred_test) test_ks = plot_ks_curve(y_pred_test,y_test, is_score=False, n=10) test_auc_list.append(test_auc) test_ks_list.append(test_ks) detail_result = pd.DataFrame(data={'test_auc':test_auc_list, 'test_ks':test_ks_list, 'valid_auc':valid_auc_list, 'valid_ks':valid_ks_list, 'train_ks':train_ks_list, 'train_auc':train_auc_list }) statistic_result = { 'train_auc_mean':np.mean(train_auc_list), 'train_auc_std':np.std(train_auc_list), 'train_ks_mean':np.mean(train_ks_list), 'train_ks_std':np.std(train_ks_list), 'valid_auc_mean':np.mean(valid_auc_list), 'valid_auc_std':np.std(valid_auc_list), 'valid_ks_mean':np.mean(valid_ks_list), 'valid_ks_std':np.std(valid_ks_list), 'test_auc_mean':np.mean(test_auc_list), 'test_auc_std':np.std(test_auc_list), 'test_ks_mean':np.mean(test_ks_list), 'test_ks_std':np.std(test_ks_list) } print('train AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_auc_list), np.std(train_auc_list)),\ 'train KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(train_ks_list), np.std(train_ks_list))) print('valid AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_auc_list), np.std(valid_auc_list)),\ 'valid KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(valid_ks_list), np.std(valid_ks_list))) if X_test is not None: print('test AUC:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_auc_list), np.std(test_auc_list)),\ 'test KS:{0:.4f}, std:{1:.4f}.'.format(np.mean(test_ks_list), np.std(test_ks_list))) return detail_result,statistic_result
def gbdt_cv_evaluate_earlystop(data_dict, gbdt_estimator, total_features, category_features, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=10), groups=None): ''' 使用gbdt模型进行交叉验证评估,训练中使用earlystop,适用于XBG、GBM、GBDT以及CAT等模型 data_dict: 多个数据集组成的dict 包含train test_xxx等等key gbdt_estimator: gbdt类的estimator total_features: 入模所有特征 list category_features: 入模类别特征 list cv: 数据集切分方法 groups: 数据分组 return: fold_detail_result: 每一折内各个数据集预测的概率结果和真实标签 fold_statistic_result: 每一折内各个数据集预测的指标 fold_best_iteration_result: 每一折内的提前停止情况 ''' ##为了防止下面的操作更改dict内容所以复制一份 但是考虑到内存占用问题后续可以优化 data_dict = data_dict.copy() ##取出训练集 进行模型训练 X = data_dict.get('train').get('X') y = data_dict.get('train').get('y') fold_detail_result = {} fold_statistic_result = {} fold_best_iteration_result = {} ##遍历数据集 for fold_n, (train_index, valid_index) in enumerate(cv.split(X, y, groups)): if groups is None: group_fold = 'NULL' else: group_fold = str(groups[valid_index[0]]) print('正在进行第{}折的验证,验证组号为{}'.format(fold_n, group_fold)) ##取出当前轮使用的的训练数据和验证数据 X_outer_valid, y_outer_valid = X.iloc[valid_index], y.iloc[valid_index] X_train, y_train = X.iloc[train_index], y.iloc[train_index] ##对train进行数据集划分用于early stop X_train, X_inner_valid, y_train, y_inner_valid = train_test_split( X_train, y_train, test_size=0.2, random_state=0) ##用于训练的数据集准备 temp_dict = { 'train': { 'X': X_train, 'y': y_train }, 'inner_valid': { 'X': X_inner_valid, 'y': y_inner_valid }, 'outer_valid': { 'X': X_outer_valid, 'y': y_outer_valid } } data_dict.update(temp_dict) ##模型训练 #gbdt_estimator.set_params({'n_estimators':20000}) if category_features == []: gbdt_estimator.fit(X_train[total_features], y_train, eval_metric=['auc'], eval_set=[(X_inner_valid[total_features], y_inner_valid)], early_stopping_rounds=500, verbose=500) else: gbdt_estimator.fit(X_train[total_features], y_train, categorical_feature=category_features, eval_metric=['auc'], eval_set=[(X_inner_valid[total_features], y_inner_valid)], early_stopping_rounds=500, verbose=500) ##获取最佳训练论数 if hasattr(gbdt_estimator, 'best_iteration_'): best_iteration = gbdt_estimator.best_iteration_ elif hasattr(gbdt_estimator, 'best_iteration'): best_iteration = gbdt_estimator.best_iteration else: raise ValueError( "cannot find best_iteration in {0}".format(gbdt_estimator)) ##进行模型预测 detail_result = {} statistic_result = {} for key in data_dict.keys(): temp_X = data_dict.get(key).get('X') temp_y = data_dict.get(key).get('y') temp_predict = gbdt_estimator.predict_proba( temp_X[total_features])[:, 1] temp_auc = plot_roc_curve(temp_y, temp_predict) temp_ks = plot_ks_curve(temp_predict, temp_y, is_score=False, n=10) detail_result[key] = { 'predict': temp_predict, 'true': temp_y.values } statistic_result[key] = {'auc': temp_auc, 'ks': temp_ks} ##数据存储 fold_detail_result['fold_' + str(fold_n) + '_group_' + group_fold] = detail_result fold_statistic_result['fold_' + str(fold_n) + '_group_' + group_fold] = statistic_result fold_best_iteration_result['fold_' + str(fold_n) + '_group_' + group_fold] = best_iteration return fold_detail_result, fold_statistic_result, fold_best_iteration_result
'X': test_x, 'y': test_y } } model_detail_result = {} model_statistic_result = {} model_detail_result5, model_statistic_result5 = pipe_train_test_evaluate( data_dict, imbRandom5) model_detail_result4, model_statistic_result4 = pipe_train_test_evaluate( data_dict, imbRandom4) model_detail_result3, model_statistic_result3 = pipe_train_test_evaluate( data_dict, imbRandom3) model_predict_result = model_result_combine({'RandomSample':model_detail_result5,\ 'SMOTEENN':model_detail_result4,\ 'Smote':model_detail_result3},'test') ### ks曲线 ks = plot_ks_curve(model_predict_result.get('SMOTEENN').get('predict'),\ model_predict_result.get('SMOTEENN').get('true'),n=10,return_graph=True) ## roc曲线 roc_dict, auc_dict = plot_multi_roc_curve_dict_type(model_predict_result) ## 通过率vs拒绝率曲线 bad_rate_result = plot_multi_reject_bad_curve_dict_type(model_predict_result) ## PR曲线 plot_multi_PR_curve_dict_type(model_predict_result) ### 预测概率目的曲线 plot_density_curve(model_predict_result.get('SMOTEENN').get('true'),\ model_predict_result.get('SMOTEENN').get('predict'))