def convert_scrd2rebin(score_card): ''' 该函数将已经生成的线上部署文档的2_评分卡(logistics和xgb通用)转换为rebin_spec过程,方便进行woe转换 配合 bin_obj = mt.BinWoe(); X_cat_train = bin_obj.convert_to_category(data[selected], var_dict, rebin_spec) woe_iv_df_coarse = bin_obj.calculate_woe_all(X_cat_train, y_train, var_dict, rebin_spec) 可以将离线数据进行分箱统计,包括逾期率以及分布 Args: score_card(DataFrame):评分卡 Returns: rebin_spec ''' select_numerical = list(score_card[(score_card['中间层指标名称'].notnull())\ &(score_card['数据类型']!='varchar')]['中间层指标名称'].unique()) select_categorical = list(score_card[(score_card['中间层指标名称'].notnull())\ &(score_card['数据类型']=='varchar')]['中间层指标名称'].unique()) rebin_spec = {} if len(select_numerical)>=1: for i in select_numerical: rebin_spec[i] = mt.BinWoe().obtain_boundaries(score_card[score_card['中间层指标名称'] == i].分箱) if len(select_categorical)>=1: for i in select_categorical: cat_dict = {} for j,k in enumerate(score_card[score_card['中间层指标名称']==i].分箱): cat_dict[j+1] = k.split(',') rebin_spec[i] = cat_dict return rebin_spec
def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15): X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df) agglo = FeatureAgglomeration(n_clusters=n_clusters) if len(X_transformed) > 20000: X_agglo = X_transformed.sample(20000) else: X_agglo = X_transformed.copy() agglo.fit(X_agglo) vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(), 'cluster':list(agglo.labels_)})\ .sort_values('cluster') return vars_clusters, X_transformed
def get_decile(score, varscore, all_Y, y_col, label, RESULT_PATH, manual_cut_bounds=[]): """ 根据提供的score和Y相应的Y定义 Args: score (pd.DataFrame): index is apply_id all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build' name can be changed. index is apply_id y_col (str): column name of Y label (str): sample set label """ merged_data = score.to_frame('score')\ .merge(all_Y, left_index=True, right_index=True) merged_data = merged_data.loc[merged_data[y_col].notnull()] if len(merged_data) > 10: ks_decile = mt.Performance().calculate_ks_by_decile(merged_data.score, \ np.array(merged_data[y_col]), 'decile', 10, manual_cut_bounds=manual_cut_bounds) ks_decile.loc[:, 'sample_set'] = label else: ks_decile = pd.DataFrame() varscore_new = varscore.merge(score.to_frame('score'), left_index=True, right_index=True) if not ks_decile.empty: if len(manual_cut_bounds) == 0: manual_cut_bounds = mt.BinWoe().obtain_boundaries( ks_decile['分箱'])['cut_boundaries'] varscore_new.loc[:, 'score_10bin'] = pd.cut(varscore_new.score, manual_cut_bounds).astype(str) bin_varscore_avg = varscore_new.groupby('score_10bin').mean()\ .reset_index()\ .rename(columns={'score_10bin': '分箱'}) ks_decile = ks_decile.merge(bin_varscore_avg, on='分箱') return ks_decile, manual_cut_bounds else: return pd.DataFrame(), manual_cut_bounds
def approval_rate_anaysis(backscore, RESULT_PATH, model_label, y_col): all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx')) the_bool = ((all_decile_df.sample_set == 'RUNBOOK') & (all_decile_df.model_label == model_label) & (all_decile_df.Y_definition == y_col)) runbook = all_decile_df.loc[the_bool].copy() point_bounds = mt.BinWoe().obtain_boundaries( runbook['分箱'])['cut_boundaries'] backscore_cut = pd.cut(backscore, point_bounds).astype(str) dist_ct = backscore_cut.value_counts().to_frame('样本量').sort_index().reset_index()\ .rename(columns={'index': '分箱'}) dist_ct.loc[:, '分布占比'] = dist_ct['样本量'] / dist_ct['样本量'].sum() dist_ct = dist_ct.reset_index().rename(columns={'index': 'bin_order'}) dist_ct = dist_ct.merge(runbook[['分箱', '逾期率']], on='分箱', how='left') dist_ct = dist_ct.sort_values('bin_order', ascending=False) dist_ct.loc[:, 'ApprovalRate'] = dist_ct['分布占比'].cumsum() dist_ct.loc[:, 'ApprovedBadRate'] = ( (dist_ct['分布占比'] * dist_ct['逾期率']).cumsum() / dist_ct['分布占比'].cumsum()) dist_ct.loc[:, 'model_label'] = model_label return dist_ct
def score_stability(score_dict, all_Y, y_col, train_set_name, RESULT_PATH, model_label): """ 计算所有sample_set的score stability Args: score_dict (dict): the key is the label of the sample set. e.g. 'train', 'test', 'oot' etc. should be the save as data_cat_dict.keys() The value is the scores. index is apply_id all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build' name can be changed. index is apply_id y_col (str): column name of Y """ all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx')) the_bool = ((all_decile_df.model_label == model_label) & (all_decile_df.sample_set == train_set_name) & (all_decile_df.Y_definition == y_col)) train_decile = all_decile_df.loc[the_bool].copy() point_bounds = mt.BinWoe().obtain_boundaries( train_decile['分箱'])['cut_boundaries'] d_list = [] compare_set_names = [ i for i in list(score_dict.keys()) if i != train_set_name ] for label in compare_set_names: logging.log(18, label + ' data set starts score PSI calculation') tmp = mt.Performance().score_psi(score_dict[train_set_name]['score'], score_dict[label]['score'], point_bounds) columns_order = tmp.columns tmp.loc[:, 'compare_set'] = '%s_vs_%s' % (train_set_name, label) d_list.append(tmp) columns_order = ['model_label', 'compare_set'] + list(tmp.columns.values) all_score_psi = pd.concat(d_list) all_score_psi.loc[:, 'model_label'] = model_label return all_score_psi[columns_order]
def generate_xgb_deployment_documents(model_label, eda_table, model_spec, xgb_importance_score, var_dict, model_decile, model_result, RESULT_PATH, test_case_data, test_nums, woe_iv_df_coarse, production_name_map={}): """ 生成XGBoost模型部署文档 Args model_label (str): 模型名称 eda_table (dataFrame): EDA结果 model_spec (dataFrame): XGBoost输出数据和分箱明细.pkl 文件 xgb_importance_score (dataFrame): xgboost模型变量重要性排序 var_dict (dataFrame): 数据字典 model_decile(dataFrame): 建模decile model_result (dict): 模型结果 RESULT_PATH (str): 结果路径 test_case_data(dataFrame): 生成testcase的测试数据, 如果用于部署, 请确保apply_id为index test_nums(int):生成测试用例的数量 woe_iv_df_coarse(dataFrame): 建模粗分箱结果 production_name_map (dict): 当有指标英文线上的名字和线下建模时不一样时传入。key=建模时英文名 value=线上英文名。 default={} 默认建模时和线上没有命名不一致的情况 """ if not os.path.exists(os.path.join(RESULT_PATH, 'deployment')): os.makedirs(os.path.join(RESULT_PATH, 'deployment')) DEPLOY_PATH = os.path.join(RESULT_PATH, 'deployment') eda_table = eda_table model_spec = model_spec rebin_spec = model_spec['rebin_spec'] bin_to_label = model_spec['bin_to_label'] dummy_var_name_map = model_spec['dummy_var_name_map'] impt_writer = pd.ExcelWriter( os.path.join(DEPLOY_PATH, '%s_variable_importance.xlsx' % model_label)) xgb_importance_score.to_excel(impt_writer, 'local', index=False) impt_writer.save() xgb_importance_score = xgb_importance_score xgb_importance_score = xgb_importance_score.rename( columns={ 'feature': 'XGB衍生入模名称', 'fscore': '指标用于Split数据的数量', 'imp_pct': '指标用于Split数据的数量占比' }) xgb_importance_score[['XGB变量转换类型', '中间层指标名称']] = xgb_importance_score['XGB衍生入模名称']\ .apply(lambda x: pd.Series(mt.BinWoe().xgboost_obtain_raw_variable(x, var_dict))) xgb_importance_score['建模时指标名称'] = xgb_importance_score['中间层指标名称'].copy() xgb_importance_score['建模时XGB衍生入模名称'] = xgb_importance_score[ 'XGB衍生入模名称'].copy() for original, new_name in production_name_map.items(): a = xgb_importance_score.loc[xgb_importance_score['建模时指标名称']==original, '建模时XGB衍生入模名称']\ .apply(lambda x: x.replace(original, new_name)) xgb_importance_score.loc[xgb_importance_score['建模时指标名称'] == original, 'XGB衍生入模名称'] = a if original in rebin_spec: rebin_spec[new_name] = rebin_spec.pop(original) if original in bin_to_label: bin_to_label[new_name] = bin_to_label.pop(original) # 这一步很重要,要将字典里边的变量名也改掉,不然rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables+bin_variables)会出错 if original in list(var_dict['指标英文']): var_dict['指标英文'].replace({original: new_name}, inplace=True) xgb_importance_score['中间层指标名称'] = xgb_importance_score['中间层指标名称'].replace( production_name_map) xgb_importance_score = var_dict[['数据源', '指标英文', '指标中文', '数据类型']]\ .rename(columns={'指标英文':'中间层指标名称'})\ .merge(xgb_importance_score, on='中间层指标名称', how='right') xgb_importance_score.insert( 5, '输出打分指标名称', xgb_importance_score['XGB衍生入模名称'].apply( lambda x: 'mlr_' + str(x) + '_xgb_' + model_label)) xgb_importance_score = xgb_importance_score.append( {'输出打分指标名称': 'mlr_creditscore_xgb_' + model_label}, ignore_index=True) xgb_importance_score = xgb_importance_score.append( {'输出打分指标名称': 'mlr_prob_xgb_' + model_label}, ignore_index=True) model_decile = model_decile selected_variables = xgb_importance_score['建模时指标名称'].unique() model_eda = eda_table.loc[eda_table['指标英文'].isin( selected_variables)].copy() model_eda['指标英文'] = model_eda['指标英文'].replace(production_name_map) woe_iv_df_coarse_copy = woe_iv_df_coarse.copy() if len(production_name_map) > 0: for j, k in production_name_map.items(): woe_iv_df_coarse_copy = woe_iv_df_coarse_copy.replace(j, k) xgb_importance_score.rename(columns={'数据源': '指标分类'}, inplace=True) xgb_importance_score_bin_result = pd.merge(xgb_importance_score.drop_duplicates('中间层指标名称')[['指标分类','中间层指标名称','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\ ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称']]\ ,woe_iv_df_coarse_copy,left_on='中间层指标名称',right_on='指标英文',how='left')\ [['指标分类','中间层指标名称','指标中文','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\ ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称'\ ,'数据类型','指标类型','分箱','分箱对应原始分类','N','分布占比','WOE','逾期率']] xgb_importance_score_bin_result[ '分箱'] = xgb_importance_score_bin_result.apply( lambda x: x['分箱对应原始分类'] if x['分箱对应原始分类'] else x['分箱'], axis=1) xgb_importance_score_bin_result.drop(['分箱对应原始分类'], axis=1, inplace=True) writer = pd.ExcelWriter( os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label)) xgb_importance_score_bin_result.to_excel(writer, '2_模型变量重要性排序及分箱统计', index=False) model_decile.to_excel(writer, '3_模型decile', index=False) model_eda.to_excel(writer, '4_模型EDA', index=False) writer.save() logging.info("""第六步部署:生成XGBoost部署文档。 1. 模型部署Excel文档存储于:%s 2. 需添加『0_文档修订记录』、『1_信息总览』页面。详见其他正式部署文档文件。并存储于『/Seafile/模型共享/模型部署文档/』相应文件夹中 """ % os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label)) model_result = model_result derive_name_map = dict( zip(xgb_importance_score['建模时XGB衍生入模名称'], xgb_importance_score['XGB衍生入模名称'])) xgbmodel = model_result['model_final'] var_list = [] for i in xgbmodel.__dict__['feature_names']: try: var_list.append(derive_name_map[i]) except: var_list.append(i) xgbmodel.__dict__['feature_names'] = var_list if len(production_name_map) > 0: var_list_copy = [] for i in var_list: for j, k in production_name_map.items(): r = i.replace(j, k) var_list_copy.append(r) xgbmodel.__dict__['feature_names'] = var_list_copy num_variables = list(xgb_importance_score.loc[ xgb_importance_score['XGB变量转换类型'] == 'num_vars_origin', '中间层指标名称'].unique()) bin_variables = list(xgb_importance_score.loc[ xgb_importance_score['XGB变量转换类型'] == 'bin_vars', '中间层指标名称'].unique()) rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables + bin_variables) bin_to_label = { k: v for k, v in bin_to_label.items() if k in bin_variables } var_transform_method = {} var_transform_method['num_vars_origin'] = num_variables var_transform_method['bin_vars'] = bin_variables var_transform_method['dummy_vars'] = {} dummy_vars_df = xgb_importance_score.loc[xgb_importance_score['XGB变量转换类型'] == 'dummy_vars'].copy() dummy_vars_df.loc[:, "dummy原始名称"] = dummy_vars_df['建模时XGB衍生入模名称'].apply( lambda x: dummy_var_name_map[x]) dummy_vars_df.loc[:, 'dummy原始对应分类'] = dummy_vars_df.loc[:, "dummy原始名称"].apply( lambda x: x.split('DUMMY')[-1]) for original_variable in dummy_vars_df['中间层指标名称'].unique(): cat_list = list( dummy_vars_df.loc[dummy_vars_df['中间层指标名称'] == original_variable, 'dummy原始对应分类'].unique()) var_transform_method['dummy_vars'][original_variable] = cat_list production_dummy_var_name_map = {} for new_dummy_name, old_dummy_name in dummy_var_name_map.items(): if new_dummy_name != old_dummy_name: if new_dummy_name.split('DUMMY')[0] in production_name_map: var_name = new_dummy_name.split('DUMMY')[0] prod_var_name = production_name_map[var_name] production_dummy_var_name_map[old_dummy_name.replace( var_name, prod_var_name)] = new_dummy_name.replace( var_name, prod_var_name) else: production_dummy_var_name_map[old_dummy_name] = new_dummy_name var_transform_method['dummy_var_name_map'] = production_dummy_var_name_map save_data_to_json(rebin_spec_json, DEPLOY_PATH, '%s_selected_rebin_spec.json' % model_label) save_data_to_json(bin_to_label, DEPLOY_PATH, '%s_bin_to_label.json' % model_label) save_data_to_python2_pickle(xgbmodel, DEPLOY_PATH, '%s_xgbmodel.pkl' % model_label) save_data_to_json(var_transform_method, DEPLOY_PATH, '%s_var_transform_method.json' % model_label) ''' 生成模型的testcase ''' if test_nums < len(test_case_data): test_case_data_final = test_case_data[:test_nums] else: test_case_data_final = test_case_data.copy() if len(production_name_map) > 0: columns = [] for i in test_case_data_final.columns: for j, k in production_name_map.items(): r = i.replace(j, k) columns.append(r) test_case_data_final.columns = columns offline_model_score = pd.concat([model_result['p_train'].apply(mt.Performance()\ .p_to_score).to_frame('offline_model_score'),\ model_result['p_test'].apply(mt.Performance()\ .p_to_score).to_frame('offline_model_score')]) model_used_origin_features = [ i for i in list(set(xgb_importance_score['中间层指标名称'])) if str(i) != 'nan' and i != None ] test_case_data_final_with_xgbScore = pd.merge( test_case_data_final[model_used_origin_features], offline_model_score, left_index=True, right_index=True) def get_xgb_testcase(test_data, modelName, productName): strs = ''' curl -d 'varparams={"modelName":"%s","productName":"%s","applyId": 123, ''' % ( modelName, productName) for i, j in test_data.items(): strs = strs + '"%s"' % i + ':"%s"' % j + ', ' final = strs[:-2] + '''}' -X POST http://localhost:8080/modelInvoke''' return final test_case_data_final_with_xgbScore[ 'test_case'] = test_case_data_final_with_xgbScore[ test_case_data_final_with_xgbScore.columns.difference( ['offline_model_score'])].to_dict('records') test_case_data_final_with_xgbScore[ 'curl_script'] = test_case_data_final_with_xgbScore['test_case'].apply( get_xgb_testcase, args=(model_label, model_label)) test_case_data_final_with_xgbScore[[ 'offline_model_score', 'curl_script' ]].to_csv(os.path.join(DEPLOY_PATH, '%s_xgbmodel_testcase.csv' % model_label), index=None) '''为秒算模型部署平台生成测试用例''' generate_xgb_testcases(model_label, test_case_data_final, var_dict, DEPLOY_PATH, model_result) logging.info("""第六步部署:生成XGBoost部署文档。 线上部署配置文件存储于%s路径下 1. %save_data_dict_to_pickle 2. %s 3. %s 4. %s """ % ( DEPLOY_PATH, '%s_selected_rebin_spec.json' % model_label, '%sbin_to_label.json' % model_label, '%s_var_transform_method.json' % model_label, '%s_xgbmodel.pkl' % model_label, ))
# 相关性分析删除的变量 exclusion_cols = vif_result.corr_exclude_vars + vif_result.vif_dropvars exclusion_cols # # 调整粗分箱 selected = list(set(selected) - set(exclusion_cols)) selected ################################################################ #S9: 适用于逻辑回归的调分箱, 衍生bin_num, dummy变量的处理方式 ################################################################ # ## 训练集按照新的rebin_spec进行分箱 #apply new bin cutting to the data bin_obj = mt.BinWoe() new_X = X_train[selected] X_cat_train = bin_obj.convert_to_category(new_X, var_dict, rebin_spec) woe_iv_df_coarse = bin_obj.calculate_woe_all(X_cat_train, y_train, var_dict, rebin_spec) # 字符型变量编码前的原始值写入粗分箱文件中 mt.add_origin_categorical_val(woe_iv_df_coarse, all_encoding_map) # ##将woe_iv_df_coarse存储出去,为了部署监控使用 save_data_to_pickle(woe_iv_df_coarse, Result_path, 'woe_iv_df_coarse.pkl') # ## 测试集按照新的rebin_spec进行分箱 new_X = X_test[selected] X_cat_test = bin_obj.convert_to_category(new_X, var_dict, rebin_spec) # # 按月查看训练集分箱以后的分布以及逾期率 # ## 将分好箱的数据与原数据中的y以及时间标识合并
def xgb_online_score_validity(data,input_vars,var_dict,var_transform_method,model_booster,model_decile,onLineScoreName): ''' 该函数为xgb模型上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数 Args: data(DataFrame):其中表头必须含有apply_id,var_code,create_at字段, 大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id input_vars(list):原始变量 var_dict(DataFrame):数据字典 var_transform_method(json):部署时刻的变量转换文件 model_booster(booster):模型部署时刻的booster文件 model_decile(DataFrame):model建模时候的decile onLineScoreName:xgb模型线上打分字段名 Returns: data_input_vars_missing(DataFrame):未填充缺失值的所有apply data_scored_applys_wrong(DataFrame):打分错误明细 model_decile_summary(DataFrame):模型分PSI ''' data = data.sort_values(['apply_id','var_code','created_at'],ascending=False)\ .drop_duplicates(['apply_id','var_code']) # 将数据进行列行转换 data_pivot_all = data.pivot(index='apply_id',columns='var_code',values='var_val') # 变量出现空值的apply统计 data_input_vars_missing = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)>0] data_scored_applys = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)==0] # 进模型前的原始数据 data_scored_applys_origin = data_scored_applys[input_vars] # 将数据转化为字典中的格式 data_scored_applys_origin = mu.convert_right_data_type(data_scored_applys_origin, var_dict)[0] try: # 将需要做dummy的变量进行dummy转化 data_dummys = pd.get_dummies(data_scored_applys_origin[list(var_transform_method['dummy_vars'].keys())]\ ,columns=list(var_transform_method['dummy_vars'].keys()), drop_first=False\ , prefix_sep='DUMMY') # 进入模型的数据 data_in_model = pd.merge(data_scored_applys_origin,data_dummys,left_index=True,right_index=True) except: data_in_model = data_scored_applys_origin data_in_model.rename(columns=var_transform_method['dummy_var_name_map'],inplace=True) # 转换成xgb格式 data_in_model_DMatrix = xgb.DMatrix(data_in_model[model_booster.__dict__['feature_names']]) data_scored_applys['offLine_prob'] = model_booster.predict(data_in_model_DMatrix) data_scored_applys['offLineScore'] = data_scored_applys['offLine_prob']\ .apply(lambda x : mt.Performance().p_to_score(x)) # 打分错误的applyid data_scored_applys_wrong = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\ .apply(lambda x : float(x[onLineScoreName]) != float(x['offLineScore']),axis=1)] data_scored_applys_right = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\ .apply(lambda x : float(x[onLineScoreName]) == float(x['offLineScore']),axis=1)] # PSI统计 point_bounds_XGBRandom = mt.BinWoe().obtain_boundaries(model_decile['分箱'])['cut_boundaries'] model_decile_curr = pd.cut(data_scored_applys_right[onLineScoreName].astype(float)\ ,point_bounds_XGBRandom).value_counts().sort_index().to_frame('curr_num').reset_index() model_decile_final = pd.concat([model_decile,model_decile_curr],axis=1) model_decile_final['建模分布'] = model_decile_final['样本数']/model_decile_final['样本数'].sum() model_decile_final['curr_dist'] = model_decile_final['curr_num']/model_decile_final['curr_num'].sum() model_decile_final['psi'] = model_decile_final.apply(lambda x : (x['建模分布'] - x['curr_dist'])\ *log(x['建模分布']/x['curr_dist'])\ if x['curr_dist']!=0 and x['建模分布']!=0 else 0\ ,axis=1) model_decile_summary = model_decile_final[['分箱','样本数','建模分布','curr_num','curr_dist','psi']] return data_input_vars_missing,data_scored_applys_wrong,model_decile_summary
def verify_var_and_score_validity(data_origin,score_card_deploy,rebin_spec_bin_adjusted_deploy,model_decile_deploy): ''' 该函数为上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数 Args: data_origin(DataFrame):其中表头必须含有apply_id,var_code,create_at字段, 大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id score_card_deploy(DataFrame):切记这是部署时刻的评分卡 rebin_spec_bin_adjusted_deploy(DataFrame):部署时刻的变量分箱.json文件 model_decile_deploy(DataFrame):部署时刻的model decile就在部署的评分卡中 Returns: wrong_data(DataFrame):未填充缺失值的所有apply wrong_score_vars(dict):变量以及其对应的打分错误明细 wrong_model_score(DataFrame):模型分打分错误明细 var_dist_data_with_score_card(DataFrame):变量PSI score_dist_with_decile(DataFrame):模型分PSI ''' # 将部署的评分卡切换成和建模时对应的字段名 score_card_deploy.rename(columns={'打分':'变量打分','中间层指标名称':'指标英文'},inplace=True) # 选出模型中用到的字段名和其对应的 model_selected = [i for i in set(list(score_card_deploy['指标英文'])) if str(i)!='nan'] mlr_selected_vars = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if str(i)!='nan'] # 生成数据字典 var_dict = score_card_deploy.drop_duplicates('指标英文') var_dict = var_dict[var_dict['指标英文'].isin(model_selected)] # 生成打分时候建模的评分卡 score_card_modeling = score_card_deploy[score_card_deploy['指标英文'].isin(model_selected)] score_card_modeling.ix[len(score_card_modeling)+1,'指标英文'] = 'intercept' const_var = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if 'const' in i][0] const_value = float(score_card_deploy.ix[score_card_deploy['输出打分指标名称']==const_var,'变量打分']) score_card_modeling.ix[score_card_modeling['指标英文']=='intercept','变量打分'] = const_value # 生成建模时刻的rebin_spec rebin_spec_bin_adjusted_modeling = rebin_spec_bin_adjusted_deploy['numerical'] rebin_spec_bin_adjusted_modeling.update(rebin_spec_bin_adjusted_deploy['categorical']) # 生成变量名称与打分名称对应关系 modelVar_scoreVar_dict = {} for j,i in score_card_deploy[['指标英文','输出打分指标名称']].drop_duplicates(['指标英文']).iterrows(): if str(i['指标英文'])!='nan': modelVar_scoreVar_dict[i['指标英文']] = i['输出打分指标名称'] # 转换原始数据格式 try: data_origin = data_origin.sort_values(['apply_id','var_code','created_at'],ascending=False)\ .drop_duplicates(['apply_id','var_code','created_at']) except: try: data_origin = data_origin.sort_values(['apply_id','var_code'],ascending=False)\ .drop_duplicates(['apply_id','var_code']) except: pass data_origin_pivot = data_origin.pivot(index='apply_id',columns='var_code',values='var_val') # 找出变量缺失的用户 wrong_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)>0] wrong_data.reset_index(inplace=True) # 找出不缺失打分的用户 right_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)==0] right_data_converted = convert_right_data_type(right_data[model_selected],var_dict)[0] # 将数据转换为bin bin_obj = mt.BinWoe() data_origin_pivot_cat = bin_obj.convert_to_category(right_data_converted[model_selected], var_dict\ , rebin_spec_bin_adjusted_modeling) data_origin_pivot_score_all = mt.Performance().calculate_score_by_scrd(data_origin_pivot_cat[model_selected]\ ,score_card_modeling) data_origin_pivot_score_vars = data_origin_pivot_score_all[0] data_origin_pivot_score_model = data_origin_pivot_score_all[1] data_origin_pivot_cat.columns = [i+'_cat' for i in data_origin_pivot_cat.columns] data_origin_pivot_score_vars.columns = [i+'_varScoreBymodel' for i in data_origin_pivot_score_vars.columns] data_origin_pivot_all = pd.merge(pd.merge(data_origin_pivot,data_origin_pivot_cat,left_index=True,right_index=True)\ ,data_origin_pivot_score_vars,left_index=True,right_index=True) # 将打分错误的变量对应的申请找出来 wrong_score_vars = {} for i in model_selected: wrong_dt =data_origin_pivot_all[data_origin_pivot_all\ .apply(lambda x : float(x[modelVar_scoreVar_dict[i]])!=float(x[i+'_varScoreBymodel']),axis=1)]\ [[i,i+'_cat',i+'_varScoreBymodel',modelVar_scoreVar_dict[i]]] wrong_score_vars[i] = wrong_dt.reset_index() # 整合线上和线下打分 mlr_score_vars = [i for i in mlr_selected_vars if 'creditscore' in i][0] model_on_off_score = pd.merge(data_origin_pivot_score_model.to_frame('back_score'),data_origin_pivot_all[mlr_score_vars]\ .to_frame('mlr_score'),left_index=True,right_index=True) wrong_model_score = model_on_off_score[model_on_off_score\ .apply(lambda x : float(x['back_score']) != float(x['mlr_score']),axis=1)] wrong_model_score.reset_index(inplace=True) # 求变量的PSI var_dist = [] for i in model_selected: tmp = (data_origin_pivot_cat[i+'_cat'].value_counts())\ .to_frame('allNum').reset_index() tmp['dist'] = tmp['allNum']/len(data_origin_pivot_cat) tmp['指标英文'] = i tmp.columns = ['分箱','allNum','dist','指标英文'] var_dist.append(tmp) var_dist_data = pd.concat(var_dist)[['指标英文','分箱','allNum','dist']] var_dist_data['分箱'] = var_dist_data['分箱'].astype(str) var_dist_data_with_score_card = pd.merge(score_card_modeling,var_dist_data,on = ['指标英文','分箱']) var_dist_data_with_score_card['PSI'] = var_dist_data_with_score_card.apply(lambda x : (x['dist'] - x['分布占比'])\ *log(x['dist']/x['分布占比'])\ if x['dist']!=0 and x['分布占比']!=0 else 0\ ,axis=1) var_dist_data_with_score_card = var_dist_data_with_score_card\ [['指标分类','指标英文','变量中文','数据类型','分箱','N','分布占比','allNum','dist','PSI']] # 求打分的psi # 拿到打分的cut_bounds point_bounds = mt.BinWoe().obtain_boundaries(model_decile_deploy[u'分箱'])['cut_boundaries'] mlr_score_dist = (pd.cut(data_origin_pivot_all[mlr_score_vars].astype(float),point_bounds)\ .value_counts().to_frame('allNum')).reset_index() mlr_score_dist['dist'] = mlr_score_dist['allNum']/len(data_origin_pivot_all) mlr_score_dist['index'] = mlr_score_dist['index'].astype(str) model_decile_deploy['样本分布'] = model_decile_deploy['样本数']/sum(model_decile_deploy['样本数']) score_dist_with_decile = pd.merge(model_decile_deploy,mlr_score_dist,left_on='分箱',right_on='index')\ [['分箱','样本数','样本分布','allNum','dist']] score_dist_with_decile['PSI'] = score_dist_with_decile.apply(lambda x : (x['dist'] - x['样本分布'])\ *log(x['dist']/x['样本分布'])\ if x['dist']!=0 and x['样本分布']!=0 else 0\ ,axis=1) return wrong_data,wrong_score_vars,wrong_model_score,var_dist_data_with_score_card,score_dist_with_decile
import os import sys import pickle import time from datetime import datetime from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score, roc_curve import utils3.misc_utils as mu import utils3.summary_statistics1 as ss import utils3.feature_selection as fs fs_obj = fs.FeatureSelection() import utils3.metrics as mt pf = mt.Performance() bw = mt.BinWoe() import utils3.plot_tools as pt import matplotlib.pyplot as plt import utils3.xgboost_model as xm import utils3.filing as fl import utils3.data_io_utils as du ## 数据预处理、字典、eda、缺失填充 def data_preprocess(df, in_dict, var_dict=None): ''' 输入原始数据,输出缺失填充好的数据、字典、eda结果 'known_missing': {-999:-9999, -1111:-9999} #将data中已知表示缺失的值赋值为-9999 ''' #index_id=in_dict['index_id'] #label=in_dict['target']
def overall_ranking(self, X, y, var_dict, args_dict, methods, num_max_bins=10, n_clusters=15, verbose=True): """ 综合各算法的综合排序 Args: X (pd.DataFrame): x 变量宽表, 已经处理过missing value的。如果没有,会处理。 y (pd.Series): y label var_dict (pd.DataFrame): 变量字典表。必须包含: [u'数据源', u'指标英文', u'指标中文', u'数据类型', u'指标类型'] args_dict (dict): key 是算法名称如['random_forest', 'svm', 'xgboost', 'IV', 'lasso'] value是dict包含key值grid_search, param.如果没有赋值default值 methods (list): ['random_forest', 'svm', 'xgboost', 'IV', 'lasso'] num_max_bins (int): 用于自动分箱决定分箱的数量, default=10 Returns: woe_iv_df (pd.DataFrame): 输出的auto classing的WOE IV表,可以直接贴到输出文档里面 result (pd.DataFrame): 每个变量用每个方法计算的metrics和相应的ranking还有综合排序 """ num_missing = pd.isnull(X).sum().sum() if num_missing > 0: X = mu.process_missing(X, var_dict, verbose=verbose) result_df_list = [] # IV. calculate_iv里面会分箱,所以需要传入的数据是原始的,只经过缺失处理的数据 # 计算IV是mandatory bin_obj = mt.BinWoe() logging.info('Overall ranking prep: auto binning started') X_cat, all_encoding_map, all_spec_dict = bin_obj.binning( X, y, var_dict, num_max_bins, verbose=verbose) logging.info('Overall ranking prep: auto binning completed') woe_iv_df = bin_obj.calculate_woe_all(X_cat, y, var_dict, all_spec_dict, verbose=verbose) logging.info('Overall ranking prep: woe calculation completed') rebin_spec = bin_obj.create_rebin_spec(woe_iv_df, all_spec_dict, all_encoding_map) iv_result = woe_iv_df[['指标英文', 'IV']].copy()\ .rename(columns={'指标英文':'var_code'})\ .drop_duplicates().dropna() iv_result.loc[:, 'iv_rank'] = iv_result.IV.rank(ascending=False) result_df_list.append(iv_result) logging.info('Overall ranking prep: IV completed') # 将categorical变量label encode,numerical保持原样 def _encoding_categorical(data, var_dict): new_data = data.copy() if '数据类型' in var_dict.columns.values: cateogory_col = var_dict.loc[var_dict['指标英文'].isin(new_data.columns) &\ (var_dict['数据类型']=='varchar'), '指标英文'].tolist() else: cateogory_col = [ col for col in new_data.columns if new_data[col].dtype == 'object' ] for col in cateogory_col: # NA不fill成missing的话,会一个NA一个label new_data.loc[:, col] = mu.label_encode(data[col]) return new_data X = _encoding_categorical(X, var_dict) if 'random_forest' in methods: random_forest_result = self.random_forest(X, y, \ args_dict['random_forest'].get('grid_search', False), \ args_dict['random_forest'].get('param', None)) result_df_list.append(random_forest_result) logging.info('Overall ranking prep: Random Forest completed') # 太慢了,SVC已经是基于libsvm的,速度没办法更快了 if 'svm' in methods: svm_result = self.svm(X, y) result_df_list.append(svm_result) logging.info('Overall ranking prep: SVM completed') if 'xgboost' in methods: xgboost_result = self.xgboost(X, y, \ args_dict['xgboost'].get('grid_search', False), \ args_dict['xgboost'].get('param', None)) result_df_list.append(xgboost_result) logging.info('Overall ranking prep: Xgboost completed') if 'lasso' in methods: lasso_result = self.lasso(X, y) result_df_list.append(lasso_result) logging.info('Overall ranking prep: LASSO completed') result = reduce(lambda left, right: left.merge(right, on='var_code', how='outer'),\ result_df_list) rank_columns = [i for i in result.columns if 'rank' in i] result.loc[:, 'overall_rank'] = (result[rank_columns] * 1.0).mean(1) result = var_dict.loc[:, ['指标英文', '指标中文','数据源']].drop_duplicates()\ .merge(result.rename(columns={'var_code': '指标英文'}), \ on='指标英文') cluster_result, X_transformed = self.variable_clustering( X_cat, woe_iv_df, n_clusters=n_clusters) result = result.merge(cluster_result, on='指标英文', how='left') return X_cat, X_transformed, woe_iv_df, rebin_spec, result