def summarize_perf_metrics(score_dict, all_Y, y_col): """ 把模型在不同样本和Y的表现(KS, AUC)汇总到一个表格 Args: score_dict (dict): the key is the label of the sample set. e.g. 'train', 'test', 'oot' etc. should be the save as data_cat_dict.keys() The value is the scores. index is apply_id all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build' name can be changed. index is apply_id y_col (str): the column name of Y """ metrics_result_build = {} for label, score in score_dict.items(): logging.log(18, label + ' data set starts performance calculation') merged_data = score['score'].to_frame('score')\ .merge(all_Y, left_index=True, right_index=True) if merged_data[y_col].count() > 10: merged_data = merged_data.loc[merged_data[y_col].notnull()] metrics_result_build[label] = {} metrics_result_build[label]['AUC'] = mt.Performance( ).calculate_auc(merged_data[y_col], 1. / merged_data.score) metrics_result_build[label]['KS'] = mt.Performance( ).calculate_ks_by_score(merged_data[y_col], merged_data.score) perf_result = pd.DataFrame(metrics_result_build) perf_result = perf_result.reset_index() perf_result = perf_result.rename(columns={'index': 'metrics'}) return perf_result
def get_decile(score, varscore, all_Y, y_col, label, RESULT_PATH, manual_cut_bounds=[]): """ 根据提供的score和Y相应的Y定义 Args: score (pd.DataFrame): index is apply_id all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build' name can be changed. index is apply_id y_col (str): column name of Y label (str): sample set label """ merged_data = score.to_frame('score')\ .merge(all_Y, left_index=True, right_index=True) merged_data = merged_data.loc[merged_data[y_col].notnull()] if len(merged_data) > 10: ks_decile = mt.Performance().calculate_ks_by_decile(merged_data.score, \ np.array(merged_data[y_col]), 'decile', 10, manual_cut_bounds=manual_cut_bounds) ks_decile.loc[:, 'sample_set'] = label else: ks_decile = pd.DataFrame() varscore_new = varscore.merge(score.to_frame('score'), left_index=True, right_index=True) if not ks_decile.empty: if len(manual_cut_bounds) == 0: manual_cut_bounds = mt.BinWoe().obtain_boundaries( ks_decile['分箱'])['cut_boundaries'] varscore_new.loc[:, 'score_10bin'] = pd.cut(varscore_new.score, manual_cut_bounds).astype(str) bin_varscore_avg = varscore_new.groupby('score_10bin').mean()\ .reset_index()\ .rename(columns={'score_10bin': '分箱'}) ks_decile = ks_decile.merge(bin_varscore_avg, on='分箱') return ks_decile, manual_cut_bounds else: return pd.DataFrame(), manual_cut_bounds
def score_stability(score_dict, all_Y, y_col, train_set_name, RESULT_PATH, model_label): """ 计算所有sample_set的score stability Args: score_dict (dict): the key is the label of the sample set. e.g. 'train', 'test', 'oot' etc. should be the save as data_cat_dict.keys() The value is the scores. index is apply_id all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build' name can be changed. index is apply_id y_col (str): column name of Y """ all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx')) the_bool = ((all_decile_df.model_label == model_label) & (all_decile_df.sample_set == train_set_name) & (all_decile_df.Y_definition == y_col)) train_decile = all_decile_df.loc[the_bool].copy() point_bounds = mt.BinWoe().obtain_boundaries( train_decile['分箱'])['cut_boundaries'] d_list = [] compare_set_names = [ i for i in list(score_dict.keys()) if i != train_set_name ] for label in compare_set_names: logging.log(18, label + ' data set starts score PSI calculation') tmp = mt.Performance().score_psi(score_dict[train_set_name]['score'], score_dict[label]['score'], point_bounds) columns_order = tmp.columns tmp.loc[:, 'compare_set'] = '%s_vs_%s' % (train_set_name, label) d_list.append(tmp) columns_order = ['model_label', 'compare_set'] + list(tmp.columns.values) all_score_psi = pd.concat(d_list) all_score_psi.loc[:, 'model_label'] = model_label return all_score_psi[columns_order]
# 1)把主键设置成index ;2)在代码前期声名y; 3)train-test模式和train-test-OOT模式 import sys sys.path.append('/Users/yantingting/Documents/Code/CodePackage/') from utils3.summary_statistics import * from utils3.data_bins import * from utils3.common_fun import * from utils3.stepregression import * from utils3.scorecard import * import utils3.metrics as mt import pandas as pd pd.set_option('display.max_columns', None) import os pf = mt.Performance() data_path = '/Users/yantingting/Documents/Code/' result_path = '/Users/yantingting/Documents/Code/result' if not os.path.exists(result_path): os.makedirs(result_path) df = pd.read_excel(os.path.join(data_path, 'sample_data.xlsx')) df.set_index(['company_name_md5'], inplace=True) df.head() df['flag'].sum() / df['flag'].count() # step 1:生成数据字典,方便后期管理建模可用的变量 my_dict = DataSummary().create_dict(df, is_available='Y', data_sorce='TYC',
def generate_xgb_deployment_documents(model_label, eda_table, model_spec, xgb_importance_score, var_dict, model_decile, model_result, RESULT_PATH, test_case_data, test_nums, woe_iv_df_coarse, production_name_map={}): """ 生成XGBoost模型部署文档 Args model_label (str): 模型名称 eda_table (dataFrame): EDA结果 model_spec (dataFrame): XGBoost输出数据和分箱明细.pkl 文件 xgb_importance_score (dataFrame): xgboost模型变量重要性排序 var_dict (dataFrame): 数据字典 model_decile(dataFrame): 建模decile model_result (dict): 模型结果 RESULT_PATH (str): 结果路径 test_case_data(dataFrame): 生成testcase的测试数据, 如果用于部署, 请确保apply_id为index test_nums(int):生成测试用例的数量 woe_iv_df_coarse(dataFrame): 建模粗分箱结果 production_name_map (dict): 当有指标英文线上的名字和线下建模时不一样时传入。key=建模时英文名 value=线上英文名。 default={} 默认建模时和线上没有命名不一致的情况 """ if not os.path.exists(os.path.join(RESULT_PATH, 'deployment')): os.makedirs(os.path.join(RESULT_PATH, 'deployment')) DEPLOY_PATH = os.path.join(RESULT_PATH, 'deployment') eda_table = eda_table model_spec = model_spec rebin_spec = model_spec['rebin_spec'] bin_to_label = model_spec['bin_to_label'] dummy_var_name_map = model_spec['dummy_var_name_map'] impt_writer = pd.ExcelWriter( os.path.join(DEPLOY_PATH, '%s_variable_importance.xlsx' % model_label)) xgb_importance_score.to_excel(impt_writer, 'local', index=False) impt_writer.save() xgb_importance_score = xgb_importance_score xgb_importance_score = xgb_importance_score.rename( columns={ 'feature': 'XGB衍生入模名称', 'fscore': '指标用于Split数据的数量', 'imp_pct': '指标用于Split数据的数量占比' }) xgb_importance_score[['XGB变量转换类型', '中间层指标名称']] = xgb_importance_score['XGB衍生入模名称']\ .apply(lambda x: pd.Series(mt.BinWoe().xgboost_obtain_raw_variable(x, var_dict))) xgb_importance_score['建模时指标名称'] = xgb_importance_score['中间层指标名称'].copy() xgb_importance_score['建模时XGB衍生入模名称'] = xgb_importance_score[ 'XGB衍生入模名称'].copy() for original, new_name in production_name_map.items(): a = xgb_importance_score.loc[xgb_importance_score['建模时指标名称']==original, '建模时XGB衍生入模名称']\ .apply(lambda x: x.replace(original, new_name)) xgb_importance_score.loc[xgb_importance_score['建模时指标名称'] == original, 'XGB衍生入模名称'] = a if original in rebin_spec: rebin_spec[new_name] = rebin_spec.pop(original) if original in bin_to_label: bin_to_label[new_name] = bin_to_label.pop(original) # 这一步很重要,要将字典里边的变量名也改掉,不然rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables+bin_variables)会出错 if original in list(var_dict['指标英文']): var_dict['指标英文'].replace({original: new_name}, inplace=True) xgb_importance_score['中间层指标名称'] = xgb_importance_score['中间层指标名称'].replace( production_name_map) xgb_importance_score = var_dict[['数据源', '指标英文', '指标中文', '数据类型']]\ .rename(columns={'指标英文':'中间层指标名称'})\ .merge(xgb_importance_score, on='中间层指标名称', how='right') xgb_importance_score.insert( 5, '输出打分指标名称', xgb_importance_score['XGB衍生入模名称'].apply( lambda x: 'mlr_' + str(x) + '_xgb_' + model_label)) xgb_importance_score = xgb_importance_score.append( {'输出打分指标名称': 'mlr_creditscore_xgb_' + model_label}, ignore_index=True) xgb_importance_score = xgb_importance_score.append( {'输出打分指标名称': 'mlr_prob_xgb_' + model_label}, ignore_index=True) model_decile = model_decile selected_variables = xgb_importance_score['建模时指标名称'].unique() model_eda = eda_table.loc[eda_table['指标英文'].isin( selected_variables)].copy() model_eda['指标英文'] = model_eda['指标英文'].replace(production_name_map) woe_iv_df_coarse_copy = woe_iv_df_coarse.copy() if len(production_name_map) > 0: for j, k in production_name_map.items(): woe_iv_df_coarse_copy = woe_iv_df_coarse_copy.replace(j, k) xgb_importance_score.rename(columns={'数据源': '指标分类'}, inplace=True) xgb_importance_score_bin_result = pd.merge(xgb_importance_score.drop_duplicates('中间层指标名称')[['指标分类','中间层指标名称','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\ ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称']]\ ,woe_iv_df_coarse_copy,left_on='中间层指标名称',right_on='指标英文',how='left')\ [['指标分类','中间层指标名称','指标中文','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\ ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称'\ ,'数据类型','指标类型','分箱','分箱对应原始分类','N','分布占比','WOE','逾期率']] xgb_importance_score_bin_result[ '分箱'] = xgb_importance_score_bin_result.apply( lambda x: x['分箱对应原始分类'] if x['分箱对应原始分类'] else x['分箱'], axis=1) xgb_importance_score_bin_result.drop(['分箱对应原始分类'], axis=1, inplace=True) writer = pd.ExcelWriter( os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label)) xgb_importance_score_bin_result.to_excel(writer, '2_模型变量重要性排序及分箱统计', index=False) model_decile.to_excel(writer, '3_模型decile', index=False) model_eda.to_excel(writer, '4_模型EDA', index=False) writer.save() logging.info("""第六步部署:生成XGBoost部署文档。 1. 模型部署Excel文档存储于:%s 2. 需添加『0_文档修订记录』、『1_信息总览』页面。详见其他正式部署文档文件。并存储于『/Seafile/模型共享/模型部署文档/』相应文件夹中 """ % os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label)) model_result = model_result derive_name_map = dict( zip(xgb_importance_score['建模时XGB衍生入模名称'], xgb_importance_score['XGB衍生入模名称'])) xgbmodel = model_result['model_final'] var_list = [] for i in xgbmodel.__dict__['feature_names']: try: var_list.append(derive_name_map[i]) except: var_list.append(i) xgbmodel.__dict__['feature_names'] = var_list if len(production_name_map) > 0: var_list_copy = [] for i in var_list: for j, k in production_name_map.items(): r = i.replace(j, k) var_list_copy.append(r) xgbmodel.__dict__['feature_names'] = var_list_copy num_variables = list(xgb_importance_score.loc[ xgb_importance_score['XGB变量转换类型'] == 'num_vars_origin', '中间层指标名称'].unique()) bin_variables = list(xgb_importance_score.loc[ xgb_importance_score['XGB变量转换类型'] == 'bin_vars', '中间层指标名称'].unique()) rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables + bin_variables) bin_to_label = { k: v for k, v in bin_to_label.items() if k in bin_variables } var_transform_method = {} var_transform_method['num_vars_origin'] = num_variables var_transform_method['bin_vars'] = bin_variables var_transform_method['dummy_vars'] = {} dummy_vars_df = xgb_importance_score.loc[xgb_importance_score['XGB变量转换类型'] == 'dummy_vars'].copy() dummy_vars_df.loc[:, "dummy原始名称"] = dummy_vars_df['建模时XGB衍生入模名称'].apply( lambda x: dummy_var_name_map[x]) dummy_vars_df.loc[:, 'dummy原始对应分类'] = dummy_vars_df.loc[:, "dummy原始名称"].apply( lambda x: x.split('DUMMY')[-1]) for original_variable in dummy_vars_df['中间层指标名称'].unique(): cat_list = list( dummy_vars_df.loc[dummy_vars_df['中间层指标名称'] == original_variable, 'dummy原始对应分类'].unique()) var_transform_method['dummy_vars'][original_variable] = cat_list production_dummy_var_name_map = {} for new_dummy_name, old_dummy_name in dummy_var_name_map.items(): if new_dummy_name != old_dummy_name: if new_dummy_name.split('DUMMY')[0] in production_name_map: var_name = new_dummy_name.split('DUMMY')[0] prod_var_name = production_name_map[var_name] production_dummy_var_name_map[old_dummy_name.replace( var_name, prod_var_name)] = new_dummy_name.replace( var_name, prod_var_name) else: production_dummy_var_name_map[old_dummy_name] = new_dummy_name var_transform_method['dummy_var_name_map'] = production_dummy_var_name_map save_data_to_json(rebin_spec_json, DEPLOY_PATH, '%s_selected_rebin_spec.json' % model_label) save_data_to_json(bin_to_label, DEPLOY_PATH, '%s_bin_to_label.json' % model_label) save_data_to_python2_pickle(xgbmodel, DEPLOY_PATH, '%s_xgbmodel.pkl' % model_label) save_data_to_json(var_transform_method, DEPLOY_PATH, '%s_var_transform_method.json' % model_label) ''' 生成模型的testcase ''' if test_nums < len(test_case_data): test_case_data_final = test_case_data[:test_nums] else: test_case_data_final = test_case_data.copy() if len(production_name_map) > 0: columns = [] for i in test_case_data_final.columns: for j, k in production_name_map.items(): r = i.replace(j, k) columns.append(r) test_case_data_final.columns = columns offline_model_score = pd.concat([model_result['p_train'].apply(mt.Performance()\ .p_to_score).to_frame('offline_model_score'),\ model_result['p_test'].apply(mt.Performance()\ .p_to_score).to_frame('offline_model_score')]) model_used_origin_features = [ i for i in list(set(xgb_importance_score['中间层指标名称'])) if str(i) != 'nan' and i != None ] test_case_data_final_with_xgbScore = pd.merge( test_case_data_final[model_used_origin_features], offline_model_score, left_index=True, right_index=True) def get_xgb_testcase(test_data, modelName, productName): strs = ''' curl -d 'varparams={"modelName":"%s","productName":"%s","applyId": 123, ''' % ( modelName, productName) for i, j in test_data.items(): strs = strs + '"%s"' % i + ':"%s"' % j + ', ' final = strs[:-2] + '''}' -X POST http://localhost:8080/modelInvoke''' return final test_case_data_final_with_xgbScore[ 'test_case'] = test_case_data_final_with_xgbScore[ test_case_data_final_with_xgbScore.columns.difference( ['offline_model_score'])].to_dict('records') test_case_data_final_with_xgbScore[ 'curl_script'] = test_case_data_final_with_xgbScore['test_case'].apply( get_xgb_testcase, args=(model_label, model_label)) test_case_data_final_with_xgbScore[[ 'offline_model_score', 'curl_script' ]].to_csv(os.path.join(DEPLOY_PATH, '%s_xgbmodel_testcase.csv' % model_label), index=None) '''为秒算模型部署平台生成测试用例''' generate_xgb_testcases(model_label, test_case_data_final, var_dict, DEPLOY_PATH, model_result) logging.info("""第六步部署:生成XGBoost部署文档。 线上部署配置文件存储于%s路径下 1. %save_data_dict_to_pickle 2. %s 3. %s 4. %s """ % ( DEPLOY_PATH, '%s_selected_rebin_spec.json' % model_label, '%sbin_to_label.json' % model_label, '%s_var_transform_method.json' % model_label, '%s_xgbmodel.pkl' % model_label, ))
def generate_xgb_testcases(model_label, x_with_apply_id, var_dict, RESULT_PATH, model_result=None, is_backscoring=False): ''' :param model_label(str): 模型命名, 发布模型时定义的模型名称 :param x_with_apply_id(DataFrame): X变量, 必须有apply_id, apply_id务必设置为index :param var_dict(DataFrame): 变量字典 :param model_result(dict): 建模时的模型结果, 建模时的[模型结果.pkl]. 必须有p_train. 务必保证model_result['p_train']的索引为apply_id,model_result['p_test']同理 :param is_backscoring(boolean): True 用于回溯打分, 不需要建模样本的打分结果; False 用于建模样本生成建模样本的测试用例, 需同时传入model_result :return: ''' def auto_xgb_testcase(test_data, modelName): ''' :param test_data(dict): :param modelName(str): :return :json dict ''' strs = ''' {"modelName":"%s","productName":"%s","applyId": 123, ''' % ( modelName, modelName) for i, j in test_data.items(): strs = strs + '"%s"' % i + ':"%s"' % j + ', ' final = strs[:-2] + '''} ''' return final x_columns = list( set((var_dict['指标英文']).unique()).intersection( set(x_with_apply_id.columns.values))) x_columns X = x_with_apply_id[x_columns] X = mu.process_missing(X, var_dict, verbose=True) X = mu.convert_right_data_type(X, var_dict)[0] if is_backscoring: X['origin_features'] = X.to_dict('records') X['var_param'] = X.apply( lambda row: auto_xgb_testcase(row['origin_features'], model_label), axis=1) test_cases = X[['var_param']] test_cases['offline_model_score'] = -8887 else: model_sample_score = pd.concat([ model_result['p_train'].apply( mt.Performance().p_to_score).to_frame('offline_model_score'), (model_result['p_test'].apply( mt.Performance().p_to_score).to_frame('offline_model_score')) ]) model_sample_x_with_score = model_sample_score.merge(x_with_apply_id, left_index=True, right_index=True, how='inner') model_sample_x_with_score[ 'origin_features'] = model_sample_x_with_score[ model_sample_x_with_score.columns.difference( ['offline_model_score'])].to_dict('records') model_sample_x_with_score[ 'var_param'] = model_sample_x_with_score.apply( lambda row: auto_xgb_testcase(row['origin_features'], model_label), axis=1) test_cases = model_sample_x_with_score[[ 'offline_model_score', 'var_param' ]] result_df = test_cases[['offline_model_score', 'var_param']] result_df.reset_index(inplace=True) result_df.columns = ['applyId', 'offline_model_score', 'var_param'] writer = pd.ExcelWriter( os.path.join(RESULT_PATH, '%s_testcases.xlsx' % model_label)) result_df.to_excel(writer, index=False) writer.save() return
def get_score_card(model_label, DATA_PATH, RESULT_PATH, woe_file_name, rebin_spec_name, var_dict): """ 为了延展性输入model_label将相应的评分卡计算并存好 Args: model_label (str): 模型版本名 RESULT_PATH (str): 存储路径 woe_file_name (str): 存储的woe_iv_df的文件名,用于转换计算评分卡 rebin_spec_name (str): 变量粗分箱(或者评分卡建模用的分箱)边界spec文件名 var_dict (pd.DataFrame): 标准变量字典 Returns: 存储评分卡至: RESULT_PATH, '%s_score_card.xlsx' % model_label """ rebin_spec = load_data_from_pickle(DATA_PATH, rebin_spec_name) model_result = load_data_from_pickle(DATA_PATH, '%s模型结果.pkl' % model_label) coefficients = model_result['model_final'].params.to_frame('beta')\ .reset_index().rename(columns={'index':'var_code'}) woe_iv_df_coarse = pd.read_excel(os.path.join(RESULT_PATH, woe_file_name)) woe_iv_df_coarse.loc[:, '指标英文'] = woe_iv_df_coarse.loc[:, '指标英文'].astype(str) cleaned_woe = woe_iv_df_coarse.loc[ woe_iv_df_coarse[u'指标英文'].isin(coefficients.var_code), [u'指标英文', u'分箱', '分箱对应原始分类', 'N', u'分布占比', 'WOE', u'逾期率', u'Bad分布占比']] # woe表是根据实际数据计算的,可能实际数据中缺少某些字段的某一箱 should_haves_list = [] for var_name in cleaned_woe['指标英文'].unique(): var_type = woe_iv_df_coarse.loc[woe_iv_df_coarse['指标英文'] == var_name, '数据类型'].iloc[0] if var_type == 'varchar': if var_name in rebin_spec: complete_bins_df = pd.Series(list( rebin_spec[var_name].keys())).to_frame('分箱') complete_bins_df.loc[:, '指标英文'] = var_name else: complete_bins_df = cleaned_woe.loc[ cleaned_woe['指标英文'] == var_name, ['指标英文', '分箱']].copy() else: a = pd.Series(0) # 这样拿到的是最全的分箱值,不依赖数据里是否有这一分箱。且顺序是排好的 complete_bins = [ str(i) for i in pd.cut(a, rebin_spec[var_name] ['cut_boundaries']).cat.categories ] if var_type == 'integer': other_categories = [ int(i) for i in rebin_spec[var_name]['other_categories'] ] if var_type == 'float': other_categories = [ float(i) for i in rebin_spec[var_name]['other_categories'] ] complete_bins_list = sorted(other_categories) + list(complete_bins) complete_bins_df = pd.Series(complete_bins_list).to_frame('分箱') complete_bins_df.loc[:, '指标英文'] = var_name should_haves_list.append(complete_bins_df) should_haves_df = pd.concat(should_haves_list) should_haves_df = should_haves_df.astype(str) cleaned_woe[[u'指标英文', u'分箱']] = cleaned_woe[[u'指标英文', u'分箱']].astype(str) cleaned_woe = should_haves_df.merge(cleaned_woe, on=[u'指标英文', u'分箱'], how='left') cleaned_woe['N'] = cleaned_woe['N'].fillna(0) cleaned_woe['分布占比'] = cleaned_woe['分布占比'].fillna(0) cleaned_woe['WOE'] = cleaned_woe['WOE'].fillna(0) score_card = mt.Performance().calculate_score(coefficients, cleaned_woe, 'var_score', var_dict) score_card.to_excel(os.path.join(RESULT_PATH, '%s_score_card.xlsx' % model_label), index=False)
Data_path, '%s模型结果.pkl' % (model_label + 'XGBHyperopt')) plot_obj.plot_for_the_model(Result_path, model_result_hyperopt, with_test=True) # #### 查看XGBHyperopt(超参)对应的每一颗决策树的决策点 model_result_hyperopt['model_final'].dump_model( Result_path + '/model_result_hyperopt_trees.txt') # #### 查看XGBHyperopt(超参)每一颗决策树对应的information和population train_model_result_hyperopt_leafs = ss.get_xgboost_tree_leaf_dist_and_badRate( model_result_hyperopt, X_train[selected], y_train, y_col_name) train_model_result_hyperopt_leafs.head(10) """ 10.4 XGBHyperopt模型decile """ train_XGBHyperopt_score = model_result_hyperopt['p_train'].apply( mt.Performance().p_to_score) test_XGBHyperopt_score = model_result_hyperopt['p_test'].apply( mt.Performance().p_to_score) # 训练集 ks_decile_XGBHyperopt_train = mt.Performance().calculate_ks_by_decile( train_XGBHyperopt_score, np.array(y_train), 'decile', q=10) ks_decile_XGBHyperopt_train # 测试集 point_bounds_XGBHyperopt = mt.BinWoe().obtain_boundaries( ks_decile_XGBHyperopt_train[u'分箱'])['cut_boundaries'] ks_decile_XGBHyperopt_test = mt.Performance().calculate_ks_by_decile( test_XGBHyperopt_score, np.array(y_test), 'decile',
def xgb_online_score_validity(data,input_vars,var_dict,var_transform_method,model_booster,model_decile,onLineScoreName): ''' 该函数为xgb模型上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数 Args: data(DataFrame):其中表头必须含有apply_id,var_code,create_at字段, 大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id input_vars(list):原始变量 var_dict(DataFrame):数据字典 var_transform_method(json):部署时刻的变量转换文件 model_booster(booster):模型部署时刻的booster文件 model_decile(DataFrame):model建模时候的decile onLineScoreName:xgb模型线上打分字段名 Returns: data_input_vars_missing(DataFrame):未填充缺失值的所有apply data_scored_applys_wrong(DataFrame):打分错误明细 model_decile_summary(DataFrame):模型分PSI ''' data = data.sort_values(['apply_id','var_code','created_at'],ascending=False)\ .drop_duplicates(['apply_id','var_code']) # 将数据进行列行转换 data_pivot_all = data.pivot(index='apply_id',columns='var_code',values='var_val') # 变量出现空值的apply统计 data_input_vars_missing = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)>0] data_scored_applys = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)==0] # 进模型前的原始数据 data_scored_applys_origin = data_scored_applys[input_vars] # 将数据转化为字典中的格式 data_scored_applys_origin = mu.convert_right_data_type(data_scored_applys_origin, var_dict)[0] try: # 将需要做dummy的变量进行dummy转化 data_dummys = pd.get_dummies(data_scored_applys_origin[list(var_transform_method['dummy_vars'].keys())]\ ,columns=list(var_transform_method['dummy_vars'].keys()), drop_first=False\ , prefix_sep='DUMMY') # 进入模型的数据 data_in_model = pd.merge(data_scored_applys_origin,data_dummys,left_index=True,right_index=True) except: data_in_model = data_scored_applys_origin data_in_model.rename(columns=var_transform_method['dummy_var_name_map'],inplace=True) # 转换成xgb格式 data_in_model_DMatrix = xgb.DMatrix(data_in_model[model_booster.__dict__['feature_names']]) data_scored_applys['offLine_prob'] = model_booster.predict(data_in_model_DMatrix) data_scored_applys['offLineScore'] = data_scored_applys['offLine_prob']\ .apply(lambda x : mt.Performance().p_to_score(x)) # 打分错误的applyid data_scored_applys_wrong = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\ .apply(lambda x : float(x[onLineScoreName]) != float(x['offLineScore']),axis=1)] data_scored_applys_right = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\ .apply(lambda x : float(x[onLineScoreName]) == float(x['offLineScore']),axis=1)] # PSI统计 point_bounds_XGBRandom = mt.BinWoe().obtain_boundaries(model_decile['分箱'])['cut_boundaries'] model_decile_curr = pd.cut(data_scored_applys_right[onLineScoreName].astype(float)\ ,point_bounds_XGBRandom).value_counts().sort_index().to_frame('curr_num').reset_index() model_decile_final = pd.concat([model_decile,model_decile_curr],axis=1) model_decile_final['建模分布'] = model_decile_final['样本数']/model_decile_final['样本数'].sum() model_decile_final['curr_dist'] = model_decile_final['curr_num']/model_decile_final['curr_num'].sum() model_decile_final['psi'] = model_decile_final.apply(lambda x : (x['建模分布'] - x['curr_dist'])\ *log(x['建模分布']/x['curr_dist'])\ if x['curr_dist']!=0 and x['建模分布']!=0 else 0\ ,axis=1) model_decile_summary = model_decile_final[['分箱','样本数','建模分布','curr_num','curr_dist','psi']] return data_input_vars_missing,data_scored_applys_wrong,model_decile_summary
def verify_var_and_score_validity(data_origin,score_card_deploy,rebin_spec_bin_adjusted_deploy,model_decile_deploy): ''' 该函数为上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数 Args: data_origin(DataFrame):其中表头必须含有apply_id,var_code,create_at字段, 大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id score_card_deploy(DataFrame):切记这是部署时刻的评分卡 rebin_spec_bin_adjusted_deploy(DataFrame):部署时刻的变量分箱.json文件 model_decile_deploy(DataFrame):部署时刻的model decile就在部署的评分卡中 Returns: wrong_data(DataFrame):未填充缺失值的所有apply wrong_score_vars(dict):变量以及其对应的打分错误明细 wrong_model_score(DataFrame):模型分打分错误明细 var_dist_data_with_score_card(DataFrame):变量PSI score_dist_with_decile(DataFrame):模型分PSI ''' # 将部署的评分卡切换成和建模时对应的字段名 score_card_deploy.rename(columns={'打分':'变量打分','中间层指标名称':'指标英文'},inplace=True) # 选出模型中用到的字段名和其对应的 model_selected = [i for i in set(list(score_card_deploy['指标英文'])) if str(i)!='nan'] mlr_selected_vars = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if str(i)!='nan'] # 生成数据字典 var_dict = score_card_deploy.drop_duplicates('指标英文') var_dict = var_dict[var_dict['指标英文'].isin(model_selected)] # 生成打分时候建模的评分卡 score_card_modeling = score_card_deploy[score_card_deploy['指标英文'].isin(model_selected)] score_card_modeling.ix[len(score_card_modeling)+1,'指标英文'] = 'intercept' const_var = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if 'const' in i][0] const_value = float(score_card_deploy.ix[score_card_deploy['输出打分指标名称']==const_var,'变量打分']) score_card_modeling.ix[score_card_modeling['指标英文']=='intercept','变量打分'] = const_value # 生成建模时刻的rebin_spec rebin_spec_bin_adjusted_modeling = rebin_spec_bin_adjusted_deploy['numerical'] rebin_spec_bin_adjusted_modeling.update(rebin_spec_bin_adjusted_deploy['categorical']) # 生成变量名称与打分名称对应关系 modelVar_scoreVar_dict = {} for j,i in score_card_deploy[['指标英文','输出打分指标名称']].drop_duplicates(['指标英文']).iterrows(): if str(i['指标英文'])!='nan': modelVar_scoreVar_dict[i['指标英文']] = i['输出打分指标名称'] # 转换原始数据格式 try: data_origin = data_origin.sort_values(['apply_id','var_code','created_at'],ascending=False)\ .drop_duplicates(['apply_id','var_code','created_at']) except: try: data_origin = data_origin.sort_values(['apply_id','var_code'],ascending=False)\ .drop_duplicates(['apply_id','var_code']) except: pass data_origin_pivot = data_origin.pivot(index='apply_id',columns='var_code',values='var_val') # 找出变量缺失的用户 wrong_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)>0] wrong_data.reset_index(inplace=True) # 找出不缺失打分的用户 right_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)==0] right_data_converted = convert_right_data_type(right_data[model_selected],var_dict)[0] # 将数据转换为bin bin_obj = mt.BinWoe() data_origin_pivot_cat = bin_obj.convert_to_category(right_data_converted[model_selected], var_dict\ , rebin_spec_bin_adjusted_modeling) data_origin_pivot_score_all = mt.Performance().calculate_score_by_scrd(data_origin_pivot_cat[model_selected]\ ,score_card_modeling) data_origin_pivot_score_vars = data_origin_pivot_score_all[0] data_origin_pivot_score_model = data_origin_pivot_score_all[1] data_origin_pivot_cat.columns = [i+'_cat' for i in data_origin_pivot_cat.columns] data_origin_pivot_score_vars.columns = [i+'_varScoreBymodel' for i in data_origin_pivot_score_vars.columns] data_origin_pivot_all = pd.merge(pd.merge(data_origin_pivot,data_origin_pivot_cat,left_index=True,right_index=True)\ ,data_origin_pivot_score_vars,left_index=True,right_index=True) # 将打分错误的变量对应的申请找出来 wrong_score_vars = {} for i in model_selected: wrong_dt =data_origin_pivot_all[data_origin_pivot_all\ .apply(lambda x : float(x[modelVar_scoreVar_dict[i]])!=float(x[i+'_varScoreBymodel']),axis=1)]\ [[i,i+'_cat',i+'_varScoreBymodel',modelVar_scoreVar_dict[i]]] wrong_score_vars[i] = wrong_dt.reset_index() # 整合线上和线下打分 mlr_score_vars = [i for i in mlr_selected_vars if 'creditscore' in i][0] model_on_off_score = pd.merge(data_origin_pivot_score_model.to_frame('back_score'),data_origin_pivot_all[mlr_score_vars]\ .to_frame('mlr_score'),left_index=True,right_index=True) wrong_model_score = model_on_off_score[model_on_off_score\ .apply(lambda x : float(x['back_score']) != float(x['mlr_score']),axis=1)] wrong_model_score.reset_index(inplace=True) # 求变量的PSI var_dist = [] for i in model_selected: tmp = (data_origin_pivot_cat[i+'_cat'].value_counts())\ .to_frame('allNum').reset_index() tmp['dist'] = tmp['allNum']/len(data_origin_pivot_cat) tmp['指标英文'] = i tmp.columns = ['分箱','allNum','dist','指标英文'] var_dist.append(tmp) var_dist_data = pd.concat(var_dist)[['指标英文','分箱','allNum','dist']] var_dist_data['分箱'] = var_dist_data['分箱'].astype(str) var_dist_data_with_score_card = pd.merge(score_card_modeling,var_dist_data,on = ['指标英文','分箱']) var_dist_data_with_score_card['PSI'] = var_dist_data_with_score_card.apply(lambda x : (x['dist'] - x['分布占比'])\ *log(x['dist']/x['分布占比'])\ if x['dist']!=0 and x['分布占比']!=0 else 0\ ,axis=1) var_dist_data_with_score_card = var_dist_data_with_score_card\ [['指标分类','指标英文','变量中文','数据类型','分箱','N','分布占比','allNum','dist','PSI']] # 求打分的psi # 拿到打分的cut_bounds point_bounds = mt.BinWoe().obtain_boundaries(model_decile_deploy[u'分箱'])['cut_boundaries'] mlr_score_dist = (pd.cut(data_origin_pivot_all[mlr_score_vars].astype(float),point_bounds)\ .value_counts().to_frame('allNum')).reset_index() mlr_score_dist['dist'] = mlr_score_dist['allNum']/len(data_origin_pivot_all) mlr_score_dist['index'] = mlr_score_dist['index'].astype(str) model_decile_deploy['样本分布'] = model_decile_deploy['样本数']/sum(model_decile_deploy['样本数']) score_dist_with_decile = pd.merge(model_decile_deploy,mlr_score_dist,left_on='分箱',right_on='index')\ [['分箱','样本数','样本分布','allNum','dist']] score_dist_with_decile['PSI'] = score_dist_with_decile.apply(lambda x : (x['dist'] - x['样本分布'])\ *log(x['dist']/x['样本分布'])\ if x['dist']!=0 and x['样本分布']!=0 else 0\ ,axis=1) return wrong_data,wrong_score_vars,wrong_model_score,var_dist_data_with_score_card,score_dist_with_decile