def output_score_psi(file_in, model_vars, score_right_list=None): df = tool.read_file(file_in) param_dic = d00_param.param_dic project_name = param_dic['project_name'] path_ = tool.prepare_path(project_name, 'd04_') tag = param_dic['tag'] key = param_dic['key'] fin_model_vars = model_vars fin_model_vars.extend([key, tag]) Xpath_ = tool.prepare_path(project_name, 'd02_') f = open(Xpath_ + '_vars_woe_bin_dic.pkl', "rb") dic_woe = pickle.load(f) print(10 * '*' + 'start to apply woe' + 10 * '*') m_data_cross = apply_woe(df[fin_model_vars], dic_woe, str_vars, key, tag, path=path_ + '_cross') print(10 * '*' + 'finish apply woe' + 10 * '*') m_data_train = tool.read_file('./data/' + project_name + '_m_data_train_bin.pkl') nd_vars_woe = [ i + "_woe" for i in model_vars if i + "_woe" in m_data_cross.columns ] for i in nd_vars_woe: m_data_train[i] = m_data_train[i].astype(float) m_data_cross[i] = m_data_cross[i].astype(float) nd_vars_bin = [i[:-4] + '_bin' for i in nd_vars_woe] writer = pd.ExcelWriter(path_ + '_cross_result_vars_psi.xlsx') func_tt_vardescrible(m_data_train, m_data_cross, writer, nd_vars_bin, tag) root_path = path_.replace('d04_' + project_name, '') error_file = [ ii for ii in os.listdir(root_path) if ii.find('_var_error_value.csv') >= 0 ] if error_file: for ii in error_file: df = tool.read_file(root_path + ii, header=-1) df.columns = ['var_name', 'value', 'flag'] df.drop_duplicates(inplace=True) df.to_csv(root_path + ii, index=False) print(10 * '*' + '存在特征未匹配到分箱请查看文件%s' % root_path + ii + 10 * '*') writer.save()
if __name__ == "__main__": with tool.Timer() as t: param_dic = d00_param.param_dic import os if os.path.exists('./data'): pass else: os.makedirs('./data') # 将数据存放在创建的运行目录下的data文件夹内 file_in = './data/' + param_dic['data_file'] percent_list = param_dic['percentiles'] project_name = param_dic['project_name'] tag = param_dic['tag'] key = param_dic['key'] bin_image = param_dic['bin_image'] test_size = param_dic['test_size'] ex_cols = param_dic['ex_cols'] auto_bin_num = param_dic['auto_bin_num'] data = tool.read_file(file_in) path_ = tool.prepare_path(project_name, 'd01_') data = data[data[tag] != 2] write_data_resource(data, auto_bin_num) key_num = data[key].count() # 总样本数 data_col = list(set(data.columns.tolist()) - set([key])) newdata = data[data_col] fin_data = get_summary_data(newdata) d01_main(project_name, data, tag, test_size) print(t.secs)
'mean': u'平均值' } param_dic = d00_param.param_dic project_name = param_dic['project_name'] tag = param_dic['tag'] key = param_dic['key'] cut_bin_num = param_dic['auto_bin_num'] datafile_input = param_dic['file_input'] derivate_var_list = param_dic['derivate_var_list'] derivate_agg_list = param_dic['derivate_agg_list'] derivate_var_n = param_dic['derivate_var_n'] output_data_file = param_dic['output_data_file'] output_datasource_file = param_dic['output_datasource_file'] data_var_type = pd.read_excel('./data/data_source.xlsx') # 输入数据集 data = tool.read_file('./data/' + datafile_input) chn_dict = data_var_type.set_index('var_name').T.to_dict('list') # 输入衍生特征list及需衍生计算指标 # derivate_var_list = ['age', # 'usermobileonlinedd', # 'certi_city_level', # 'phone_city_level', ] # derivate_agg_list = ['sum', 'min', 'max', 'mean', 'minus', 'rate', 'std'] # # 举例两两组合衍生即n=2 # derivate_var_n = 2 # # 输出衍生变量数据集路径 # output_data_file = './data/data_derivated.csv' # # 输出衍生变量中文名路径 # output_datasource_file = './data/data_source_derivated.csv' all_list = combine(derivate_var_list, derivate_var_n) new_data_var_type = []
def output_score(file_in, model_vars, score_right_list=None): # encoding = tool.file_encoding(file_in) df = tool.read_file(file_in) param_dic = d00_param.param_dic project_name = param_dic['project_name'] path_ = tool.prepare_path(project_name, 'd04_') tag = param_dic['tag'] key = param_dic['key'] fin_model_vars = model_vars fin_model_vars.extend([key, tag]) Xpath_ = tool.prepare_path(project_name, 'd02_') f = open(Xpath_ + '_vars_woe_bin_dic.pkl', "rb") dic_woe = pickle.load(f) f = pd.read_excel(path_ + '_result.xlsx', sheet_name=u'评分卡刻度及公式') odds = float(f['odds'][0]) print(10 * '*' + 'start to apply woe' + 10 * '*') m_data_cross = apply_woe(df[fin_model_vars], dic_woe, str_vars, key, tag, path=path_ + '_cross') print(10 * '*' + 'finish apply woe' + 10 * '*') m_data_train = tool.read_file('./data/' + project_name + '_m_data_train_bin.pkl') nd_vars_woe = [ i + "_woe" for i in model_vars if i + "_woe" in m_data_cross.columns ] for i in nd_vars_woe: m_data_train[i] = m_data_train[i].astype(float) m_data_cross[i] = m_data_cross[i].astype(float) nd_vars_bin = [i[:-4] + '_bin' for i in nd_vars_woe] # 加载模型 result = joblib.load(path_ + '_fin.model') df_gp_uncut, df_gp_cut, df_gp_qcut, ks_df, p, df_rs_score = cal_ks_test( result, m_data_cross, tag, base_score=base_score, double_score=double_score, odds=odds) writer = pd.ExcelWriter(path_ + '_cross_result.xlsx') title_Style = writer.book.add_format(tool.title_Style) cell_Style = writer.book.add_format(tool.cell_Style) # 跨时间集汇总指标 ws_ks_df = writer.book.add_worksheet('跨时间集汇总指标') df_rs_score = df_rs_score[[key, tag, 'score']] df_rs_score['score_bin'] = df_rs_score['score'].apply( lambda x: get_bin(x, score_right_list)) df_score = pd.read_csv(path_ + '_modeling_score.csv') model_ks, model_df = gb_add_woe(df_score, tag, 'artificial') cross_ks, cross_df = gb_add_woe(df_rs_score, tag, 'artificial') model_df = model_df[['bin', 'good', 'bad', 'bad_rate', 'bin_pct']] cross_df = cross_df[['bin', 'good', 'bad', 'bad_rate', 'bin_pct']] model_df.index = model_df['bin'] cross_df.index = cross_df['bin'] df_score_res = pd.concat([model_df, cross_df], axis=1, keys=['TRAIN', 'CROSS'], sort=False) df_score_res['PSI'] = (df_score_res[('TRAIN', 'bin_pct')] - df_score_res[ ('CROSS', 'bin_pct')]) * np.log(df_score_res[('TRAIN', 'bin_pct')] / df_score_res[('CROSS', 'bin_pct')]) psi = sum([ii for ii in df_score_res['PSI'] if not pd.isnull(ii)]) psi_df = pd.DataFrame({'describe': [psi]}) psi_df.index = ['PSI'] all_result = pd.concat([ks_df, psi_df]) all_result = all_result.reset_index() all_result.columns = ['跨时间集', 'describe'] print(all_result) df_score_res = df_score_res.reset_index(drop=True) base_bin = list(set(model_df['bin']) | set(cross_df['bin'])) newbase = pd.DataFrame({'base_bin': base_bin}) newbase = newbase.sort_values('base_bin') model_df = model_df.reset_index(drop=True) model_df_new = pd.merge(newbase, model_df, left_on='base_bin', right_on='bin', how='left') model_df_new.drop(columns=['bin'], inplace=True) cross_df = cross_df.reset_index(drop=True) cross_df_new = pd.merge(newbase, cross_df, left_on='base_bin', right_on='bin', how='left') cross_df_new.drop(columns=['bin'], inplace=True) row_num = all_result.shape[0] summary_df = all_result.fillna('') ws_ks_df.write_row(0, 0, summary_df.columns, title_Style) for ii in range(row_num): ws_ks_df.write_row(1 + ii, 0, summary_df.loc[ii, ], cell_Style) ws_ks_df.set_column(0, summary_df.shape[1], 20) # 各分数段分布 df_score_res.to_excel(writer, sheet_name='各分数段训练_跨时间集分布对比', startcol=0) df_rs_score.to_excel(writer, sheet_name='跨时间集打分明细', index=False) # 入模特征psi汇总指标及分箱明细对比 func_tt_vardescrible(m_data_train, m_data_cross, writer, nd_vars_bin, tag) # 跨时间样本匹配不成功字段列表明细(如无匹配不成功则不输出文件) root_path = path_.replace('d04_' + project_name, '') error_file = [ ii for ii in os.listdir(root_path) if ii.find('_var_error_value.csv') >= 0 ] if error_file: for ii in error_file: df = tool.read_file(root_path + ii, header=-1) df.columns = ['var_name', 'value', 'flag'] df.drop_duplicates(inplace=True) df.to_csv(root_path + ii, index=False) print(10 * '*' + '存在特征未匹配到分箱请查看文件%s' % root_path + ii + 10 * '*') writer.save()
def model(model_ex_corr=True): start_time = time.time() param_dic = d00_param.param_dic project_name = param_dic['project_name'] path_ = tool.prepare_path(project_name, 'd04_') tag = param_dic['tag'] key = param_dic['key'] corr_limit = param_dic['corr_limit'] Xpath_ = tool.prepare_path(project_name, 'd02_') info = pd.read_excel(Xpath_ + '_info.xlsx', sheet_name='特征IV汇总') new_info = info[[u'特征简称', u'特征中文名', 'IV', u'覆盖率']] new_info_dict = new_info.set_index(u'特征简称').to_dict('index') vars = info.sort_values('IV', ascending=False)[:3000][u'特征简称'].tolist() fin_vars = vars # fin_vars = ['register_channel_name'] fin_vars.extend([key, tag]) if model_ex_corr and u'是否因超过共线性阀值剔除' in list(info.columns): m_data_train = tool.read_file('./data/' + project_name + '_m_data_train.pkl') m_data_test = tool.read_file('./data/' + project_name + '_m_data_test.pkl') nd_vars_woe = (info[info['是否因超过共线性阀值剔除'] == 0]['特征简称'] + '_woe').tolist() elif u'是否因超过共线性阀值剔除' in list(info.columns): m_data_train = tool.read_file('./data/' + project_name + '_m_data_train.pkl') m_data_test = tool.read_file('./data/' + project_name + '_m_data_test.pkl') nd_vars_woe = (info['特征简称'] + '_woe').tolist() elif model_ex_corr: f = open(Xpath_ + '_vars_woe_bin_dic.pkl', "rb") dic_woe = pickle.load(f) df_train = tool.read_file('./data/' + project_name + '_df_train.pkl') df_test = tool.read_file('./data/' + project_name + '_df_test.pkl') print(10 * '*' + 'start apply woe.' + 10 * '*') m_data_train = apply_woe(df_train[fin_vars], dic_woe, str_vars, key, tag, path=path_) m_data_test = apply_woe(df_test[fin_vars], dic_woe, str_vars, key, tag, path=path_) print(10 * '*' + 'finish apply woe.' + 10 * '*') nd_vars_woe = filter_by_corr(m_data_train, fin_vars, nec_vars=[], corr_limit=corr_limit) for i in nd_vars_woe: m_data_train[i] = m_data_train[i].astype(float) m_data_test[i] = m_data_test[i].astype(float) m_data_train.to_pickle('./data/' + project_name + '_m_data_train_bin.pkl') m_data_train = m_data_train[[ ii for ii in m_data_train if ii[-4:] != '_bin' ]] m_data_test = m_data_test[[ ii for ii in m_data_train if ii[-4:] != '_bin' ]] m_data_train.to_pickle('./data/' + project_name + '_m_data_train.pkl') m_data_test.to_pickle('./data/' + project_name + '_m_data_test.pkl') print(10 * '*' + 'finish vars check corr') sort_vars = func_sort_col(nd_vars_woe, m_data_train, tag)[1] train_cols, result, vars_dic, ks_df, error_var, odds, A, B = func_stepwise_1( sort_vars, m_data_train, tag, pmml_btn=False, base_score=base_score, double_score=double_score) # 保存模型 joblib.dump(result, path_ + '_fin.model') # 加载模型 # RF = joblib.load('rf.model') # df_gp_uncut, df_gp_cut, df_gp_qcut, ks_df, p, df_rs_score = cal_ks_test(result, m_data_test, tag) df_score = func_report(m_data_train, result, tag, base_score=base_score, double_score=double_score, save_path=path_, test=m_data_test, info=None, odds=odds, score_detail_btn=False, data_dic=new_info_dict) root_path = path_.replace('d04_' + project_name, '') error_file = [ ii for ii in os.listdir(root_path) if ii.find('_var_error_value.csv') >= 0 ] if error_file: for ii in error_file: df = tool.read_file(root_path + ii, header=-1) df.columns = ['var_name', 'value', 'flag'] df.drop_duplicates(inplace=True) df.to_csv(root_path + ii, index=False) print(10 * '*' + '存在特征未匹配到分箱请查看文件%s' % root_path + ii + 10 * '*') df_score = df_score[[key, tag, 'score']] df_score['score_bin'] = df_score['score'].apply( lambda x: get_bin(x, score_right_list)) df_score.to_csv(path_ + '_modeling_score.csv', index=False) print(10 * '*' + 'finish logstic modeling') print('spend times:', time.time() - start_time)
def model_tune(tune_del_vars=[], tune_nec_vars=[], is_all_necess=False, score_right_list=[]): start_time = time.time() param_dic = d00_param.param_dic project_name = param_dic['project_name'] path_ = tool.prepare_path(project_name, 'd04_') tag = param_dic['tag'] key = param_dic['key'] corr_limit = param_dic['corr_limit'] Xpath_ = tool.prepare_path(project_name, 'd02_') info = pd.read_excel(Xpath_ + '_info.xlsx', sheet_name='特征IV汇总') new_info = info[[u'特征简称', u'特征中文名', 'IV', u'覆盖率']] new_info_dict = new_info.set_index(u'特征简称').to_dict('index') vars = info.sort_values('IV', ascending=False)[:3000][u'特征简称'].tolist() m_data_train = tool.read_file('./data/' + project_name + '_m_data_train.pkl') m_data_test = tool.read_file('./data/' + project_name + '_m_data_test.pkl') if is_all_necess: fin_vars = tune_nec_vars nd_vars_woe = [ i + "_woe" for i in fin_vars if i + "_woe" in m_data_train.columns ] else: fin_vars = filter(lambda x: x not in tune_del_vars, vars) if model_ex_corr: nd_vars_woe = filter_by_corr(m_data_train, fin_vars, nec_vars=tune_nec_vars, corr_limit=corr_limit) else: nd_vars_woe = [ ii + "_woe" for ii in fin_vars if ii + "_woe" in m_data_train.columns ] for i in nd_vars_woe: m_data_train[i] = m_data_train[i].astype(float) m_data_test[i] = m_data_test[i].astype(float) print(10 * '*' + 'finish vars check corr') sort_vars = func_sort_col(nd_vars_woe, m_data_train, tag)[1] train_cols, result, vars_dic, ks_df, error_var, odds, A, B = func_stepwise_1( sort_vars, m_data_train, tag, pmml_btn=False, base_score=base_score, double_score=double_score, nec_vars=tune_nec_vars) # 保存模型 joblib.dump(result, path_ + '_fin.model') df_score = func_report(m_data_train, result, tag, base_score=base_score, double_score=double_score, save_path=path_, test=m_data_test, info=None, odds=odds, score_detail_btn=False, data_dic=new_info_dict) df_score = df_score[[key, tag, 'score']] df_score['score_bin'] = df_score['score'].apply( lambda x: get_bin(x, score_right_list)) df_score.to_csv(path_ + '_modeling_score.csv', index=False) print(10 * '*' + 'odds:', odds) print(10 * '*' + 'finish logstic modeling') print('spend times:', time.time() - start_time)
if __name__ == "__main__": with tool.Timer() as t: param_dic = d00_param.param_dic project_name = param_dic['project_name'] path_ = tool.prepare_path(project_name, 'd02_') ex_cols = param_dic['ex_cols'] tag = param_dic['tag'] key = param_dic['key'] test_size = param_dic['test_size'] bin_ex_corr = param_dic['bin_ex_corr'] mono = param_dic['mono'] auto_bin_num = param_dic['auto_bin_num'] nece_var = param_dic['nece_var'] output_auto_cut_bin = param_dic['output_auto_cut_bin'] output_uncut_bin = param_dic['output_uncut_bin'] df_train = tool.read_file('./data/' + project_name + '_df_train.pkl') df_test = tool.read_file('./data/' + project_name + '_df_test.pkl') if output_auto_cut_bin: cut_bin_choose(df_train, df_test, ex_cols, path_, tag, mono=mono, test_size=test_size, bin_ex_corr=bin_ex_corr) if output_uncut_bin: cut_bin_choose_uncut(df_train, df_test, ex_cols, path_,