def process_woe_trans(in_data_path=None, rst_path=None, out_path=None, config_path=None): cfg = config.config() cfg.load_file(config_path, in_data_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans( cfg.dataset_train[r.var_name], r) cfg.dataset_train.to_csv(out_path)
def process_woe_trans(in_data_path=None,rst_path=None): config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model_201705.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) cfg.dataset_train = cfg.dataset_train.rename(columns={'cs_cpd':'cpd'}) # rename # dataset['raw_cs_cpd'] = dataset['cs_cpd'] for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans(cfg.dataset_train[r.var_name], r) return cfg.dataset_train
def proc_validattion(dataset_path,config_path,model_path): print '####PROC VALIDATION#####' print 'dataset_path:\n',dataset_path print 'config_path:\n',config_path print 'model_path:\n',model_path #fillna config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv' cfg = config.config() cfg.load_file(config_path, dataset_path) for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 output = open(model_path, 'rb') clf_model = pickle.load(output) output.close() clf = clf_model['clf'] X_test = cfg.dataset_train[clf_model['features_list']] y_test = cfg.dataset_train['target'] y_hat = clf.predict_proba(X_test)[:,1] ks = compute_ks(y_hat,y_test) print 'global_bt:',cfg.global_bt print 'global_gt:', cfg.global_gt print 'ks:',ks return ks
def process_woe_trans(dataset, rst_path=None, config_path=None): ''' Process WOE features from fitted InfoValues Parameters ---------- dataset : Pandas dataframe to be processed rst_path : Path to list of InfoValue pickle object config_path : Path to read config file from. Return ------ dataset_transformed : Pandas dataframe that is transformed ''' # Load config cfg = config.config() cfg.load_file(config_path) cfg.set_dataset(dataset) # Prepare variable list bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] discrete_var_list = [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ] # Impute missing values for var in bin_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in discrete_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' # Cast dataframe dtypes change_feature_dtype(cfg.dataset_train, cfg.variable_type) # Load fitted InfoValues with open(rst_path, 'rb') as f: rst = pickle.load(f) # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = woe_trans( cfg.dataset_train[r.var_name], r) return cfg.dataset_train.copy()
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None): print time.asctime(time.localtime(time.time())), 'load config file' config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) print time.asctime(time.localtime(time.time())), 'fill na' dataset = pd.read_csv(in_data_path) print time.asctime(time.localtime( time.time())), 'fill na continuous variables' for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = -1 print time.asctime(time.localtime( time.time())), 'fill na discrete variables' for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = 'missing' print time.asctime(time.localtime(time.time())), 'change feature dtypes' fp.change_feature_dtype(dataset, cfg.variable_type) print time.asctime(time.localtime(time.time())), 'load woe rule' output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: print 'woe trans:', r.var_name dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path, index=False) print('%s\tSUCCESS EXPORT FILE: \n%s' % (time.asctime(time.localtime(time.time())), out_path))
def process_train_woe(infile_path=None,outfile_path=None,rst_path=None): print 'run into process_train_woe: \n',time.asctime(time.localtime(time.time())) config_path = 'E:\\Code\\Python_ML_Code\\cs_model\\config\\config_cs_model_pos_m2.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path,data_path) bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time())) print 'cfg.global_bt',cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: rst.append(fp.proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time())) for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append(fp.proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, outfile_path) print 'save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time())) output = open(rst_path, 'wb') pickle.dump(rst,output) output.close() return feature_detail,rst
# -*- coding:utf-8 -*- __author__ = 'maomaochong' import pandas as pd import woe.feature_process as fp import woe.config as config dataset_train_path1 = r'E:\work_file\mmt_application_card\raw_data\mmt_application_model_feature_ftrain1.csv' config_path = r'E:\work_file\mmt_application_card\config\config_mmt_application_model.csv' dataset = pd.read_csv(dataset_train_path1) var = 'data_status' dataset.loc[dataset[var].isnull(), (var)] = 'missing' cfg = config.config() cfg.load_file(config_path,dataset_train_path1) print 'cfg.global_bt',cfg.global_bt print 'cfg.global_gt',cfg.global_gt print 'cfg.min_sample',cfg.min_sample # rst = fp.proc_woe_discrete(dataset,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05) var = 'pos_sales_commission' fp.proc_woe_continuous(dataset,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05) var = 'pos_dd_fail_cnt' fp.proc_woe_continuous(dataset,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)
def process_train_woe(dataset, outfile_path=None, rst_path=None, config_path=None, min_sample_weight_config=None): ''' Process training data for WOE Parameters ---------- dataset : Pandas dataframe of training dataset. Includes 'target' column. outfile_path : Path for WOE feature details output. rst_path : Path for WOE InfoValue object output. config_path : Path to read config file from. min_sample_weight : Adjust the percentage of samples required for leaf. Return ------ feature_detail : WOE feature details rst : List of InfoValue instances ''' # Load config cfg = config.config() cfg.load_file(config_path) cfg.set_dataset(dataset) cfg.load_min_sample_weight_config(min_sample_weight_config) print(cfg.min_sample_weight_config) # Prepare variable list bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] discrete_var_list = [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ] # Impute missing values for features to be binned for var in bin_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # Cast feature dtypes change_feature_dtype(cfg.dataset_train, cfg.variable_type) # Process woe transformation of continuous variables rst = [] for var in bin_var_list: iv_obj = proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.get_min_sample(var), alpha=0.05) rst.append(iv_obj) # Process woe transformation of discrete variables for var in discrete_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' iv_obj = proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.get_min_sample(var), alpha=0.05) rst.append(iv_obj) feature_detail = eval.eval_feature_detail(rst, outfile_path) # Write list of InfoValue instances to output path with open(rst_path, 'wb') as f: pickle.dump(rst, f) return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None, config_path=None): print('run into process_train_woe: ', time.asctime(time.localtime(time.time()))) data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path) bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] for var in bin_var_list: cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] print('process woe transformation of continuous variables: ', time.asctime(time.localtime(time.time()))) print('cfg.global_bt', cfg.global_bt) print('cfg.global_gt', cfg.global_gt) # 处理连续变量 for var in bin_var_list: rst.append( proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) # 处理离散变量 print('process woe transformation of discrete variables: ', time.asctime(time.localtime(time.time()))) for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append( proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, outfile_path) print('save woe transformation rule into pickle: ', time.asctime(time.localtime(time.time()))) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None): print 'run into process_train_woe: \n', time.asctime( time.localtime(time.time())) config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model_lr.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path) # rst = [] output = open(rst_path, 'rb') rst = pickle.load(output) output.close() exists_var_list = [rst[i].var_name for i in range(rst.__len__())] bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n', time.asctime( time.localtime(time.time())) print 'cfg.global_bt', cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: print var if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n', time.asctime( time.localtime(time.time())) for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ]: print var # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' feature_detail = eval.eval_feature_detail(rst, outfile_path) return feature_detail, rst
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None, config_path=None, rebin_feature_path=None): print('run into process_train_woe: \n', time.asctime(time.localtime(time.time()))) data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path, rebin_feature_path) bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ] orig_dataset_train = cfg.dataset_train # change feature dtypes change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] print('cfg.global_bt', cfg.global_bt) print('cfg.global_gt', cfg.global_gt) print('cfg.global_categorical_missing', cfg.global_categorical_missing) print('cfg.global_numeric_missing', cfg.global_numeric_missing) # process woe transformation of continuous variables print('process woe transformation of continuous variables: \n', time.asctime(time.localtime(time.time()))) for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing rst.append( proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of continuous variables based on the re-binning logic provided print( 'process woe transformation of continuous variables based on rebin logic: \n', time.asctime(time.localtime(time.time()))) rebin_var_list = [ tmp for tmp in cfg.rebin_var_list if tmp in list(cfg.dataset_train.columns) ] for var in rebin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing var_df = cfg.dataset_rebin.loc[cfg.dataset_rebin['var_name'] == var] split_list = list(np.unique(var_df[['split']].astype(float))) rst.append( proc_woe_continuous_rebin(cfg.dataset_train, var, split_list, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of discrete variables print('process woe transformation of discrete variables: \n', time.asctime(time.localtime(time.time()))) discrete_var_list = [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ] for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing rst.append( proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) # process woe transformation of discrete variables based on re-binning logic print( 'process woe transformation of discrete variables based on rebin logic: \n', time.asctime(time.localtime(time.time()))) rebin_discrete_var_list = [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ] for var in [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing var_df = cfg.dataset_rebin.loc[cfg.dataset_rebin['var_name'] == var] var_df.loc[:, 'split'] = var_df['split'].astype(object) rebin_list = list(np.unique(var_df[['split']])) rst.append( proc_woe_discrete_rebin(cfg.dataset_train, var, rebin_list, cfg.global_bt, cfg.global_gt, cfg.min_sample, cfg.global_numeric_missing, cfg.global_categorical_missing, alpha=0.05)) feature_detail = woeeval.eval_feature_detail(rst, outfile_path) import pandas as pd pd.options.display.float_format = '{:.3f}'.format for var in bin_var_list + rebin_var_list + discrete_var_list + rebin_discrete_var_list: missing_obs = cfg.dataset_train.loc[cfg.dataset_train[var].isin( [cfg.global_numeric_missing, cfg.global_categorical_missing])].shape[0] print 'variable = ', var, '\t# obs = ', orig_dataset_train[var].shape[ 0], '\t# valid = ', ( orig_dataset_train[var].shape[0] - missing_obs), '\t% valid = ', ( orig_dataset_train[var].shape[0] - missing_obs) * 100.0 / (orig_dataset_train[var].shape[0]) df = feature_detail.loc[feature_detail['var_name'] == var] print(df[[ 'split_list', 'sub_total_sample_num', 'positive_sample_num', 'weight_positive_freq', 'weight_negative_freq', 'perc_cum_weight_freq', 'perc_cum_weight_positive_freq', 'perc_cum_weight_negative_freq', 'woe_list', 'iv_list', 'ks_list' ]]) woeeval.plot_woe(df, var) s = 'summary of WOE transformation' print(s.center(60, '-')) smry_df = feature_detail[['var_name', 'iv', 'maxks', 'linearity' ]].drop_duplicates().sort_values('iv', ascending=False) print(smry_df) print('save woe transformation rule into pickle: \n', time.asctime(time.localtime(time.time()))) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() return feature_detail, rst
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None, config_path=None, rebin_feature_path=None): cfg = config.config() cfg.load_file(config_path, in_data_path, rebin_feature_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing rebin_var_list = [ tmp for tmp in cfg.rebin_var_list if tmp in list(cfg.dataset_train.columns) ] for var in rebin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_numeric_missing for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing rebin_discrete_var_list = [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ] for var in [ tmp for tmp in cfg.rebin_discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = cfg.global_categorical_missing # first use the split values from features_rebin to re-bin the existing dataset, that way the WOE transformation is applied correctly rebin_var_df = cfg.dataset_rebin.loc[cfg.dataset_rebin['var_name'] == var] rebin_var_df['split'] = rebin_var_df['split'].astype(object) rebin_list = list(np.unique(rebin_var_df[['split']])) for rebin_val in rebin_list: cfg.dataset_train.loc[cfg.dataset_train[var].isin(eval(rebin_val)), (var)] = str(rebin_val).strip('[]').replace( '\'', '') change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = woe_trans( cfg.dataset_train[r.var_name], r) # Output to a csv cfg.dataset_train.to_csv(out_path)