def process_woe_trans(in_data_path=None,rst_path=None): config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model_201705.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) cfg.dataset_train = cfg.dataset_train.rename(columns={'cs_cpd':'cpd'}) # rename # dataset['raw_cs_cpd'] = dataset['cs_cpd'] for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans(cfg.dataset_train[r.var_name], r) return cfg.dataset_train
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None, config_path=None): cfg = config.config() cfg.load_file(config_path, in_data_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: cfg.dataset_train[r.var_name] = fp.woe_trans( cfg.dataset_train[r.var_name], r) cfg.dataset_train.to_csv(out_path)
def process_woe_trans(cfg=None, in_data_path=None, rst_path=None, out_path=None): # config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv' # data_path = in_data_path # cfg = config.config() # cfg.load_file(config_path, data_path) dataset = pd.read_csv(in_data_path) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(dataset, cfg.variable_type) output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation r = rst[0] dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path) print('%s\tSUCCESS EXPORT FILE: \n%s' % (time.asctime(time.localtime(time.time())), out_path))
def single_process(var): target_path = r'E:\ScoreCard\cs_model\cs_m1_pos_model_daily\raw_data\dataset_split_by_cols\target.csv' target = pd.read_csv(target_path) fp.change_feature_dtype(target, cfg.variable_type) cfg.global_bt = sum(target.target) cfg.global_gt = target.shape[0] - cfg.global_bt cfg.min_sample = int(target.shape[0] * 0.05) return process_train_woe(cfg, feature_name=var, target=target)
def process_train_woe(cfg=None, feature_name=None, target=None): print 'run into process_train_woe: \n', feature_name, time.asctime( time.localtime(time.time())) feature_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\raw_data\\dataset_split_by_cols\\' feature_path = feature_path + feature_name + '.csv' feature = pd.read_csv(feature_path) rst = [] if feature.columns[0] in list(cfg.bin_var_list): feature.loc[feature[feature.columns[0]].isnull()] = -1 fp.change_feature_dtype(feature, cfg.variable_type) dataset = pd.merge(feature.reset_index(), target.reset_index()).drop('index', axis=1) var = feature.columns[0] del feature del target riv = fp.proc_woe_continuous(dataset, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05) else: # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n', time.asctime( time.localtime(time.time())) feature.loc[feature[feature.columns[0]].isnull()] = 'missing' fp.change_feature_dtype(feature, cfg.variable_type) dataset = pd.merge(feature.reset_index(), target.reset_index()).drop('index', axis=1) var = feature.columns[0] del feature del target riv = fp.proc_woe_discrete(dataset, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05) rst.append(riv) feature_detail = eval.eval_feature_detail(rst) rst_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\gendata\\WOE_Rule\\' rst_path = rst_path + feature_name + '.pkl' result = (riv, feature_detail) output = open(rst_path, 'wb') pickle.dump(result, output) output.close() return result
def process_woe01_trans(cfg=None, rst=None, dataset=None, out_path=None): # fill null for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: dataset.loc[dataset[var].isnull(), (var)] = -1 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: dataset.loc[dataset[var].isnull(), (var)] = 'missing' fp.change_feature_dtype(dataset, cfg.variable_type) for r in rst: dataset[r.var_name] = woe01_trans(dataset[r.var_name], r) dataset.to_csv(out_path)
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None): print time.asctime(time.localtime(time.time())), 'load config file' config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv' data_path = in_data_path cfg = config.config() cfg.load_file(config_path, data_path) print time.asctime(time.localtime(time.time())), 'fill na' dataset = pd.read_csv(in_data_path) print time.asctime(time.localtime( time.time())), 'fill na continuous variables' for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = -1 print time.asctime(time.localtime( time.time())), 'fill na discrete variables' for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = 'missing' print time.asctime(time.localtime(time.time())), 'change feature dtypes' fp.change_feature_dtype(dataset, cfg.variable_type) print time.asctime(time.localtime(time.time())), 'load woe rule' output = open(rst_path, 'rb') rst = pickle.load(output) output.close() # Training dataset Woe Transformation for r in rst: print 'woe trans:', r.var_name dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r) dataset.to_csv(out_path, index=False) print('%s\tSUCCESS EXPORT FILE: \n%s' % (time.asctime(time.localtime(time.time())), out_path))
def process_train_woe(infile_path=None,outfile_path=None,rst_path=None): print 'run into process_train_woe: \n',time.asctime(time.localtime(time.time())) config_path = 'E:\\Code\\Python_ML_Code\\cs_model\\config\\config_cs_model_pos_m2.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path,data_path) bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time())) print 'cfg.global_bt',cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: rst.append(fp.proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time())) for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' rst.append(fp.proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) feature_detail = eval.eval_feature_detail(rst, outfile_path) print 'save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time())) output = open(rst_path, 'wb') pickle.dump(rst,output) output.close() return feature_detail,rst
import pandas as pd import woe.config as config import woe.feature_process as fp import woe.eval as eval config_path = os.getcwd() + '\\woe\\examples\\config.csv' data_path = os.getcwd() + '\\woe\\examples\\UCI_Credit_Card.csv' cfg = config.config() cfg.load_file(config_path, data_path) for var in cfg.bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) rst = [] # process woe transformation of continuous variables for var in cfg.bin_var_list: rst.append( fp.proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) # process woe transformation of discrete variables for var in cfg.discrete_var_list:
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None): print 'run into process_train_woe: \n', time.asctime( time.localtime(time.time())) config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model_lr.csv' data_path = infile_path cfg = config.config() cfg.load_file(config_path, data_path) # rst = [] output = open(rst_path, 'rb') rst = pickle.load(output) output.close() exists_var_list = [rst[i].var_name for i in range(rst.__len__())] bin_var_list = [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ] for var in bin_var_list: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 # change feature dtypes fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type) # process woe transformation of continuous variables print 'process woe transformation of continuous variables: \n', time.asctime( time.localtime(time.time())) print 'cfg.global_bt', cfg.global_bt print 'cfg.global_gt', cfg.global_gt for var in bin_var_list: print var if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_continuous(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' # process woe transformation of discrete variables print 'process woe transformation of discrete variables: \n', time.asctime( time.localtime(time.time())) for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list ]: print var # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' if rst.__len__() == 0: pass else: output = open(rst_path, 'rb') rst = pickle.load(output) output.close() print 'load' rst.append( fp.proc_woe_discrete(cfg.dataset_train, var, cfg.global_bt, cfg.global_gt, cfg.min_sample, alpha=0.05)) output = open(rst_path, 'wb') pickle.dump(rst, output) output.close() print 'dump' feature_detail = eval.eval_feature_detail(rst, outfile_path) return feature_detail, rst
:return: null value,replace null value inplace """ for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]: # fill null dataset.loc[dataset[var].isnull(), (var)] = continuous_filler for var in [ tmp for tmp in discrete_var_list if tmp in list(dataset.columns) ]: # fill null dataset.loc[dataset[var].isnull(), (var)] = discrete_filler fillna(df_train, cfg.bin_var_list, cfg.discrete_var_list) fillna(df_validation, cfg.bin_var_list, cfg.discrete_var_list) fp.change_feature_dtype(df_train, cfg.variable_type) fp.change_feature_dtype(df_validation, cfg.variable_type) candidate_var_list = [ 'avg_days', 'bptp_ratio', 'city', 'due_periods_ratio', 'finish_periods_ratio', 'intime_pay', 'kptp_ratio', 'most_contact_3m', 'person_app_age', 'person_sex', 'recent_contact_day', 'rej_count', 'rpy_cn', 'seq_delay_days', 'state_sagroup', 'tot_credit_amount', 'value_balance_ratio', 'value_income_ratio' ] reload(eval) eval.eval_feature_stability(civ_list, df_train, df_validation, candidate_var_list,