def AllWoeCals(self, vrs=None): """ vrs for version control """ ivs = {} try: with tqdm(self.ftrs.keys()) as t: for i in t: self.setTgt(self.data[[i, 'label']]) if self.ftrs[i] == 'str': self.strWoe_cal() elif self.ftrs[i] in ['int', 'float']: self.woe_cal() elif isinstance(self.ftrs[i], dict): self._setStrValue(self.ftrs[i], ifraise=False) self.woeDetail[i]['str2orders'] = self.ftrs[i] self.woe_cal() ivs[i] = self.getIVinfo() except KeyboardInterrupt: t.close() raise t.close() if vrs is not None: tools.putFile(self.path + '/feature_process_methods/IVstat', 'IVs_' + vrs + '.json', ivs) tools.putFile(self.path + '/feature_process_methods/IVstat', 'woeDetail_' + vrs + '.json', self.woeDetail)
'type_1,type_2': 0 } } smy['woeCal']['ft_lbs_dis_label'] = { 'type_info': { 'd0': 1, 'd1_300': 2, 'd301_800': 3, 'd801_2500': 4, 'd2501_8000': 5, 'd8001_20000': 6, 'd20000_': 7 } } #js_smy = json.dumps(smy) tools.putFile(path + '/' + version, 'process_methods.json', smy) if ifselect: #建议用非WOE编码的方式进行逻辑回归,查看模型效果 #设定逻辑回归参数 #保留test prc_methods = tools.getJson(path + '/' + version + '/process_methods.json') data, oot, data_lb, oot_lb = train_test_split(raw_data.drop('label', axis=1), raw_data['label'], test_size=0.2, random_state=rnd_seed) params = {'ifconst': True, 'ifnull': True} print( '------------------------------------IV值计算---------------------------------------' )
elif k in to_drop_list: smy['toDrop'].append({k: 'no feature'}) elif js[k]['dist'] <= 1: smy['toDrop'].append({k: 'unique_value'}) elif js[k]['type'] == 'str' and js[k]['dist'] > 30: smy['toDrop'].append({k: 'too much chars'}) elif raw_data[k].isnull().sum() / total_data > 0.98: smy['toDrop'].append({k: 'too much missing'}) elif js[k]['type'] == 'str': smy['str_col'].append(k) elif js[k]['type'] == 'int': smy['int_col'].append(k) elif js[k]['type'] == 'float': smy['float_col'].append(k) tools.putFile(path, 'summary.json', smy) if ftr_stat: #读取原始数据 print("Reading related information...") smy = tools.getJson(path + '/summary.json') toDrop = smy.get('toDrop') toDropList = [ list(a.keys())[0] for a in toDrop if list(a.values())[0] != 'no feature' ] #ids = smy.get('ids') str_col = smy.get('str_col') int_col = smy.get('int_col') float_col = smy.get('float_col') #toOneHot = smy.get('toOneHot')
def ft_type_check(path, output, header=True, size_c=1000): """ 检查样本特征种类 """ fsize = os.path.getsize(path) / 1024**2 if fsize > size_c: warnings.warn('sample data is used to assess data type') with open(path, 'r') as f: txt = [ re.sub(r'[\t,|]', ',', a).strip().split(',') for a in f.readlines()[:2000 + np.int(header)] ] f.close() if header: hds = txt[0] vls = txt[1:] else: hds = ['f' + str(a) for a in range(len(txt[0]))] vls = txt all_data = pd.DataFrame(vls, columns=hds) all_data = all_data.replace({'': None}) else: if header: all_data = pd.read_csv(path, sep=',', header=0, engine='python') else: all_data = pd.read_csv(path, sep=',', header=None, engine='python') all_data.columns = [ 'f' + str(a) for a in list(all_data.columns.values) ] # with open(path, 'r') as f: # if fsize > size_c: # txt = [re.sub(r'[\t,|]', ',', a).strip().split(',') for a in f.readlines()[:2000+np.int(header)]] # warnings.warn('sample data is used to assess data type') # else: # txt = [re.sub(r'[\t,|]', ',', a).strip().split(',') for a in f.readlines()] # f.close() # # if header: # hds = txt[0] # vls = txt[1:] # else: # hds = ['f'+str(a) for a in range(len(txt[0]))] # vls = txt # all_data = pd.DataFrame(vls, columns = hds) # # all_data = all_data.replace({'' : None}) if all_data.shape[1] == 1: warnings.warn("invalid parser probably") hds = list(all_data.columns.values) rlts = {} for i in hds: tmp = all_data[i].dropna() try: tmp = tmp.apply(np.float) unq = len(tmp.unique()) if np.float(unq) > 100: rlts[i] = {'type': 'float', 'dist': len(all_data[i].unique())} else: rlts[i] = {'type': 'int', 'dist': len(all_data[i].unique())} except: rlts[i] = {'type': 'str', 'dist': len(all_data[i].unique())} if fsize > size_c: nm = 'type_info_sample.json' else: nm = 'type_info.json' tools.putFile(output, nm, rlts) return rlts
if if_gnrt_smy: smy = {'undo':[], 'fill':{}, 'cap':{}, 'var2char':{}, 'onehot':{}, 'woeCal':{}} for i in int_col+float_col: smy['undo'] += [i] """ 对于字符串项特征的特殊处理 """ smy['var2char']['ft_tag_age'] = {'0-17':1, '18-24':2, '25-34':3, '35-44':4, '45+':5} smy['var2char']['ft_gz_grey_list'] = {'micro_loan_5_':2, 'micro_loan_3_4':1, 'micro_loan_5_,type_1':2, 'micro_loan_5_,type_2':2, 'micro_loan_3_4,type_1':1, 'micro_loan_3_4,type_2':1, 'type_1':0, 'type_2':0, 'type_1,type_2':0} smy['var2char']['ft_lbs_dis_label'] = {'d0':1, 'd1_300':2, 'd301_800':3, 'd801_2500':4, 'd2501_8000':5, 'd8001_20000':6, 'd20000_':7} #js_smy = json.dumps(smy) tools.putFile(path+'/'+version, 'process_methods.json', smy) if ifselect: #维信方法 #随机筛选的方法,主要用gain进行评估 #使用xgboost #feature一般不做特别处理,除非cap #通过随机抽取特征的方式计算 #需要留出OOT model_params = { #'booster':'gbtree', 'objective': 'binary:logistic', #多分类的问题 #'eval_metric': 'auc', #'num_class':10, # 类别数,与 multisoftmax 并用 'gamma':0.2, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 'max_depth':2, # 构建树的深度,越大越容易过拟合
'int_col': [], 'float_col': [], 'str_col': [], 'toDrop': [] } total_data = len(raw_data) try: js = tools.getJson(path + '/' + 'type_info.json') except: js = tools.getJson(path + '/' + 'type_info_sample.json') print('generating summary') for k, v in tqdm(js.items()): if k == smy['label'] or k == smy['dayno']: continue elif k in to_drop_list: smy['toDrop'].append({k: 'no feature'}) elif js[k]['dist'] <= 1: smy['toDrop'].append({k: 'unique_value'}) elif js[k]['type'] == 'str' and js[k]['dist'] > 30: smy['toDrop'].append({k: 'too much chars'}) elif raw_data[k].isnull().sum() / total_data > 0.98: smy['toDrop'].append({k: 'too much missing'}) elif js[k]['type'] == 'str': smy['str_col'].append(k) elif js[k]['type'] == 'int': smy['int_col'].append(k) elif js[k]['type'] == 'float': smy['float_col'].append(k) tools.putFile(path, 'summary.json', smy)
tmp['bf'] = spurs.getIVinfo() spurs.setTgt(df_af) spurs.woe_cal() tmp['af'] = spurs.getIVinfo() ivs[i] = tmp except: ivs[i] = {'bf': None, 'af': None} except KeyboardInterrupt: t.close() raise t.close() #输出不适合计算WOE的特征 #print(spurs.allInvalid) tools.putFile(path + '/feature_stat', 'invalidIV_mdf.json', spurs.getInvalid()) tools.putFile(path + '/feature_stat', 'misStat_mdf.json', mis) tools.putFile(path + '/feature_stat', 'ivStat_mdf.json', ivs) tools.putFile(path + '/feature_stat', 'ksStat_mdf.json', kses) tools.putFile(path + '/feature_stat', 'tvStat_mdf.json', tvs) tools.putFile(path + '/feature_stat', 'psiStat_mdf.json', psis) final = pd.DataFrame(mis).T tmp = pd.DataFrame(ivs).T tmp.columns = ['iv_' + str(a) for a in tmp.columns] final = pd.merge(left=final, right=tmp, left_index=True, right_index=True, how='outer') tmp = pd.DataFrame(kses).T