file3 = open(folderOfData + 'continous_merged_dict.pkl', 'wb+') pickle.dump(continous_merged_dict, file3) file3.close() ''' 第四步:WOE编码、计算IV ''' WOE_dict = {} IV_dict = {} # 分箱后的变量进行编码,包括: # 1,初始取值个数小于5,且不需要合并的类别型变量。存放在less_value_features中 # 2,初始取值个数小于5,需要合并的类别型变量。合并后新的变量存放在var_bin_list中 # 3,初始取值个数超过5,需要合并的类别型变量。合并后新的变量存放在var_bin_list中 # 4,连续变量。分箱后新的变量存放在var_bin_list中 all_var = var_bin_list + less_value_features for var in all_var: woe_iv = scorecard_function.CalcWOE(trainData, var, 'y') WOE_dict[var] = woe_iv['WOE'] IV_dict[var] = woe_iv['IV'] file4 = open(folderOfData + 'WOE_dict.pkl', 'wb+') pickle.dump(WOE_dict, file4) file4.close() # 将变量IV值进行降.,序排列,方便后续挑选变量 IV_dict_sorted = sorted(IV_dict.items(), key=lambda x: x[1], reverse=True) IV_values = [i[1] for i in IV_dict_sorted] IV_name = [i[0] for i in IV_dict_sorted] plt.title('feature IV') plt.bar(range(len(IV_values)), IV_values) plt.show()
p = t.index(min(t)) return bins[p] for x in X.columns: locals()[x+'_me'] = copy.deepcopy(X[x]) locals()[x+'_me'] = locals()[x+'_me'].map(lambda e:close(e,globals()[x+'_chi'])) #lambda函数内必须用全局声明才能访问函数外的变量 X_me = pd.concat([locals()[x+'_me'] for x in X.columns],axis = 1) ##或者将每个分箱视为离散化后的类别(针对连续值,若是类别性数据还是需要LabelEncoder) for x in X.columns: locals()[x+'_le'] = X[x].map(lambda e:scorecard_function.AssignBin(e, globals()[x+'_chi'], special_attribute=[])) X_le = pd.concat([locals()[x+'_le'] for x in X.columns],axis = 1) ##用WOE编码,这是一种有监督的取值方式,通常用于信用评分卡 for x in X.columns: locals()['woe_'+x] = scorecard_function.CalcWOE(pd.concat([X_le,y],axis=1), x, 'y')['WOE'] locals()[x+'_woe'] = X_le[x].map(lambda e:globals()['woe_'+x][e]) #通过xi_le转变 X_woe = pd.concat([locals()[x+'_woe'] for x in X.columns],axis = 1) #标准化 from sklearn.preprocessing import StandardScaler s = StandardScaler().fit(X) X_s = pd.DataFrame(s.transform(X),columns = X.columns) me_s = StandardScaler().fit(X_me) X_me_s = pd.DataFrame(me_s.transform(X_me),columns = X_me.columns) #重采样