Пример #1
0
file3 = open(folderOfData + 'continous_merged_dict.pkl', 'wb+')
pickle.dump(continous_merged_dict, file3)
file3.close()
'''
第四步:WOE编码、计算IV
'''
WOE_dict = {}
IV_dict = {}
# 分箱后的变量进行编码,包括:
# 1,初始取值个数小于5,且不需要合并的类别型变量。存放在less_value_features中
# 2,初始取值个数小于5,需要合并的类别型变量。合并后新的变量存放在var_bin_list中
# 3,初始取值个数超过5,需要合并的类别型变量。合并后新的变量存放在var_bin_list中
# 4,连续变量。分箱后新的变量存放在var_bin_list中
all_var = var_bin_list + less_value_features
for var in all_var:
    woe_iv = scorecard_function.CalcWOE(trainData, var, 'y')
    WOE_dict[var] = woe_iv['WOE']
    IV_dict[var] = woe_iv['IV']

file4 = open(folderOfData + 'WOE_dict.pkl', 'wb+')
pickle.dump(WOE_dict, file4)
file4.close()

# 将变量IV值进行降.,序排列,方便后续挑选变量
IV_dict_sorted = sorted(IV_dict.items(), key=lambda x: x[1], reverse=True)

IV_values = [i[1] for i in IV_dict_sorted]
IV_name = [i[0] for i in IV_dict_sorted]
plt.title('feature IV')
plt.bar(range(len(IV_values)), IV_values)
plt.show()
Пример #2
0
        p = t.index(min(t))
        return bins[p]

    for x in X.columns:
        locals()[x+'_me'] = copy.deepcopy(X[x])
        locals()[x+'_me'] = locals()[x+'_me'].map(lambda e:close(e,globals()[x+'_chi'])) #lambda函数内必须用全局声明才能访问函数外的变量
    X_me = pd.concat([locals()[x+'_me'] for x in X.columns],axis = 1)

    ##或者将每个分箱视为离散化后的类别(针对连续值,若是类别性数据还是需要LabelEncoder)
    for x in X.columns:
        locals()[x+'_le'] = X[x].map(lambda e:scorecard_function.AssignBin(e, globals()[x+'_chi'], special_attribute=[]))
    X_le = pd.concat([locals()[x+'_le'] for x in X.columns],axis = 1)

    ##用WOE编码,这是一种有监督的取值方式,通常用于信用评分卡
    for x in X.columns:
        locals()['woe_'+x] = scorecard_function.CalcWOE(pd.concat([X_le,y],axis=1), x, 'y')['WOE'] 
        locals()[x+'_woe'] = X_le[x].map(lambda e:globals()['woe_'+x][e]) #通过xi_le转变
    X_woe = pd.concat([locals()[x+'_woe'] for x in X.columns],axis = 1)



    #标准化
    from sklearn.preprocessing import StandardScaler
    s = StandardScaler().fit(X)
    X_s = pd.DataFrame(s.transform(X),columns = X.columns)
    
    me_s = StandardScaler().fit(X_me)
    X_me_s = pd.DataFrame(me_s.transform(X_me),columns = X_me.columns)


    #重采样