Exemplo n.º 1
0
    'self_card_query_in6m', 'self_card_query_in3m', 'self_card_query_in1m',
    'self_card_query_in12m', 'self_card_query_in24m', 'self_loan_query_in6m',
    'self_loan_query_in3m', 'self_loan_query_in1m', 'self_loan_query_in12m',
    'self_loan_query_in24m', 'self_loan_query_de_f_in6m',
    'self_loan_query_de_f_in3m', 'self_loan_query_de_f_in1m',
    'self_loan_query_de_f_in12m', 'self_loan_query_de_f_in24m',
    'self_loan_card_query_in6m', 'self_loan_card_query_in3m',
    'self_loan_card_query_in1m', 'self_loan_card_query_in12m',
    'self_loan_card_query_in24m', 'y'
]]

#同值化检查
df2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio(
    df1, ratiolimit=0.95)
#查看缺失值情况
df3, null_ratio = step01_feature_engine.select_null_ratio(df2)

var = list(df2.columns)
for i in var:
    step06_draw_plot.drawHistogram(df[i])

#查看缺失值情况
#step01_feature_engine.fill_null_data(df3)
df3.isnull().sum(axis=0).sort_values(ascending=False)
null_ratio = step01_feature_engine.select_null_ratio(df3)

df4 = df3.fillna(0)
df4.isnull().sum(axis=0).sort_values(ascending=False)

#IV保留大于0.02的变量
new_data, iv_value = step01_feature_engine.filter_iv(df4, group=5)
Exemplo n.º 2
0
    df2, ratiolimit=0.942)

##针对缺失值进行IV值计算和分箱
null_data = df3[[
    'mths_since_last_record', 'mths_since_recent_bc_dlq',
    'mths_since_last_major_derog', 'mths_since_recent_revol_delinq',
    'mths_since_last_delinq', 'il_util', 'mths_since_recent_inq', 'y'
]]

csvfile = r"F:\TS\Lending_Club\04_output\06_null_data\null_data.csv"
null_data.to_csv(csvfile, sep=',', index=False, encoding='utf-8')

#这些变量除了mths_since_recent_inq和il_util,其他IV值都是小于0.01的,可以直接删除

#查看缺失值情况,变量由122变为83个;其中几个40%-80%的缺失值其意义不大,无需进行missing编码
df4, null_ratio = step01_feature_engine.select_null_ratio(df3, ratiolimit=0.40)

##看一下每个变量的描述性统计
ds = df.describe().T.reset_index()

##处理带有百分号的数据
df4['revol_util'] = df4['revol_util'].str.rstrip('%').astype('float')
df4['int_rate'] = df4['int_rate'].str.rstrip('%').astype('float')
df4['term'] = df4['term'].str.rstrip('months').astype('float')

##############################################
#对字符型数据进行编码和删除

##删掉一些无意义或者重复的变量,变量由83变为72个
##删除一些贷后的变量的,这些变量会向申请模型泄露信息