a = step01_feature_engine.fill_null_data(lf) #============================================================================== #绘图 objectColumns = lf.select_dtypes(include=["object"]).columns var = lf[objectColumns].columns for i in var: step06_draw_plot.drawBar(lf[i]) objectColumns = lf.select_dtypes(include=["float"]).columns var = lf[objectColumns].columns for i in var: step06_draw_plot.drawHistogram(lf[i]) #同值化检查 lf2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio( lf, ratiolimit=0.931) #打印字符型变量 step01_feature_engine.check_feature_binary(lf2) #观察各个离散值的分布情况 step01_feature_engine.watch_obj(lf2) # 构建mapping,对有序变量进行转换 mapping_dict1 = { "var1": { "无": 0, "近151-180天": 1, "近121-150天": 1, "近91-120天": 1, "近61-90天": 1,
'same_com_lo_qurry_num_3m', 'same_com_lo_card_num_3m', 'same_com_insur_qurry_num_3m', 'self_loan_dv_in1m', 'self_loan_dv_in3m', 'self_loan_dv_in6m', 'self_loan_dv_in12m', 'self_loan_dv_in24m', 'self_card_query_in6m', 'self_card_query_in3m', 'self_card_query_in1m', 'self_card_query_in12m', 'self_card_query_in24m', 'self_loan_query_in6m', 'self_loan_query_in3m', 'self_loan_query_in1m', 'self_loan_query_in12m', 'self_loan_query_in24m', 'self_loan_query_de_f_in6m', 'self_loan_query_de_f_in3m', 'self_loan_query_de_f_in1m', 'self_loan_query_de_f_in12m', 'self_loan_query_de_f_in24m', 'self_loan_card_query_in6m', 'self_loan_card_query_in3m', 'self_loan_card_query_in1m', 'self_loan_card_query_in12m', 'self_loan_card_query_in24m', 'y' ]] #同值化检查 df2, feature_primaryvalue_ratio = step01_feature_engine.select_primaryvalue_ratio( df1, ratiolimit=0.95) #查看缺失值情况 df3, null_ratio = step01_feature_engine.select_null_ratio(df2) var = list(df2.columns) for i in var: step06_draw_plot.drawHistogram(df[i]) #查看缺失值情况 #step01_feature_engine.fill_null_data(df3) df3.isnull().sum(axis=0).sort_values(ascending=False) null_ratio = step01_feature_engine.select_null_ratio(df3) df4 = df3.fillna(0) df4.isnull().sum(axis=0).sort_values(ascending=False)