#多变量分析,保留相关性低于阈值0.6的变量 #对产生的相关系数矩阵进行比较,并删除IV比较小的变量 per_col = step02_modle_plot.fillter_pearson(pearson_coef, threshold=0.60) print('保留了变量有:', len(per_col)) print(per_col) #136个变量,保留37个 lf5 = new_data[[ 'var8', 'var94', 'var17', 'var7', 'var13', 'var139', 'var78', 'var53', 'var121', 'var97', 'var147', 'var113', 'var59', 'var57', 'var27', 'var114', 'var26', 'var144', 'var154', 'var2', 'var141', 'var136', 'var65', 'var135', 'var123', 'var107', 'var108', 'var122', 'var40', 'var118', 'var133', 'var89', 'var19', 'var14', 'var134', 'var145', 'var156', 'y' ]] pearson_coef = step02_modle_plot.plot_pearson(lf5) #再次观察共线情况 lf5.to_csv(r"C:\Users\Administrator\Desktop\data.csv") data, iv_value = step01_feature_engine.filter_iv(lf5, group=5) iv_value.to_excel( r"F:\TS\external_data_test\电话邦\通善_测试结果\output\iv_value_2.xls") X, y = step01_feature_engine.x_y_data(data) vif_data = step01_feature_engine.judge_vif(X) #两个变量VIF>10,共线 X, y = step01_feature_engine.smote_data(X, y) model = step03_built_modle.baseline_model(X, y)
# pvals = pvals.to_dict() # #============================================================================== #============================================================================== # from sklearn.preprocessing import StandardScaler # 导入模块 # sc = StandardScaler() # X[Col] = sc.fit_transform(X[Col]) #============================================================================== ##处理样本不平衡;当样本过少的时候建议采用这个方法 X, y = step01_feature_engine.smote_data(X, y) model = step03_built_modle.baseline_model(X, y) ''' confusion_matrix [[1531 861] [ 764 1628]] accuracy_score 0.668147373922 precision_score 0.663346613546 recall_score 0.691588785047 ROC_AUC is 0.728932086353 K-S score 0.354737526648''' '''or: ''' model = log_model_test(X,y)