def coef(): data = readbunchobj('data_set.data') cate_feature_col = ['gender', 'receipt_address', 'household_register'] X = data.X_train y = data.y_train col = data.col x_num_col = [i for i in list(col) if i not in cate_feature_col] # prep = StandardScaler() # X = prep.fit_transform(X[x_num_col]) # X = pd.DataFrame(data=X, columns=x_num_col) X = X[x_num_col] coef_list = [] for c in X.columns: coef_ = np.corrcoef(X[c], y) coef_list.append([c, coef_[0][1]]) coef_df = pd.DataFrame(coef_list)
def t_model(y_test, y_pred): c_m = metrics.confusion_matrix(y_test, y_pred) print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format( c_m[0][0], c_m[1][0], c_m[1][1], c_m[0][1])) print("召回率:%.4f" % metrics.recall_score(y_test, y_pred)) print("查准率:%.4f" % metrics.precision_score(y_test, y_pred)) print("F1:%.4f" % metrics.f1_score(y_test, y_pred)) print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred)) print("F-measure:%.4f" % (metrics.recall_score(y_test, y_pred) * metrics.precision_score(y_test, y_pred))) if __name__ == '__main__': data = readbunchobj('dataset_woe.data') X_train = pd.DataFrame(data.X_train) X_test = data.X_test y_train = data.y_train y_test = data.y_test # categorical_features_indices = np.where((X_train.dtypes != np.float) & (X_train.dtypes != np.int64))[0] # 类型特征的索引 n = 100 # 做100个子样本 start = time.time() clf_list, clf_score = bagging_boost_fit(X_train, y_train, n) end = time.time() print(start - end) y_pred = bagging_boost_predict(X_test, clf_list, clf_score) t_model(y_test, y_pred)
def get_data(): data = readbunchobj('data.data') return data
x_num_col = [i for i in list(col) if i not in cate_feature_col] # prep = StandardScaler() # X = prep.fit_transform(X[x_num_col]) # X = pd.DataFrame(data=X, columns=x_num_col) X = X[x_num_col] coef_list = [] for c in X.columns: coef_ = np.corrcoef(X[c], y) coef_list.append([c, coef_[0][1]]) coef_df = pd.DataFrame(coef_list) if __name__ == '__main__': dataset = readbunchobj('data.data') data = dataset.data label = dataset.label positive_index = label[label['label'] == 1].index.values negative_index = label[label['label'] == 0].index.values data_1 = data.loc[positive_index] data_0 = data.loc[negative_index] miss_df = data.isna().sum() / len(data) plt.barh(miss_df.index, miss_df.values) col = [ 'years', 'score', 'account_rank', 'deal_order_number', 'avg_order_amount', 'max_pay_amount', 'last_consume_days',
return list(sum_y[0]) def test_model(y_test, y_pred): c_m = metrics.confusion_matrix(y_test, y_pred) print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format( c_m[0][0], c_m[1][0], c_m[1][1], c_m[0][1])) print("召回率:%.4f" % metrics.recall_score(y_test, y_pred)) print("查准率:%.4f" % metrics.precision_score(y_test, y_pred)) print("F1:%.4f" % metrics.f1_score(y_test, y_pred)) print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred)) print("F-measure:%.4f" % (metrics.recall_score(y_test, y_pred) * metrics.precision_score(y_test, y_pred))) if __name__ == '__main__': data = readbunchobj('dataset.data') X_train = pd.DataFrame(data.X_train) X_test = data.X_test y_train = data.y_train y_test = data.y_test categorical_features_indices = np.where((X_train.dtypes != np.float) & ( X_train.dtypes != np.int64))[0] # 类型特征的索引 n = 50 # 做100个子样本 clf_list, clf_score = bagging_boost_fit(X_train, y_train, n) y_pred = bagging_boost_predict(X_test, clf_list, clf_score) test_model(y_test, y_pred)
vae = Model(inputs, outputs, name='vae_mlp') # reconstruction_loss = binary_crossentropy(inputs, outputs) # reconstruction_loss *= original_dim # kl_loss = 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma) # kl_loss = K.sum(kl_loss, axis=-1) # kl_loss *= -0.5 # vae_loss = K.mean(reconstruction_loss + kl_loss) # vae.add_loss(vae_loss) vae.compile(optimizer='adam', loss=lambda y_true, y_pred: y_pred) # train data import pandas as pd from prepare import readbunchobj from sklearn.preprocessing import MinMaxScaler data = readbunchobj('dataset_delstr.data') x_train = np.array(data.X_train) x_test = np.array(data.X_test) y_train = data.y_train y_test = data.y_test scl = MinMaxScaler() x_train = scl.fit_transform(x_train) x_test = scl.transform(x_test) train = pd.DataFrame(x_train) train['target'] = y_train train_0 = train[train['target'] == 0] # 多数类 train_1 = train[train['target'] == 1] # 少数类 resample_num = 2000 # 需要生成的少数类样本数目
x_train_transform = np.hstack((x_train_transform_sigmoid, x_train_transform_tanh)) x_test_transform = np.hstack((x_test_transform_sigmoid, x_test_transform_tanh)) return x_train_transform, x_test_transform if __name__ == '__main__': import os os.environ["CUDA_VISIBLE_DEVICES"] = '0' # use GPU with ID=0 config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 # maximun alloc gpu50% of MEM config.gpu_options.allow_growth = True # allocate dynamically data = readbunchobj('d:/py/credit_risk/dataset_delstr.data') Xtrain = np.array(data.X_train) Xtest = data.X_test y_train = data.y_train y_test = data.y_test prep = MinMaxScaler() Xtrain = prep.fit_transform(Xtrain) Xtest = prep.transform(Xtest) n, m = Xtrain.shape gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) list1 = [m, 21, 17, 9] eta = 0.01 training_epochs = 30 bitch_size = 100