def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def gbdt_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_gbdt_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1] gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt) print('基于原有特征的gbdt auc: %.5f' % gbdt_auc) cv_gbdt_scores.append(gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_valid)[:, 1] lr_valid_auc = roc_auc_score(y_valid, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_valid_auc) cv_lr_scores.append(lr_valid_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_valid_leaves = gbdt.apply(X_valid)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) cv_lr_trans_scores.append(gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_valid_ext = hstack([X_trans[train_rows:, :], X_valid]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) cv_lr_trans_raw_scores.append(gbdt_lr_auc2) cv_lr = np.mean(cv_lr_scores) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_gbdt = np.mean(cv_gbdt_scores) print("==" * 20) print("gbdt原始特征cv_gbdt:", cv_gbdt) print("lr原始特征cv_lr:", cv_lr) print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans) print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline): # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3) #n_estimators=20, max_depth=3, verbose=0, max_features=0.5 # 训练学习 gbdt.fit(train[gbdt_features], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1] gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) else: y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1] gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) # GBDT编码原有特征 X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0] X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1) print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1) else: print('Online') # 定义LR模型 lr = LogisticRegression() # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]]) X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]]) print("gbdt output",X_trans[:train_rows, :].shape) print("input",train[lr_features].shape) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2) print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2) else: print('Online') test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1] print(test['predicted_score'].head(5)) print(len(test)) test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果 print('Saved result success!')
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
# 原始结果为:('test:', 0.081939773937662927) # Accuracy : 0.6848 """ """ # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, Y_train) # 预测及AUC评测 Y_predict_LR = lr.predict_proba(X_test)[:, 1] print('test:', log_loss(Y_test, Y_predict_LR)) print "before Accuracy : %.4f" % metrics.roc_auc_score(Y_test, Y_predict_LR) # ('test:', 0.095052240862119497) # before Accuracy : 0.5413 """ # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # apply方法只有gdbt里面才有 # xgboost里没有。。 # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # print X_trans.shape # (478111, 797) """ # 定义LR模型 lr = LogisticRegression(n_jobs=1) # lr对gbdt特征编码后的样本模型训练
encoder = LabelEncoder() data['V2'] = encoder.fit_transform(data['V2']) data['V4'] = encoder.fit_transform(data['V4']) data['V5'] = encoder.fit_transform(data['V5']) data_process(train_agg) data_process(test_agg) del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg #gbdt 构造新特征 gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None) X_train=train_agg.drop(['USRID','FLAG'],axis=1) y_train=train_agg['FLAG'] # 训练学习 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0] (train_rows, cols) = X_train_leaves.shape onehot = OneHotEncoder() X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 组合特征 X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray()) X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray()) X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True) X_test_agg.rename(columns={494: "USRID"},inplace=True) #训练集和测试集 train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left') test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left')
test_size=0.25, random_state=0) # 不生成新的特征,直接训练 clf = GradientBoostingClassifier(n_estimators=50) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print("Original featrues") print("GBDT_ACC: {:.6f}".format(acc)) print("GBDT_AUC: {:.6f}".format(auc)) # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵 X_train_leaves = clf.apply(X_train)[:, :, 0] X_test_leaves = clf.apply(X_test)[:, :, 0] # 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作 All_leaves = np.r_[X_train_leaves, X_test_leaves] # 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作 enc = OneHotEncoder(categories='auto') new_features = enc.fit_transform(All_leaves) # 根据原训练集、测试集的索引对新特征予以拆分 train_samples = X_train.shape[0] X_train_new = new_features[:train_samples, :] X_test_new = new_features[train_samples:, :] # 将初始训练集与GBDT新生成的特征联合后再训练LR
class Predict(): def __init__(self): self.gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) self.lr = LogisticRegression(n_jobs=-1) Train_tab = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] Train_libsvm = [[1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 2, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 4, 1], [3, 3, 2, 1, 5, 2], [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 6, 2], [5, 5, 3, 1, 7, 2], [2, 2, 2, 1, 8, 1], [2, 2, 2, 1, 6, 1], [2, 2, 2, 1, 9, 2], [6, 6, 2, 1, 8, 3], [1, 1, 1, 1, 10, 1], [2, 2, 2, 1, 4, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 10, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 12, 1], [2, 2, 2, 1, 2, 1], [5, 5, 3, 1, 13, 2], [2, 2, 2, 1, 14, 1], [7, 7, 2, 1, 15, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 17, 1], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [5, 5, 3, 1, 15, 2], [5, 5, 3, 1, 21, 2], [2, 2, 2, 1, 21, 1], [1, 1, 1, 1, 22, 1], [6, 6, 2, 1, 5, 2], [2, 2, 2, 1, 1, 2], [8, 8, 2, 1, 15, 3], [4, 4, 3, 1, 23, 2], [9, 9, 2, 2, 6, 2], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 10, 2], [5, 5, 3, 1, 24, 2], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 8, 1], [5, 5, 3, 1, 2, 2], [6, 6, 2, 1, 3, 3], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 25, 1], [1, 1, 1, 1, 2, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 10, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 9, 1], [2, 2, 2, 1, 20, 2], [2, 2, 2, 1, 4, 2], [1, 1, 1, 1, 4, 1], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 4, 2], [2, 2, 2, 1, 23, 1], [5, 5, 3, 1, 13, 2], [3, 3, 2, 1, 22, 2], [2, 2, 2, 1, 11, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 9, 1], [1, 1, 1, 1, 9, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 1, 2], [1, 1, 1, 1, 14, 1], [10, 10, 2, 1, 23, 3], [5, 5, 3, 1, 21, 2], [1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 23, 1], [2, 2, 2, 1, 20, 1], [1, 1, 1, 1, 14, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 19, 1], [5, 5, 3, 1, 19, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 14, 1], [11, 11, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 22, 1], [9, 9, 2, 2, 27, 2], [4, 4, 3, 1, 1, 2], [4, 4, 3, 1, 12, 2], [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 8, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 28, 1], [2, 2, 2, 1, 15, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 14, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 24, 2], [2, 2, 2, 1, 23, 1], [2, 2, 2, 1, 8, 1], [2, 2, 2, 1, 21, 2], [6, 6, 2, 1, 6, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 12, 1], [5, 5, 3, 1, 23, 2], [1, 1, 1, 1, 29, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 2, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 30, 1], [2, 2, 2, 1, 8, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 23, 2], [5, 5, 3, 1, 9, 2], [4, 4, 3, 1, 1, 2], [9, 9, 2, 2, 19, 2], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 1, 2], [10, 10, 2, 1, 30, 1], [9, 9, 2, 2, 24, 2], [5, 5, 3, 1, 14, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 22, 2], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 1, 1], [4, 4, 3, 1, 2, 2], [3, 3, 2, 1, 29, 2], [2, 2, 2, 1, 6, 2], [2, 2, 2, 1, 9, 2], [2, 2, 2, 1, 16, 2], [5, 5, 3, 1, 13, 2], [13, 13, 2, 1, 3, 2], [2, 2, 2, 1, 27, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 29, 2], [3, 3, 2, 1, 12, 2], [2, 2, 2, 1, 2, 2], [2, 2, 2, 1, 5, 1], [5, 5, 3, 1, 28, 2], [6, 6, 2, 1, 22, 3], [1, 1, 1, 1, 5, 1], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 21, 2], [2, 2, 2, 1, 1, 1], [2, 2, 2, 1, 19, 1], [2, 2, 2, 1, 4, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 4, 2], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 18, 1], [1, 1, 1, 1, 23, 1], [9, 9, 2, 2, 25, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 5, 1], [10, 10, 2, 1, 2, 3], [2, 2, 2, 1, 9, 2], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 26, 1], [1, 1, 1, 1, 3, 1], [14, 14, 2, 1, 23, 2], [4, 4, 3, 1, 2, 2], [2, 2, 2, 1, 23, 2]] self.gbdt_lr_train(Train_tab, Train_libsvm) def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42) # 定义GBDT模型 self.gbdt.fit(X_train, y_train) # GBDT编码原有特征 self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0] X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (self.train_rows, cols) = self.X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_train_ext = hstack([X_trans[:self.train_rows, :], X_train]) # lr对组合特征的样本模型训练 self.lr.fit(X_train_ext, y_train) def Predict(self, X_test): X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] gbdtenc = OneHotEncoder() self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test]) y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1] values = [] for value in y_pred_gbdtlr2: values.append(value) return values
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # X_all_dense = X_all.todense() print(type(X_all)) # print(type(X_all_dense[0])) # print(y_all) # print("===") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # print(X_train) # print(y_train) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 toarray = X_test.toarray() print(type(toarray)) y_pred_gbdt = gbdt.predict_proba(toarray) # print(y_pred_gbdt) y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # gbdt auc: 0.96455 # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # 基于原有特征的LR AUC: 0.93455 # GBDT编码原有特征 # X_train_leaves = gbdt.apply(X_train) X_train_leaves = gbdt.apply(X_train)[:, :, 0] np.set_printoptions(linewidth=400) np.set_printoptions(threshold=np.inf) # print(X_train_leaves[0:22,:]) # 打印22行,所有列 print(type(X_train_leaves)) X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape print(train_rows, cols) gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print(X_trans.shape) # print(X_trans.todense()[0:22,:]) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 # print(X_trans[train_rows:, :]) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print("组合特征的个数:", X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)