def select_feature(x_train, y_train, x_test): y_train, num_class, lbl = label_encode(y_train) clf = XGBClassifier( learning_rate=0.1, # 默认0.3 n_estimators=20, # 树的个数 max_depth=5, min_child_weight=1, gamma=0.5, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', # 逻辑回归损失函数 nthread=8, # cpu线程数 scale_pos_weight=1, reg_alpha=1e-05, reg_lambda=1, seed=2017) # 随机种子 clf.fit(x_train, y_train) new_feature = clf.apply(x_train) new_feature = pd.DataFrame(new_feature) x_train = pd.concat([x_train, new_feature], axis=1) # x_train = new_feature new_feature = clf.apply(x_test) new_feature = pd.DataFrame(new_feature) x_test = pd.concat([x_test, new_feature], axis=1) # new_feature = [] # x_test = new_feature return x_train, x_test
def TreeEmbedding(): for task_no in range(args.task_num): train_file = os.path.join(data_dir, "task{}_train.csv".format(task_no)) test_file = os.path.join(data_dir, "task{}_test.csv".format(task_no)) train_data = pd.read_csv(train_file) test_data = pd.read_csv(test_file) ID = 'id' target = 'label' columns = [x for x in train_data.columns if x not in [ID, target]] X_train = train_data[columns] y_train = train_data[target] X_test = test_data[columns] y_test = test_data[target] xgb = XGBClassifier(n_estimators=500, max_depth=4, subsample=0.6, colsample_bytree=0.6) xgb.fit(X_train, y_train) X_train_embedding = xgb.apply(X_train) X_test_embedding = xgb.apply(X_test) field_num = X_train_embedding.shape[1] field_id = ["t{}".format(i + 1) for i in range(field_num)] leaves_num = sum( [max(X_train_embedding[:, i]) for i in range(field_num)]) print("leaves num: {}".format(leaves_num)) with open('./data/parameters.conf', 'w') as file: file.write("leaves_num:{}".format(leaves_num)) train_data_path = train_file.replace(".csv", "_embedding.csv") test_data_path = test_file.replace(".csv", "_embedding.csv") print('saving RandomTreesEmbedding datasets...') # 将训练集和测试集的Tree Embedding 保存下来 X_train_embedding_df = pd.DataFrame(data=X_train_embedding, columns=field_id) for field_name in field_id: X_train_embedding_df[field_name] = (X_train_embedding_df[field_name]-X_train_embedding_df[field_name].min())/\ (X_train_embedding_df[field_name].max()-X_train_embedding_df[field_name].min()) # 增加label列 X_train_embedding_df['label'] = y_train X_train_embedding_df.to_csv(train_data_path, index_label='id') # 保存测试集 X_test_embedding_df = pd.DataFrame(data=X_test_embedding, columns=field_id) for field_name in field_id: X_test_embedding_df[field_name] = (X_test_embedding_df[field_name] - X_test_embedding_df[field_name].min()) / \ (X_test_embedding_df[field_name].max() - X_test_embedding_df[field_name].min()) X_test_embedding_df['label'] = y_test X_test_embedding_df.to_csv(test_data_path, index_label='id')
def Trees_Leaf(str, X_train, x_test, Y_train, y_test, n_estimators=2000): if str == 'rf': from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(bootstrap=True, oob_score=True, criterion='gini', n_estimators=n_estimators) clf.fit(X_train, Y_train) lv_train = clf.apply(X_train).reshape(-1, n_estimators) lv_test = clf.apply(x_test).reshape(-1, n_estimators) return lv_train, lv_test elif str == 'gbdt': from sklearn.ensemble import GradientBoostingClassifier gbm = GradientBoostingClassifier(n_estimators=n_estimators, random_state=10, subsample=0.6, max_depth=7) gbm.fit(X_train, Y_train) lv_train = gbm.apply(X_train).reshape(-1, n_estimators) lv_test = gbm.apply(x_test).reshape(-1, n_estimators) return lv_train, lv_test elif str == 'xgb': import xgboost as xgb from xgboost import XGBClassifier xgbm = XGBClassifier(max_depth=15, learning_rate=0.1, n_estimators=n_estimators, min_child_weight=5, max_delta_step=0, subsample=0.8, colsample_bytree=0.7, reg_alpha=0, reg_lambda=0.4, scale_pos_weight=0.8, silent=True, objective='binary:logistic', missing=None, eval_metric='auc', seed=1440, gamma=0) xgbm.fit(X_train, Y_train) lv_train = xgbm.apply(X_train).reshape(-1, n_estimators) lv_test = xgbm.apply(x_test).reshape(-1, n_estimators) return lv_train, lv_test elif str == 'lgb': import lightgbm as lgb ### 数据转换 data_train = lgb.Dataset(X_train, Y_train, free_raw_data=False, silent=True) data_test = lgb.Dataset(x_test, y_test, reference=data_train, free_raw_data=False, silent=True) params = { 'n_estimators': n_estimators, 'boosting_type': 'gbdt', 'boosting': 'dart', 'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.01, 'num_leaves': 25, 'max_depth': 3, 'max_bin': 10, 'min_data_in_leaf': 8, 'feature_fraction': 0.6, 'bagging_fraction': 1, 'bagging_freq': 0, 'lambda_l1': 0, 'lambda_l2': 0, 'min_split_gain': 0 } gbm = lgb.train( params, # 参数字典 data_train, # 训练集 num_boost_round=n_estimators, # 迭代次数 verbose_eval=2000, # 每运行多少次打印一次(尽量设置大一点) valid_sets=data_test, # 验证集 early_stopping_rounds=30) # 早停系数 lv_train = gbm.predict(X_train, pred_leaf=True).reshape(-1, n_estimators) lv_test = gbm.predict(x_test, pred_leaf=True).reshape(-1, n_estimators) return lv_train, lv_test
objective="binary:logistic", 损失函数 booster='gbtree', 求解方式 """ algo = XGBClassifier(n_estimators=10, objective="binary:logistic", max_depth=3) # 7. 模型的训练 algo.fit(x_train, y_train) # 8. 模型效果评估 train_predict = algo.predict(x_train) test_predict = algo.predict(x_test) print("测试集上的效果(准确率):{}".format(algo.score(x_test, y_test))) print("训练集上的效果(准确率):{}".format(algo.score(x_train, y_train))) print("测试集上的效果(分类评估报告):\n{}".format(classification_report( y_test, test_predict))) print("训练集上的效果(分类评估报告):\n{}".format( classification_report(y_train, train_predict))) # 9. 其它 print("返回的预测概率值:\n{}".format(algo.predict_proba(x_test))) # 10. 其他特殊的API print("各个特征属性的重要性权重:\n{}".format(algo.feature_importances_)) # 返回叶子节点下标 print("*" * 100) x_test2 = x_test.iloc[:2, :] print(x_test2) # apply方法返回的是叶子节点下标 print(algo.apply(x_test2))
y_prob = test_pred for i in [99, 98, 95, 90]: threshold = np.percentile(y_prob, i) print( f'Checking top {100-i}% suspicious transactions: {len(y_prob[y_prob > threshold])}' ) precision = np.mean(xgb_testy[y_prob > threshold]) recall = sum(xgb_testy[y_prob > threshold]) / sum(xgb_testy) revenue_recall = sum( revenue_test[y_prob > threshold]) / sum(revenue_test) print( f'Precision: {round(precision, 4)}, Recall: {round(recall, 4)}, Seized Revenue (Recall): {round(revenue_recall, 4)}' ) # get leaf index from xgboost model X_train_leaves = xgb_clf.apply(xgb_trainx) X_valid_leaves = xgb_clf.apply(xgb_validx) X_test_leaves = xgb_clf.apply(xgb_testx) train_rows = X_train_leaves.shape[0] # one-hot encoding for leaf index xgbenc = OneHotEncoder(categories="auto") lr_trainx = xgbenc.fit_transform(X_train_leaves) lr_validx = xgbenc.transform(X_valid_leaves) lr_testx = xgbenc.transform(X_test_leaves) # model print("Training Logistic regression model...") lr = LogisticRegression() lr.fit(lr_trainx, xgb_trainy) test_pred = lr.predict_proba(lr_testx)[:, 1]
# 不生成新的特征,直接训练 clf = XGBClassifier(n_estimators=50) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print("Original featrues") print("XGB_ACC: {:.6f}".format(acc)) print("XGB_AUC: {:.6f}".format(auc)) # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵 X_train_leaves = clf.apply(X_train) X_test_leaves = clf.apply(X_test) # 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作 All_leaves = np.r_[X_train_leaves, X_test_leaves] # 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作 enc = OneHotEncoder(categories='auto') new_features = enc.fit_transform(All_leaves) # 根据原训练集、测试集的索引对新特征予以拆分 train_samples = X_train.shape[0] X_train_new = new_features[:train_samples, :] X_test_new = new_features[train_samples: , :] # 将初始训练集与GBDT新生成的特征联合后再训练LR