예제 #1
0
def select_feature(x_train, y_train, x_test):
    y_train, num_class, lbl = label_encode(y_train)
    clf = XGBClassifier(
        learning_rate=0.1,  # 默认0.3
        n_estimators=20,  # 树的个数
        max_depth=5,
        min_child_weight=1,
        gamma=0.5,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='multi:softmax',  # 逻辑回归损失函数
        nthread=8,  # cpu线程数
        scale_pos_weight=1,
        reg_alpha=1e-05,
        reg_lambda=1,
        seed=2017)  # 随机种子

    clf.fit(x_train, y_train)
    new_feature = clf.apply(x_train)
    new_feature = pd.DataFrame(new_feature)
    x_train = pd.concat([x_train, new_feature], axis=1)
    # x_train = new_feature
    new_feature = clf.apply(x_test)
    new_feature = pd.DataFrame(new_feature)
    x_test = pd.concat([x_test, new_feature], axis=1)
    # new_feature = []
    # x_test = new_feature
    return x_train, x_test
예제 #2
0
def TreeEmbedding():
    for task_no in range(args.task_num):
        train_file = os.path.join(data_dir, "task{}_train.csv".format(task_no))
        test_file = os.path.join(data_dir, "task{}_test.csv".format(task_no))
        train_data = pd.read_csv(train_file)
        test_data = pd.read_csv(test_file)
        ID = 'id'
        target = 'label'
        columns = [x for x in train_data.columns if x not in [ID, target]]
        X_train = train_data[columns]
        y_train = train_data[target]
        X_test = test_data[columns]
        y_test = test_data[target]
        xgb = XGBClassifier(n_estimators=500,
                            max_depth=4,
                            subsample=0.6,
                            colsample_bytree=0.6)
        xgb.fit(X_train, y_train)
        X_train_embedding = xgb.apply(X_train)
        X_test_embedding = xgb.apply(X_test)
        field_num = X_train_embedding.shape[1]
        field_id = ["t{}".format(i + 1) for i in range(field_num)]

        leaves_num = sum(
            [max(X_train_embedding[:, i]) for i in range(field_num)])
        print("leaves num: {}".format(leaves_num))
        with open('./data/parameters.conf', 'w') as file:
            file.write("leaves_num:{}".format(leaves_num))
        train_data_path = train_file.replace(".csv", "_embedding.csv")
        test_data_path = test_file.replace(".csv", "_embedding.csv")

        print('saving RandomTreesEmbedding datasets...')
        # 将训练集和测试集的Tree Embedding 保存下来
        X_train_embedding_df = pd.DataFrame(data=X_train_embedding,
                                            columns=field_id)
        for field_name in field_id:
            X_train_embedding_df[field_name] = (X_train_embedding_df[field_name]-X_train_embedding_df[field_name].min())/\
                                               (X_train_embedding_df[field_name].max()-X_train_embedding_df[field_name].min())
        # 增加label列
        X_train_embedding_df['label'] = y_train
        X_train_embedding_df.to_csv(train_data_path, index_label='id')

        # 保存测试集
        X_test_embedding_df = pd.DataFrame(data=X_test_embedding,
                                           columns=field_id)
        for field_name in field_id:
            X_test_embedding_df[field_name] = (X_test_embedding_df[field_name] - X_test_embedding_df[field_name].min()) / \
                                               (X_test_embedding_df[field_name].max() - X_test_embedding_df[field_name].min())

        X_test_embedding_df['label'] = y_test
        X_test_embedding_df.to_csv(test_data_path, index_label='id')
예제 #3
0
def Trees_Leaf(str, X_train, x_test, Y_train, y_test, n_estimators=2000):
    if str == 'rf':
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(bootstrap=True,
                                     oob_score=True,
                                     criterion='gini',
                                     n_estimators=n_estimators)
        clf.fit(X_train, Y_train)
        lv_train = clf.apply(X_train).reshape(-1, n_estimators)
        lv_test = clf.apply(x_test).reshape(-1, n_estimators)
        return lv_train, lv_test
    elif str == 'gbdt':
        from sklearn.ensemble import GradientBoostingClassifier
        gbm = GradientBoostingClassifier(n_estimators=n_estimators,
                                         random_state=10,
                                         subsample=0.6,
                                         max_depth=7)
        gbm.fit(X_train, Y_train)
        lv_train = gbm.apply(X_train).reshape(-1, n_estimators)
        lv_test = gbm.apply(x_test).reshape(-1, n_estimators)
        return lv_train, lv_test
    elif str == 'xgb':
        import xgboost as xgb
        from xgboost import XGBClassifier
        xgbm = XGBClassifier(max_depth=15,
                             learning_rate=0.1,
                             n_estimators=n_estimators,
                             min_child_weight=5,
                             max_delta_step=0,
                             subsample=0.8,
                             colsample_bytree=0.7,
                             reg_alpha=0,
                             reg_lambda=0.4,
                             scale_pos_weight=0.8,
                             silent=True,
                             objective='binary:logistic',
                             missing=None,
                             eval_metric='auc',
                             seed=1440,
                             gamma=0)
        xgbm.fit(X_train, Y_train)
        lv_train = xgbm.apply(X_train).reshape(-1, n_estimators)
        lv_test = xgbm.apply(x_test).reshape(-1, n_estimators)
        return lv_train, lv_test
    elif str == 'lgb':
        import lightgbm as lgb
        ### 数据转换
        data_train = lgb.Dataset(X_train,
                                 Y_train,
                                 free_raw_data=False,
                                 silent=True)
        data_test = lgb.Dataset(x_test,
                                y_test,
                                reference=data_train,
                                free_raw_data=False,
                                silent=True)
        params = {
            'n_estimators': n_estimators,
            'boosting_type': 'gbdt',
            'boosting': 'dart',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'learning_rate': 0.01,
            'num_leaves': 25,
            'max_depth': 3,
            'max_bin': 10,
            'min_data_in_leaf': 8,
            'feature_fraction': 0.6,
            'bagging_fraction': 1,
            'bagging_freq': 0,
            'lambda_l1': 0,
            'lambda_l2': 0,
            'min_split_gain': 0
        }
        gbm = lgb.train(
            params,  # 参数字典
            data_train,  # 训练集
            num_boost_round=n_estimators,  # 迭代次数
            verbose_eval=2000,  # 每运行多少次打印一次(尽量设置大一点)
            valid_sets=data_test,  # 验证集
            early_stopping_rounds=30)  # 早停系数
        lv_train = gbm.predict(X_train,
                               pred_leaf=True).reshape(-1, n_estimators)
        lv_test = gbm.predict(x_test, pred_leaf=True).reshape(-1, n_estimators)
        return lv_train, lv_test
예제 #4
0
objective="binary:logistic", 损失函数
booster='gbtree', 求解方式
"""
algo = XGBClassifier(n_estimators=10, objective="binary:logistic", max_depth=3)

# 7. 模型的训练
algo.fit(x_train, y_train)

# 8. 模型效果评估
train_predict = algo.predict(x_train)
test_predict = algo.predict(x_test)
print("测试集上的效果(准确率):{}".format(algo.score(x_test, y_test)))
print("训练集上的效果(准确率):{}".format(algo.score(x_train, y_train)))
print("测试集上的效果(分类评估报告):\n{}".format(classification_report(
    y_test, test_predict)))
print("训练集上的效果(分类评估报告):\n{}".format(
    classification_report(y_train, train_predict)))

# 9. 其它
print("返回的预测概率值:\n{}".format(algo.predict_proba(x_test)))

# 10. 其他特殊的API
print("各个特征属性的重要性权重:\n{}".format(algo.feature_importances_))

# 返回叶子节点下标
print("*" * 100)
x_test2 = x_test.iloc[:2, :]
print(x_test2)
# apply方法返回的是叶子节点下标
print(algo.apply(x_test2))
예제 #5
0
        y_prob = test_pred
        for i in [99, 98, 95, 90]:
            threshold = np.percentile(y_prob, i)
            print(
                f'Checking top {100-i}% suspicious transactions: {len(y_prob[y_prob > threshold])}'
            )
            precision = np.mean(xgb_testy[y_prob > threshold])
            recall = sum(xgb_testy[y_prob > threshold]) / sum(xgb_testy)
            revenue_recall = sum(
                revenue_test[y_prob > threshold]) / sum(revenue_test)
            print(
                f'Precision: {round(precision, 4)}, Recall: {round(recall, 4)}, Seized Revenue (Recall): {round(revenue_recall, 4)}'
            )

        # get leaf index from xgboost model
        X_train_leaves = xgb_clf.apply(xgb_trainx)
        X_valid_leaves = xgb_clf.apply(xgb_validx)
        X_test_leaves = xgb_clf.apply(xgb_testx)
        train_rows = X_train_leaves.shape[0]

        # one-hot encoding for leaf index
        xgbenc = OneHotEncoder(categories="auto")
        lr_trainx = xgbenc.fit_transform(X_train_leaves)
        lr_validx = xgbenc.transform(X_valid_leaves)
        lr_testx = xgbenc.transform(X_test_leaves)

        # model
        print("Training Logistic regression model...")
        lr = LogisticRegression()
        lr.fit(lr_trainx, xgb_trainy)
        test_pred = lr.predict_proba(lr_testx)[:, 1]
예제 #6
0

# 不生成新的特征,直接训练
clf = XGBClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print("Original featrues")
print("XGB_ACC: {:.6f}".format(acc))
print("XGB_AUC: {:.6f}".format(auc))


# 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
X_train_leaves = clf.apply(X_train)
X_test_leaves = clf.apply(X_test)

# 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作
All_leaves = np.r_[X_train_leaves, X_test_leaves]

# 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作
enc = OneHotEncoder(categories='auto')
new_features = enc.fit_transform(All_leaves)

# 根据原训练集、测试集的索引对新特征予以拆分
train_samples = X_train.shape[0]
X_train_new = new_features[:train_samples, :]
X_test_new = new_features[train_samples: , :]

# 将初始训练集与GBDT新生成的特征联合后再训练LR