def fit_model(self, data, target, test):
        clf = XGBRegressor(learning_rate=self.learning_rate,
                           n_estimators=self.n_estimators,
                           max_depth=self.max_depth,
                           min_child_weight=self.min_child_weight,
                           gamma=self.gamma,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           objective=self.objective,
                           nthread=self.nthread,
                           scale_pos_weight=self.scale_pos_weight,
                           reg_alpha=self.reg_alpha,
                           reg_lambda=self.reg_lambda,
                           seed=self.seed)
        data = np.array(data).astype(float)
        scaler = MinMaxScaler()
        temp = scaler.fit(data)
        data = scaler.transform(data)
        test = scaler.transform(test)
        target = scaler.fit_transform(target)

        clf.fit(data, target)
        new_feature = clf.apply(data)
        new_test = clf.apply(test)
        X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature)
        X_test_new = self.mergeToOne(pd.DataFrame(test), new_test)
        X_train_new = pd.DataFrame(X_train_new)
        X_test_new = pd.DataFrame(X_test_new)
        return X_train_new, target, X_test_new
Пример #2
0
 def fit_model_split(self, X_train, y_train, X_test, y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     # y_pre = clf.predict(X_train_2)
     # y_pro = clf.predict_proba(X_train_2)[:, 1]
     # print
     # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro)
     # print
     # "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre)
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set of sample size 0.4 fewer than before"
     return X_train_new2, y_train_2, X_test_new, y_test
Пример #3
0
 def fit_model(self, X_train, y_train, X_test, y_test):
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train, y_train)
     # y_pre = clf.predict(X_test)
     # y_pro = clf.predict_proba(X_test)[:, 1]
     # print
     # "pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro)
     # print("pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set sample number remains the same"
     return X_train_new, y_train, X_test_new, y_test
Пример #4
0
 def fit_model_split(self,X_train,y_train,X_test,y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBRegressor(
            learning_rate =self.learning_rate,
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_child_weight=self.min_child_weight,
            gamma=self.gamma,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            objective= self.objective,
            nthread=self.nthread,
            scale_pos_weight=self.scale_pos_weight,
            reg_alpha=self.reg_alpha,
            reg_lambda=self.reg_lambda,
            seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     new_feature= clf.apply(X_train_2)
     X_train_new2=self.mergeToOne(X_train_2,new_feature)
     new_feature_test= clf.apply(X_test)
     X_test_new=self.mergeToOne(X_test,new_feature_test)
     return X_train_new2,y_train_2,X_test_new,y_test
Пример #5
0
def get_XgbRegressor(train_data,
                     train_target,
                     test_data,
                     feature_names,
                     parameters,
                     early_stopping_rounds,
                     num_folds,
                     eval_metric,
                     model_name='model',
                     stratified=False):
    '''
    :param train_data: 一定是numpy
    :param train_target:
    :param parameters:
    :param round:
    :param k:
    :param eval_metrics:自定义 or 内置字符串
    :return:
    '''
    reg = XGBRegressor()
    reg.set_params(**parameters)

    # 定义一些变量
    oof_preds = np.zeros((train_data.shape[0], ))
    sub_preds = np.zeros((test_data.shape[0], ))
    feature_importance_df = pd.DataFrame()
    cv_result = []

    # K-flod
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1234)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234)
    X_train_newfeature = np.zeros((1, 1))
    for n_flod, (train_index,
                 val_index) in enumerate(folds.split(train_data,
                                                     train_target)):
        train_X = train_data[train_index]
        val_X = train_data[val_index]
        train_Y = train_target[train_index]
        val_Y = train_target[val_index]
        # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果,
        # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。
        watchlist = [(train_X, train_Y), (val_X, val_Y)]

        # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate
        reg.fit(train_X,
                train_Y,
                early_stopping_rounds=early_stopping_rounds,
                eval_set=watchlist,
                eval_metric=eval_metric)

        ## 生成gbdt新特征
        new_feature = reg.apply(val_X)
        if X_train_newfeature.shape[0] == 1:
            X_train_newfeature = mergeToOne(val_X, new_feature)
        else:
            X_train_newfeature = mergeToOne(val_X, new_feature)
            X_train_newfeature = np.concatenate(
                (X_train_newfeature, mergeToOne(new_feature, val_X)), axis=0)
        print(X_train_newfeature)
        # 获得每次的预测值补充
        oof_preds[val_index] = reg.predict(val_X)
        # 获得预测的平均值,这里直接加完再除m
        sub_preds += reg.predict(test_data)
        result = mean_absolute_error(val_Y, reg.predict(val_X))
        print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result))
        cv_result.append(round(result, 5))
        gc.collect()
        # 默认就是gain 如果要修改要再参数定义中修改importance_type
        # 保存特征重要度
        gain = reg.feature_importances_
        fold_importance_df = pd.DataFrame({
            'feature': feature_names,
            'gain': 100 * gain / gain.sum(),
            'fold': n_flod,
        }).sort_values('gain', ascending=False)
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
    # 进行保存
    sub_preds = sub_preds / folds.n_splits
    new_feature = reg.apply(test_data)
    X_test_newfeature = mergeToOne(test_data, new_feature)

    if not os.path.isdir('./sub'):
        os.makedirs('./sub')
    pd.DataFrame(oof_preds,
                 columns=['class'
                          ]).to_csv('./sub/val_{}.csv'.format(model_name),
                                    index=False)
    pd.DataFrame(sub_preds,
                 columns=['class'
                          ]).to_csv('./sub/test_{}.csv'.format(model_name),
                                    index=False)
    print('cv_result', cv_result)

    if not os.path.isdir('./gbdt_newfeature'):
        os.makedirs('./gbdt_newfeature')

    np.save("./gbdt_newfeature/train_newfeature.npy", X_train_newfeature)
    np.save("./gbdt_newfeature/test_newfeature.npy", X_test_newfeature)
    save_importances(feature_importance_df, model_name)
    return reg, sub_preds