def fit_model(self, data, target, test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) data = np.array(data).astype(float) scaler = MinMaxScaler() temp = scaler.fit(data) data = scaler.transform(data) test = scaler.transform(test) target = scaler.fit_transform(target) clf.fit(data, target) new_feature = clf.apply(data) new_test = clf.apply(test) X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature) X_test_new = self.mergeToOne(pd.DataFrame(test), new_test) X_train_new = pd.DataFrame(X_train_new) X_test_new = pd.DataFrame(X_test_new) return X_train_new, target, X_test_new
def fit_model_split(self, X_train, y_train, X_test, y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) # y_pre = clf.predict(X_train_2) # y_pro = clf.predict_proba(X_train_2)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro) # print # "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set of sample size 0.4 fewer than before" return X_train_new2, y_train_2, X_test_new, y_test
def fit_model(self, X_train, y_train, X_test, y_test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) # y_pre = clf.predict(X_test) # y_pro = clf.predict_proba(X_test)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro) # print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set sample number remains the same" return X_train_new, y_train, X_test_new, y_test
def fit_model_split(self,X_train,y_train,X_test,y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train, y_train, test_size=0.6, random_state=0) clf = XGBRegressor( learning_rate =self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective= self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) new_feature= clf.apply(X_train_2) X_train_new2=self.mergeToOne(X_train_2,new_feature) new_feature_test= clf.apply(X_test) X_test_new=self.mergeToOne(X_test,new_feature_test) return X_train_new2,y_train_2,X_test_new,y_test
def get_XgbRegressor(train_data, train_target, test_data, feature_names, parameters, early_stopping_rounds, num_folds, eval_metric, model_name='model', stratified=False): ''' :param train_data: 一定是numpy :param train_target: :param parameters: :param round: :param k: :param eval_metrics:自定义 or 内置字符串 :return: ''' reg = XGBRegressor() reg.set_params(**parameters) # 定义一些变量 oof_preds = np.zeros((train_data.shape[0], )) sub_preds = np.zeros((test_data.shape[0], )) feature_importance_df = pd.DataFrame() cv_result = [] # K-flod if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234) X_train_newfeature = np.zeros((1, 1)) for n_flod, (train_index, val_index) in enumerate(folds.split(train_data, train_target)): train_X = train_data[train_index] val_X = train_data[val_index] train_Y = train_target[train_index] val_Y = train_target[val_index] # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果, # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。 watchlist = [(train_X, train_Y), (val_X, val_Y)] # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate reg.fit(train_X, train_Y, early_stopping_rounds=early_stopping_rounds, eval_set=watchlist, eval_metric=eval_metric) ## 生成gbdt新特征 new_feature = reg.apply(val_X) if X_train_newfeature.shape[0] == 1: X_train_newfeature = mergeToOne(val_X, new_feature) else: X_train_newfeature = mergeToOne(val_X, new_feature) X_train_newfeature = np.concatenate( (X_train_newfeature, mergeToOne(new_feature, val_X)), axis=0) print(X_train_newfeature) # 获得每次的预测值补充 oof_preds[val_index] = reg.predict(val_X) # 获得预测的平均值,这里直接加完再除m sub_preds += reg.predict(test_data) result = mean_absolute_error(val_Y, reg.predict(val_X)) print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result)) cv_result.append(round(result, 5)) gc.collect() # 默认就是gain 如果要修改要再参数定义中修改importance_type # 保存特征重要度 gain = reg.feature_importances_ fold_importance_df = pd.DataFrame({ 'feature': feature_names, 'gain': 100 * gain / gain.sum(), 'fold': n_flod, }).sort_values('gain', ascending=False) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) # 进行保存 sub_preds = sub_preds / folds.n_splits new_feature = reg.apply(test_data) X_test_newfeature = mergeToOne(test_data, new_feature) if not os.path.isdir('./sub'): os.makedirs('./sub') pd.DataFrame(oof_preds, columns=['class' ]).to_csv('./sub/val_{}.csv'.format(model_name), index=False) pd.DataFrame(sub_preds, columns=['class' ]).to_csv('./sub/test_{}.csv'.format(model_name), index=False) print('cv_result', cv_result) if not os.path.isdir('./gbdt_newfeature'): os.makedirs('./gbdt_newfeature') np.save("./gbdt_newfeature/train_newfeature.npy", X_train_newfeature) np.save("./gbdt_newfeature/test_newfeature.npy", X_test_newfeature) save_importances(feature_importance_df, model_name) return reg, sub_preds