def fit_model(n_estimators=500, learning_rate=0.26): params = { 'n_estimators': 100, 'max_depth': 200, 'min_samples_split': 400, 'learning_rate': 0.1, 'verbose': 2, 'loss': 'ls', 'random_state': 0 } # params = {'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2, # 'learning_rate': 0.01, 'verbose': 1, 'loss': 'ls', 'random_state': 0} # params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'ls'} # n_estimators: 500, 'max_depth': 5, 'min_samples_split': 6, # learning_rate: 0.26, xgb = ensemble.GradientBoostingRegressor(**params) # gbrt = ensemble.GradientBoostingRegressor(verbose=2) xgb = XGBRegressor(verbose=2) # gbrt = ensemble.GradientBoostingRegressor(loss='ls',n_estimators = 300,max_depth = 300, learning_rate = 0.1, verbose = 2, min_samples_leaf = 256, min_samples_split = 256) fileName = 'E:\\data\\DiDiData\\data_csv\\result\\default_para_xgb_pair_result' global x_train, y_train # gbrt.fit(x, y) # gbrt.fit(x_test, y_test) xgb.fit(x_train, y_train) save_model(fileName, xgb) model_result = xgb.predict(x_test) save_result(fileName, list(model_result)) mse = mean_squared_error(y_test, xgb.predict(x_test)) print("MSE: %.4f" % mse) # 输出均方误差 r2 = r2_score(y_test, model_result) print("r^2 on test data : %f" % r2) # R^2 拟合优度=(预测值-均值)^2之和/(真实值-均值)^2之和,越接近1越好 # return mse # Plot training deviance # compute test set deviance test_score = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(xgb.staged_predict(x_test)): test_score[i] = xgb.loss_(y_test, y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, xgb.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Deviance') # Plot feature importance feature_importance = xgb.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[sorted_idx], align='center') columns = x_train.columns plt.yticks(pos, columns[sorted_idx]) # plt.yticks(pos, boston.feature_names[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show()