Пример #1
0
def fit_model(n_estimators=500, learning_rate=0.26):
    params = {
        'n_estimators': 100,
        'max_depth': 200,
        'min_samples_split': 400,
        'learning_rate': 0.1,
        'verbose': 2,
        'loss': 'ls',
        'random_state': 0
    }
    # params = {'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2,
    #           'learning_rate': 0.01, 'verbose': 1, 'loss': 'ls', 'random_state': 0}
    # params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
    #           'learning_rate': 0.01, 'loss': 'ls'}
    # n_estimators: 500, 'max_depth': 5, 'min_samples_split': 6,
    # learning_rate: 0.26,
    xgb = ensemble.GradientBoostingRegressor(**params)
    # gbrt = ensemble.GradientBoostingRegressor(verbose=2)
    xgb = XGBRegressor(verbose=2)

    # gbrt = ensemble.GradientBoostingRegressor(loss='ls',n_estimators = 300,max_depth = 300, learning_rate = 0.1, verbose = 2, min_samples_leaf = 256, min_samples_split = 256)
    fileName = 'E:\\data\\DiDiData\\data_csv\\result\\default_para_xgb_pair_result'

    global x_train, y_train
    # gbrt.fit(x, y)
    # gbrt.fit(x_test, y_test)
    xgb.fit(x_train, y_train)
    save_model(fileName, xgb)
    model_result = xgb.predict(x_test)
    save_result(fileName, list(model_result))
    mse = mean_squared_error(y_test, xgb.predict(x_test))
    print("MSE: %.4f" % mse)  # 输出均方误差
    r2 = r2_score(y_test, model_result)
    print("r^2 on test data : %f" %
          r2)  # R^2 拟合优度=(预测值-均值)^2之和/(真实值-均值)^2之和,越接近1越好
    # return mse

    # Plot training deviance

    # compute test set deviance
    test_score = np.zeros((params['n_estimators'], ), dtype=np.float64)

    for i, y_pred in enumerate(xgb.staged_predict(x_test)):
        test_score[i] = xgb.loss_(y_test, y_pred)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1,
             xgb.train_score_,
             'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1,
             test_score,
             'r-',
             label='Test Set Deviance')
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')

    # Plot feature importance
    feature_importance = xgb.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    columns = x_train.columns
    plt.yticks(pos, columns[sorted_idx])
    # plt.yticks(pos, boston.feature_names[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()