예제 #1
0
def xgbmodelfit(alg, datamatrix, cvfolds):
    xgb_param = alg.get_xgb_params()
    cvresult = xgb.cv(xgb_param,
                      datamatrix,
                      num_boost_round=alg.get_params()['n_estimators'],
                      folds=cvfolds,
                      metrics='rmse',
                      early_stopping_rounds=50)
    alg.set_params(n_estimators=cvresult.shape[0])
    alg.fit(X, y)
    rmse = root_mean_squared_error(alg, X, y)
    n = cvresult.shape[0]
    print("optimal n_estimator is %d" % n)
    print("With optimal n_estimator, mean CV test RMSE is %.4f" %
          cvresult['test-rmse-mean'][n - 1])
    print("With optimal n_estimator, mean CV train RMSE is %.4f" %
          cvresult['train-rmse-mean'][n - 1])
    print("RMSE of xgb entire data is %.4f" % (rmse))
    plot_cv_traintestscores(cvresult['train-rmse-mean'],
                            cvresult['test-rmse-mean'], [i for i in range(n)],
                            'n_estimators')
    plot_cv_traintestscores(cvresult['train-rmse-mean'][40:50],
                            cvresult['test-rmse-mean'][40:50],
                            [i for i in range(n)][40:50], 'n_estimators')
    plot_cv_traintestscores(cvresult['train-rmse-mean'][50:],
                            cvresult['test-rmse-mean'][50:],
                            [i for i in range(n)][50:], 'n_estimators')
    plot_cv_testscores(cvresult['test-rmse-mean'][50:],
                       [i for i in range(n)][50:], 'n_estimators')

    feat_imp = plot_FI_tree(alg, cols, 20)
    feat_imp[0:20]
    return alg
예제 #2
0
def simpleRF(X,y,test_frac):
    rf_model = ensemble.RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac)
    rf_model.fit(X_train,y_train)
    print("R2 on training data %.2f" %rf_model.score(X_train,y_train))
    print("RMSE on training data %2.3f" %np.sqrt(mean_squared_error(y_train,rf_model.predict(X_train))))
    print("R2 on test data %.2f" %rf_model.score(X_test,y_test))
    print("RMSE on test data %2.3f" %np.sqrt(mean_squared_error(y_test,rf_model.predict(X_test))))
    
    FI = plot_FI_tree(rf_model, cols, topn = 20)
    FI[0:20]
    return rf_model
예제 #3
0
    plt.figure().set_size_inches(8, 6)
    for ridx, depth in enumerate(max_depths):
        if ridx >= 3:
            plt.plot(n_estimators,
                     test_rmse[ridx, :],
                     label="max_depth: " + str(depth))
    plt.xlabel('# of trees')
    plt.ylabel('Mean RMSE on test set (5-fold CV)')
    plt.legend(loc='upper left')

    return tuneRF


#RF0 = simpleRF(X,y)
RF1 = tune_Nestimator(X, y)
FI = plot_FI_tree(RF1, cols, topn=20)
FI[0:20]

RF2 = tune_maxdepth(X, y)
FI = plot_FI_tree(RF2, cols, topn=20)
FI[0:20]

tuneRF3 = tune_Nestimators_maxdepth(X, y)
FI = plot_FI_tree(tuneRF3.best_estimator_, cols, topn=20)
FI[0:20]

RF3 = tune_min_samples_leaf(X, y)
RF4 = tune_min_samples_split(X, y)
RF5 = tune_max_features(X, y)

# make predictions with best model, most complex model, simplest model