Exemplo n.º 1
0
def tune_maxdepth(X, y):
    """
    Tune only one parameter: the maximum depth in each single tree
    """
    # takes 23s to run
    rf_model = ensemble.RandomForestRegressor(n_estimators=60)
    max_depths = np.linspace(2, 20, 10, endpoint=True, dtype=int)
    grid_para = [{'max_depth': max_depths}]
    tuneRF = GridSearchCV(rf_model,
                          grid_para,
                          cv=inner_cv,
                          refit=True,
                          return_train_score=True,
                          scoring='neg_mean_squared_error')
    t1 = time.time()
    tuneRF.fit(X, y)
    t2 = time.time()
    print(t2 - t1, " seconds\n")
    plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']),
                            np.sqrt(-tuneRF.cv_results_['mean_test_score']),
                            max_depths, 'max tree depth')
    print("best RMSE: ", np.sqrt(-tuneRF.best_score_))
    print("best n_estimator: ", tuneRF.best_params_)
    print(tuneRF.best_estimator_)
    return tuneRF.best_estimator_
Exemplo n.º 2
0
def tune_Nestimator(X, y):
    """
    Tune only one parameter: the number of estimators in each single tree
    """
    # takes 47 seconds to run
    rf_model = ensemble.RandomForestRegressor(max_depth=14)
    n_estimators = np.linspace(20, 300, 15, endpoint=True, dtype=int)
    grid_para = [{'n_estimators': n_estimators}]
    tuneRF = GridSearchCV(rf_model,
                          grid_para,
                          cv=inner_cv,
                          refit=True,
                          return_train_score=True,
                          scoring='neg_mean_squared_error')
    t1 = time.time()
    tuneRF.fit(X, y)
    t2 = time.time()
    print(t2 - t1, " seconds\n")
    plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']),
                            np.sqrt(-tuneRF.cv_results_['mean_test_score']),
                            n_estimators, '# of trees')
    plot_cv_testscores(np.sqrt(-tuneRF.cv_results_['mean_test_score']),
                       n_estimators, '# of trees')
    print("best RMSE: ", np.sqrt(-tuneRF.best_score_))
    print("best n_estimator: ", tuneRF.best_params_)
    print(tuneRF.best_estimator_)
    return tuneRF.best_estimator_
Exemplo n.º 3
0
def tune_max_features(X, y):
    """
    Tune only one parameter: the max_features at each split
    """
    # takes 23s to run
    rf_model = ensemble.RandomForestRegressor(n_estimators=100,
                                              max_depth=14,
                                              min_samples_leaf=1,
                                              min_samples_split=2)
    max_features = np.linspace(24, 44, 11, endpoint=True, dtype=int)
    print(max_features)
    grid_para = [{'max_features': max_features}]
    tuneRF = GridSearchCV(rf_model,
                          grid_para,
                          cv=inner_cv,
                          refit=True,
                          return_train_score=True,
                          scoring='neg_mean_squared_error')
    t1 = time.time()
    tuneRF.fit(X, y)
    t2 = time.time()
    print(t2 - t1, " seconds\n")
    plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']),
                            np.sqrt(-tuneRF.cv_results_['mean_test_score']),
                            max_features, 'max_features')
    plot_cv_testscores(np.sqrt(-tuneRF.cv_results_['mean_test_score']),
                       max_features, 'max_features')
    print("best RMSE: ", np.sqrt(-tuneRF.best_score_))
    print("best n_estimator: ", tuneRF.best_params_)
    print(tuneRF.best_estimator_)
    return tuneRF.best_estimator_
Exemplo n.º 4
0
def xgbmodelfit(alg, datamatrix, cvfolds):
    xgb_param = alg.get_xgb_params()
    cvresult = xgb.cv(xgb_param,
                      datamatrix,
                      num_boost_round=alg.get_params()['n_estimators'],
                      folds=cvfolds,
                      metrics='rmse',
                      early_stopping_rounds=50)
    alg.set_params(n_estimators=cvresult.shape[0])
    alg.fit(X, y)
    rmse = root_mean_squared_error(alg, X, y)
    n = cvresult.shape[0]
    print("optimal n_estimator is %d" % n)
    print("With optimal n_estimator, mean CV test RMSE is %.4f" %
          cvresult['test-rmse-mean'][n - 1])
    print("With optimal n_estimator, mean CV train RMSE is %.4f" %
          cvresult['train-rmse-mean'][n - 1])
    print("RMSE of xgb entire data is %.4f" % (rmse))
    plot_cv_traintestscores(cvresult['train-rmse-mean'],
                            cvresult['test-rmse-mean'], [i for i in range(n)],
                            'n_estimators')
    plot_cv_traintestscores(cvresult['train-rmse-mean'][50:150],
                            cvresult['test-rmse-mean'][50:150],
                            [i for i in range(n)][50:150], 'n_estimators')
    #plot_cv_traintestscores(cvresult['train-rmse-mean'][50:], cvresult['test-rmse-mean'][50:], [i for i in range(n)][50:], 'n_estimators')
    plot_cv_testscores(cvresult['test-rmse-mean'][50:],
                       [i for i in range(n)][50:], 'n_estimators')

    feat_imp = plot_FI_tree(alg, cols, 20)
    feat_imp[0:20]
    return alg
def tune_max_depth(X, y):
    tree_model = tree.DecisionTreeRegressor()
    max_depths = np.linspace(1, 25, 25, endpoint=True)
    grid_para = [{'max_depth': max_depths}]
    tuneDT1 = GridSearchCV(tree_model,
                           grid_para,
                           cv=inner_cv,
                           refit=True,
                           return_train_score=True,
                           scoring='neg_mean_squared_error')
    t1 = time.time()
    tuneDT1.fit(X, y)
    t2 = time.time()
    print(t2 - t1, " seconds\n")
    plot_cv_traintestscores(np.sqrt(-tuneDT1.cv_results_['mean_train_score']),
                            np.sqrt(-tuneDT1.cv_results_['mean_test_score']),
                            max_depths, 'max_depth')
    print("best RMSE: ", np.sqrt(-tuneDT1.best_score_))
    print("best max_depth: ", tuneDT1.best_params_)
    print(tuneDT1.best_estimator_)
    return tuneDT1.best_estimator_
def tune_leaf(X, y):
    tree_model = tree.DecisionTreeRegressor()
    min_samples_leaf = np.linspace(1, 20, 20, endpoint=True, dtype=int)
    grid_para = [{'min_samples_leaf': min_samples_leaf}]
    tuneDT2 = GridSearchCV(tree_model,
                           grid_para,
                           cv=inner_cv,
                           refit=True,
                           return_train_score=True,
                           scoring='neg_mean_squared_error')
    t1 = time.time()
    tuneDT2.fit(X, y)
    t2 = time.time()
    print(t2 - t1, " seconds\n")
    plot_cv_traintestscores(np.sqrt(-tuneDT2.cv_results_['mean_train_score']),
                            np.sqrt(-tuneDT2.cv_results_['mean_test_score']),
                            min_samples_leaf, 'min_samples_leaf')
    print("best RMSE: ", np.sqrt(-tuneDT2.best_score_))
    print("best min_samples_leaf: ", tuneDT2.best_params_)
    print("max_depth in best tree: ", tuneDT2.best_estimator_.tree_.max_depth)
    print(tuneDT2.best_estimator_)
    return tuneDT2.best_estimator_