def tune_maxdepth(X, y): """ Tune only one parameter: the maximum depth in each single tree """ # takes 23s to run rf_model = ensemble.RandomForestRegressor(n_estimators=60) max_depths = np.linspace(2, 20, 10, endpoint=True, dtype=int) grid_para = [{'max_depth': max_depths}] tuneRF = GridSearchCV(rf_model, grid_para, cv=inner_cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') t1 = time.time() tuneRF.fit(X, y) t2 = time.time() print(t2 - t1, " seconds\n") plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']), np.sqrt(-tuneRF.cv_results_['mean_test_score']), max_depths, 'max tree depth') print("best RMSE: ", np.sqrt(-tuneRF.best_score_)) print("best n_estimator: ", tuneRF.best_params_) print(tuneRF.best_estimator_) return tuneRF.best_estimator_
def tune_Nestimator(X, y): """ Tune only one parameter: the number of estimators in each single tree """ # takes 47 seconds to run rf_model = ensemble.RandomForestRegressor(max_depth=14) n_estimators = np.linspace(20, 300, 15, endpoint=True, dtype=int) grid_para = [{'n_estimators': n_estimators}] tuneRF = GridSearchCV(rf_model, grid_para, cv=inner_cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') t1 = time.time() tuneRF.fit(X, y) t2 = time.time() print(t2 - t1, " seconds\n") plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']), np.sqrt(-tuneRF.cv_results_['mean_test_score']), n_estimators, '# of trees') plot_cv_testscores(np.sqrt(-tuneRF.cv_results_['mean_test_score']), n_estimators, '# of trees') print("best RMSE: ", np.sqrt(-tuneRF.best_score_)) print("best n_estimator: ", tuneRF.best_params_) print(tuneRF.best_estimator_) return tuneRF.best_estimator_
def tune_max_features(X, y): """ Tune only one parameter: the max_features at each split """ # takes 23s to run rf_model = ensemble.RandomForestRegressor(n_estimators=100, max_depth=14, min_samples_leaf=1, min_samples_split=2) max_features = np.linspace(24, 44, 11, endpoint=True, dtype=int) print(max_features) grid_para = [{'max_features': max_features}] tuneRF = GridSearchCV(rf_model, grid_para, cv=inner_cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') t1 = time.time() tuneRF.fit(X, y) t2 = time.time() print(t2 - t1, " seconds\n") plot_cv_traintestscores(np.sqrt(-tuneRF.cv_results_['mean_train_score']), np.sqrt(-tuneRF.cv_results_['mean_test_score']), max_features, 'max_features') plot_cv_testscores(np.sqrt(-tuneRF.cv_results_['mean_test_score']), max_features, 'max_features') print("best RMSE: ", np.sqrt(-tuneRF.best_score_)) print("best n_estimator: ", tuneRF.best_params_) print(tuneRF.best_estimator_) return tuneRF.best_estimator_
def xgbmodelfit(alg, datamatrix, cvfolds): xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, datamatrix, num_boost_round=alg.get_params()['n_estimators'], folds=cvfolds, metrics='rmse', early_stopping_rounds=50) alg.set_params(n_estimators=cvresult.shape[0]) alg.fit(X, y) rmse = root_mean_squared_error(alg, X, y) n = cvresult.shape[0] print("optimal n_estimator is %d" % n) print("With optimal n_estimator, mean CV test RMSE is %.4f" % cvresult['test-rmse-mean'][n - 1]) print("With optimal n_estimator, mean CV train RMSE is %.4f" % cvresult['train-rmse-mean'][n - 1]) print("RMSE of xgb entire data is %.4f" % (rmse)) plot_cv_traintestscores(cvresult['train-rmse-mean'], cvresult['test-rmse-mean'], [i for i in range(n)], 'n_estimators') plot_cv_traintestscores(cvresult['train-rmse-mean'][50:150], cvresult['test-rmse-mean'][50:150], [i for i in range(n)][50:150], 'n_estimators') #plot_cv_traintestscores(cvresult['train-rmse-mean'][50:], cvresult['test-rmse-mean'][50:], [i for i in range(n)][50:], 'n_estimators') plot_cv_testscores(cvresult['test-rmse-mean'][50:], [i for i in range(n)][50:], 'n_estimators') feat_imp = plot_FI_tree(alg, cols, 20) feat_imp[0:20] return alg
def tune_max_depth(X, y): tree_model = tree.DecisionTreeRegressor() max_depths = np.linspace(1, 25, 25, endpoint=True) grid_para = [{'max_depth': max_depths}] tuneDT1 = GridSearchCV(tree_model, grid_para, cv=inner_cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') t1 = time.time() tuneDT1.fit(X, y) t2 = time.time() print(t2 - t1, " seconds\n") plot_cv_traintestscores(np.sqrt(-tuneDT1.cv_results_['mean_train_score']), np.sqrt(-tuneDT1.cv_results_['mean_test_score']), max_depths, 'max_depth') print("best RMSE: ", np.sqrt(-tuneDT1.best_score_)) print("best max_depth: ", tuneDT1.best_params_) print(tuneDT1.best_estimator_) return tuneDT1.best_estimator_
def tune_leaf(X, y): tree_model = tree.DecisionTreeRegressor() min_samples_leaf = np.linspace(1, 20, 20, endpoint=True, dtype=int) grid_para = [{'min_samples_leaf': min_samples_leaf}] tuneDT2 = GridSearchCV(tree_model, grid_para, cv=inner_cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') t1 = time.time() tuneDT2.fit(X, y) t2 = time.time() print(t2 - t1, " seconds\n") plot_cv_traintestscores(np.sqrt(-tuneDT2.cv_results_['mean_train_score']), np.sqrt(-tuneDT2.cv_results_['mean_test_score']), min_samples_leaf, 'min_samples_leaf') print("best RMSE: ", np.sqrt(-tuneDT2.best_score_)) print("best min_samples_leaf: ", tuneDT2.best_params_) print("max_depth in best tree: ", tuneDT2.best_estimator_.tree_.max_depth) print(tuneDT2.best_estimator_) return tuneDT2.best_estimator_