def test_input_errors_randomized(params, expected_error_message): # tests specific to HalvingRandomSearchCV base_estimator = FastClassifier() param_grid = {'a': [1]} X, y = make_classification(100) sh = HalvingRandomSearchCV(base_estimator, param_grid, **params) with pytest.raises(ValueError, match=expected_error_message): sh.fit(X, y)
def test_random_search_discrete_distributions(param_distributions, expected_n_candidates): # Make sure random search samples the appropriate number of candidates when # we ask for more than what's possible. How many parameters are sampled # depends whether the distributions are 'all lists' or not (see # ParameterSampler for details). This is somewhat redundant with the checks # in ParameterSampler but interaction bugs were discovered during # developement of SH n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator, param_distributions, n_candidates=10) sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates
def test_random_search(max_resources, n_candidates, expected_n_candidates): # Test random search and make sure the number of generated candidates is # as expected n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) param_grid = {'a': norm, 'b': norm} base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator, param_grid, n_candidates=n_candidates, cv=2, max_resources=max_resources, factor=2, min_resources=4) sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates if n_candidates == 'exhaust': # Make sure 'exhaust' makes the last iteration use as much resources as # we can assert sh.n_resources_[-1] == max_resources
max_depth=[2, 5, 10], min_samples_leaf=[1, 5, 10, 20], min_samples_split=[5, 10, 20, 30, 50], ) alpha = 0.05 neg_mean_pinball_loss_05p_scorer = make_scorer( mean_pinball_loss, alpha=alpha, greater_is_better=False, # maximize the negative loss ) gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0) search_05p = HalvingRandomSearchCV( gbr, param_grid, resource="n_estimators", max_resources=250, min_resources=50, scoring=neg_mean_pinball_loss_05p_scorer, n_jobs=2, random_state=0, ).fit(X_train, y_train) pprint(search_05p.best_params_) # %% # We observe that the hyper-parameters that were hand-tuned for the median # regressor are in the same range as the hyper-parameters suitable for the 5th # percentile regressor. # # Let's now tune the hyper-parameters for the 95th percentile regressor. We # need to redefine the `scoring` metric used to select the best model, along # with adjusting the alpha parameter of the inner gradient boosting estimator # itself:
rng = np.random.RandomState(0) X, y = datasets.make_classification(n_samples=700, random_state=rng) clf = RandomForestClassifier(n_estimators=20, random_state=rng) param_dist = { "max_depth": [3, None], "max_features": randint(1, 11), "min_samples_split": randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } rsh = HalvingRandomSearchCV(estimator=clf, param_distributions=param_dist, factor=2, random_state=rng) rsh.fit(X, y) # %% # We can now use the `cv_results_` attribute of the search estimator to inspect # and plot the evolution of the search. results = pd.DataFrame(rsh.cv_results_) results['params_str'] = results.params.apply(str) results.drop_duplicates(subset=('params_str', 'iter'), inplace=True) mean_scores = results.pivot(index='iter', columns='params_str', values='mean_test_score') ax = mean_scores.plot(legend=False, alpha=.6)
n_iter=5, scoring='roc_auc', n_jobs=-1) clf = clf.fit(train_features, train_labels) # Score model score_randomized = roc_auc_score(test_labels, clf.predict_proba(test_features)[:, 1]) print(f'ROC AUC Score for RandomizedSearchCV model: {score_randomized}') print(clf.best_params_) # Fit rf model with HalvingRandomSearchCV clf_halving = HalvingRandomSearchCV(pipe, param_grid, cv=cv, verbose=1, scoring='roc_auc', n_jobs=-1, aggressive_elimination=True, factor=2, min_resources=20) clf_halving = clf_halving.fit(train_features, train_labels) # Score model score_halving = roc_auc_score(test_labels, clf_halving.predict_proba(test_features)[:, 1]) print(f'ROC AUC Score for HalvingRandomSearchCV model: {score_halving}') print(clf_halving.best_params_) print(f'ROC AUC Score for out of the box model: {score_rf}') print(f'ROC AUC Score for RandomizedSearchCV model: {score_randomized}') print(f'ROC AUC Score for HalvingRandomSearchCV model: {score_halving}')
search_multi.fit(X_train, y_train) time_end = timeit.default_timer() time_elapsed = time_end - time_start print('Execution time (hour:min:sec): {}'.format( str(dt.timedelta(seconds=time_elapsed)))) print('Best parameter (CV score = {:.3f}):'.format( search_multi.best_score_)) print(search_multi.best_params_) elif search_type == 'HalvingRandomSearchCV': # Bandit-based successive halving strategy. time_start = timeit.default_timer() search_multi = HalvingRandomSearchCV(estimator=pipe, param_distributions=param_dists, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error', factor=2, refit=True, n_jobs=NJOBS) search_multi.fit(X_train, y_train) time_end = timeit.default_timer() time_elapsed = time_end - time_start print('Execution time (hour:min:sec): {}'.format( str(dt.timedelta(seconds=time_elapsed)))) print('Best parameter (CV score = {:.3f}):'.format( search_multi.best_score_)) print(search_multi.best_params_) else: raise NotImplementedError('Search method "{}" is not recognized ' 'or implemented!'.format(search_type))
def search_best_rf(self, n_trees=2500, saveStats=True): """ Seach Best Random Forest Model Parameters ---------- df : DataFrame prepared (method prepared_data) Returns ------- JSON File (model_params_rf.json). """ #Process Time start = time.time() #Datasets feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset #Generate random state #min_samples_split_values to test max_features_list = np.arange(0.20, 0.66, 0.01).tolist() max_features_list = [round(elem, 2) for elem in max_features_list] max_features_list.append('sqrt') max_features_list.append('auto') #Get max n_trees max_n_trees = self.depth_of_trees.max()[0] max_depth_list = np.arange(int(max_n_trees / 4), max_n_trees, 1).tolist() max_depth_list.append(None) #min_impurity_decrease min_impurity_decrease_list = np.arange(0.01, 0.26, 0.01).tolist() min_impurity_decrease_list = [ round(elem, 2) for elem in min_impurity_decrease_list ] #min_samples_leaf_list.append(None) param_grid = { "max_features": max_features_list, "max_depth": max_depth_list, "min_impurity_decrease": min_impurity_decrease_list } #RF Model to test rf = RandomForestRegressor(bootstrap=True, oob_score=True, n_estimators=n_trees, random_state=7) #Define and execute pipe grid_cv = HalvingRandomSearchCV(estimator=rf, param_distributions=param_grid, random_state=7, max_resources='auto', verbose=3).fit(feat_tsf, labels) df_results = pd.DataFrame(grid_cv.cv_results_) #Save CV Results if saveStats: df_results.to_csv('data/cv_hyperparams_model.csv') print("Best Params:") print(grid_cv.best_params_) print("Saving model in 'model_params.joblib'") # Writing joblibfile with best model dump(grid_cv.best_estimator_, 'model_params.joblib') #Save json file with params best model json_txt = json.dumps(grid_cv.best_params_, indent=4) with open('model_params', 'w') as file: file.write(json_txt) #End Time end = time.time() time_elapsed = round((end - start) / 60, 1) return ('Time elapsed minutes: %1.f' % (time_elapsed))
_ = halving_cv.fit(X, y) # deal with class imbalance counts = pd.Series(y.flatten()).value_counts() scale_pos_weight = counts["No"] / counts["Yes"] param_grid_2 = { "max_depth": [3, 4, 5], "gamma": [5, 30, 50], "learning_rate": [0.01, 0.1, 0.3, 0.5], "min_child_weight": [1, 3, 5], "reg_lambda": [50, 100, 300], "scale_pos_weight": [scale_pos_weight], # Fix scale_pos_weight "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9], } from sklearn.model_selection import HalvingRandomSearchCV halving_random_cv = HalvingRandomSearchCV( xgb_cl, param_grid_2, scoring="roc_auc", n_jobs=-1, n_candidates="exhaust", factor=4 ) _ = halving_random_cv.fit(X, y)
if do_search == "halving": distributions = dict( lr=expon(1e-2), sampler_lr=expon(1e-1), sampler=["mala", "langevin", "tempered mala", "tempered langevin"], weight_decay=expon(1e-3), # max_iter=poisson(30), replay_prob=beta(a=9, b=1), adversary_weight=beta(a=1, b=1), num_units=poisson(32), num_layers=poisson(3), max_replay=poisson(10), ) clf_cv = HalvingRandomSearchCV(clf, distributions, random_state=0, n_jobs=5, resource="max_iter", max_resources=max_resources) search = clf_cv.fit(X.values) clf = clf_cv.best_estimator_ elif do_search == "bohb": distributions = CS.ConfigurationSpace(seed=42) distributions.add_hyperparameter( CSH.UniformFloatHyperparameter("lr", 1e-4, 3e-1, log=True, default_value=6e-3)) distributions.add_hyperparameter( CSH.UniformFloatHyperparameter("sampler_lr", 1e-4,