def test_multi_best_regression(self): x, y = make_regression(n_samples=100, n_features=10, n_informative=5) model = SGDRegressor() parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]} scoring = ("neg_mean_absolute_error", "neg_mean_squared_error") search_methods = ["random", "bayesian", "hyperopt", "bohb"] for search_method in search_methods: tune_search = TuneSearchCV( model, parameter_grid, scoring=scoring, search_optimization=search_method, cv=2, n_trials=3, n_jobs=1, refit="neg_mean_absolute_error") tune_search.fit(x, y) self.assertAlmostEqual( tune_search.best_score_, max(tune_search.cv_results_[ "mean_test_neg_mean_absolute_error"]), places=10) p = tune_search.cv_results_["params"] scores = tune_search.cv_results_[ "mean_test_neg_mean_absolute_error"] cv_best_param = max( list(zip(scores, p)), key=lambda pair: pair[0])[1] self.assertEqual(tune_search.best_params_, cv_best_param)
def test_multi_best_classification_scoring_dict(self): digits = datasets.load_digits() x = digits.data y = digits.target model = SGDClassifier() parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]} scoring = {"acc": "accuracy", "f1": "f1_micro"} search_methods = ["random", "bayesian", "hyperopt", "bohb"] for search_method in search_methods: tune_search = TuneSearchCV( model, parameter_grid, scoring=scoring, search_optimization=search_method, cv=2, n_trials=3, n_jobs=1, refit="acc") tune_search.fit(x, y) self.assertAlmostEqual( tune_search.best_score_, max(tune_search.cv_results_["mean_test_acc"]), places=10) p = tune_search.cv_results_["params"] scores = tune_search.cv_results_["mean_test_acc"] cv_best_param = max( list(zip(scores, p)), key=lambda pair: pair[0])[1] self.assertEqual(tune_search.best_params_, cv_best_param)
def test_multi_best(self): digits = datasets.load_digits() x = digits.data y = digits.target parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]} scoring = ("accuracy", "f1_micro") tune_search = TuneSearchCV( SGDClassifier(), parameter_grid, scoring=scoring, max_iters=20, refit="accuracy") tune_search.fit(x, y) self.assertAlmostEqual( tune_search.best_score_, max(tune_search.cv_results_["mean_test_accuracy"]), places=10) p = tune_search.cv_results_["params"] scores = tune_search.cv_results_["mean_test_accuracy"] cv_best_param = max(list(zip(scores, p)), key=lambda pair: pair[0])[1] self.assertEqual(tune_search.best_params_, cv_best_param)
def test_multi_refit_false(self): digits = datasets.load_digits() x = digits.data y = digits.target model = SGDClassifier() parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]} scoring = ("accuracy", "f1_micro") tune_search = TuneSearchCV( model, parameter_grid, scoring=scoring, search_optimization="random", cv=2, n_trials=3, n_jobs=1, refit=False) tune_search.fit(x, y) with self.assertRaises(ValueError) as exc: tune_search.best_score_ self.assertTrue(("instance was initialized with refit=False. " "For multi-metric evaluation,") in str(exc.exception)) with self.assertRaises(ValueError) as exc: tune_search.best_index_ self.assertTrue(("instance was initialized with refit=False. " "For multi-metric evaluation,") in str(exc.exception)) with self.assertRaises(ValueError) as exc: tune_search.best_params_ self.assertTrue(("instance was initialized with refit=False. " "For multi-metric evaluation,") in str(exc.exception))
def test_warn_early_stop(self): with self.assertWarnsRegex(UserWarning, "max_iters = 1"): TuneSearchCV( LogisticRegression(), {"C": [1, 2]}, early_stopping=True) with self.assertWarnsRegex(UserWarning, "max_iters = 1"): TuneSearchCV( SGDClassifier(), {"epsilon": [0.1, 0.2]}, early_stopping=True)
def test_pipeline_early_stop(self): digits = datasets.load_digits() x = digits.data y = digits.target pipe = Pipeline([("reduce_dim", PCA()), ("classify", SGDClassifier())]) parameter_grid = [ { "classify__alpha": [1e-4, 1e-1, 1], "classify__epsilon": [0.01, 0.1] }, ] with self.assertRaises(ValueError) as exc: TuneSearchCV( pipe, parameter_grid, early_stopping=True, pipeline_auto_early_stop=False, max_iters=10) self.assertTrue(( "Early stopping is not supported because the estimator does " "not have `partial_fit`, does not support warm_start, or " "is a tree classifier. Set `early_stopping=False`." ) in str(exc.exception)) tune_search = TuneSearchCV( pipe, parameter_grid, early_stopping=True, max_iters=10) tune_search.fit(x, y)
def test_plateau(self): try: from ray.tune.stopper import TrialPlateauStopper except ImportError: self.skipTest("`TrialPlateauStopper` not available in " "current Ray version.") return X, y = make_classification(n_samples=50, n_features=50, n_informative=3, random_state=0) clf = PlateauClassifier(converge_after=4) stopper = TrialPlateauStopper(metric="objective") search = TuneSearchCV(clf, {"foo_param": [2.0, 3.0, 4.0]}, cv=2, max_iters=20, stopper=stopper, early_stopping=True) search.fit(X, y) print(search.cv_results_) for iters in search.cv_results_["training_iteration"]: # Converges after 4 iterations, but the stopper needs another # 4 to detect it converged. self.assertLessEqual(iters, 8)
def sweep( self, params: Dict, X, y, search_algorithm: str = "bayesian", num_trials: int = 3, scoring_func: str = "r2", ): from tune_sklearn import TuneGridSearchCV, TuneSearchCV X, y = ( torch.tensor(X).float().to(device=self.device), torch.tensor(y).float().to(device=self.device), ) tune_search = TuneSearchCV( self.model, params, search_optimization=search_algorithm, n_trials=num_trials, early_stopping=True, scoring=scoring_func, ) tune_search.fit(X, y) return tune_search
def test_warm_start_detection(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import VotingClassifier, RandomForestClassifier clf = VotingClassifier(estimators=[( "rf", RandomForestClassifier(n_estimators=50, random_state=0))]) tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search.early_stop_type, EarlyStopping.NO_EARLY_STOP) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) tune_search2 = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search2.early_stop_type, EarlyStopping.NO_EARLY_STOP) from sklearn.linear_model import LogisticRegression clf = LogisticRegression() tune_search3 = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search3.early_stop_type, EarlyStopping.NO_EARLY_STOP) tune_search4 = TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search4.early_stop_type, EarlyStopping.WARM_START_ITER) clf = RandomForestClassifier() tune_search5 = TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search5.early_stop_type, EarlyStopping.WARM_START_ENSEMBLE)
def test_early_stop_catboost_warn(self): from catboost import CatBoostClassifier with self.assertWarnsRegex(UserWarning, "Catboost"): TuneSearchCV(CatBoostClassifier(), {"learning_rate": [0.1, 0.5]}, early_stopping=True, max_iters=10) with self.assertWarnsRegex(UserWarning, "max_iters"): TuneSearchCV(CatBoostClassifier(), {"learning_rate": [0.1, 0.5]}, early_stopping=True, max_iters=1)
def test_early_stop_lightgbm_warn(self): from lightgbm import LGBMClassifier with self.assertWarnsRegex(UserWarning, "lightgbm"): TuneSearchCV(LGBMClassifier(), {"learning_rate": [0.1, 0.5]}, early_stopping=True, max_iters=10) with self.assertWarnsRegex(UserWarning, "max_iters"): TuneSearchCV(LGBMClassifier(), {"learning_rate": [0.1, 0.5]}, early_stopping=True, max_iters=1)
def test_early_stop_xgboost_warn(self): from xgboost.sklearn import XGBClassifier with self.assertWarnsRegex(UserWarning, "github.com"): TuneSearchCV(XGBClassifier(), {"C": [1, 2]}, early_stopping=True, max_iters=10) with self.assertWarnsRegex(UserWarning, "max_iters"): TuneSearchCV(XGBClassifier(), {"C": [1, 2]}, early_stopping=True, max_iters=1)
def test_warm_start_detection(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) tune_search2 = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search2._can_early_stop()) from sklearn.linear_model import LogisticRegression clf = LogisticRegression() tune_search3 = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertTrue(tune_search3._can_early_stop())
def tune_remote(x_train, y_train): clf = RandomForestClassifier() param_distributions = { "n_estimators": randint(20, 80), "max_depth": randint(2, 10) } tune_search = TuneSearchCV(clf, param_distributions, n_trials=3) tune_search.fit(x_train, y_train) return tune_search
def test_trivial_cv_results_attr(self): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by # dcv.GridSearchCV. clf = MockClassifier() grid_search = TuneGridSearchCV(clf, {"foo_param": [1]}, cv=3) grid_search.fit(X, y) self.assertTrue(hasattr(grid_search, "cv_results_")) random_search = TuneSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=3) random_search.fit(X, y) self.assertTrue(hasattr(random_search, "cv_results_"))
def test_warn_reduce_maxiters(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) with self.assertWarnsRegex(UserWarning, "max_iters is set"): TuneSearchCV( clf, parameter_grid, max_iters=10, local_dir="./test-result") with self.assertWarnsRegex(UserWarning, "max_iters is set"): TuneSearchCV( SGDClassifier(), parameter_grid, max_iters=10, local_dir="./test-result")
def sweep(self, params: Dict, X, y): tune_search = TuneSearchCV( self.model, param_distributions=params, n_trials=3, # early_stopping=True, # use_gpu=True ) tune_search.fit(X, y) return tune_search
def sweep(self, X, y, params: Dict = None): if not params: raise NotImplementedError tune_search = TuneSearchCV( self.model, param_distributions=params, n_trials=3, # early_stopping=True, # use_gpu=True ) tune_search.fit(X, y) return tune_search
def test_warm_start_error(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, early_stopping=False, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) with self.assertRaises(ValueError): tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, early_stopping=True, max_iters=10, local_dir="./test-result")
def test_early_stop_xgboost_pipeline(self): from xgboost.sklearn import XGBClassifier from sklearn.pipeline import Pipeline TuneSearchCV(Pipeline([("model", XGBClassifier())]), {"model__C": [1, 2]}, early_stopping=True, pipeline_auto_early_stop=True, cv=2, n_trials=2, max_iters=10)
def test_early_stop_lightgbm_pipeline(self): from lightgbm import LGBMClassifier from sklearn.pipeline import Pipeline TuneSearchCV(Pipeline([("model", LGBMClassifier())]), {"model__learning_rate": [0.1, 0.5]}, early_stopping=True, pipeline_auto_early_stop=True, cv=2, n_trials=2, max_iters=10)
def test_local_mode(self): digits = datasets.load_digits() x = digits.data y = digits.target clf = SGDClassifier() parameter_grid = { "alpha": Real(1e-4, 1e-1, 1), "epsilon": Real(0.01, 0.1) } tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") import ray with patch.object(ray, "init", wraps=ray.init) as wrapped_init: tune_search.fit(x, y) self.assertTrue(wrapped_init.call_args[1]["local_mode"])
def test_early_stop_catboost_pipeline(self): from catboost import CatBoostClassifier from sklearn.pipeline import Pipeline TuneSearchCV(Pipeline([("model", CatBoostClassifier())]), {"model__learning_rate": [0.1, 0.5]}, early_stopping=True, pipeline_auto_early_stop=True, cv=2, n_trials=2, max_iters=10)
def test_max_iters(self): X, y = make_classification(n_samples=50, n_features=50, n_informative=3, random_state=0) clf = PlateauClassifier(converge_after=20) search = TuneSearchCV(clf, {"foo_param": [2.0, 3.0, 4.0]}, cv=2, max_iters=6, early_stopping=True) search.fit(X, y) print(search.cv_results_) for iters in search.cv_results_["training_iteration"]: # Stop after 6 iterations. self.assertLessEqual(iters, 6)
def test_local_dir(self): digits = datasets.load_digits() x = digits.data y = digits.target clf = SGDClassifier() parameter_grid = { "alpha": Real(1e-4, 1e-1, 1), "epsilon": Real(0.01, 0.1) } scheduler = MedianStoppingRule(grace_period=10.0) tune_search = TuneSearchCV(clf, parameter_grid, early_stopping=scheduler, max_iters=10, local_dir="./test-result") tune_search.fit(x, y) self.assertTrue(len(os.listdir("./test-result")) != 0)
def _test_method(self, search_method): digits = datasets.load_digits() x = digits.data y = digits.target tune_search = TuneSearchCV(self.clf, self.parameter_grid, search_optimization=search_method, cv=2, n_trials=3, n_jobs=1, refit=True) tune_search.fit(x, y) params = tune_search.best_estimator_.get_params() print({ k: v for k, v in params.items() if k in ("alpha", "epsilon", "penalty") }) self.assertTrue(1e-4 <= params["alpha"] <= 0.5) self.assertTrue(0.01 <= params["epsilon"] <= 0.05) self.assertTrue(params["penalty"] in ("elasticnet", "l1"))
def tune_ray(clf, params, X_train, y_train, X_test, y_test, n_params=-1, max_epochs=-1, n_jobs=4): common = dict(random_state=42) split = ShuffleSplit(test_size=0.20, n_splits=1, random_state=42) clf = clone(clf).set_params(prefix="ray") from tune_sklearn import TuneSearchCV search = TuneSearchCV( clf, params, cv=split, early_stopping=True, max_iters=max_epochs, n_iter=n_params, random_state=42, refit=False, ) start = time() search.fit(X_train, y_train) fit_time = time() - start data = { "library": "ray", "fit_time": fit_time, "start_time": start, "n_params": n_params, "n_jobs": n_jobs, "max_epochs": max_epochs, } return search, data
def test_timeout(self): X, y = make_classification(n_samples=50, n_features=50, n_informative=3, random_state=0) clf = SleepClassifier() # SleepClassifier sleeps for `foo_param` seconds, `cv` times. # Thus, the time budget is exhausted after testing the first two # `foo_param`s. search = TuneSearchCV(clf, {"foo_param": [1.1, 1.2, 2.5]}, time_budget_s=5.0, cv=2, max_iters=5, early_stopping=True) start = time.time() search.fit(X, y) taken = time.time() - start print(search) # Without timeout we would need over 50 seconds for this to # finish. Allow for some initialization overhead self.assertLess(taken, 25.0)
def _test_seed_run(self, search_optimization, seed): digits = datasets.load_digits() x = digits.data y = digits.target parameters = { "classify__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1], "classify__epsilon": [0.01, 0.02, 0.03, 0.04, 0.05, 0.06] } pipe = Pipeline([("reduce_dim", PCA()), ("classify", SGDClassifier())]) if isinstance(seed, str): _seed = np.random.RandomState(seed=int(seed)) else: _seed = seed tune_search_1 = TuneSearchCV(pipe, parameters.copy(), early_stopping=True, max_iters=1, search_optimization=search_optimization, random_state=_seed) tune_search_1.fit(x, y) if isinstance(seed, str): _seed = np.random.RandomState(seed=int(seed)) else: _seed = seed tune_search_2 = TuneSearchCV(pipe, parameters.copy(), early_stopping=True, max_iters=1, search_optimization=search_optimization, random_state=_seed) tune_search_2.fit(x, y) try: self.assertSequenceEqual(tune_search_1.cv_results_["params"], tune_search_2.cv_results_["params"]) except AssertionError: print(f"Seeds: {tune_search_1.seed} == {tune_search_2.seed}?") raise
def test_warm_start_error(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import VotingClassifier, RandomForestClassifier clf = VotingClassifier(estimators=[( "rf", RandomForestClassifier(n_estimators=50, random_state=0))]) tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, early_stopping=False, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) with self.assertRaises(ValueError): tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, early_stopping=True, max_iters=10, local_dir="./test-result") from sklearn.linear_model import LogisticRegression clf = LogisticRegression() with self.assertRaises(ValueError): parameter_grid = {"max_iter": [1, 2]} TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() with self.assertRaises(ValueError): parameter_grid = {"n_estimators": [1, 2]} TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result")