示例#1
0
    def test_multi_best_regression(self):
        x, y = make_regression(n_samples=100, n_features=10, n_informative=5)
        model = SGDRegressor()
        parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}

        scoring = ("neg_mean_absolute_error", "neg_mean_squared_error")

        search_methods = ["random", "bayesian", "hyperopt", "bohb"]
        for search_method in search_methods:

            tune_search = TuneSearchCV(
                model,
                parameter_grid,
                scoring=scoring,
                search_optimization=search_method,
                cv=2,
                n_trials=3,
                n_jobs=1,
                refit="neg_mean_absolute_error")
            tune_search.fit(x, y)
            self.assertAlmostEqual(
                tune_search.best_score_,
                max(tune_search.cv_results_[
                    "mean_test_neg_mean_absolute_error"]),
                places=10)

            p = tune_search.cv_results_["params"]
            scores = tune_search.cv_results_[
                "mean_test_neg_mean_absolute_error"]
            cv_best_param = max(
                list(zip(scores, p)), key=lambda pair: pair[0])[1]
            self.assertEqual(tune_search.best_params_, cv_best_param)
示例#2
0
    def test_multi_best_classification_scoring_dict(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target
        model = SGDClassifier()

        parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}
        scoring = {"acc": "accuracy", "f1": "f1_micro"}
        search_methods = ["random", "bayesian", "hyperopt", "bohb"]
        for search_method in search_methods:

            tune_search = TuneSearchCV(
                model,
                parameter_grid,
                scoring=scoring,
                search_optimization=search_method,
                cv=2,
                n_trials=3,
                n_jobs=1,
                refit="acc")
            tune_search.fit(x, y)
            self.assertAlmostEqual(
                tune_search.best_score_,
                max(tune_search.cv_results_["mean_test_acc"]),
                places=10)

            p = tune_search.cv_results_["params"]
            scores = tune_search.cv_results_["mean_test_acc"]
            cv_best_param = max(
                list(zip(scores, p)), key=lambda pair: pair[0])[1]
            self.assertEqual(tune_search.best_params_, cv_best_param)
示例#3
0
    def test_multi_refit_false(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target
        model = SGDClassifier()

        parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}
        scoring = ("accuracy", "f1_micro")

        tune_search = TuneSearchCV(
            model,
            parameter_grid,
            scoring=scoring,
            search_optimization="random",
            cv=2,
            n_trials=3,
            n_jobs=1,
            refit=False)
        tune_search.fit(x, y)
        with self.assertRaises(ValueError) as exc:
            tune_search.best_score_
        self.assertTrue(("instance was initialized with refit=False. "
                         "For multi-metric evaluation,") in str(exc.exception))
        with self.assertRaises(ValueError) as exc:
            tune_search.best_index_
        self.assertTrue(("instance was initialized with refit=False. "
                         "For multi-metric evaluation,") in str(exc.exception))
        with self.assertRaises(ValueError) as exc:
            tune_search.best_params_
        self.assertTrue(("instance was initialized with refit=False. "
                         "For multi-metric evaluation,") in str(exc.exception))
示例#4
0
    def test_multi_best(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target

        parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}

        scoring = ("accuracy", "f1_micro")

        tune_search = TuneSearchCV(
            SGDClassifier(),
            parameter_grid,
            scoring=scoring,
            max_iters=20,
            refit="accuracy")
        tune_search.fit(x, y)
        self.assertAlmostEqual(
            tune_search.best_score_,
            max(tune_search.cv_results_["mean_test_accuracy"]),
            places=10)

        p = tune_search.cv_results_["params"]
        scores = tune_search.cv_results_["mean_test_accuracy"]
        cv_best_param = max(list(zip(scores, p)), key=lambda pair: pair[0])[1]
        self.assertEqual(tune_search.best_params_, cv_best_param)
示例#5
0
    def test_pipeline_early_stop(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target

        pipe = Pipeline([("reduce_dim", PCA()), ("classify", SGDClassifier())])
        parameter_grid = [
            {
                "classify__alpha": [1e-4, 1e-1, 1],
                "classify__epsilon": [0.01, 0.1]
            },
        ]

        with self.assertRaises(ValueError) as exc:
            TuneSearchCV(
                pipe,
                parameter_grid,
                early_stopping=True,
                pipeline_auto_early_stop=False,
                max_iters=10)
        self.assertTrue((
            "Early stopping is not supported because the estimator does "
            "not have `partial_fit`, does not support warm_start, or "
            "is a tree classifier. Set `early_stopping=False`."
        ) in str(exc.exception))

        tune_search = TuneSearchCV(
            pipe, parameter_grid, early_stopping=True, max_iters=10)
        tune_search.fit(x, y)
    def test_plateau(self):
        try:
            from ray.tune.stopper import TrialPlateauStopper
        except ImportError:
            self.skipTest("`TrialPlateauStopper` not available in "
                          "current Ray version.")
            return

        X, y = make_classification(n_samples=50,
                                   n_features=50,
                                   n_informative=3,
                                   random_state=0)

        clf = PlateauClassifier(converge_after=4)

        stopper = TrialPlateauStopper(metric="objective")

        search = TuneSearchCV(clf, {"foo_param": [2.0, 3.0, 4.0]},
                              cv=2,
                              max_iters=20,
                              stopper=stopper,
                              early_stopping=True)

        search.fit(X, y)

        print(search.cv_results_)

        for iters in search.cv_results_["training_iteration"]:
            # Converges after 4 iterations, but the stopper needs another
            # 4 to detect it converged.
            self.assertLessEqual(iters, 8)
    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
    ):

        from tune_sklearn import TuneGridSearchCV, TuneSearchCV

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )
        tune_search = TuneSearchCV(
            self.model,
            params,
            search_optimization=search_algorithm,
            n_trials=num_trials,
            early_stopping=True,
            scoring=scoring_func,
        )
        tune_search.fit(X, y)

        return tune_search
示例#8
0
def tune_remote(x_train, y_train):
    clf = RandomForestClassifier()
    param_distributions = {
        "n_estimators": randint(20, 80),
        "max_depth": randint(2, 10)
    }

    tune_search = TuneSearchCV(clf, param_distributions, n_trials=3)

    tune_search.fit(x_train, y_train)
    return tune_search
示例#9
0
    def test_trivial_cv_results_attr(self):
        # Test search over a "grid" with only one point.
        # Non-regression test: grid_scores_ wouldn't be set by
        # dcv.GridSearchCV.
        clf = MockClassifier()
        grid_search = TuneGridSearchCV(clf, {"foo_param": [1]}, cv=3)
        grid_search.fit(X, y)
        self.assertTrue(hasattr(grid_search, "cv_results_"))

        random_search = TuneSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=3)
        random_search.fit(X, y)
        self.assertTrue(hasattr(random_search, "cv_results_"))
示例#10
0
    def sweep(self, params: Dict, X, y):

        tune_search = TuneSearchCV(
            self.model,
            param_distributions=params,
            n_trials=3,
            # early_stopping=True,
            # use_gpu=True
        )

        tune_search.fit(X, y)

        return tune_search
示例#11
0
    def sweep(self, X, y, params: Dict = None):
        if not params:
            raise NotImplementedError

        tune_search = TuneSearchCV(
            self.model,
            param_distributions=params,
            n_trials=3,
            # early_stopping=True,
            # use_gpu=True
        )

        tune_search.fit(X, y)

        return tune_search
示例#12
0
    def test_local_mode(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target

        clf = SGDClassifier()
        parameter_grid = {
            "alpha": Real(1e-4, 1e-1, 1),
            "epsilon": Real(0.01, 0.1)
        }
        tune_search = TuneSearchCV(clf,
                                   parameter_grid,
                                   n_jobs=1,
                                   max_iters=10,
                                   local_dir="./test-result")
        import ray
        with patch.object(ray, "init", wraps=ray.init) as wrapped_init:
            tune_search.fit(x, y)
        self.assertTrue(wrapped_init.call_args[1]["local_mode"])
    def _test_seed_run(self, search_optimization, seed):
        digits = datasets.load_digits()

        x = digits.data
        y = digits.target

        parameters = {
            "classify__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1],
            "classify__epsilon": [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
        }

        pipe = Pipeline([("reduce_dim", PCA()), ("classify", SGDClassifier())])

        if isinstance(seed, str):
            _seed = np.random.RandomState(seed=int(seed))
        else:
            _seed = seed
        tune_search_1 = TuneSearchCV(pipe,
                                     parameters.copy(),
                                     early_stopping=True,
                                     max_iters=1,
                                     search_optimization=search_optimization,
                                     random_state=_seed)
        tune_search_1.fit(x, y)

        if isinstance(seed, str):
            _seed = np.random.RandomState(seed=int(seed))
        else:
            _seed = seed
        tune_search_2 = TuneSearchCV(pipe,
                                     parameters.copy(),
                                     early_stopping=True,
                                     max_iters=1,
                                     search_optimization=search_optimization,
                                     random_state=_seed)
        tune_search_2.fit(x, y)

        try:
            self.assertSequenceEqual(tune_search_1.cv_results_["params"],
                                     tune_search_2.cv_results_["params"])
        except AssertionError:
            print(f"Seeds: {tune_search_1.seed} == {tune_search_2.seed}?")
            raise
    def test_max_iters(self):
        X, y = make_classification(n_samples=50,
                                   n_features=50,
                                   n_informative=3,
                                   random_state=0)

        clf = PlateauClassifier(converge_after=20)

        search = TuneSearchCV(clf, {"foo_param": [2.0, 3.0, 4.0]},
                              cv=2,
                              max_iters=6,
                              early_stopping=True)

        search.fit(X, y)

        print(search.cv_results_)

        for iters in search.cv_results_["training_iteration"]:
            # Stop after 6 iterations.
            self.assertLessEqual(iters, 6)
    def _test_method(self, search_method):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target

        tune_search = TuneSearchCV(self.clf,
                                   self.parameter_grid,
                                   search_optimization=search_method,
                                   cv=2,
                                   n_trials=3,
                                   n_jobs=1,
                                   refit=True)
        tune_search.fit(x, y)
        params = tune_search.best_estimator_.get_params()
        print({
            k: v
            for k, v in params.items() if k in ("alpha", "epsilon", "penalty")
        })
        self.assertTrue(1e-4 <= params["alpha"] <= 0.5)
        self.assertTrue(0.01 <= params["epsilon"] <= 0.05)
        self.assertTrue(params["penalty"] in ("elasticnet", "l1"))
示例#16
0
    def test_local_dir(self):
        digits = datasets.load_digits()
        x = digits.data
        y = digits.target

        clf = SGDClassifier()
        parameter_grid = {
            "alpha": Real(1e-4, 1e-1, 1),
            "epsilon": Real(0.01, 0.1)
        }

        scheduler = MedianStoppingRule(grace_period=10.0)

        tune_search = TuneSearchCV(clf,
                                   parameter_grid,
                                   early_stopping=scheduler,
                                   max_iters=10,
                                   local_dir="./test-result")
        tune_search.fit(x, y)

        self.assertTrue(len(os.listdir("./test-result")) != 0)
示例#17
0
def tune_ray(clf,
             params,
             X_train,
             y_train,
             X_test,
             y_test,
             n_params=-1,
             max_epochs=-1,
             n_jobs=4):
    common = dict(random_state=42)
    split = ShuffleSplit(test_size=0.20, n_splits=1, random_state=42)
    clf = clone(clf).set_params(prefix="ray")
    from tune_sklearn import TuneSearchCV

    search = TuneSearchCV(
        clf,
        params,
        cv=split,
        early_stopping=True,
        max_iters=max_epochs,
        n_iter=n_params,
        random_state=42,
        refit=False,
    )

    start = time()
    search.fit(X_train, y_train)
    fit_time = time() - start

    data = {
        "library": "ray",
        "fit_time": fit_time,
        "start_time": start,
        "n_params": n_params,
        "n_jobs": n_jobs,
        "max_epochs": max_epochs,
    }
    return search, data
    def test_timeout(self):
        X, y = make_classification(n_samples=50,
                                   n_features=50,
                                   n_informative=3,
                                   random_state=0)

        clf = SleepClassifier()
        # SleepClassifier sleeps for `foo_param` seconds, `cv` times.
        # Thus, the time budget is exhausted after testing the first two
        # `foo_param`s.
        search = TuneSearchCV(clf, {"foo_param": [1.1, 1.2, 2.5]},
                              time_budget_s=5.0,
                              cv=2,
                              max_iters=5,
                              early_stopping=True)

        start = time.time()
        search.fit(X, y)
        taken = time.time() - start

        print(search)
        # Without timeout we would need over 50 seconds for this to
        # finish. Allow for some initialization overhead
        self.assertLess(taken, 25.0)
示例#19
0
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

model = lgb.LGBMClassifier()
param_dists = {
    "n_estimators": [400, 700, 1000],
    "colsample_bytree": [0.7, 0.8],
    "max_depth": [15, 20, 25],
    "num_leaves": [50, 100, 200],
    "reg_alpha": [1.1, 1.2, 1.3],
    "reg_lambda": [1.1, 1.2, 1.3],
    "min_split_gain": [0.3, 0.4],
    "subsample": [0.7, 0.8, 0.9],
    "subsample_freq": [20]
}

gs = TuneSearchCV(model, param_dists, n_trials=5, scoring="accuracy")
gs.fit(X_train, y_train)
print(gs.cv_results_)

pred = gs.predict(X_test)
correct = 0
for i in range(len(y_test)):
    if pred[i] == y_test[i]:
        correct += 1
print("Accuracy:", correct / len(pred))
示例#20
0
from tune_sklearn import TuneSearchCV

# Other imports
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

# Set training and validation sets
X, y = make_classification(n_samples=11000,
                           n_features=1000,
                           n_informative=50,
                           n_redundant=0,
                           n_classes=10,
                           class_sep=2.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

# Example parameter distributions to tune from SGDClassifier
# Note the use of tuples instead if non-random optimization is desired
param_dists = {"alpha": (1e-4, 1e-1), "epsilon": (1e-2, 1e-1)}

bohb_tune_search = TuneSearchCV(
    SGDClassifier(),
    param_distributions=param_dists,
    n_trials=20,
    max_iters=100,
    search_optimization="bohb",
)

bohb_tune_search.fit(X_train, y_train)
pipe = Pipeline([("reduce_dim", PCA()), ("classify", SGDClassifier())])

param_grid = [
    {
        "classify__alpha": [1e-4, 1e-1, 1],
        "classify__epsilon": [0.01, 0.1]
    },
]

random = TuneSearchCV(pipe,
                      param_grid,
                      search_optimization="random",
                      early_stopping=True,
                      max_iters=10,
                      pipeline_auto_early_stop=True)
random.fit(X, y)
print(random.cv_results_)

grid = TuneGridSearchCV(pipe,
                        param_grid=param_grid,
                        early_stopping=True,
                        max_iters=10,
                        pipeline_auto_early_stop=True)
grid.fit(X, y)
print(grid.cv_results_)

# warm start iter

pipe = Pipeline([("reduce_dim", PCA()),
                 ("classify", LogisticRegression(max_iter=1000))])
示例#22
0
# A parameter grid for XGBoost
params = {
    "min_child_weight": [1, 5, 10],
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [3, 4, 5],
}

xgb = XGBClassifier(
    learning_rate=0.02,
    n_estimators=50,
    objective="binary:logistic",
    nthread=4,
    # tree_method="gpu_hist"  # this enables GPU.
    # See https://github.com/dmlc/xgboost/issues/2819
)

digit_search = TuneSearchCV(
    xgb,
    param_distributions=params,
    n_trials=3,
    early_stopping=True,
    # use_gpu=True # Commented out for testing on github actions,
    # but this is how you would use gpu
)

digit_search.fit(x_train, y_train)
print(digit_search.best_params_)
print(digit_search.cv_results_)
示例#23
0
    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
        early_stopping: bool = False,
        results_csv_path: str = "outputs/results.csv",
        splitting_criteria: str = "CV",
        test_indices: Union[None, List[int]] = None,
        num_splits: int = 5,
    ) -> pd.DataFrame:

        if self.scale_data:
            X, y = self.scalar(X, y)

        if splitting_criteria.lower() == "cv":
            cv = None
        elif splitting_criteria.lower() == "timeseries":
            cv = TimeSeriesSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "grouped":
            cv = GroupShuffleSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "fixed":
            if type(test_indices) != list:
                raise ValueError(
                    "fixed split used but no test-indices provided...")
            cv = PredefinedSplit(test_fold=test_indices)
        else:
            raise ValueError(
                "Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
            )

        # early stopping only supported for learners that have a
        # `partial_fit` method
        from tune_sklearn import TuneSearchCV
        import mlflow
        import time

        mlflow.set_tracking_uri(os.path.join("file:/", os.getcwd(), "outputs"))

        # start mlflow auto-logging
        # mlflow.sklearn.autolog()

        if search_algorithm.lower() == "bohb":
            early_stopping = True

        if any([
                search_algorithm.lower()
                in ["bohb", "bayesian", "hyperopt", "optuna"]
        ]):
            search = TuneSearchCV(
                self.model,
                params,
                search_optimization=search_algorithm,
                cv=cv,
                n_trials=num_trials,
                early_stopping=early_stopping,
                scoring=scoring_func,
                loggers=["csv", "tensorboard"],
                verbose=1,
            )
        elif search_algorithm == "grid":
            search = GridSearchCV(
                self.model,
                param_grid=params,
                refit=True,
                cv=cv,
                scoring=scoring_func,
                verbose=1,
            )
        elif search_algorithm == "random":
            search = RandomizedSearchCV(
                self.model,
                param_distributions=params,
                refit=True,
                cv=cv,
                scoring=scoring_func,
                verbose=1,
            )
        else:
            raise NotImplementedError(
                "Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
            )

        # with mlflow.start_run() as run:
        search.fit(X, y)
        self.model = search.best_estimator_
        results_df = pd.DataFrame(search.cv_results_)
        if not pathlib.Path(results_csv_path).parent.exists():
            pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True,
                                                        parents=True)
        final_path = (results_csv_path[:-4] + "_" +
                      time.strftime("%Y%m%d-%H%M%S") + ".csv")
        logger.info(f"Saving sweeping results to {final_path}")
        results_df.to_csv(final_path)
        logger.info(f"Best hyperparams: {search.best_params_}")
        logger.info(f"Best score: {search.best_score_}")

        return results_df
示例#24
0
from tune_sklearn import TuneSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from ray.tune.schedulers import MedianStoppingRule
import numpy as np

digits = datasets.load_digits()
x = digits.data
y = digits.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

clf = SGDClassifier()
parameter_grid = {"alpha": (1e-4, 1), "epsilon": (0.01, 0.1)}

scheduler = MedianStoppingRule(grace_period=10.0)

tune_search = TuneSearchCV(clf,
                           parameter_grid,
                           search_optimization="bayesian",
                           n_iter=3,
                           early_stopping=scheduler,
                           max_iters=10)
tune_search.fit(x_train, y_train)

pred = tune_search.predict(x_test)
accuracy = np.count_nonzero(np.array(pred) == np.array(y_test)) / len(pred)
print(accuracy)
示例#25
0
   'reg_lambda':[1e-5, 1e-2, 0.45],
   'subsample':[0.6,0.95]
}

t_search = TuneSearchCV(
    xgb_model,
    param_distributions=parameters_for_testing,
    n_trials=3,
    early_stopping=True,
    use_gpu=True
    # Commented out for testing on github actions,
    # but this is how you would use gpu
)

# gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=6,iid=False, verbose=10,scoring='neg_mean_squared_error')
t_search.fit(X_train,y_train)
print (t_search.scorer_)
print('best params')
print (t_search.best_params_)
print('best score')
print (t_search.best_score_)

final_xgb = xgboost.XGBRegressor(colsample_bytree= 0.6, gamma= 0.1, min_child_weight= 1.5, learning_rate= 0.07, max_depth= 5, n_estimators= 1000, reg_alpha= 0.01, reg_lambda= 1e-05, subsample= 0.95)

trained = final_xgb.fit(X_train,y_train)


y_pred = trained.predict(X_test)
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return  np.mean(np.abs((y_test - y_pred) / y_test)) * 100
示例#26
0
    def sweep(
        self,
        params: Dict,
        X,
        y,
        search_algorithm: str = "bayesian",
        num_trials: int = 3,
        scoring_func: str = "r2",
        early_stopping: bool = False,
        results_csv_path: str = "outputs/results.csv",
        splitting_criteria: str = "timeseries",
        num_splits: int = 5,
    ):

        start_dir = str(pathlib.Path(os.getcwd()).parent)
        module_dir = str(pathlib.Path(__file__).parent)
        # temporarily change directory to file directory and then reset
        os.chdir(module_dir)

        if self.scale_data:
            X, y = self.scalar(X, y)

        X, y = (
            torch.tensor(X).float().to(device=self.device),
            torch.tensor(y).float().to(device=self.device),
        )

        if splitting_criteria.lower() == "cv":
            cv = None
        elif splitting_criteria.lower() == "timeseries":
            cv = TimeSeriesSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "grouped":
            cv = GroupShuffleSplit(n_splits=num_splits)
        elif splitting_criteria.lower() == "fixed":
            if type(test_indices) != list:
                raise ValueError(
                    "fixed split used but no test-indices provided...")
            cv = PredefinedSplit(test_fold=test_indices)
        else:
            raise ValueError(
                "Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]"
            )

        if search_algorithm.lower() == "bohb":
            early_stopping = True

        if any([
                search_algorithm.lower()
                in ["bohb", "bayesian", "hyperopt", "optuna"]
        ]):
            search = TuneSearchCV(
                self.model,
                params,
                search_optimization=search_algorithm,
                n_trials=num_trials,
                early_stopping=early_stopping,
                scoring=scoring_func,
            )
        elif search_algorithm == "grid":
            search = GridSearchCV(
                self.model,
                param_grid=params,
                refit=True,
                cv=num_trials,
                scoring=scoring_func,
            )
        elif search_algorithm == "random":
            search = RandomizedSearchCV(
                self.model,
                param_distributions=params,
                refit=True,
                cv=num_trials,
                scoring=scoring_func,
            )
        else:
            raise NotImplementedError(
                "Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random"
            )
        with mlflow.start_run() as run:
            search.fit(X, y)
        self.model = search.best_estimator_

        # set path back to initial
        os.chdir(start_dir)

        results_df = pd.DataFrame(search.cv_results_)
        logger.info(f"Best hyperparams: {search.best_params_}")

        if not pathlib.Path(results_csv_path).parent.exists():
            pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True,
                                                        parents=True)
        logger.info(f"Saving sweeping results to {results_csv_path}")
        logger.info(f"Best score: {search.best_score_}")
        results_df.to_csv(results_csv_path)
        cols_keep = [col for col in results_df if "param_" in col]
        cols_keep += ["mean_test_score"]

        results_df = results_df[cols_keep]

        return results_df
示例#27
0
    def test_random_search_cv_results(self):
        # Make a dataset with a lot of noise to get various kind of prediction
        # errors across CV folds and parameter settings
        X, y = make_classification(n_samples=200,
                                   n_features=100,
                                   n_informative=3,
                                   random_state=0)

        # scipy.stats dists now supports `seed` but we still support scipy 0.12
        # which doesn't support the seed. Hence the assertions in the test for
        # random_search alone should not depend on randomization.
        n_splits = 3
        n_search_iter = 30
        params = dict(C=expon(scale=10), gamma=expon(scale=0.1))
        random_search = TuneSearchCV(
            SVC(),
            n_iter=n_search_iter,
            cv=n_splits,
            param_distributions=params,
            return_train_score=True,
        )
        random_search.fit(X, y)

        param_keys = ("param_C", "param_gamma")
        score_keys = (
            "mean_test_score",
            "mean_train_score",
            "rank_test_score",
            "rank_train_score",
            "split0_test_score",
            "split1_test_score",
            "split2_test_score",
            "split0_train_score",
            "split1_train_score",
            "split2_train_score",
            "std_test_score",
            "std_train_score",
            "time_total_s",
        )
        n_cand = n_search_iter

        def test_check_cv_results_array_types(cv_results, param_keys,
                                              score_keys):
            # Check if the search `cv_results`'s array are of correct types
            self.assertTrue(
                all(
                    isinstance(cv_results[param], np.ma.MaskedArray)
                    for param in param_keys))
            self.assertTrue(
                all(cv_results[key].dtype == object for key in param_keys))
            self.assertFalse(
                any(
                    isinstance(cv_results[key], np.ma.MaskedArray)
                    for key in score_keys))
            self.assertTrue(
                all(cv_results[key].dtype == np.float64 for key in score_keys
                    if not key.startswith("rank")))
            self.assertEquals(cv_results["rank_test_score"].dtype, np.int32)

        def test_check_cv_results_keys(cv_results, param_keys, score_keys,
                                       n_cand):
            # Test the search.cv_results_ contains all the required results
            assert_array_equal(sorted(cv_results.keys()),
                               sorted(param_keys + score_keys + ("params", )))
            self.assertTrue(
                all(cv_results[key].shape == (n_cand, )
                    for key in param_keys + score_keys))

        cv_results = random_search.cv_results_
        # Check results structure
        test_check_cv_results_array_types(cv_results, param_keys, score_keys)
        test_check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
        # For random_search, all the param array vals should be unmasked
        self.assertFalse(
            any(cv_results["param_C"].mask)
            or any(cv_results["param_gamma"].mask))
示例#28
0
    "n_estimators": randint(50, 1000),
    "max_depth": randint(2, 7),
    'max_features': randint(5, 25),
    'min_weight_fraction_leaf': [0.0, 0.03, 0.05, 0.07, 0.10, 0.15],
    'min_impurity_decrease': [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1]
}
tune_search = TuneSearchCV(estimator=rf,
                           param_distributions=param_random,
                           search_optimization="random",
                           early_stopping=False,
                           n_iter=30,
                           scoring='accuracy',
                           n_jobs=12,
                           cv=cv,
                           verbose=1)
tune_search.fit(X_train, y_train, sample_weight=sample_weights)

# bayesian search
tune_search = TuneSearchCV(rf,
                           param_random,
                           search_optimization='bayesian',
                           max_iters=100,
                           scoring='accuracy',
                           n_jobs=12,
                           cv=cv,
                           verbose=1)
tune_search.fit(X_train, y_train, sample_weight=sample_weights)

# scores
clf_predictions = tune_search.predict(X_test)
tune_search.best_params_