def _execute_experiment(self): """Instantiate and run a :class:`experiments.CVExperiment` after checking for duplicate keys Notes ----- As described in the Notes of :meth:`BaseOptimizationProtocol.set_experiment_guidelines`, the `auto_start` kwarg of :meth:`experiments.CVExperiment.__init__` is set to False in order to check for duplicated keys""" self._update_current_hyperparameters() self.current_experiment = CVExperiment( # model=None, # TODO: May need to pass `model` from `set_experiment_guidelines` model_initializer=self.model_initializer, model_init_params=self.current_init_params, model_extra_params=self.current_extra_params, feature_selector=self.feature_selector, preprocessing_pipeline=self.preprocessing_pipeline, preprocessing_params=self.preprocessing_params, notes=self.notes, do_raise_repeated=self.do_raise_repeated, auto_start=False, ) self.current_experiment.preparation_workflow() # Future Hunter, if multi-cross_experiment_keys ever supported, this will be a problem. Should've fixed it earlier, dummy if self.current_experiment.hyperparameter_key.key not in self.tested_keys: self.tested_keys.append(self.current_experiment.hyperparameter_key.key) self.current_experiment.experiment_workflow() self.current_score = get_path( self.current_experiment.last_evaluation_results, self.target_metric ) self.successful_iterations += 1 self._clean_up_experiment()
def test_invalid_environment(monkeypatch, env_fixture_0): """Test that initializing an Experiment when there is an active `Environment` -- but :attr:`hyperparameter_hunter.environment.Environment.current_task` is not None -- raises `EnvironmentInvalidError`""" # Currently have a valid `settings.G.Env` (`env_fixture_0`), so give it a fake `current_task` monkeypatch.setattr(settings.G.Env, "current_task", "some other task") with pytest.raises(EnvironmentInvalidError, match="Current experiment must finish before .*"): CVExperiment(LogisticRegression, dict())
class BaseOptimizationProtocol(metaclass=MergedOptimizationMeta): def __init__( self, target_metric=None, iterations=1, verbose=1, read_experiments=True, reporter_parameters=None, ): """Base class for intermediate base optimization protocol classes Parameters ---------- target_metric: Tuple, default=('oof', <first key in :attr:`environment.Environment.metrics`>) A path denoting the metric to be used to compare completed Experiments within the Optimization Protocol. The first value should be one of ['oof', 'holdout', 'in_fold']. The second value should be the name of a metric being recorded according to the values supplied in :attr:`environment.Environment.metrics_params`. See the documentation for :func:`metrics.get_formatted_target_metric` for more info. Any values returned by, or given as the `target_metric` input to, :func:`metrics.get_formatted_target_metric` are acceptable values for :attr:`BaseOptimizationProtocol.target_metric` iterations: Int, default=1 The number of distinct experiments to execute verbose: Int 0, 1, or 2, default=1 Verbosity mode for console logging. 0: Silent. 1: Show only logs from the Optimization Protocol. 2: In addition to logs shown when verbose=1, also show the logs from individual Experiments read_experiments: Boolean, default=True If True, all Experiment records that fit in the current :attr:`space` and guidelines, and match :attr:`algorithm_name`, will be read in and used to fit any optimizers reporter_parameters: Dict, or None, default={} Additional parameters passed to :meth:`reporting.OptimizationReporter.__init__`. Note: Unless provided explicitly, the key "do_maximize" will be added by default to `reporter_params`, with a value inferred from the `direction` of :attr:`target_metric` in `G.Env.metrics`. In nearly all cases, the "do_maximize" key should be ignored, as there are very few reasons to explicitly include it Notes ----- By default, 'script_backup' for Experiments is blacklisted when executed within :class:`BaseOptimizationProtocol` since it would just repeatedly create copies of the same, unchanged file (this file). So don't expect any script_backup files for Experiments executed during optimization rounds""" #################### Optimization Protocol Parameters #################### self.target_metric = target_metric self.iterations = iterations self.verbose = verbose self.read_experiments = read_experiments self.reporter_parameters = reporter_parameters or {} #################### Experiment Guidelines #################### self.model_initializer = None self.model_init_params = None self.model_extra_params = None self.feature_selector = None self.preprocessing_pipeline = None self.preprocessing_params = None self.notes = None self.do_raise_repeated = True #################### Search Parameters #################### self.dimensions = [] self.search_bounds = dict() self.space = None self.similar_experiments = [] self.best_experiment = None self.best_score = None self.successful_iterations = 0 self.skipped_iterations = 0 self.tested_keys = [] self._search_space_size = None self.current_init_params = None self.current_extra_params = None #################### Identification Attributes #################### self.algorithm_name = None self.module_name = None self.current_experiment = None self.current_score = None #################### Keras-Specific Attributes #################### self.dummy_layers = [] self.dummy_compile_params = dict() self.init_iter_attrs = [] self.extra_iter_attrs = [] self.logger = None self._preparation_workflow() self.do_maximize = G.Env.metrics[self.target_metric[-1]].direction == "max" ################################################## # Core Methods: ################################################## # TODO: Add `model` here, with a `TranslateTrace` decorator, and document it below def set_experiment_guidelines( self, model_initializer, model_init_params, model_extra_params=None, feature_selector=None, preprocessing_pipeline=None, preprocessing_params=None, notes=None, do_raise_repeated=True, ): """Provide arguments necessary to instantiate :class:`experiments.CVExperiment`. This method has the same signature as :meth:`experiments.BaseExperiment.__init__` except where noted Parameters ---------- model_initializer: Class, or functools.partial, or class instance The algorithm class being used to initialize a model model_init_params: Dict, or object The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are valid in `model_init_params` model_extra_params: Dict, or None, default=None A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models' non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks feature_selector: List of str, callable, list of booleans, default=None The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None, `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column` preprocessing_pipeline: ... ... Experimental... preprocessing_params: ... ... Experimental... notes: String, or None, default=None Additional information about the Experiment that will be saved with the Experiment's description result file. This serves no purpose other than to facilitate saving Experiment details in a more readable format do_raise_repeated: Boolean, default=False If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a warning will be logged Notes ----- The `auto_start` kwarg is not available here because :meth:`BaseOptimizationProtocol._execute_experiment` sets it to False in order to check for duplicated keys before running the whole Experiment. This is the most notable difference between calling :meth:`set_experiment_guidelines` and instantiating :class:`experiments.CVExperiment`""" self.model_initializer = model_initializer self.model_init_params = identify_algorithm_hyperparameters(self.model_initializer) try: self.model_init_params.update(model_init_params) except TypeError: self.model_init_params.update(dict(build_fn=model_init_params)) self.model_extra_params = model_extra_params if model_extra_params is not None else {} self.feature_selector = feature_selector if feature_selector is not None else [] self.preprocessing_pipeline = preprocessing_pipeline or {} self.preprocessing_params = preprocessing_params if preprocessing_params is not None else {} self.notes = notes self.do_raise_repeated = do_raise_repeated if self.do_raise_repeated is False: G.warn_("WARNING: Setting `do_raise_repeated`=False allows duplicated Experiments") self.algorithm_name, self.module_name = identify_algorithm(self.model_initializer) self._validate_guidelines() #################### Deal with Keras #################### if self.module_name == "keras": reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow( self.model_initializer, self.model_init_params["build_fn"], self.model_extra_params, self.source_script, ) self.model_init_params = dict(build_fn=reusable_build_fn) self.model_extra_params = reusable_wrapper_params self.dummy_layers = dummy_layers self.dummy_compile_params = dummy_compile_params # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam' self.set_dimensions() def set_dimensions(self): """Locate given hyperparameters that are `space` choice declarations and add them to :attr:`dimensions`""" all_dimension_choices = [] #################### Remap Extra Objects #################### if self.module_name == "keras": from keras.initializers import Initializer as KerasInitializer from keras.callbacks import Callback as KerasCB self.init_iter_attrs.append(lambda _p, _k, _v: isinstance(_v, KerasInitializer)) self.extra_iter_attrs.append(lambda _p, _k, _v: isinstance(_v, KerasCB)) #################### Collect Choice Dimensions #################### init_dim_choices = get_choice_dimensions(self.model_init_params, self.init_iter_attrs) extra_dim_choices = get_choice_dimensions(self.model_extra_params, self.extra_iter_attrs) for (path, choice) in init_dim_choices: choice._name = ("model_init_params",) + path all_dimension_choices.append(choice) for (path, choice) in extra_dim_choices: choice._name = ("model_extra_params",) + path all_dimension_choices.append(choice) self.dimensions = all_dimension_choices if self.module_name == "keras": self.model_extra_params = link_choice_ids( self.dummy_layers, self.dummy_compile_params, self.model_extra_params, self.dimensions, ) def go(self): """Begin hyperparameter optimization process after experiment guidelines have been set and search dimensions are in place. This process includes the following: setting the hyperparameter space; locating similar experiments to be used as learning material for :class:`SKOptimizationProtocol` s; and executing :meth:`_optimization_loop`, which actually sets off the Experiment execution process""" if self.model_initializer is None: raise ValueError("Experiment guidelines must be set before starting optimization") _reporter_params = dict(dict(do_maximize=self.do_maximize), **self.reporter_parameters) self.logger = OptimizationReporter([_.name for _ in self.dimensions], **_reporter_params) self.tested_keys = [] self._set_hyperparameter_space() self._find_similar_experiments() loop_start_time = datetime.now() self._optimization_loop() loop_end_time = datetime.now() G.log_(f"Optimization loop completed in {loop_end_time - loop_start_time}") G.log_(f'Best score was {self.best_score} from Experiment "{self.best_experiment}"') self._clean_up_optimization() ################################################## # Helper Methods: ################################################## def _optimization_loop(self, iteration=0): """Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an Experiment will be executed, its results will be logged, and it will be compared to the current best experiment Parameters ---------- iteration: Int, default=0 The current iteration in the optimization loop""" self.logger.print_optimization_header() while iteration < self.iterations: try: self._execute_experiment() except RepeatedExperimentError: # G.debug_(F'Skipping repeated Experiment: {_ex!s}\n') if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size: G.log_(f"Hyperparameter search space has been exhausted") break self.skipped_iterations += 1 continue except StopIteration: if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size: G.log_(f"Hyperparameter search space has been exhausted") break # G.debug_(f'Re-initializing hyperparameter grid after testing {len(self.tested_keys)} keys') self._set_hyperparameter_space() continue self.logger.print_result( self.current_hyperparameters_list, self.current_score, experiment_id=self.current_experiment.experiment_id, ) if ( (self.best_experiment is None) # First evaluation or (self.do_maximize and (self.best_score < self.current_score)) # New best max or (not self.do_maximize and (self.best_score > self.current_score)) # New best min ): self.best_experiment = self.current_experiment.experiment_id self.best_score = self.current_score iteration += 1 def _execute_experiment(self): """Instantiate and run a :class:`experiments.CVExperiment` after checking for duplicate keys Notes ----- As described in the Notes of :meth:`BaseOptimizationProtocol.set_experiment_guidelines`, the `auto_start` kwarg of :meth:`experiments.CVExperiment.__init__` is set to False in order to check for duplicated keys""" self._update_current_hyperparameters() self.current_experiment = CVExperiment( # model=None, # TODO: May need to pass `model` from `set_experiment_guidelines` model_initializer=self.model_initializer, model_init_params=self.current_init_params, model_extra_params=self.current_extra_params, feature_selector=self.feature_selector, preprocessing_pipeline=self.preprocessing_pipeline, preprocessing_params=self.preprocessing_params, notes=self.notes, do_raise_repeated=self.do_raise_repeated, auto_start=False, ) self.current_experiment.preparation_workflow() # Future Hunter, if multi-cross_experiment_keys ever supported, this will be a problem. Should've fixed it earlier, dummy if self.current_experiment.hyperparameter_key.key not in self.tested_keys: self.tested_keys.append(self.current_experiment.hyperparameter_key.key) self.current_experiment.experiment_workflow() self.current_score = get_path( self.current_experiment.last_evaluation_results, self.target_metric ) self.successful_iterations += 1 self._clean_up_experiment() @staticmethod def _clean_up_optimization(): """Perform any cleanup necessary after completion of the optimization loop. Most notably, this handles removal of temporary model files created for Keras optimization""" for (root, dirs, files) in walk(TEMP_MODULES_DIR_PATH, topdown=False): for file in files: if file.startswith("__temp_"): remove(f"{root}/{file}") try: rmdir(root) except OSError: G.warn_(f"Unidentified file found in temporary directory: {root}") def _clean_up_experiment(self): """Perform any cleanup necessary after completion of an Experiment""" if self.module_name == "keras": K.clear_session() def _update_current_hyperparameters(self): """Update :attr:`current_init_params`, and :attr:`current_extra_params` according to the upcoming set of hyperparameters to be searched""" current_hyperparameters = self._get_current_hyperparameters().items() init_params = { _k[1:]: _v for _k, _v in current_hyperparameters if _k[0] == "model_init_params" } extra_params = { _k[1:]: _v for _k, _v in current_hyperparameters if _k[0] == "model_extra_params" } # TODO: Replace above two with `general_utils.subdict` call that modifies key to a slice # FLAG: At this point, `dummy_layers` shows "kernel_initializer" as `orthogonal` instance with "__hh" attrs # FLAG: HOWEVER, the `orthogonal` instance does have `gain` set to the correct dummy value, ... # FLAG: ... so it might be ok, as long as experiment matching can still work with that self.current_init_params = deep_restricted_update( self.model_init_params, init_params, iter_attrs=self.init_iter_attrs ) self.current_extra_params = deep_restricted_update( self.model_extra_params, extra_params, iter_attrs=self.extra_iter_attrs ) if (self.module_name == "keras") and ("callbacks" in self.current_extra_params): self.current_extra_params["callbacks"] = reinitialize_callbacks( self.current_extra_params["callbacks"] ) # No need to reinitialize Keras `initializers` - Their values are passed to `build_fn` via extra `params` ################################################## # Abstract Methods: ################################################## @abstractmethod def _set_hyperparameter_space(self): """Initialize :attr:`space` according to the provided hyperparameter search dimensions""" raise NotImplementedError() @abstractmethod def _get_current_hyperparameters(self): """Retrieve the upcoming set of hyperparameters to be searched Returns ------- current_hyperparameters: Dict The next set of hyperparameters that will be searched""" raise NotImplementedError() @property @abstractmethod def search_space_size(self): """The number of different hyperparameter permutations possible given the current hyperparameter search space""" raise NotImplementedError() ################################################## # Utility Methods: ################################################## def _preparation_workflow(self): """Perform housekeeping tasks to prepare for core functionality like validating the `Environment` and parameters, and updating the verbosity of individual Experiments""" self._validate_environment() self._validate_parameters() self._update_verbosity() @staticmethod def _validate_environment(): """Check that there is a currently active and unoccupied Environment instance""" if G.Env is None: raise EnvironmentInactiveError() if G.Env.current_task is None: G.log_(f'Validated Environment with key: "{G.Env.cross_experiment_key}"') else: raise EnvironmentInvalidError("Must finish current task before starting a new one") def _validate_parameters(self): """Ensure provided input parameters are properly formatted""" self.target_metric = get_formatted_target_metric( self.target_metric, G.Env.metrics, default_dataset="oof" ) def _validate_guidelines(self): """Ensure provided Experiment guideline parameters are properly formatted""" target_column = G.Env.target_column id_column = G.Env.id_column train_dataset = G.Env.train_dataset.copy() self.feature_selector = self.feature_selector or train_dataset.columns.values restricted_cols = [_ for _ in target_column + [id_column] if _ is not None] self.feature_selector = [_ for _ in self.feature_selector if _ not in restricted_cols] def _find_similar_experiments(self): """Look for Experiments that were performed under similar conditions (algorithm and cross-experiment parameters)""" if self.read_experiments is False: return self.logger.print_saved_results_header() model_params = dict( model_init_params=self.model_init_params, model_extra_params=self.model_extra_params, preprocessing_pipeline=self.preprocessing_pipeline, preprocessing_params=self.preprocessing_params, feature_selector=self.feature_selector, ) if self.module_name == "keras": model_params["model_init_params"]["layers"] = self.dummy_layers model_params["model_init_params"]["compile_params"] = self.dummy_compile_params experiment_finder = finder_selector(self.module_name)( self.algorithm_name, self.module_name, G.Env.cross_experiment_key, self.target_metric, self.space, G.Env.result_paths["global_leaderboard"], G.Env.result_paths["description"], model_params, ) experiment_finder.find() self.similar_experiments = experiment_finder.similar_experiments def _update_verbosity(self): """Update :attr:`environment.Environment.reporting_params` if required by :attr:`verbose`""" #################### Mute non-critical console logging for Experiments #################### if self.verbose in [0, 1]: G.Env.reporting_params.setdefault("console_params", {})["level"] = "CRITICAL" #################### Blacklist 'script_backup' #################### G.Env.result_paths["script_backup"] = None