def __init__( # TODO: Make `model_init_params` an optional kwarg - If not given, algorithm defaults used self, # model=None, # model_initializer=None, # model_init_params=None, # TODO: Convert below 2 to above 3 lines for `TranslateTrace` model_initializer, model_init_params, model_extra_params=None, feature_selector=None, preprocessing_pipeline=None, preprocessing_params=None, notes=None, do_raise_repeated=False, auto_start=True, target_metric=None, ): # TODO: When `TranslateTrace` added document `model` below with expectation that if `model` # TODO: ... given, (`model_initializer`, `model_init_params`) should not be, and vice versa # TODO: `model` (Class instance, default=None); # TODO: `model_initializer`/`model_init_params` docstring types += "default=None" """Base class for :class:`BaseCVExperiment` Parameters ---------- model_initializer: Class, or functools.partial, or class instance The algorithm class being used to initialize a model model_init_params: Dict, or object The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are valid in `model_init_params` model_extra_params: Dict, or None, default=None A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models' non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks feature_selector: List of str, callable, list of booleans, default=None The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None, `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column` preprocessing_pipeline: ... ... Experimental... preprocessing_params: ... ... Experimental... notes: String, or None, default=None Additional information about the Experiment that will be saved with the Experiment's description result file. This serves no purpose other than to facilitate saving Experiment details in a more readable format do_raise_repeated: Boolean, default=False If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a warning will be logged auto_start: Boolean, default=True If True, after the Experiment is initialized, it will automatically call :meth:`BaseExperiment.preparation_workflow`, followed by :meth:`BaseExperiment.experiment_workflow`, effectively completing all essential tasks without requiring additional method calls target_metric: Tuple, str, default=('oof', <:attr:`environment.Environment.metrics`[0]>) A path denoting the metric to be used to compare completed Experiments or to use for certain early stopping procedures in some model classes. The first value should be one of ['oof', 'holdout', 'in_fold']. The second value should be the name of a metric being recorded according to the values supplied in :attr:`environment.Environment.metrics_params`. See the documentation for :func:`metrics.get_formatted_target_metric` for more info. Any values returned by, or used as the `target_metric` input to this function are acceptable values for :attr:`BaseExperiment.target_metric`""" # self._model_original = model # TODO: Add for `TranslateTrace` self.model_initializer = model_initializer self.model_init_params = identify_algorithm_hyperparameters( self.model_initializer) try: self.model_init_params.update(model_init_params) except TypeError: self.model_init_params.update(dict(build_fn=model_init_params)) self.model_extra_params = model_extra_params if model_extra_params is not None else {} self.feature_selector = feature_selector if feature_selector is not None else [] self.preprocessing_pipeline = preprocessing_pipeline or {} self.preprocessing_params = preprocessing_params if preprocessing_params is not None else {} self.notes = notes self.do_raise_repeated = do_raise_repeated self.auto_start = auto_start self.target_metric = target_metric #################### Attributes From Active Environment #################### G.Env.initialize_reporting() self._validate_environment() self.train_dataset = G.Env.train_dataset.copy() try: self.holdout_dataset = G.Env.holdout_dataset.copy() except AttributeError: self.holdout_dataset = G.Env.holdout_dataset try: self.test_dataset = G.Env.test_dataset.copy() except AttributeError: self.test_dataset = G.Env.test_dataset self.target_column = G.Env.target_column self.id_column = G.Env.id_column self.do_predict_proba = G.Env.do_predict_proba self.prediction_formatter = G.Env.prediction_formatter self.metrics_params = G.Env.metrics_params self.experiment_params = G.Env.cross_experiment_params self.cv_params = G.Env.cv_params self.result_paths = G.Env.result_paths self.cross_experiment_key = G.Env.cross_experiment_key #################### Instantiate Other Attributes #################### self.train_input_data = None self.train_target_data = None self.holdout_input_data = None self.holdout_target_data = None self.test_input_data = None self.model = None self.metrics = None # Set by :class:`metrics.ScoringMixIn` self.stat_aggregates = dict() self.result_description = None #################### Experiment Identification Attributes #################### self.experiment_id = None self.hyperparameter_key = None self.algorithm_name, self.module_name = identify_algorithm( self.model_initializer) ScoringMixIn.__init__( self, **self.metrics_params if self.metrics_params else {}) if self.auto_start is True: self.preparation_workflow() self.experiment_workflow()
def set_experiment_guidelines( self, model_initializer, model_init_params, model_extra_params=None, feature_selector=None, preprocessing_pipeline=None, preprocessing_params=None, notes=None, do_raise_repeated=True, ): """Provide the arguments necessary to instantiate :class:`experiments.CrossValidationExperiment`. This method has the same signature as :meth:`experiments.BaseExperiment.__init__` except where noted Parameters ---------- model_initializer: Class, or functools.partial, or class instance The algorithm class being used to initialize a model model_init_params: Dict, or object The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are valid in `model_init_params` model_extra_params: Dict, or None, default=None A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models' non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks feature_selector: List of str, callable, list of booleans, default=None The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None, `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column` preprocessing_pipeline: ... ... Experimental... preprocessing_params: ... ... Experimental... notes: String, or None, default=None Additional information about the Experiment that will be saved with the Experiment's description result file. This serves no purpose other than to facilitate saving Experiment details in a more readable format do_raise_repeated: Boolean, default=False If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a warning will be logged Notes ----- The `auto_start` kwarg is not available here because :meth:`BaseOptimizationProtocol._execute_experiment` sets it to False in order to check for duplicated keys before running the whole Experiment. This is the most notable difference between calling :meth:`set_experiment_guidelines` and instantiating :class:`experiments.CrossValidationExperiment`""" self.model_initializer = model_initializer self.model_init_params = identify_algorithm_hyperparameters( self.model_initializer) try: self.model_init_params.update(model_init_params) except TypeError: self.model_init_params.update(dict(build_fn=model_init_params)) self.model_extra_params = model_extra_params self.feature_selector = feature_selector self.preprocessing_pipeline = preprocessing_pipeline self.preprocessing_params = preprocessing_params self.notes = notes self.do_raise_repeated = do_raise_repeated if self.do_raise_repeated is False: G.warn_( 'WARNING: Setting `do_raise_repeated`=False will allow Experiments to be unnecessarily duplicated' ) self.algorithm_name, self.module_name = identify_algorithm( self.model_initializer) self._validate_guidelines() #################### Deal with Keras #################### if self.module_name == 'keras': reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow( self.model_initializer, self.model_init_params['build_fn'], self.model_extra_params, self.source_script) self.model_init_params = dict(build_fn=reusable_build_fn) self.model_extra_params = reusable_wrapper_params self.dummy_layers = dummy_layers self.dummy_compile_params = dummy_compile_params # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam' self.set_dimensions()
def __init__( self, model_initializer, model_init_params=None, model_extra_params=None, feature_engineer=None, feature_selector=None, notes=None, do_raise_repeated=False, auto_start=True, target_metric=None, ): """One-off Experimentation base class **Bare-bones Description:** Runs the cross-validation scheme defined by `Environment`, during which 1) Datasets are processed according to `feature_engineer`; 2) Models are built by instantiating `model_initializer` with `model_init_params`; 3) Models are trained on processed data, optionally using parameters from `model_extra_params`; 4) Results are logged and recorded for each fitting period; 5) Descriptions, predictions, results (both averages and individual periods), etc. are saved. **What's the Big Deal?** The most important takeaway from the above description is that descriptions/results are THOROUGH and REUSABLE. By thorough, I mean that all of a model's hyperparameters are saved, not just the ones given in `model_init_params`. This may sound odd, but it's important because it makes results reusable during optimization, when you may be using a different set of hyperparameters. It helps with other things like preventing duplicate experiments and ensembling, as well. But the big part is that this transforms hyperparameter optimization from an isolated, throwaway process we can only afford when an ML project is sufficiently "mature" to a process that covers the entire lifespan of a project. No Experiment is forgotten or wasted. Optimization is automatically given the data it needs to succeed by drawing on all your past Experiments and optimization rounds. The Experiment has three primary missions: 1. Act as scaffold for organizing ML Experimentation and optimization 2. Record Experiment descriptions and results 3. Eliminate lots of repetitive/error-prone boilerplate code Providing a scaffold for the entire ML process is critical because without a standardized format, everything we do looks different. Without a unified scaffold, development is slower, more confusing, and less adaptable. One of the benefits of standardizing the format of ML Experimentation is that it enables us to exhaustively record all the important characteristics of Experiment, as well as an assortment of customizable result files -- all in a way that allows them to be reused in the future. **What About Data/Metrics?** Experiments require an active :class:`~hyperparameter_hunter.environment.Environment` in order to function, from which the Experiment collects important cross-experiment parameters, such as datasets, metrics, cross-validation schemes, and even callbacks to inherit, among many other properties documented in :class:`~hyperparameter_hunter.environment.Environment` Parameters ---------- model_initializer: Class, or functools.partial, or class instance Algorithm class used to initialize a model, such as XGBoost's `XGBRegressor`, or SKLearn's `KNeighborsClassifier`; although, there are hundreds of possibilities across many different ML libraries. `model_initializer` is expected to define at least `fit` and `predict` methods. `model_initializer` will be initialized with `model_init_params`, and its "extra" methods (`fit`, `predict`, etc.) will be invoked with parameters in `model_extra_params` model_init_params: Dict, or object (optional) Dictionary of arguments given to create an instance of `model_initializer`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are valid in `model_init_params`. One of the key features that makes HyperparameterHunter so magical is that **ALL** hyperparameters in the signature of `model_initializer` (and their default values) are discovered -- whether or not they are explicitly given in `model_init_params`. Not only does this make Experiment result descriptions incredibly thorough, it also makes optimization smoother, more effective, and far less work for the user. For example, take LightGBM's `LGBMRegressor`, with `model_init_params`=`dict(learning_rate=0.2)`. HyperparameterHunter recognizes that this differs from the default of 0.1. It also recognizes that `LGBMRegressor` is actually initialized with more than a dozen other hyperparameters we didn't bother mentioning, and it records their values, too. So if we want to optimize `num_leaves` tomorrow, the OptPro doesn't start from scratch. It knows that we ran an Experiment that didn't explicitly mention `num_leaves`, but its default value was 31, and it uses this information to fuel optimization -- all without us having to manually keep track of tons of janky collections of hyperparameters. In fact, we really don't need to go out of our way at all. HyperparameterHunter just acts as our faithful lab assistant, keeping track of all the stuff we'd rather not worry about model_extra_params: Dict (optional) Dictionary of extra parameters for models' non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks. To specify parameters for an extra method, place them in a dict named for the extra method to which the parameters should be given. For example, to call `fit` with `early_stopping_rounds`=5, use `model_extra_params`=`dict(fit=dict(early_stopping_rounds=5))`. For models whose `fit` methods have a kwarg like `eval_set` (such as XGBoost's), one can use the `DatasetSentinel` attributes of the current active :class:`~hyperparameter_hunter.environment.Environment`, documented under its "Attributes" section and under :attr:`~hyperparameter_hunter.environment.Environment.train_input`. An example using several DatasetSentinels can be found in HyperparameterHunter's [XGBoost Classification Example](https://github.com/HunterMcGushion/hyperparameter_hunter/blob/master/examples/xgboost_examples/classification.py) feature_engineer: `FeatureEngineer`, or list (optional) Feature engineering/transformation/pre-processing steps to apply to datasets defined in :class:`~hyperparameter_hunter.environment.Environment`. If list, will be used to initialize :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`, and can contain any of the following values: 1. :class:`~hyperparameter_hunter.feature_engineering.EngineerStep` instance 2. Function input to :class:~hyperparameter_hunter.feature_engineering.EngineerStep` For important information on properly formatting `EngineerStep` functions, please see the documentation of :class:`~hyperparameter_hunter.feature_engineering.EngineerStep`. OptPros can perform hyperparameter optimization of `feature_engineer` steps. This capability adds a third allowed value to the above list and is documented in :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment` feature_selector: List of str, callable, or list of booleans (optional) Column names to include as input data for all provided DataFrames. If None, `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column`. `feature_selector` is provided as the second argument for calls to `pandas.DataFrame.loc` when constructing datasets notes: String (optional) Additional information about the Experiment that will be saved with the Experiment's description result file. This serves no purpose other than to facilitate saving Experiment details in a more readable format do_raise_repeated: Boolean, default=False If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a warning will be logged auto_start: Boolean, default=True If True, after the Experiment is initialized, it will automatically call :meth:`BaseExperiment.preparation_workflow`, followed by :meth:`BaseExperiment.experiment_workflow`, effectively completing all essential tasks without requiring additional method calls target_metric: Tuple, str, default=('oof', <:attr:`environment.Environment.metrics`[0]>) Path denoting the metric to be used to compare completed Experiments or to use for certain early stopping procedures in some model classes. The first value should be one of ['oof', 'holdout', 'in_fold']. The second value should be the name of a metric being recorded according to the values supplied in :attr:`hyperparameter_hunter.environment.Environment.metrics_params`. See the documentation for :func:`hyperparameter_hunter.metrics.get_formatted_target_metric` for more info. Any values returned by, or used as the `target_metric` input to this function are acceptable values for `target_metric` See Also -------- :meth:`hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment` OptPro method to define hyperparameter search scaffold for building Experiments during optimization. This method follows the same format as Experiment initialization, but it adds the ability to provide hyperparameter values as ranges to search over, via subclasses of :class:`~hyperparameter_hunter.space.dimensions.Dimension`. The other notable difference is that `forge_experiment` removes the `auto_start` and `target_metric` kwargs, which is described in the `forge_experiment` docstring Notes :class:`~hyperparameter_hunter.environment.Environment` Provides critical information on how Experiments should be conducted, as well as the data to be used by Experiments. An `Environment` must be active before executing any Experiment or OptPro :func:`~hyperparameter_hunter.callbacks.bases.lambda_callback` Enables customization of the Experimentation process and access to all Experiment internals through a collection of methods that are invoked at all the important periods over an Experiment's lifespan. These can be provided via the `experiment_callbacks` kwarg of :class:`~hyperparameter_hunter.environment.Environment`, and the callback classes literally get thrown in to the parent classes of the Experiment, so they're kind of a big deal""" self.model_initializer = model_initializer self.model_init_params = identify_algorithm_hyperparameters( self.model_initializer) model_init_params = model_init_params if model_init_params is not None else {} try: self.model_init_params.update(model_init_params) except TypeError: self.model_init_params.update(dict(build_fn=model_init_params)) self.model_extra_params = model_extra_params if model_extra_params is not None else {} self.feature_engineer = feature_engineer if not isinstance(self.feature_engineer, FeatureEngineer): self.feature_engineer = FeatureEngineer(self.feature_engineer) self.feature_selector = feature_selector if feature_selector is not None else [] self.notes = notes self.do_raise_repeated = do_raise_repeated self.auto_start = auto_start self.target_metric = target_metric #################### Attributes From Active Environment #################### G.Env.initialize_reporting() self._validate_environment() self.train_dataset = G.Env.train_dataset.copy() try: self.holdout_dataset = G.Env.holdout_dataset.copy() except AttributeError: self.holdout_dataset = G.Env.holdout_dataset try: self.test_dataset = G.Env.test_dataset.copy() except AttributeError: self.test_dataset = G.Env.test_dataset self.target_column = G.Env.target_column self.id_column = G.Env.id_column self.do_predict_proba = G.Env.do_predict_proba self.prediction_formatter = G.Env.prediction_formatter self.metrics_params = G.Env.metrics_params self.experiment_params = G.Env.cross_experiment_params self.cv_params = G.Env.cv_params self.result_paths = G.Env.result_paths self.cross_experiment_key = G.Env.cross_experiment_key #################### Dataset Attributes #################### self.data_train = None self.data_oof = None self.data_holdout = None self.data_test = None #################### Other Attributes #################### self.model = None self.metrics = None # Set by :class:`metrics.ScoringMixIn` self.stat_aggregates = dict() self.result_description = None #################### Experiment Identification Attributes #################### self.experiment_id = None self.hyperparameter_key = None self.algorithm_name, self.module_name = identify_algorithm( self.model_initializer) ScoringMixIn.__init__( self, **self.metrics_params if self.metrics_params else {}) if self.auto_start is True: self.preparation_workflow() self.experiment_workflow()