def save_result(self): """Save the Experiment description as a .json file, named after :attr:`experiment_id`. If :attr:`do_full_save` is a callable and returns False when given the description object, the result recording loop will be broken, and the remaining result files will not be saved Returns ------- 'break' This string will be returned if :attr:`do_full_save` is a callable and returns False when given the description object. This is the signal for :class:`recorders.RecorderList` to stop recording result files""" try: write_json(F'{self.result_path}/{self.experiment_id}.json', self.result, do_clear=False) except FileNotFoundError: os.makedirs(self.result_path, exist_ok=False) write_json(F'{self.result_path}/{self.experiment_id}.json', self.result, do_clear=False) if (self.do_full_save is not None) and (not self.do_full_save(self.result)): G.warn( 'Breaking out of result-saving loop early! Remaining result files will not be saved' ) return 'break'
def update_custom_environment_params(self): """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS""" allowed_parameter_keys = [k for k, v in signature(Environment).parameters.items() if v.kind == v.KEYWORD_ONLY] user_defaults = {} if (not isinstance(self.environment_params_path, str)) and (self.environment_params_path is not None): raise TypeError('environment_params_path must be a str, not {}: {}'.format(*type_val(self.environment_params_path))) try: user_defaults = read_json(self.environment_params_path) except TypeError: if self.environment_params_path is not None: raise except FileNotFoundError: raise if not isinstance(user_defaults, dict): raise TypeError('environment_params_path must contain a dict. Received {}: {}'.format(*type_val(user_defaults))) #################### Check user_defaults #################### for k, v in user_defaults.items(): if k not in allowed_parameter_keys: G.warn('\n\t'.join([ 'Invalid key ({}) in user-defined default Environment parameter file at "{}". If expected to do something,', 'it really won\'t, so it should be removed or fixed. The following are valid default keys: {}' ]).format(k, self.environment_params_path, allowed_parameter_keys)) elif getattr(self, k) is None: setattr(self, k, v) G.debug('Environment kwarg "{}" was set to user default at "{}"'.format(k, self.environment_params_path)) #################### Check Module Default Environment Arguments #################### for k in allowed_parameter_keys: if getattr(self, k) is None: setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
def initialize_preprocessing_stage(self): """Ensures preprocessing_stage can be set according to class attributes or method input""" _stages, _err = ['pre_cv', 'intra_cv'], 'Unknown error occurred.' _i_strs = ['validation_input_data', 'validation_target_data'] _i_sets = [getattr(self, _) for _ in _i_strs] if self.preprocessing_stage in _stages: return self.preprocessing_stage elif self.preprocessing_stage == 'infer': if all([_ for _ in _i_sets]): return 'intra_cv' elif any([_ for _ in _i_sets]): _err = "Inference failed. {} types must be same. Received: {}".format( _i_strs, [type(_) for _ in _i_sets]) else: return 'pre_cv' else: _err = "preprocessing_stage must be in {}. Received type {}: {}".format( _stages, *type_val(self.preprocessing_stage)) if self.fail_gracefully is True: G.warn(_err) return 'pre_cv' else: raise ValueError(_err)
def __call__(self, wrapped, instance, args, kwargs): file = self.file file_params = {} #################### Locate Parameters File #################### if not file and self.key is not None: with suppress(TypeError): file = kwargs.get(self.key, None) or args[self.key] if file: # If `file=None`, continue with empty dict of `file_params` file_params = read_json(file) if not isinstance(file_params, dict): raise TypeError("{} must have dict, not {}".format( file, file_params)) #################### Check Valid Parameters for `wrapped` #################### ok_keys = [ k for k, v in signature(wrapped).parameters.items() if v.kind == v.KEYWORD_ONLY ] for k, v in file_params.items(): if k not in ok_keys: if self.verbose: G.warn( f"Invalid key ({k}) in user parameters file: {file}") if k not in kwargs: kwargs[k] = v if self.verbose: G.debug( f"Parameter `{k}` set to user default in parameters file: '{file}'" ) return wrapped(*args, **kwargs)
def on_fold_end(self): # G.log('AggregatorEpochsElapsed.on_fold_end()') rep_key, fold_key = "rep_{}".format(self._rep), "fold_{}".format(self._fold) #################### Simple Average of Fold's Runs #################### try: self.stat_aggregates["epochs_elapsed"][fold_key]["simple_average"] = np.average( self.stat_aggregates["epochs_elapsed"][fold_key]["run_values"] ) except KeyError: # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place pass except TypeError: vals = self.stat_aggregates["epochs_elapsed"][fold_key]["run_values"] G.warn( "\n".join( [ f"TypeError encountered when averaging stat_aggregates['epochs_elapsed'][fold_key]:", "\tValues: {}".format(vals), "\tTypes: {}".format([type(_) for _ in vals]), "If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me", ] ) ) super().on_fold_end()
def handle_complex_types(self): """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries linking their original values to their hashes, then update their values in :attr:`parameters` to their hashes to facilitate Description saving""" if self.tested_keys_dir is None: # Key-making blacklisted return dataframe_hashes = {} def visit(path, key, value): """Check whether a parameter is of a complex type. If not, return it unchanged. Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`, instead of the original complex-typed `value` Parameters ---------- path: Tuple The path of keys that leads to `key` key: Str The parameter name value: * The value of the parameter `key` Returns ------- Tuple of (`key`, value), in which value is either unchanged or a hash for the original `value`""" if isinstance(value, BaseKerasCallback): return (key, keras_callback_to_dict(value)) if isinstance(value, Sentinel): return (key, value.sentinel) elif callable(value) or isinstance(value, pd.DataFrame): hashed_value = make_hash_sha256(value) if isinstance(value, pd.DataFrame): dataframe_hashes.setdefault(hashed_value, []).append(key) try: self.add_complex_type_lookup_entry(path, key, value, hashed_value) except FileNotFoundError: os.makedirs(self.key_attribute_lookup_dir, exist_ok=False) self.add_complex_type_lookup_entry(path, key, value, hashed_value) return (key, hashed_value) return (key, value) self.parameters = remap(self.parameters, visit=visit) #################### Check for Identical DataFrames #################### for df_hash, df_names in dataframe_hashes.items(): if len(df_names) > 1: G.warn( f"The dataframes: {df_names} have an identical hash: {df_hash!s}. This implies the dataframes are " + "identical, which is probably unintentional. If left alone, scores may be misleading!" )
def results_path(self, value): self._results_path = value if self._results_path is None: G.warn("Received results_path=None. Results will not be stored at all.") elif isinstance(self._results_path, str): if not self._results_path.endswith(ASSETS_DIRNAME): self._results_path = os.path.join(self._results_path, ASSETS_DIRNAME) # self.result_paths["root"] = self.results_path if not os.path.exists(self._results_path): make_dirs(self._results_path, exist_ok=True) else: raise TypeError(f"results_path must be None or str, not {value}")
def fit(self): """Train model according to :attr:`extra_params['fit']` (if appropriate) on training data""" try: self.model_history = self.model.fit(self.train_input, self.train_target) except Exception as _ex: G.warn(f"KerasModel.fit() failed with Exception: {_ex}\nAttempting standard fit method") super().fit() finally: #################### Record Epochs Elapsed if Model has 'epoch' Attribute #################### with suppress(AttributeError): # self.epochs_elapsed = len(self.model.epoch) self.epochs_elapsed = len(self.model_history.epoch) #################### Load Model Checkpoint if Possible #################### for callback in self.extra_params.get("callbacks", []): if callback.__class__.__name__ == "ModelCheckpoint": self.model.model.load_weights(callback.filepath)
def update_custom_environment_params(self): """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS""" allowed_parameter_keys = [ k for k, v in signature(Environment).parameters.items() if v.kind == v.KEYWORD_ONLY ] user_defaults = {} if (not isinstance(self.environment_params_path, str)) and ( self.environment_params_path is not None): raise TypeError( f"Non-str `environment_params_path`: {self.environment_params_path}" ) try: user_defaults = read_json(self.environment_params_path) except TypeError: if self.environment_params_path is not None: raise # If `environment_params_path=None`, no error raised - `user_defaults` continues as {} except FileNotFoundError: raise if not isinstance(user_defaults, dict): raise TypeError( "environment_params_path must have dict, not {}".format( user_defaults)) #################### Check user_defaults #################### for k, v in user_defaults.items(): if k not in allowed_parameter_keys: G.warn("\n\t".join([ "Invalid key ({}) in user-defined default Environment parameter file at '{}'. If expected to do something,", "it really won't, so it should be removed or fixed. The following are valid default keys: {}", ]).format(k, self.environment_params_path, allowed_parameter_keys)) elif getattr(self, k) is None: setattr(self, k, v) G.debug( f"Environment.`{k}` set to user default: '{self.environment_params_path}'" ) #################### Check Module Default Environment Arguments #################### for k in allowed_parameter_keys: if getattr(self, k) is None: setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
def experiment_workflow(self): """Define the actual experiment process, including execution, result saving, and cleanup""" if self.hyperparameter_key.exists is True: _ex = F'{self!r} has already been run' if self.do_raise_repeated is True: self._clean_up() raise RepeatedExperimentError(_ex) G.warn(_ex) self._initialize_random_seeds() self._initial_preprocessing() self.execute() recorders = RecorderList(file_blacklist=G.Env.file_blacklist) recorders.format_result() G.log(F'Saving results for Experiment: "{self.experiment_id}"') recorders.save_result() self._clean_up()
def experiment_workflow(self): """Define the actual experiment process, including execution, result saving, and cleanup""" if self.hyperparameter_key.exists is True: _ex = f"{self!r} has already been run" if self.do_raise_repeated is True: self._clean_up() raise RepeatedExperimentError(_ex) G.debug(_ex) G.warn("WARNING: Duplicate experiment!") self._initialize_random_seeds() self.execute() #################### Save Experiment Results #################### recorders = RecorderList(file_blacklist=G.Env.file_blacklist, extra_recorders=G.Env.experiment_recorders) recorders.format_result() G.log(f"Saving results for Experiment: '{self.experiment_id}'") recorders.save_result() self._clean_up()
def on_fold_end(self): # G.log('AggregatorEpochsElapsed.on_fold_end()') rep_key, fold_key = 'rep_{}'.format(self._rep), 'fold_{}'.format(self._fold) #################### Simple Average of Fold's Runs #################### try: self.stat_aggregates['epochs_elapsed'][fold_key]['simple_average'] = np.average( self.stat_aggregates['epochs_elapsed'][fold_key]['run_values'] ) except KeyError: # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place pass except TypeError: G.warn('\n'.join([ 'TypeError encountered when averaging stat_aggregates[{}][{}]:'.format('epochs_elapsed', fold_key), '\tValues: {}'.format(self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']), '\tTypes: {}'.format([type(_) for _ in self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']]), 'If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me' ])) super().on_fold_end()
def update_custom_environment_params(self): """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS""" allowed_parameter_keys = [ k for k, v in signature(Environment).parameters.items() if v.kind == v.KEYWORD_ONLY ] user_defaults = {} try: user_defaults = read_json(self.environment_params_path) except (TypeError, OSError): # If `environment_params_path=None`, no error raised - `user_defaults` continues as {} if self.environment_params_path is not None: raise if not isinstance(user_defaults, dict): raise TypeError( "environment_params_path must have dict, not {}".format( user_defaults)) #################### Check user_defaults #################### for k, v in user_defaults.items(): if k not in allowed_parameter_keys: G.warn( f"Invalid key ({k}) in user Environment parameters: {self.environment_params_path}" ) elif getattr(self, k) is None: setattr(self, k, v) G.debug( f"Environment.`{k}` set to user default: '{self.environment_params_path}'" ) #################### Check Module Default Environment Arguments #################### for k in allowed_parameter_keys: if getattr(self, k) is None: setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
def validate_file_blacklist(blacklist): """Validate contents of blacklist. For most values, the corresponding file is saved upon completion of the experiment. See the "Notes" section below for details on some special cases Parameters ---------- blacklist: List of strings, or None The result files that should not be saved Returns ------- blacklist: List If not empty, acceptable list of result file types to blacklist Notes ----- 'heartbeat': If the heartbeat file is saved, a new file is not generated and saved to the "Experiments/Heartbeats" directory as is the case with most other files. Instead, the general "Heartbeat.log" file is copied and renamed to the current experiment id, then saved to the appropriate dir. This is because the general "Heartbeat.log" file represents the heartbeat for whatever experiment is currently in progress. 'script_backup': This file is saved as quickly as possible after starting a new experiment, rather than waiting for the experiment to end. There are two reasons for this behavior: 1) to avoid saving any changes that may have been made to a file after it has been executed, and 2) to have the offending file in the event of a catastrophic failure that results in no other files being saved. As stated in the documentation of the `file_blacklist` parameter of `Environment`, if the path of the file that initializes an Experiment does not end with a ".py" extension, the Experiment proceeds as if "script_backup" had been added to `blacklist`. This means that backup files will not be created for Jupyter notebooks (or any other non-".py" files) 'description' and 'tested_keys': These two results types constitute a bare minimum of sorts for experiment recording. If either of these two are blacklisted, then as far as the library is concerned, the experiment never took place. 'tested_keys' (continued): If this string is included in the blacklist, then the contents of the "KeyAttributeLookup" directory will also be excluded from the list of files to update 'current_heartbeat': The general heartbeat file that should be stored at 'HyperparameterHunterAssets/Heartbeat.log'. If this value is blacklisted, then 'heartbeat' is also added to `blacklist` automatically out of necessity. This is done because the heartbeat file for the current experiment cannot be created as a copy of the general heartbeat file if the general heartbeat file is never created in the first place""" valid_values = [ # 'checkpoint', "description", "heartbeat", "predictions_holdout", "predictions_in_fold", "predictions_oof", "predictions_test", "script_backup", "tested_keys", "current_heartbeat", ] if blacklist == "ALL": G.warn('WARNING: Received `blacklist`="ALL". Nothing will be saved') return blacklist if not blacklist: return [] elif not isinstance(blacklist, list): raise TypeError("Expected blacklist to be a list, not: {}".format(blacklist)) elif not all([isinstance(_, str) for _ in blacklist]): invalid_files = [(type(_).__name__, _) for _ in blacklist if not isinstance(_, str)] raise TypeError("Expected blacklist contents to be strings, not: {}".format(invalid_files)) for a_file in blacklist: if a_file not in valid_values: raise ValueError(f"Invalid blacklist value: {a_file}.\nExpected one of: {valid_values}") if a_file in ["description", "tested_keys"]: G.warn(f"Including {a_file!r} in blacklist will severely impede library functionality") # Blacklist experiment-specific heartbeat if general (current) heartbeat is blacklisted if ("current_heartbeat" in blacklist) and ("heartbeat" not in blacklist): blacklist.append("heartbeat") return blacklist
def custom_pipeline_method_builder(self, functionality, name=None): """ Parameters ---------- - functionality: Callable Performs all desired transformations/alterations/work for this pipeline step. This callable will not receive any input arguments, so don't expect any. Instead, it is implemented as a class method, so it has access to all class attributes and methods. To work properly, the class attributes: ['self.train_input_data', 'self.train_target_data', 'self.validation_input_data', 'self.validation_target_data', 'self.holdout_input_data', 'self.holdout_target_data', 'self.test_input_data'] are expected to be directly modified. See the "Notes"/"Examples" sections below for more - name: String, or None, default=None Suffix for the name of the new custom method. See below "Notes" section for details on method name creation Returns ------- name: str The name of the new method that was created Notes ----- - WARNING: Because the custom functionality is implemented as a class method, it is capable of modifying values that are not expected to change, or setting new attributes. Doing either of these is a bad idea. The only attributes that should be set are those listed in the above "Parameters" description for the "functionality" argument. Additionally, the only values that should be retrieved are the aforementioned "data" attributes, plus "self.preprocessing_params" - METHOD ARGUMENTS: If the custom functionality requires some input argument that could be subject to change later (like a hyperparameter), it should be included in the "preprocessing_params" argument that is provided at the initialization of this class. Then in the custom functionality, it can be retrieved with "self.preprocessing_params[<your_arg>]". See the "Examples" section below for details on how to do this. The two primary reasons for this behavior are as follows: - 1) to get around having to make sense of methods' expected arguments and the arguments actually input to them, and - 2) to include any necessary arguments in the experiment's hyperparameters. Examples -------- >>> from hyperparameter_hunter.feature_engineering import PreprocessingPipelineMixIn >>> def my_function(self): >>> self.train_input_data = self.train_input_data.fillna(self.preprocessing_params['my_imputer']) Notice in "my_function", "self" is the only input, "self.train_input_data" is directly modified, and instead of passing "my_imputer" as an input, it is referenced in "self.preprocessing_params". Now, the class can use "my_function" below. >>> preprocessor = PreprocessingPipelineMixIn( >>> pipeline=[('my_function', my_function)], >>> preprocessing_params=dict(my_imputer=-1), features=[], target_column='' >>> ) The "pipeline" is set to include "my_function", which, after its creation, will be able to retrieve "my_imputer" from "self.preprocessing_params". Note that this example just demonstrates custom method building. It won't work as-is, without any train_input_data, among other things. Now in a later experiment, null values can be imputed to -2 instead of -1, just by changing "preprocessing_params": >>> preprocessor = PreprocessingPipelineMixIn( >>> pipeline=[('my_function', my_function)], >>> preprocessing_params=dict(my_imputer=-2), features=[], target_column='' >>> ) This makes it much easier to keep track of the actual hyperparameters being used in an experiment than having to scour obscure functions for some number that may or may not even be declared inside. """ if not callable(functionality): raise TypeError( 'Custom pipeline methods must be callable. Received type {}'. format(type(functionality))) # TODO: Set name (using "functionality.__name__") if name is None while hasattr(self, name): _name = name + '' # TODO: Make changes to "name" here # TODO: Do something to further modify name and check again G.warn( 'Encountered naming conflict in custom_pipeline_method_builder with "{}". Trying "{}"' .format(name, _name)) name = _name #################### Create New Custom Method #################### setattr(self, name, functionality) return name
def handle_complex_types(self): """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries linking their original values to their hashes, then update their values in :attr:`parameters` to their hashes to facilitate Description saving""" dataframe_hashes = {} def enter(path, key, value): """Produce iterable of attributes to remap for instances of :class:`metrics.Metric`""" if isinstance(value, Metric): metric_attrs = ["name", "metric_function", "direction"] return ({}, [(_, getattr(value, _)) for _ in metric_attrs]) return default_enter(path, key, value) def visit(path, key, value): """Check whether a parameter is of a complex type. If not, return it unchanged. Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`, instead of the original complex-typed `value` Parameters ---------- path: Tuple The path of keys that leads to `key` key: Str The parameter name value: * The value of the parameter `key` Returns ------- Tuple of (`key`, value), in which value is either unchanged or a hash for the original `value`""" if isinstance(value, BaseKerasCallback): return (key, keras_callback_to_dict(value)) if isinstance(value, Sentinel): return (key, value.sentinel) elif callable(value) or isinstance(value, pd.DataFrame): # TODO: Check here if callable, and using a `Trace`d model/model_initializer # TODO: If so, pass extra kwargs to below `make_hash_sha256`, which are eventually given to `hash_callable` # TODO: Notably, `ignore_source_lines=True` should be included # FLAG: Also, look into adding package version number to hashed attributes hashed_value = make_hash_sha256(value) if isinstance(value, pd.DataFrame): dataframe_hashes.setdefault(hashed_value, []).append(key) if self.tested_keys_dir is not None: # Key-making not blacklisted try: self.add_complex_type_lookup_entry(path, key, value, hashed_value) except (FileNotFoundError, OSError): make_dirs(os.path.join(self.lookup_dir, *path), exist_ok=False) self.add_complex_type_lookup_entry(path, key, value, hashed_value) return (key, hashed_value) return (key, value) self.parameters = remap(self.parameters, visit=visit, enter=enter) #################### Check for Identical DataFrames #################### for df_hash, df_names in dataframe_hashes.items(): if len(df_names) > 1: G.warn( f"The dataframes: {df_names} have an identical hash: {df_hash!s}. This implies the dataframes are " + "identical, which is probably unintentional. If left alone, scores may be misleading!" )
def validate_file_blacklist(blacklist): """Validate contents of blacklist. For most values, the corresponding file is saved upon completion of the experiment. See the "Notes" section below for details on some special cases Parameters ---------- blacklist: List of strings, or None The result files that should not be saved Returns ------- blacklist: List If not empty, acceptable list of result file types to blacklist Notes ----- 'heartbeat': If the heartbeat file is saved, a new file is not generated and saved to the "Experiments/Heartbeats" directory as is the case with most other files. Instead, the general "Heartbeat.log" file is copied and renamed to the current experiment id, then saved to the appropriate dir. This is because the general "Heartbeat.log" file represents the heartbeat for whatever experiment is currently in progress. 'script_backup': This file is saved as quickly as possible after starting a new experiment, rather than waiting for the experiment to end. There are two reasons for this behavior: 1) to avoid saving any changes that may have been made to a file after it has been executed, and 2) to have the offending file in the event of a catastrophic failure that results in no other files being saved. 'description' and 'tested_keys': These two results types constitute a bare minimum of sorts for experiment recording. If either of these two are blacklisted, then as far as the library is concerned, the experiment never took place. 'tested_keys' (continued): If this string is included in the blacklist, then the contents of the "KeyAttributeLookup" directory will also be excluded from the list of files to update""" valid_values = [ # 'checkpoint', 'description', 'heartbeat', 'predictions_holdout', 'predictions_in_fold', 'predictions_oof', 'predictions_test', 'script_backup', 'tested_keys', ] if blacklist == 'ALL': G.warn('WARNING: Received `blacklist`="ALL". Nothing will be saved') return blacklist if not blacklist: return [] elif not isinstance(blacklist, list): raise TypeError('Expected blacklist to be a list, but received {}: {}'.format(type(blacklist), blacklist)) elif not all([isinstance(_, str) for _ in blacklist]): invalid_files = [(type(_).__name__, _) for _ in blacklist if not isinstance(_, str)] raise TypeError('Expected contents of blacklist to be strings, but received {}'.format(invalid_files)) for a_file in blacklist: if a_file not in valid_values: raise ValueError('Received invalid blacklist value: {}.\nExpected one of: [{}]'.format(a_file, valid_values)) if a_file in ['description', 'tested_keys']: G.warn(F'Including {a_file!r} in file_blacklist will severely impede the functionality of this library') return blacklist
def validate_parameters(self): """Ensure the provided parameters are valid and properly formatted""" #################### root_results_path #################### if self.root_results_path is None: G.warn('Received root_results_path=None. Results will not be stored at all.') elif isinstance(self.root_results_path, str): if not self.root_results_path.endswith(ASSETS_DIRNAME): self.root_results_path = os.path.join(self.root_results_path, ASSETS_DIRNAME) self.result_paths['root'] = self.root_results_path if not os.path.exists(self.root_results_path): os.makedirs(self.root_results_path, exist_ok=True) else: raise TypeError('root_results_path must be None or str, not {}: {}'.format(*type_val(self.root_results_path))) #################### verbose #################### if not isinstance(self.verbose, bool): raise TypeError('verbose must be a boolean. Received {}: {}'.format(*type_val(self.verbose))) #################### file_blacklist #################### self.file_blacklist = validate_file_blacklist(self.file_blacklist) #################### Train/Test Datasets #################### if isinstance(self.train_dataset, str): self.train_dataset = pd.read_csv(self.train_dataset) if isinstance(self.test_dataset, str): self.test_dataset = pd.read_csv(self.test_dataset) #################### metrics_params/metrics_map #################### if (self.metrics_map is not None) and ('metrics_map' in self.metrics_params.keys()): raise ValueError( '`metrics_map` may be provided as a kwarg, or as a key in `metrics_params`, but NOT BOTH. Received: ' + F'\n `metrics_map`={self.metrics_map}\n `metrics_params`={self.metrics_params}' ) else: if self.metrics_map is None: self.metrics_map = self.metrics_params['metrics_map'] self.metrics_params = {**dict(metrics_map=self.metrics_map), **self.metrics_params} #################### cross_validation_type #################### if isinstance(self.cross_validation_type, str): try: self.cross_validation_type = sk_cv.__getattribute__(self.cross_validation_type) except AttributeError: raise AttributeError('`sklearn.model_selection._split` has no attribute "{}".'.format(self.cross_validation_type)) #################### to_csv_params #################### self.to_csv_params = {_k: _v for _k, _v in self.to_csv_params.items() if _k != 'path_or_buf'} #################### cross_experiment_params #################### self.cross_experiment_params = dict( cross_validation_type=self.cross_validation_type, runs=self.runs, global_random_seed=self.global_random_seed, random_seeds=self.random_seeds, random_seed_bounds=self.random_seed_bounds, ) #################### experiment_callbacks #################### if not isinstance(self.experiment_callbacks, list): self.experiment_callbacks = [self.experiment_callbacks] for callback in self.experiment_callbacks: if not isclass(callback): raise TypeError(F'experiment_callbacks must be classes. Received {type(callback)}: {callback}') if callback.__name__ != 'LambdaCallback': raise ValueError(F'experiment_callbacks must be LambdaCallback instances, not {callback.__name__}: {callback}')
def parameterize_compiled_keras_model(model): """Traverse a compiled Keras model to gather critical information about the layers used to construct its architecture, and the parameters used to compile it Parameters ---------- model: Instance of :class:`keras.wrappers.scikit_learn.<KerasClassifier; KerasRegressor>` A compiled instance of a Keras model, made using the Keras `wrappers.scikit_learn` module. This must be a completely valid Keras model, which means that it often must be the result of :func:`library_helpers.keras_optimization_helper.initialize_dummy_model`. Using the resulting dummy model ensures the model will pass Keras checks that would otherwise reject instances of `space.Space` descendants used to provide hyperparameter choices Returns ------- layers: List A list containing a dict for each layer found in the architecture of `model`. A layer dict should contain the following keys: ['class_name', '__hh_default_args', '__hh_default_kwargs', '__hh_used_args', '__hh_used_kwargs'] compile_params: Dict The parameters used on the call to :meth:`model.compile`. If a value for a certain parameter was not explicitly provided, its default value will be included in `compile_params`""" # NOTE: Tested optimizer and loss with both callable and string inputs - Converted to callables automatically ################################################## # Model Compile Parameters ################################################## compile_params = dict() compile_params["optimizer"] = get_keras_attr( model, "optimizer").__class__.__name__.lower() compile_params["optimizer_params"] = get_keras_attr( model, "optimizer").get_config() compile_params["metrics"] = get_keras_attr(model, "metrics") compile_params["metrics_names"] = get_keras_attr(model, "metrics_names") compile_params["loss_functions"] = get_keras_attr(model, "loss_functions") compile_params["loss_function_names"] = [ _.__name__ for _ in compile_params["loss_functions"] ] # FLAG: BELOW PARAMETERS SHOULD ONLY BE DISPLAYED IF EXPLICITLY GIVEN (probably have to be in key by default, though): compile_params["loss_weights"] = get_keras_attr(model, "loss_weights") compile_params["sample_weight_mode"] = get_keras_attr( model, "sample_weight_mode") compile_params["weighted_metrics"] = get_keras_attr( model, "weighted_metrics") compile_params["target_tensors"] = get_keras_attr(model, "target_tensors", default=None) compile_params["compile_kwargs"] = get_keras_attr(model, "_function_kwargs") ################################################## # Model Architecture ################################################## layers = [] for layer in get_keras_attr(model, "layers"): layer_obj = dict(class_name=layer.__class__.__name__) for hh_attr in HH_ARG_ATTRS: layer_obj[hh_attr] = getattr(layer, hh_attr, None) layers.append(layer_obj) ################################################## # Handle Custom Losses/Optimizers ################################################## if any([ _.__module__ != "keras.losses" for _ in compile_params["loss_functions"] ]): G.warn( "Custom loss functions will not be hashed and saved, meaning they are identified only by their names." + "\nIf you plan on tuning loss functions at all, please ensure custom functions are not given the same names as any" + " of Keras's loss functions. Otherwise, naming conflicts may occur and make results very confusing." ) if get_keras_attr(model, "optimizer").__module__ != "keras.optimizers": G.warn( "Custom optimizers will not be hashed and saved, meaning they are identified only by their names." + "\nIf you plan on tuning optimizers at all, please ensure custom optimizers are not given the same names as any" + " of Keras's optimizers. Otherwise, naming conflicts may occur and make results very confusing." ) return layers, compile_params
def parameterize_compiled_keras_model(model): """Traverse a compiled Keras model to gather critical information about the layers used to construct its architecture, and the parameters used to compile it Parameters ---------- model: Instance of :class:`keras.wrappers.scikit_learn.<KerasClassifier; KerasRegressor>` A compiled instance of a Keras model, made using the Keras `wrappers.scikit_learn` module Returns ------- layers: List A list containing a dict for each layer found in the architecture of `model`. A layer dict should contain the following keys: ['class_name', '__hh_default_args', '__hh_default_kwargs', '__hh_used_args', '__hh_used_kwargs'] compile_params: Dict The parameters used on the call to :meth:`model.compile`. If a value for a certain parameter was not explicitly provided, its default value will be included in `compile_params`""" # NOTE: Tested optimizer and loss with both callable and string inputs - Converted to callables automatically # TODO: MIGHT NEED TO CHECK KERAS VERSION... # TODO: If the "TEST" lines below don't work for older Keras versions, add check here to set `model = model.model`... # TODO: ... For newer Keras versions, but leave it alone for older versions ################################################## # Model Compile Parameters ################################################## compile_params = dict() # compile_params['optimizer'] = model.optimizer.__class__.__name__ # -> 'Adam' # FLAG: ORIGINAL compile_params["optimizer"] = model.model.optimizer.__class__.__name__.lower() # FLAG: TEST # compile_params['optimizer_params'] = model.optimizer.get_config() # -> {**kwargs} # FLAG: ORIGINAL compile_params["optimizer_params"] = model.model.optimizer.get_config() # FLAG: TEST # compile_params['metrics'] = model.metrics # -> ['accuracy'] # FLAG: ORIGINAL compile_params["metrics"] = model.model.metrics # FLAG: TEST # compile_params['metrics_names'] = model.metrics_names # -> ['loss', 'acc'] # FLAG: ORIGINAL compile_params["metrics_names"] = model.model.metrics_names # FLAG: TEST compile_params["loss_functions"] = model.model.loss_functions compile_params["loss_function_names"] = [_.__name__ for _ in compile_params["loss_functions"]] # FLAG: BELOW PARAMETERS SHOULD ONLY BE DISPLAYED IF EXPLICITLY GIVEN (probably have to be in key by default, though): # compile_params['loss_weights'] = model.loss_weights # -> None, [], or {} # FLAG: ORIGINAL compile_params["loss_weights"] = model.model.loss_weights # FLAG: TEST # compile_params['sample_weight_mode'] = model.sample_weight_mode # -> None, or '' # FLAG: ORIGINAL compile_params["sample_weight_mode"] = model.model.sample_weight_mode # FLAG: TEST # compile_params['weighted_metrics'] = model.weighted_metrics # -> None, or [] # FLAG: ORIGINAL compile_params["weighted_metrics"] = model.model.weighted_metrics # FLAG: TEST try: # compile_params['target_tensors'] = model.target_tensors # FLAG: ORIGINAL compile_params["target_tensors"] = model.model.target_tensors # FLAG: TEST except AttributeError: compile_params["target_tensors"] = None # noinspection PyProtectedMember compile_params["compile_kwargs"] = model.model._function_kwargs # -> {} ################################################## # Model Architecture ################################################## hh_attributes = [ "__hh_default_args", "__hh_default_kwargs", "__hh_used_args", "__hh_used_kwargs", ] layers = [] # for layer in model.layers: # FLAG: ORIGINAL for layer in model.model.layers: # FLAG: TEST layer_obj = dict(class_name=layer.__class__.__name__) for hh_attr in hh_attributes: layer_obj[hh_attr] = getattr(layer, hh_attr, None) layers.append(layer_obj) ################################################## # Handle Custom Losses/Optimizers ################################################## if any([_.__module__ != "keras.losses" for _ in compile_params["loss_functions"]]): G.warn( "Custom loss functions will not be hashed and saved, meaning they are identified only by their names." + "\nIf you plan on tuning loss functions at all, please ensure custom functions are not given the same names as any" + " of Keras's loss functions. Otherwise, naming conflicts may occur and make results very confusing." ) # if model.optimizer.__module__ != 'keras.optimizers': # FLAG: ORIGINAL if model.model.optimizer.__module__ != "keras.optimizers": # FLAG: TEST G.warn( "Custom optimizers will not be hashed and saved, meaning they are identified only by their names." + "\nIf you plan on tuning optimizers at all, please ensure custom optimizers are not given the same names as any" + " of Keras's optimizers. Otherwise, naming conflicts may occur and make results very confusing." ) return layers, compile_params
def validate_parameters(self): """Ensure the provided parameters are valid and properly formatted""" #################### root_results_path #################### if self.root_results_path is None: G.warn( "Received root_results_path=None. Results will not be stored at all." ) elif isinstance(self.root_results_path, str): if not self.root_results_path.endswith(ASSETS_DIRNAME): self.root_results_path = os.path.join(self.root_results_path, ASSETS_DIRNAME) self.result_paths["root"] = self.root_results_path if not os.path.exists(self.root_results_path): make_dirs(self.root_results_path, exist_ok=True) else: raise TypeError( f"root_results_path must be None or str, not {self.root_results_path}" ) #################### target_column #################### if isinstance(self.target_column, str): self.target_column = [self.target_column] #################### file_blacklist #################### self.file_blacklist = validate_file_blacklist(self.file_blacklist) if self.root_results_path is None: self.file_blacklist = "ALL" #################### Train/Test Datasets #################### if isinstance(self.train_dataset, str): self.train_dataset = pd.read_csv(self.train_dataset) if isinstance(self.test_dataset, str): self.test_dataset = pd.read_csv(self.test_dataset) #################### metrics_params/metrics_map #################### if (self.metrics_map is not None) and ("metrics_map" in self.metrics_params.keys()): raise ValueError( "`metrics_map` may be provided as a kwarg, or as a `metrics_params` key, but NOT BOTH. Received: " + f"\n `metrics_map`={self.metrics_map}\n `metrics_params`={self.metrics_params}" ) else: if self.metrics_map is None: self.metrics_map = self.metrics_params["metrics_map"] self.metrics_map = format_metrics_map(self.metrics_map) self.metrics_params = { **dict(metrics_map=self.metrics_map), **self.metrics_params } #################### cross_validation_type #################### if isinstance(self.cross_validation_type, str): try: self.cross_validation_type = sk_cv.__getattribute__( self.cross_validation_type) except AttributeError: raise AttributeError( f"'{self.cross_validation_type}' not in `sklearn.model_selection._split`" ) #################### to_csv_params #################### self.to_csv_params = { k: v for k, v in self.to_csv_params.items() if k != "path_or_buf" } #################### cross_experiment_params #################### self.cross_experiment_params = dict( cross_validation_type=self.cross_validation_type, runs=self.runs, global_random_seed=self.global_random_seed, random_seeds=self.random_seeds, random_seed_bounds=self.random_seed_bounds, ) #################### experiment_callbacks #################### if not isinstance(self.experiment_callbacks, list): self.experiment_callbacks = [self.experiment_callbacks] for cb in self.experiment_callbacks: if not isclass(cb): raise TypeError( f"experiment_callbacks must be classes, not {type(cb)}: {cb}" ) if cb.__name__ != "LambdaCallback": raise ValueError( f"experiment_callbacks must be LambdaCallback instances, not {cb}" )
def handle_complex_types(self): """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries linking their original values to their hashes, then update their values in :attr:`parameters` to their hashes to facilitate Description saving""" dataframe_hashes = {} def enter(path, key, value): """Produce iterable of attributes to remap for instances of :class:`metrics.Metric`""" if isinstance(value, Metric): metric_attrs = ["name", "metric_function", "direction"] return ({}, [(_, getattr(value, _)) for _ in metric_attrs]) if isinstance(value, EngineerStep): return ({}, list(value.get_key_data().items())) if isinstance(value, FeatureEngineer): return ({}, list(value.get_key_data().items())) return default_enter(path, key, value) def visit(path, key, value): """Check whether a parameter is of a complex type. If not, return it unchanged. Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`, instead of the original complex-typed `value` Parameters ---------- path: Tuple The path of keys that leads to `key` key: Str The parameter name value: * The value of the parameter `key` Returns ------- Tuple of (`key`, value), in which value is either unchanged or a hash for the original `value`""" if isinstance(value, BaseKerasCallback): return (key, keras_callback_to_dict(value)) if isinstance(value, BaseKerasInitializer): return (key, keras_initializer_to_dict(value)) if isinstance(value, Sentinel): return (key, value.sentinel) elif callable(value) or isinstance(value, pd.DataFrame): # FLAG: Look into adding package version number to hashed attributes hashed_value = make_hash_sha256(value) if isinstance(value, pd.DataFrame): dataframe_hashes.setdefault(hashed_value, []).append(key) if self.tested_keys_dir is not None: # Key-making not blacklisted self.add_complex_type_lookup_entry(path, key, value, hashed_value) return (key, hashed_value) return (key, value) self.parameters = remap(self.parameters, visit=visit, enter=enter) #################### Check for Identical DataFrames #################### for df_hash, df_names in dataframe_hashes.items(): if len(df_names) > 1: G.warn( f"The dataframes: {df_names} are identical. Scores may be misleading!" )