示例#1
0
    def save_result(self):
        """Save the Experiment description as a .json file, named after :attr:`experiment_id`. If :attr:`do_full_save` is a
        callable and returns False when given the description object, the result recording loop will be broken, and the remaining
        result files will not be saved

        Returns
        -------
        'break'
            This string will be returned if :attr:`do_full_save` is a callable and returns False when given the description
            object. This is the signal for :class:`recorders.RecorderList` to stop recording result files"""
        try:
            write_json(F'{self.result_path}/{self.experiment_id}.json',
                       self.result,
                       do_clear=False)
        except FileNotFoundError:
            os.makedirs(self.result_path, exist_ok=False)
            write_json(F'{self.result_path}/{self.experiment_id}.json',
                       self.result,
                       do_clear=False)

        if (self.do_full_save
                is not None) and (not self.do_full_save(self.result)):
            G.warn(
                'Breaking out of result-saving loop early! Remaining result files will not be saved'
            )
            return 'break'
    def update_custom_environment_params(self):
        """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS"""
        allowed_parameter_keys = [k for k, v in signature(Environment).parameters.items() if v.kind == v.KEYWORD_ONLY]
        user_defaults = {}

        if (not isinstance(self.environment_params_path, str)) and (self.environment_params_path is not None):
            raise TypeError('environment_params_path must be a str, not {}: {}'.format(*type_val(self.environment_params_path)))

        try:
            user_defaults = read_json(self.environment_params_path)
        except TypeError:
            if self.environment_params_path is not None:
                raise
        except FileNotFoundError:
            raise

        if not isinstance(user_defaults, dict):
            raise TypeError('environment_params_path must contain a dict. Received {}: {}'.format(*type_val(user_defaults)))

        #################### Check user_defaults ####################
        for k, v in user_defaults.items():
            if k not in allowed_parameter_keys:
                G.warn('\n\t'.join([
                    'Invalid key ({}) in user-defined default Environment parameter file at "{}". If expected to do something,',
                    'it really won\'t, so it should be removed or fixed. The following are valid default keys: {}'
                ]).format(k, self.environment_params_path, allowed_parameter_keys))
            elif getattr(self, k) is None:
                setattr(self, k, v)
                G.debug('Environment kwarg "{}" was set to user default at "{}"'.format(k, self.environment_params_path))

        #################### Check Module Default Environment Arguments ####################
        for k in allowed_parameter_keys:
            if getattr(self, k) is None:
                setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
示例#3
0
    def initialize_preprocessing_stage(self):
        """Ensures preprocessing_stage can be set according to class attributes or method input"""
        _stages, _err = ['pre_cv', 'intra_cv'], 'Unknown error occurred.'
        _i_strs = ['validation_input_data', 'validation_target_data']
        _i_sets = [getattr(self, _) for _ in _i_strs]

        if self.preprocessing_stage in _stages:
            return self.preprocessing_stage
        elif self.preprocessing_stage == 'infer':
            if all([_ for _ in _i_sets]):
                return 'intra_cv'
            elif any([_ for _ in _i_sets]):
                _err = "Inference failed. {} types must be same. Received: {}".format(
                    _i_strs, [type(_) for _ in _i_sets])
            else:
                return 'pre_cv'
        else:
            _err = "preprocessing_stage must be in {}. Received type {}: {}".format(
                _stages, *type_val(self.preprocessing_stage))

        if self.fail_gracefully is True:
            G.warn(_err)
            return 'pre_cv'
        else:
            raise ValueError(_err)
示例#4
0
    def __call__(self, wrapped, instance, args, kwargs):
        file = self.file
        file_params = {}

        #################### Locate Parameters File ####################
        if not file and self.key is not None:
            with suppress(TypeError):
                file = kwargs.get(self.key, None) or args[self.key]

        if file:  # If `file=None`, continue with empty dict of `file_params`
            file_params = read_json(file)

        if not isinstance(file_params, dict):
            raise TypeError("{} must have dict, not {}".format(
                file, file_params))

        #################### Check Valid Parameters for `wrapped` ####################
        ok_keys = [
            k for k, v in signature(wrapped).parameters.items()
            if v.kind == v.KEYWORD_ONLY
        ]

        for k, v in file_params.items():
            if k not in ok_keys:
                if self.verbose:
                    G.warn(
                        f"Invalid key ({k}) in user parameters file: {file}")
            if k not in kwargs:
                kwargs[k] = v
                if self.verbose:
                    G.debug(
                        f"Parameter `{k}` set to user default in parameters file: '{file}'"
                    )

        return wrapped(*args, **kwargs)
示例#5
0
    def on_fold_end(self):
        # G.log('AggregatorEpochsElapsed.on_fold_end()')

        rep_key, fold_key = "rep_{}".format(self._rep), "fold_{}".format(self._fold)

        #################### Simple Average of Fold's Runs ####################
        try:
            self.stat_aggregates["epochs_elapsed"][fold_key]["simple_average"] = np.average(
                self.stat_aggregates["epochs_elapsed"][fold_key]["run_values"]
            )
        except KeyError:
            # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place
            pass
        except TypeError:
            vals = self.stat_aggregates["epochs_elapsed"][fold_key]["run_values"]
            G.warn(
                "\n".join(
                    [
                        f"TypeError encountered when averaging stat_aggregates['epochs_elapsed'][fold_key]:",
                        "\tValues: {}".format(vals),
                        "\tTypes: {}".format([type(_) for _ in vals]),
                        "If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me",
                    ]
                )
            )

        super().on_fold_end()
    def handle_complex_types(self):
        """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries
        linking their original values to their hashes, then update their values in
        :attr:`parameters` to their hashes to facilitate Description saving"""
        if self.tested_keys_dir is None:  # Key-making blacklisted
            return

        dataframe_hashes = {}

        def visit(path, key, value):
            """Check whether a parameter is of a complex type. If not, return it unchanged.
            Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking
            `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`,
            instead of the original complex-typed `value`

            Parameters
            ----------
            path: Tuple
                The path of keys that leads to `key`
            key: Str
                The parameter name
            value: *
                The value of the parameter `key`

            Returns
            -------
            Tuple of (`key`, value), in which value is either unchanged or a hash for the original
            `value`"""
            if isinstance(value, BaseKerasCallback):
                return (key, keras_callback_to_dict(value))
            if isinstance(value, Sentinel):
                return (key, value.sentinel)
            elif callable(value) or isinstance(value, pd.DataFrame):
                hashed_value = make_hash_sha256(value)

                if isinstance(value, pd.DataFrame):
                    dataframe_hashes.setdefault(hashed_value, []).append(key)

                try:
                    self.add_complex_type_lookup_entry(path, key, value,
                                                       hashed_value)
                except FileNotFoundError:
                    os.makedirs(self.key_attribute_lookup_dir, exist_ok=False)
                    self.add_complex_type_lookup_entry(path, key, value,
                                                       hashed_value)

                return (key, hashed_value)
            return (key, value)

        self.parameters = remap(self.parameters, visit=visit)

        #################### Check for Identical DataFrames ####################
        for df_hash, df_names in dataframe_hashes.items():
            if len(df_names) > 1:
                G.warn(
                    f"The dataframes: {df_names} have an identical hash: {df_hash!s}. This implies the dataframes are "
                    +
                    "identical, which is probably unintentional. If left alone, scores may be misleading!"
                )
 def results_path(self, value):
     self._results_path = value
     if self._results_path is None:
         G.warn("Received results_path=None. Results will not be stored at all.")
     elif isinstance(self._results_path, str):
         if not self._results_path.endswith(ASSETS_DIRNAME):
             self._results_path = os.path.join(self._results_path, ASSETS_DIRNAME)
             # self.result_paths["root"] = self.results_path
         if not os.path.exists(self._results_path):
             make_dirs(self._results_path, exist_ok=True)
     else:
         raise TypeError(f"results_path must be None or str, not {value}")
示例#8
0
    def fit(self):
        """Train model according to :attr:`extra_params['fit']` (if appropriate) on training data"""
        try:
            self.model_history = self.model.fit(self.train_input, self.train_target)
        except Exception as _ex:
            G.warn(f"KerasModel.fit() failed with Exception: {_ex}\nAttempting standard fit method")
            super().fit()
        finally:
            #################### Record Epochs Elapsed if Model has 'epoch' Attribute ####################
            with suppress(AttributeError):
                # self.epochs_elapsed = len(self.model.epoch)
                self.epochs_elapsed = len(self.model_history.epoch)

            #################### Load Model Checkpoint if Possible ####################
            for callback in self.extra_params.get("callbacks", []):
                if callback.__class__.__name__ == "ModelCheckpoint":
                    self.model.model.load_weights(callback.filepath)
示例#9
0
    def update_custom_environment_params(self):
        """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS"""
        allowed_parameter_keys = [
            k for k, v in signature(Environment).parameters.items()
            if v.kind == v.KEYWORD_ONLY
        ]
        user_defaults = {}

        if (not isinstance(self.environment_params_path, str)) and (
                self.environment_params_path is not None):
            raise TypeError(
                f"Non-str `environment_params_path`: {self.environment_params_path}"
            )

        try:
            user_defaults = read_json(self.environment_params_path)
        except TypeError:
            if self.environment_params_path is not None:
                raise
            # If `environment_params_path=None`, no error raised - `user_defaults` continues as {}
        except FileNotFoundError:
            raise

        if not isinstance(user_defaults, dict):
            raise TypeError(
                "environment_params_path must have dict, not {}".format(
                    user_defaults))

        #################### Check user_defaults ####################
        for k, v in user_defaults.items():
            if k not in allowed_parameter_keys:
                G.warn("\n\t".join([
                    "Invalid key ({}) in user-defined default Environment parameter file at '{}'. If expected to do something,",
                    "it really won't, so it should be removed or fixed. The following are valid default keys: {}",
                ]).format(k, self.environment_params_path,
                          allowed_parameter_keys))
            elif getattr(self, k) is None:
                setattr(self, k, v)
                G.debug(
                    f"Environment.`{k}` set to user default: '{self.environment_params_path}'"
                )

        #################### Check Module Default Environment Arguments ####################
        for k in allowed_parameter_keys:
            if getattr(self, k) is None:
                setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
示例#10
0
    def experiment_workflow(self):
        """Define the actual experiment process, including execution, result saving, and cleanup"""
        if self.hyperparameter_key.exists is True:
            _ex = F'{self!r} has already been run'
            if self.do_raise_repeated is True:
                self._clean_up()
                raise RepeatedExperimentError(_ex)
            G.warn(_ex)

        self._initialize_random_seeds()
        self._initial_preprocessing()
        self.execute()

        recorders = RecorderList(file_blacklist=G.Env.file_blacklist)
        recorders.format_result()
        G.log(F'Saving results for Experiment: "{self.experiment_id}"')
        recorders.save_result()
        self._clean_up()
    def experiment_workflow(self):
        """Define the actual experiment process, including execution, result saving, and cleanup"""
        if self.hyperparameter_key.exists is True:
            _ex = f"{self!r} has already been run"
            if self.do_raise_repeated is True:
                self._clean_up()
                raise RepeatedExperimentError(_ex)
            G.debug(_ex)
            G.warn("WARNING: Duplicate experiment!")

        self._initialize_random_seeds()
        self.execute()

        #################### Save Experiment Results ####################
        recorders = RecorderList(file_blacklist=G.Env.file_blacklist,
                                 extra_recorders=G.Env.experiment_recorders)
        recorders.format_result()
        G.log(f"Saving results for Experiment: '{self.experiment_id}'")
        recorders.save_result()
        self._clean_up()
示例#12
0
    def on_fold_end(self):
        # G.log('AggregatorEpochsElapsed.on_fold_end()')

        rep_key, fold_key = 'rep_{}'.format(self._rep), 'fold_{}'.format(self._fold)

        #################### Simple Average of Fold's Runs ####################
        try:
            self.stat_aggregates['epochs_elapsed'][fold_key]['simple_average'] = np.average(
                self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']
            )
        except KeyError:
            # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place
            pass
        except TypeError:
            G.warn('\n'.join([
                'TypeError encountered when averaging stat_aggregates[{}][{}]:'.format('epochs_elapsed', fold_key),
                '\tValues: {}'.format(self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']),
                '\tTypes: {}'.format([type(_) for _ in self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']]),
                'If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me'
            ]))

        super().on_fold_end()
示例#13
0
    def update_custom_environment_params(self):
        """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS"""
        allowed_parameter_keys = [
            k for k, v in signature(Environment).parameters.items()
            if v.kind == v.KEYWORD_ONLY
        ]
        user_defaults = {}

        try:
            user_defaults = read_json(self.environment_params_path)
        except (TypeError, OSError):
            # If `environment_params_path=None`, no error raised - `user_defaults` continues as {}
            if self.environment_params_path is not None:
                raise

        if not isinstance(user_defaults, dict):
            raise TypeError(
                "environment_params_path must have dict, not {}".format(
                    user_defaults))

        #################### Check user_defaults ####################
        for k, v in user_defaults.items():
            if k not in allowed_parameter_keys:
                G.warn(
                    f"Invalid key ({k}) in user Environment parameters: {self.environment_params_path}"
                )
            elif getattr(self, k) is None:
                setattr(self, k, v)
                G.debug(
                    f"Environment.`{k}` set to user default: '{self.environment_params_path}'"
                )

        #################### Check Module Default Environment Arguments ####################
        for k in allowed_parameter_keys:
            if getattr(self, k) is None:
                setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
示例#14
0
def validate_file_blacklist(blacklist):
    """Validate contents of blacklist. For most values, the corresponding file is saved upon
    completion of the experiment. See the "Notes" section below for details on some special cases

    Parameters
    ----------
    blacklist: List of strings, or None
        The result files that should not be saved

    Returns
    -------
    blacklist: List
        If not empty, acceptable list of result file types to blacklist

    Notes
    -----
    'heartbeat': If the heartbeat file is saved, a new file is not generated and saved to the
    "Experiments/Heartbeats" directory as is the case with most other files. Instead, the general
    "Heartbeat.log" file is copied and renamed to the current experiment id, then saved to the
    appropriate dir. This is because the general "Heartbeat.log" file represents the heartbeat
    for whatever experiment is currently in progress.

    'script_backup': This file is saved as quickly as possible after starting a new experiment,
    rather than waiting for the experiment to end. There are two reasons for this behavior: 1) to
    avoid saving any changes that may have been made to a file after it has been executed, and 2)
    to have the offending file in the event of a catastrophic failure that results in no other
    files being saved. As stated in the documentation of the `file_blacklist` parameter of
    `Environment`, if the path of the file that initializes an Experiment does not end with a ".py"
    extension, the Experiment proceeds as if "script_backup" had been added to `blacklist`. This
    means that backup files will not be created for Jupyter notebooks (or any other non-".py" files)

    'description' and 'tested_keys': These two results types constitute a bare minimum of sorts for
    experiment recording. If either of these two are blacklisted, then as far as the library is
    concerned, the experiment never took place.

    'tested_keys' (continued): If this string is included in the blacklist, then the contents of the
    "KeyAttributeLookup" directory will also be excluded from the list of files to update

    'current_heartbeat': The general heartbeat file that should be stored at
    'HyperparameterHunterAssets/Heartbeat.log'. If this value is blacklisted, then 'heartbeat' is
    also added to `blacklist` automatically out of necessity. This is done because the heartbeat
    file for the current experiment cannot be created as a copy of the general heartbeat file if the
    general heartbeat file is never created in the first place"""
    valid_values = [
        # 'checkpoint',
        "description",
        "heartbeat",
        "predictions_holdout",
        "predictions_in_fold",
        "predictions_oof",
        "predictions_test",
        "script_backup",
        "tested_keys",
        "current_heartbeat",
    ]
    if blacklist == "ALL":
        G.warn('WARNING: Received `blacklist`="ALL". Nothing will be saved')
        return blacklist

    if not blacklist:
        return []
    elif not isinstance(blacklist, list):
        raise TypeError("Expected blacklist to be a list, not: {}".format(blacklist))
    elif not all([isinstance(_, str) for _ in blacklist]):
        invalid_files = [(type(_).__name__, _) for _ in blacklist if not isinstance(_, str)]
        raise TypeError("Expected blacklist contents to be strings, not: {}".format(invalid_files))

    for a_file in blacklist:
        if a_file not in valid_values:
            raise ValueError(f"Invalid blacklist value: {a_file}.\nExpected one of: {valid_values}")
        if a_file in ["description", "tested_keys"]:
            G.warn(f"Including {a_file!r} in blacklist will severely impede library functionality")

    # Blacklist experiment-specific heartbeat if general (current) heartbeat is blacklisted
    if ("current_heartbeat" in blacklist) and ("heartbeat" not in blacklist):
        blacklist.append("heartbeat")

    return blacklist
示例#15
0
    def custom_pipeline_method_builder(self, functionality, name=None):
        """
        Parameters
        ----------
        - functionality: Callable
            Performs all desired transformations/alterations/work for this pipeline step. This callable will not receive any input
            arguments, so don't expect any. Instead, it is implemented as a class method, so it has access to all class attributes
            and methods. To work properly, the class attributes: ['self.train_input_data', 'self.train_target_data',
            'self.validation_input_data', 'self.validation_target_data', 'self.holdout_input_data', 'self.holdout_target_data',
            'self.test_input_data'] are expected to be directly modified. See the "Notes"/"Examples" sections below for more
        - name: String, or None, default=None
            Suffix for the name of the new custom method. See below "Notes" section for details on method name creation

        Returns
        -------
        name: str
            The name of the new method that was created

        Notes
        -----
        - WARNING: Because the custom functionality is implemented as a class method, it is capable of modifying values that are
            not expected to change, or setting new attributes. Doing either of these is a bad idea. The only attributes that
            should be set are those listed in the above "Parameters" description for the "functionality" argument. Additionally,
            the only values that should be retrieved are the aforementioned "data" attributes, plus "self.preprocessing_params"
        - METHOD ARGUMENTS: If the custom functionality requires some input argument that could be subject to change later (like
            a hyperparameter), it should be included in the "preprocessing_params" argument that is provided at the initialization
            of this class. Then in the custom functionality, it can be retrieved with "self.preprocessing_params[<your_arg>]". See
            the "Examples" section below for details on how to do this. The two primary reasons for this behavior are as follows:
            - 1) to get around having to make sense of methods' expected arguments and the arguments actually input to them, and
            - 2) to include any necessary arguments in the experiment's hyperparameters.

        Examples
        --------
        >>> from hyperparameter_hunter.feature_engineering import PreprocessingPipelineMixIn
        >>> def my_function(self):
        >>>     self.train_input_data = self.train_input_data.fillna(self.preprocessing_params['my_imputer'])
        Notice in "my_function", "self" is the only input, "self.train_input_data" is directly modified, and instead of passing
        "my_imputer" as an input, it is referenced in "self.preprocessing_params". Now, the class can use "my_function" below.
        >>> preprocessor = PreprocessingPipelineMixIn(
        >>>     pipeline=[('my_function', my_function)],
        >>>     preprocessing_params=dict(my_imputer=-1), features=[], target_column=''
        >>> )
        The "pipeline" is set to include "my_function", which, after its creation, will be able to retrieve "my_imputer" from
        "self.preprocessing_params". Note that this example just demonstrates custom method building. It won't work as-is, without
        any train_input_data, among other things.
        Now in a later experiment, null values can be imputed to -2 instead of -1, just by changing "preprocessing_params":
        >>> preprocessor = PreprocessingPipelineMixIn(
        >>>     pipeline=[('my_function', my_function)],
        >>>     preprocessing_params=dict(my_imputer=-2), features=[], target_column=''
        >>> )
        This makes it much easier to keep track of the actual hyperparameters being used in an experiment than having to scour
        obscure functions for some number that may or may not even be declared inside.
        """
        if not callable(functionality):
            raise TypeError(
                'Custom pipeline methods must be callable. Received type {}'.
                format(type(functionality)))

        # TODO: Set name (using "functionality.__name__") if name is None

        while hasattr(self, name):
            _name = name + ''  # TODO: Make changes to "name" here
            # TODO: Do something to further modify name and check again
            G.warn(
                'Encountered naming conflict in custom_pipeline_method_builder with "{}". Trying "{}"'
                .format(name, _name))
            name = _name

        #################### Create New Custom Method ####################
        setattr(self, name, functionality)

        return name
示例#16
0
    def handle_complex_types(self):
        """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries
        linking their original values to their hashes, then update their values in
        :attr:`parameters` to their hashes to facilitate Description saving"""
        dataframe_hashes = {}

        def enter(path, key, value):
            """Produce iterable of attributes to remap for instances of :class:`metrics.Metric`"""
            if isinstance(value, Metric):
                metric_attrs = ["name", "metric_function", "direction"]
                return ({}, [(_, getattr(value, _)) for _ in metric_attrs])
            return default_enter(path, key, value)

        def visit(path, key, value):
            """Check whether a parameter is of a complex type. If not, return it unchanged.
            Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking
            `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`,
            instead of the original complex-typed `value`

            Parameters
            ----------
            path: Tuple
                The path of keys that leads to `key`
            key: Str
                The parameter name
            value: *
                The value of the parameter `key`

            Returns
            -------
            Tuple of (`key`, value), in which value is either unchanged or a hash for the original
            `value`"""
            if isinstance(value, BaseKerasCallback):
                return (key, keras_callback_to_dict(value))
            if isinstance(value, Sentinel):
                return (key, value.sentinel)
            elif callable(value) or isinstance(value, pd.DataFrame):
                # TODO: Check here if callable, and using a `Trace`d model/model_initializer
                # TODO: If so, pass extra kwargs to below `make_hash_sha256`, which are eventually given to `hash_callable`
                # TODO: Notably, `ignore_source_lines=True` should be included
                # FLAG: Also, look into adding package version number to hashed attributes
                hashed_value = make_hash_sha256(value)

                if isinstance(value, pd.DataFrame):
                    dataframe_hashes.setdefault(hashed_value, []).append(key)

                if self.tested_keys_dir is not None:  # Key-making not blacklisted
                    try:
                        self.add_complex_type_lookup_entry(path, key, value, hashed_value)
                    except (FileNotFoundError, OSError):
                        make_dirs(os.path.join(self.lookup_dir, *path), exist_ok=False)
                        self.add_complex_type_lookup_entry(path, key, value, hashed_value)

                return (key, hashed_value)
            return (key, value)

        self.parameters = remap(self.parameters, visit=visit, enter=enter)

        #################### Check for Identical DataFrames ####################
        for df_hash, df_names in dataframe_hashes.items():
            if len(df_names) > 1:
                G.warn(
                    f"The dataframes: {df_names} have an identical hash: {df_hash!s}. This implies the dataframes are "
                    + "identical, which is probably unintentional. If left alone, scores may be misleading!"
                )
示例#17
0
def validate_file_blacklist(blacklist):
    """Validate contents of blacklist. For most values, the corresponding file is saved upon completion of the experiment. See
    the "Notes" section below for details on some special cases

    Parameters
    ----------
    blacklist: List of strings, or None
        The result files that should not be saved

    Returns
    -------
    blacklist: List
        If not empty, acceptable list of result file types to blacklist

    Notes
    -----
    'heartbeat': If the heartbeat file is saved, a new file is not generated and saved to the "Experiments/Heartbeats" directory
    as is the case with most other files. Instead, the general "Heartbeat.log" file is copied and renamed to the current
    experiment id, then saved to the appropriate dir. This is because the general "Heartbeat.log" file represents the heartbeat
    for whatever experiment is currently in progress.

    'script_backup': This file is saved as quickly as possible after starting a new experiment, rather than waiting for the
    experiment to end. There are two reasons for this behavior: 1) to avoid saving any changes that may have been made to a file
    after it has been executed, and 2) to have the offending file in the event of a catastrophic failure that results in no other
    files being saved.

    'description' and 'tested_keys': These two results types constitute a bare minimum of sorts for experiment recording. If
    either of these two are blacklisted, then as far as the library is concerned, the experiment never took place.

    'tested_keys' (continued): If this string is included in the blacklist, then the contents of the "KeyAttributeLookup"
    directory will also be excluded from the list of files to update"""
    valid_values = [
        # 'checkpoint',
        'description',
        'heartbeat',
        'predictions_holdout',
        'predictions_in_fold',
        'predictions_oof',
        'predictions_test',
        'script_backup',
        'tested_keys',
    ]
    if blacklist == 'ALL':
        G.warn('WARNING: Received `blacklist`="ALL". Nothing will be saved')
        return blacklist

    if not blacklist:
        return []
    elif not isinstance(blacklist, list):
        raise TypeError('Expected blacklist to be a list, but received {}: {}'.format(type(blacklist), blacklist))
    elif not all([isinstance(_, str) for _ in blacklist]):
        invalid_files = [(type(_).__name__, _) for _ in blacklist if not isinstance(_, str)]
        raise TypeError('Expected contents of blacklist to be strings, but received {}'.format(invalid_files))

    for a_file in blacklist:
        if a_file not in valid_values:
            raise ValueError('Received invalid blacklist value: {}.\nExpected one of: [{}]'.format(a_file, valid_values))
        if a_file in ['description', 'tested_keys']:
            G.warn(F'Including {a_file!r} in file_blacklist will severely impede the functionality of this library')

    return blacklist
示例#18
0
    def validate_parameters(self):
        """Ensure the provided parameters are valid and properly formatted"""
        #################### root_results_path ####################
        if self.root_results_path is None:
            G.warn('Received root_results_path=None. Results will not be stored at all.')
        elif isinstance(self.root_results_path, str):
            if not self.root_results_path.endswith(ASSETS_DIRNAME):
                self.root_results_path = os.path.join(self.root_results_path, ASSETS_DIRNAME)
                self.result_paths['root'] = self.root_results_path
            if not os.path.exists(self.root_results_path):
                os.makedirs(self.root_results_path, exist_ok=True)
        else:
            raise TypeError('root_results_path must be None or str, not {}: {}'.format(*type_val(self.root_results_path)))

        #################### verbose ####################
        if not isinstance(self.verbose, bool):
            raise TypeError('verbose must be a boolean. Received {}: {}'.format(*type_val(self.verbose)))

        #################### file_blacklist ####################
        self.file_blacklist = validate_file_blacklist(self.file_blacklist)

        #################### Train/Test Datasets ####################
        if isinstance(self.train_dataset, str):
            self.train_dataset = pd.read_csv(self.train_dataset)
        if isinstance(self.test_dataset, str):
            self.test_dataset = pd.read_csv(self.test_dataset)

        #################### metrics_params/metrics_map ####################
        if (self.metrics_map is not None) and ('metrics_map' in self.metrics_params.keys()):
            raise ValueError(
                '`metrics_map` may be provided as a kwarg, or as a key in `metrics_params`, but NOT BOTH. Received: ' +
                F'\n `metrics_map`={self.metrics_map}\n `metrics_params`={self.metrics_params}'
            )
        else:
            if self.metrics_map is None:
                self.metrics_map = self.metrics_params['metrics_map']
            self.metrics_params = {**dict(metrics_map=self.metrics_map), **self.metrics_params}

        #################### cross_validation_type ####################
        if isinstance(self.cross_validation_type, str):
            try:
                self.cross_validation_type = sk_cv.__getattribute__(self.cross_validation_type)
            except AttributeError:
                raise AttributeError('`sklearn.model_selection._split` has no attribute "{}".'.format(self.cross_validation_type))

        #################### to_csv_params ####################
        self.to_csv_params = {_k: _v for _k, _v in self.to_csv_params.items() if _k != 'path_or_buf'}

        #################### cross_experiment_params ####################
        self.cross_experiment_params = dict(
            cross_validation_type=self.cross_validation_type,
            runs=self.runs,
            global_random_seed=self.global_random_seed,
            random_seeds=self.random_seeds,
            random_seed_bounds=self.random_seed_bounds,
        )

        #################### experiment_callbacks ####################
        if not isinstance(self.experiment_callbacks, list):
            self.experiment_callbacks = [self.experiment_callbacks]
        for callback in self.experiment_callbacks:
            if not isclass(callback):
                raise TypeError(F'experiment_callbacks must be classes. Received {type(callback)}: {callback}')
            if callback.__name__ != 'LambdaCallback':
                raise ValueError(F'experiment_callbacks must be LambdaCallback instances, not {callback.__name__}: {callback}')
示例#19
0
def parameterize_compiled_keras_model(model):
    """Traverse a compiled Keras model to gather critical information about the layers used to
    construct its architecture, and the parameters used to compile it

    Parameters
    ----------
    model: Instance of :class:`keras.wrappers.scikit_learn.<KerasClassifier; KerasRegressor>`
        A compiled instance of a Keras model, made using the Keras `wrappers.scikit_learn` module.
        This must be a completely valid Keras model, which means that it often must be the result
        of :func:`library_helpers.keras_optimization_helper.initialize_dummy_model`. Using the
        resulting dummy model ensures the model will pass Keras checks that would otherwise reject
        instances of `space.Space` descendants used to provide hyperparameter choices

    Returns
    -------
    layers: List
        A list containing a dict for each layer found in the architecture of `model`. A layer dict
        should contain the following keys: ['class_name', '__hh_default_args',
        '__hh_default_kwargs', '__hh_used_args', '__hh_used_kwargs']
    compile_params: Dict
        The parameters used on the call to :meth:`model.compile`. If a value for a certain parameter
        was not explicitly provided, its default value will be included in `compile_params`"""
    # NOTE: Tested optimizer and loss with both callable and string inputs - Converted to callables automatically
    ##################################################
    # Model Compile Parameters
    ##################################################
    compile_params = dict()

    compile_params["optimizer"] = get_keras_attr(
        model, "optimizer").__class__.__name__.lower()
    compile_params["optimizer_params"] = get_keras_attr(
        model, "optimizer").get_config()

    compile_params["metrics"] = get_keras_attr(model, "metrics")
    compile_params["metrics_names"] = get_keras_attr(model, "metrics_names")

    compile_params["loss_functions"] = get_keras_attr(model, "loss_functions")
    compile_params["loss_function_names"] = [
        _.__name__ for _ in compile_params["loss_functions"]
    ]

    # FLAG: BELOW PARAMETERS SHOULD ONLY BE DISPLAYED IF EXPLICITLY GIVEN (probably have to be in key by default, though):
    compile_params["loss_weights"] = get_keras_attr(model, "loss_weights")
    compile_params["sample_weight_mode"] = get_keras_attr(
        model, "sample_weight_mode")
    compile_params["weighted_metrics"] = get_keras_attr(
        model, "weighted_metrics")

    compile_params["target_tensors"] = get_keras_attr(model,
                                                      "target_tensors",
                                                      default=None)
    compile_params["compile_kwargs"] = get_keras_attr(model,
                                                      "_function_kwargs")

    ##################################################
    # Model Architecture
    ##################################################
    layers = []

    for layer in get_keras_attr(model, "layers"):
        layer_obj = dict(class_name=layer.__class__.__name__)

        for hh_attr in HH_ARG_ATTRS:
            layer_obj[hh_attr] = getattr(layer, hh_attr, None)

        layers.append(layer_obj)

    ##################################################
    # Handle Custom Losses/Optimizers
    ##################################################
    if any([
            _.__module__ != "keras.losses"
            for _ in compile_params["loss_functions"]
    ]):
        G.warn(
            "Custom loss functions will not be hashed and saved, meaning they are identified only by their names."
            +
            "\nIf you plan on tuning loss functions at all, please ensure custom functions are not given the same names as any"
            +
            " of Keras's loss functions. Otherwise, naming conflicts may occur and make results very confusing."
        )
    if get_keras_attr(model, "optimizer").__module__ != "keras.optimizers":
        G.warn(
            "Custom optimizers will not be hashed and saved, meaning they are identified only by their names."
            +
            "\nIf you plan on tuning optimizers at all, please ensure custom optimizers are not given the same names as any"
            +
            " of Keras's optimizers. Otherwise, naming conflicts may occur and make results very confusing."
        )

    return layers, compile_params
def parameterize_compiled_keras_model(model):
    """Traverse a compiled Keras model to gather critical information about the layers used to
    construct its architecture, and the parameters used to compile it

    Parameters
    ----------
    model: Instance of :class:`keras.wrappers.scikit_learn.<KerasClassifier; KerasRegressor>`
        A compiled instance of a Keras model, made using the Keras `wrappers.scikit_learn` module

    Returns
    -------
    layers: List
        A list containing a dict for each layer found in the architecture of `model`. A layer dict
        should contain the following keys: ['class_name', '__hh_default_args',
        '__hh_default_kwargs', '__hh_used_args', '__hh_used_kwargs']
    compile_params: Dict
        The parameters used on the call to :meth:`model.compile`. If a value for a certain parameter
        was not explicitly provided, its default value will be included in `compile_params`"""
    # NOTE: Tested optimizer and loss with both callable and string inputs - Converted to callables automatically

    # TODO: MIGHT NEED TO CHECK KERAS VERSION...
    # TODO: If the "TEST" lines below don't work for older Keras versions, add check here to set `model = model.model`...
    # TODO: ... For newer Keras versions, but leave it alone for older versions

    ##################################################
    # Model Compile Parameters
    ##################################################
    compile_params = dict()
    # compile_params['optimizer'] = model.optimizer.__class__.__name__  # -> 'Adam'  # FLAG: ORIGINAL
    compile_params["optimizer"] = model.model.optimizer.__class__.__name__.lower()  # FLAG: TEST
    # compile_params['optimizer_params'] = model.optimizer.get_config()  # -> {**kwargs}  # FLAG: ORIGINAL
    compile_params["optimizer_params"] = model.model.optimizer.get_config()  # FLAG: TEST

    # compile_params['metrics'] = model.metrics  # -> ['accuracy']  # FLAG: ORIGINAL
    compile_params["metrics"] = model.model.metrics  # FLAG: TEST
    # compile_params['metrics_names'] = model.metrics_names  # -> ['loss', 'acc']  # FLAG: ORIGINAL
    compile_params["metrics_names"] = model.model.metrics_names  # FLAG: TEST

    compile_params["loss_functions"] = model.model.loss_functions
    compile_params["loss_function_names"] = [_.__name__ for _ in compile_params["loss_functions"]]

    # FLAG: BELOW PARAMETERS SHOULD ONLY BE DISPLAYED IF EXPLICITLY GIVEN (probably have to be in key by default, though):
    # compile_params['loss_weights'] = model.loss_weights  # -> None, [], or {}  # FLAG: ORIGINAL
    compile_params["loss_weights"] = model.model.loss_weights  # FLAG: TEST
    # compile_params['sample_weight_mode'] = model.sample_weight_mode  # -> None, or ''  # FLAG: ORIGINAL
    compile_params["sample_weight_mode"] = model.model.sample_weight_mode  # FLAG: TEST
    # compile_params['weighted_metrics'] = model.weighted_metrics  # -> None, or []  # FLAG: ORIGINAL
    compile_params["weighted_metrics"] = model.model.weighted_metrics  # FLAG: TEST

    try:
        # compile_params['target_tensors'] = model.target_tensors  # FLAG: ORIGINAL
        compile_params["target_tensors"] = model.model.target_tensors  # FLAG: TEST
    except AttributeError:
        compile_params["target_tensors"] = None

    # noinspection PyProtectedMember
    compile_params["compile_kwargs"] = model.model._function_kwargs  # -> {}

    ##################################################
    # Model Architecture
    ##################################################
    hh_attributes = [
        "__hh_default_args",
        "__hh_default_kwargs",
        "__hh_used_args",
        "__hh_used_kwargs",
    ]
    layers = []

    # for layer in model.layers:  # FLAG: ORIGINAL
    for layer in model.model.layers:  # FLAG: TEST
        layer_obj = dict(class_name=layer.__class__.__name__)

        for hh_attr in hh_attributes:
            layer_obj[hh_attr] = getattr(layer, hh_attr, None)

        layers.append(layer_obj)

    ##################################################
    # Handle Custom Losses/Optimizers
    ##################################################
    if any([_.__module__ != "keras.losses" for _ in compile_params["loss_functions"]]):
        G.warn(
            "Custom loss functions will not be hashed and saved, meaning they are identified only by their names."
            + "\nIf you plan on tuning loss functions at all, please ensure custom functions are not given the same names as any"
            + " of Keras's loss functions. Otherwise, naming conflicts may occur and make results very confusing."
        )
    # if model.optimizer.__module__ != 'keras.optimizers':  # FLAG: ORIGINAL
    if model.model.optimizer.__module__ != "keras.optimizers":  # FLAG: TEST
        G.warn(
            "Custom optimizers will not be hashed and saved, meaning they are identified only by their names."
            + "\nIf you plan on tuning optimizers at all, please ensure custom optimizers are not given the same names as any"
            + " of Keras's optimizers. Otherwise, naming conflicts may occur and make results very confusing."
        )

    return layers, compile_params
示例#21
0
    def validate_parameters(self):
        """Ensure the provided parameters are valid and properly formatted"""
        #################### root_results_path ####################
        if self.root_results_path is None:
            G.warn(
                "Received root_results_path=None. Results will not be stored at all."
            )
        elif isinstance(self.root_results_path, str):
            if not self.root_results_path.endswith(ASSETS_DIRNAME):
                self.root_results_path = os.path.join(self.root_results_path,
                                                      ASSETS_DIRNAME)
                self.result_paths["root"] = self.root_results_path
            if not os.path.exists(self.root_results_path):
                make_dirs(self.root_results_path, exist_ok=True)
        else:
            raise TypeError(
                f"root_results_path must be None or str, not {self.root_results_path}"
            )

        #################### target_column ####################
        if isinstance(self.target_column, str):
            self.target_column = [self.target_column]

        #################### file_blacklist ####################
        self.file_blacklist = validate_file_blacklist(self.file_blacklist)

        if self.root_results_path is None:
            self.file_blacklist = "ALL"

        #################### Train/Test Datasets ####################
        if isinstance(self.train_dataset, str):
            self.train_dataset = pd.read_csv(self.train_dataset)
        if isinstance(self.test_dataset, str):
            self.test_dataset = pd.read_csv(self.test_dataset)

        #################### metrics_params/metrics_map ####################
        if (self.metrics_map is not None) and ("metrics_map"
                                               in self.metrics_params.keys()):
            raise ValueError(
                "`metrics_map` may be provided as a kwarg, or as a `metrics_params` key, but NOT BOTH. Received: "
                +
                f"\n `metrics_map`={self.metrics_map}\n `metrics_params`={self.metrics_params}"
            )
        else:
            if self.metrics_map is None:
                self.metrics_map = self.metrics_params["metrics_map"]
            self.metrics_map = format_metrics_map(self.metrics_map)
            self.metrics_params = {
                **dict(metrics_map=self.metrics_map),
                **self.metrics_params
            }

        #################### cross_validation_type ####################
        if isinstance(self.cross_validation_type, str):
            try:
                self.cross_validation_type = sk_cv.__getattribute__(
                    self.cross_validation_type)
            except AttributeError:
                raise AttributeError(
                    f"'{self.cross_validation_type}' not in `sklearn.model_selection._split`"
                )

        #################### to_csv_params ####################
        self.to_csv_params = {
            k: v
            for k, v in self.to_csv_params.items() if k != "path_or_buf"
        }

        #################### cross_experiment_params ####################
        self.cross_experiment_params = dict(
            cross_validation_type=self.cross_validation_type,
            runs=self.runs,
            global_random_seed=self.global_random_seed,
            random_seeds=self.random_seeds,
            random_seed_bounds=self.random_seed_bounds,
        )

        #################### experiment_callbacks ####################
        if not isinstance(self.experiment_callbacks, list):
            self.experiment_callbacks = [self.experiment_callbacks]
        for cb in self.experiment_callbacks:
            if not isclass(cb):
                raise TypeError(
                    f"experiment_callbacks must be classes, not {type(cb)}: {cb}"
                )
            if cb.__name__ != "LambdaCallback":
                raise ValueError(
                    f"experiment_callbacks must be LambdaCallback instances, not {cb}"
                )
    def handle_complex_types(self):
        """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries
        linking their original values to their hashes, then update their values in
        :attr:`parameters` to their hashes to facilitate Description saving"""
        dataframe_hashes = {}

        def enter(path, key, value):
            """Produce iterable of attributes to remap for instances of :class:`metrics.Metric`"""
            if isinstance(value, Metric):
                metric_attrs = ["name", "metric_function", "direction"]
                return ({}, [(_, getattr(value, _)) for _ in metric_attrs])

            if isinstance(value, EngineerStep):
                return ({}, list(value.get_key_data().items()))
            if isinstance(value, FeatureEngineer):
                return ({}, list(value.get_key_data().items()))

            return default_enter(path, key, value)

        def visit(path, key, value):
            """Check whether a parameter is of a complex type. If not, return it unchanged.
            Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking
            `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`,
            instead of the original complex-typed `value`

            Parameters
            ----------
            path: Tuple
                The path of keys that leads to `key`
            key: Str
                The parameter name
            value: *
                The value of the parameter `key`

            Returns
            -------
            Tuple of (`key`, value), in which value is either unchanged or a hash for the original
            `value`"""
            if isinstance(value, BaseKerasCallback):
                return (key, keras_callback_to_dict(value))
            if isinstance(value, BaseKerasInitializer):
                return (key, keras_initializer_to_dict(value))
            if isinstance(value, Sentinel):
                return (key, value.sentinel)
            elif callable(value) or isinstance(value, pd.DataFrame):
                # FLAG: Look into adding package version number to hashed attributes
                hashed_value = make_hash_sha256(value)

                if isinstance(value, pd.DataFrame):
                    dataframe_hashes.setdefault(hashed_value, []).append(key)

                if self.tested_keys_dir is not None:  # Key-making not blacklisted
                    self.add_complex_type_lookup_entry(path, key, value,
                                                       hashed_value)
                return (key, hashed_value)
            return (key, value)

        self.parameters = remap(self.parameters, visit=visit, enter=enter)

        #################### Check for Identical DataFrames ####################
        for df_hash, df_names in dataframe_hashes.items():
            if len(df_names) > 1:
                G.warn(
                    f"The dataframes: {df_names} are identical. Scores may be misleading!"
                )