示例#1
0
    def on_training_start(self, num_epochs, num_batches_per_epoch, start_epoch,
                          start_batch, start_update_iter):

        if MetricsLoggingMode.will_log_window_average_metrics(
                self._logging_mode):
            if self._batch_averaging_window is None:
                try:
                    self._batch_averaging_window = len(self._dataset)
                    if self._batch_averaging_window == math.inf:
                        self._log.error(
                            f"The batch average window of the {self._dataset_name} data set is infinite, "
                            f"unable to calculate window average. "
                            f"Please set a finite batch_processing_window during construction of "
                            f"this TestMetricsLogger.")
                        self._valid = False
                    else:
                        self._log.debug(
                            f"Set batch averaging window to number of batches in "
                            f"{self._dataset_name} data set : {self._batch_averaging_window}"
                        )
                except Exception as e:
                    _.log_exception(
                        self._log,
                        f"Unable to assess data set length to set the batch averaging window, "
                        f"{self} will not function", e)
                    self._valid = False

        return super().on_training_start(num_epochs, num_batches_per_epoch,
                                         start_epoch, start_batch,
                                         start_update_iter)
示例#2
0
文件: training.py 项目: nuhame/mlpug
    def _set_metric_window_state(self, metric_path, window_state):
        success = True
        try:
            window_length = window_state['length']

            if self.num_batches_per_epoch and (window_length !=
                                               self.num_batches_per_epoch):
                # override when different number of batches per epoch is given (or calculated)
                # during construction
                window = self.sliding_window_factory(
                    length=self.num_batches_per_epoch,
                    init_window_values=window_state['window'],
                    name=metric_path)
            else:
                window = self.sliding_window_factory(state=window_state)

            self._metric_windows[metric_path] = window
        except Exception as e:
            _.log_exception(
                self._log,
                f"Unable to set metric window state for {metric_path}, skipped ...",
                e)
            success = False

        return success
示例#3
0
文件: training.py 项目: nuhame/mlpug
    def set_learning_rate_for(self, optimizer_name, lr):
        """

        Set learning rate for specific optimizer `optimizer_name` to `lr`

        :param optimizer_name:
        :param lr:

        :return: True on success, else False
        """
        optimizer = self.get_optimizer(optimizer_name)
        if not hasattr(optimizer, 'param_groups'):
            self._log.error(
                f"No valid optimizer available with name {optimizer_name}, unable to set learning rate"
            )
            return False

        try:
            for group in optimizer.param_groups:
                group['lr'] = lr
        except Exception as e:
            _.log_exception(
                self._log,
                f"Unable to set learning rate for optimizer {optimizer_name}",
                e)
            return False

        self._log.debug(
            f"Learning rate of optimizer {optimizer_name} set to : {lr}")

        return True
示例#4
0
    def _create_training_checkpoint(self):
        checkpoint_fname = self.training_checkpoint_file_name()

        if self._backup_before_override:
            try:
                if not self._backup_checkpoint(checkpoint_fname):
                    self._log.error("Unable to backup last training checkpoint, "
                                    "will not override override training checkpoint with new one")
                    return None
            except Exception as e:
                _.log_exception(self._log, f"A problem occurred backing up the last training checkpoint, "
                                           f"will not override override training checkpoint with new one", e)
                return None

        try:
            state, success = self._gather_training_checkpoint_data()
            if state is not None:
                if not success:
                    self._log.warn("Gathering the training checkpoint data was not completely successful, "
                                   "will save available checkpoint data anyway ...")

                self._log.debug(f"Saving training checkpoint : {checkpoint_fname}")
                self._save_training_checkpoint(checkpoint_fname, state)
            else:
                return None
        except Exception as e:
            _.log_exception(self._log, f"Unable to save training checkpoint", e)
            return None

        return checkpoint_fname
示例#5
0
文件: training.py 项目: nuhame/mlpug
    def set_optimizers_state(self, state):
        """

        :param state:
        :return: success (True, False)
        """
        if not _.is_callable(getattr(state, 'items', None)):
            self._log.error("State is invalid, unable to set optimizers state")
            return False

        success = True
        for name, optimizer_state in state.items():
            optimizer = self.get_optimizer(name)
            if optimizer is None:
                self._log.error(
                    f"No {name} optmizer not found, unable to set state")
                success = False
                continue

            try:
                self._set_optimizer_state(optimizer, optimizer_state, name)
            except Exception as e:
                _.log_exception(self._log,
                                f"Unable to set state for optimizer {name}", e)
                success = False

        return success
示例#6
0
    def _gather_model_checkpoint_data(self):
        """

        :return: state, success
        """
        state, success = self.trainer.get_model_components_state()

        if state is not None:
            if not success:
                self._log.warn("Getting the model components state was not completely successful, "
                               "continuing anyway ...")

            if self._model_hyper_parameters is not None:
                state['hyper_parameters'] = self._model_hyper_parameters

            try:
                manager_state, manager_state_success = self.training_manager.get_state_for_model_checkpoint()
                state['manager_state'] = manager_state

                if not manager_state_success:
                    self._log.warn("Getting the manager state for the model checkpoint was not successful, "
                                   "will continue anyway ...")
                    success = False

            except Exception as e:
                _.log_exception(self._log, "Unable to add manager state to model checkpoint, "
                                           "continuing anyway ...", e)
                success = False
        else:
            # In any case when the state is None, gathering the model checkpoint data is not successful
            success = False

        return state, success
    def _save_checkpoint_state(self):
        try:
            fname = self.checkpoint_state_file_name()

            # only backup if exists
            if os.path.isfile(fname):
                backup_success = self._copy(fname, "%s.backup" % fname)
                if not backup_success:
                    self._log.error(
                        "Backing up checkpoint state unsuccessful, "
                        "will not override checkpoint state file with new state."
                    )
                    return

            if self._simulation_mode or self._debug_mode:
                self._log.debug("Saving checkpoint state to [%s]" % fname)

            if not self._simulation_mode:
                with open(fname, 'wb') as f:
                    pickle.dump(
                        {
                            "model_quality": self._model_quality,
                            "model_iter": self._model_iter,
                            "best_model": self._best_model,
                            "best_model_quality": self._best_model_quality,
                            "earliest_good_model": self._earliest_good_model,
                            "earliest_good_model_iter":
                            self._earliest_good_model_iter,
                            "iter": self._iter
                        }, f)
        except Exception as e:
            _.log_exception(self._log, "Unable to save checkpoint state", e)
示例#8
0
    def _create_model_checkpoint(self, model_fn=None, file_to_copy=None):
        if model_fn is None:
            model_fn = self.latest_model_file_name()

        if file_to_copy:
            if not self._copy(file_to_copy, model_fn):
                self._log.error(f"Unable to create model checkpoint based on file : {file_to_copy}")
                return None
        else:
            try:
                state, success = self._gather_model_checkpoint_data()
                if state is not None:
                    if not success:
                        self._log.warn("Gathering the model checkpoint data was not completely successful, "
                                       "will save available checkpoint data anyway ...")

                    self._log.debug(f"Saving model checkpoint : {model_fn}")
                    self._save_model_checkpoint(model_fn, state)
                else:
                    return None
            except Exception as e:
                _.log_exception(self._log, f"A problem occurred saving the latest model as checkpoint", e)
                return None

        return model_fn
    def _save_current_model_as_temp(self, checkpoint_fname):
        try:
            fname = tempfile.mkstemp(dir=self._temp_model_path)
            fname = fname[1]

            if checkpoint_fname is None:
                if self._simulation_mode or self._debug_mode:
                    self._log.debug(
                        "Saving current model weights to temp. file [%s]" %
                        fname)

                if not self._simulation_mode:
                    self._model.save_weights(fname)

                return fname
            else:
                if self._simulation_mode or self._debug_mode:
                    self._log.debug(
                        "Saving current model to temp. file: "
                        "Copying model checkpoint [%s] to temp. file [%s]" %
                        (checkpoint_fname, fname))

                if not self._simulation_mode:
                    if self._copy(checkpoint_fname, fname):
                        # Success
                        return fname
                    else:
                        return None

        except Exception as e:
            _.log_exception(
                self._log,
                "Unable to save or copy current model as temp. model", e)
            return None
示例#10
0
    def _calc_eta(self, logs):

        current = self._get_logs_base(logs)

        eta_str = None
        try:
            training_params = current["training_params"]

            average_batch_duration = training_params["window_average"][
                "duration"]
            if average_batch_duration and average_batch_duration > 0:
                batch_step = current["batch_step"]
                final_batch_step = logs["final_batch_step"]
                num_batches_to_go = final_batch_step - batch_step + 1

                eta_seconds = int(
                    round(average_batch_duration * num_batches_to_go))

                eta_str = str(datetime.timedelta(seconds=eta_seconds))
            else:
                eta_str = "[UNKNOWN]"
        except Exception as e:
            _.log_exception(self._log, "Unable to calculate epoch ETA", e)

        return eta_str
示例#11
0
文件: training.py 项目: nuhame/mlpug
    def set_model_components_state(self, state):
        """

        :param state:
        :return: success (True or False)
        """
        if not _.is_callable(getattr(state, 'items', None)):
            self._log.error(
                "State is invalid, unable to set model components state")
            return False

        success = True
        for name, model_state in state.items():
            model = self.get_model_component(name)
            if model is None:
                self._log.error(
                    f"No {name} model not found, unable to set state")
                success = False
                continue

            try:
                self._set_model_state(model, model_state, name)
            except Exception as e:
                _.log_exception(self._log,
                                f"Unable to set state for model {name}", e)
                success = False

        return success
示例#12
0
文件: training.py 项目: nuhame/mlpug
    def get_state(self):
        """

        :return: state, success
        """

        if not self.instance_valid():
            self._log.error(
                'TrainingManager is not valid, unable to get training manager state'
            )
            return None, False

        state = {
            "manager": {
                "epoch": self.epoch,
                "batch_step": self.batch_step,
                "global_iter": self.global_iter,
                "logs": self.logs,
                "metric_windows": self._get_metric_windows_state()
            },
            "callbacks": {},
            "experiment_data": self.experiment_data
        }

        success = True
        for cb_idx, callback in enumerate(self.callbacks):
            cb_name = self.cb_names[cb_idx]
            cb_hash = self.cb_hashes[cb_idx]

            try:
                cb_state, cb_success = callback.get_state()

                success &= cb_success

                if cb_hash in state["callbacks"] and cb_state != state[
                        "callbacks"][cb_hash]:
                    self._log.error(
                        f"There is already a callback {cb_name} in the TrainingManager state "
                        f"with exactly the same hash, this state will be overridden now. "
                        f"Please ensure that all your callbacks have a unique names/hashes.\n"
                        f"Callback index = {cb_idx}\n"
                        f"Callback hash  = {cb_hash}\n")
                    success = False

                state["callbacks"][cb_hash] = cb_state
            except Exception as e:
                _.log_exception(
                    self._log, f"Failed to get state of callback {cb_name}, "
                    f"unable to add callback state to Training Manager state.\n"
                    f"Callback index = {cb_idx}\n"
                    f"Callback hash  = {cb_hash}\n", e)
                success = False

        state["trainer"], get_trainer_state_success = self.trainer.get_state()
        success &= get_trainer_state_success

        return state, success
示例#13
0
    def _copy(self, source_fname, dest_fname):
        try:
            self._log.debug("Copying model:\n[%s] ==> \n[%s]\n" % (source_fname, dest_fname))
            copyfile(source_fname, dest_fname)
        except Exception as e:
            _.log_exception(self._log, "Unable to copy [%s]" % source_fname, e)
            return False

        return True
示例#14
0
    def _setup(self):
        try:
            success = self._setup_writer_for(self._writer_type)
        except Exception as e:
            _.log_exception(self._log, "An exception occurred setting up the Tensorboard callback", e)
            success = False

        self._valid = success

        return self._valid
 def _setup_temp_model_path(self):
     try:
         self._temp_model_path = os.path.join(self._model_path,
                                              'temp-models')
         if not os.path.exists(self._temp_model_path):
             os.makedirs(self._temp_model_path)
     except Exception as e:
         _.log_exception(
             self._log,
             "Unable to use temp. model path: %s" % self._temp_model_path,
             e)
示例#16
0
    def _get_epoch_duration(self, logs):
        current = self._get_logs_base(logs)

        duration_str = None
        try:
            epoch_duration = int(
                round(current["training_params"]["epoch"]["duration"]))
            duration_str = str(datetime.timedelta(seconds=epoch_duration))
        except Exception as e:
            _.log_exception(self._log, "Unable to get epoch duration", e)

        return duration_str
示例#17
0
    def reduce(self,
               batch_metric_data_lists,
               metrics_output,
               batch_metric_reducer_funcs=None,
               dataset_name=None):
        """

        Use the `batch_metric_reducer_funcs` to reduce the lists available in the `batch_metric_data_lists` dict

        The keys of `batch_metric_data_lists` must be the key path to the reduced metrics. Example:
        batch_metric_data_lists = {
            'loss': [ ... data to use to calculated loss ... ],
            'classification.F1': [ ... data to use to calculated F1 ... ]
        }

        To get the correct key paths you can use the `get_key_paths` as a utility

        :param batch_metric_data_lists: See above.
        :type batch_metric_data_lists: Dict

        :param metrics_output: Dict to which the reduced metrics will be written
        :type metrics_output: Dict

        :param batch_metric_reducer_funcs: Optional alternative dict wth reducer functions
        :type batch_metric_reducer_funcs: dict

        :param dataset_name: Used for logging purposes
        :type dataset_name:

        :return: True on success, else False
        :rtype:
        """
        success = True

        if batch_metric_reducer_funcs is None:
            batch_metric_reducer_funcs = self._batch_metric_reducer_funcs

        for metric_path, batch_metric_data_list in batch_metric_data_lists.items(
        ):
            try:
                reducer_func = get_value_at(metric_path,
                                            batch_metric_reducer_funcs)
                set_value_at(metric_path, metrics_output,
                             reducer_func(batch_metric_data_list))
            except Exception as e:
                _.log_exception(
                    self._log,
                    f"Exception occurred reducing {metric_path} for "
                    f"{'' if dataset_name is None else dataset_name} dataset "
                    f"batch metric data", e)
                success = False

        return success
示例#18
0
文件: training.py 项目: nuhame/mlpug
    def _calc_window_sum(self, metric_path):
        try:
            window = self._metric_windows[metric_path]
            if window is None:
                return None

            window_data = window.window
            return sum(window_data) if len(window_data) > 0 else None
        except Exception as e:
            _.log_exception(
                self._log,
                f"Exception occurred calculating sliding window average for {metric_path}.",
                e)
            return None
示例#19
0
文件: training.py 项目: nuhame/mlpug
    def _update_window(self, metric_path):
        try:
            window = self._metric_windows[metric_path]
            if window is None:
                return

            value = get_value_at(metric_path, self.logs["current"])

            window.slide(value)
        except Exception as e:
            _.log_exception(
                self._log,
                f"Exception occurred updating sliding window {metric_path}, skipped...",
                e)
    def _save_checkpoint(self):
        try:
            fname = self.latest_model_file_name()
            if self._simulation_mode or self._debug_mode:
                self._log.debug("Saving checkpoint as [%s]" % fname)

            if not self._simulation_mode:
                self._model.save_weights(fname)

            return fname
        except Exception as e:
            _.log_exception(self._log, "Unable to save current model", e)

        return None
示例#21
0
    def _get_average_batch_duration(self, logs):
        current = self._get_logs_base(logs)

        duration_str = "[UNKNOWN]"
        try:
            duration = current["training_params"]["window_average"]["duration"]
            if duration and duration > 0.0:
                duration = int(duration * 1000)
                duration_str = f"{duration}ms"
        except Exception as e:
            _.log_exception(self._log, "Unable to get average batch duration",
                            e)

        return duration_str
    def _copy(self, source_fname, dest_fname):
        success = True

        try:
            if self._simulation_mode or self._debug_mode:
                self._log.debug("Copying model: [%s] ==> [%s]" %
                                (source_fname, dest_fname))

            if not self._simulation_mode:
                copyfile(source_fname, dest_fname)
        except Exception as e:
            _.log_exception(self._log, "Unable to copy [%s]" % source_fname, e)
            success = False

        return success
示例#23
0
    def _setup(self):
        try:
            # Metrics writer is used for batch level and epoch level
            success = self._setup_writer_for(METRIC_WRITER)

            if self._show_batch_level and self._show_batch_window_averages:
                success &= self._setup_writer_for(WINDOW_AVERAGED_METRICS_WRITER)

        except Exception as e:
            _.log_exception(self._log, "An exception occurred setting up the Tensorboard callback", e)
            success = False

        self._valid = success

        return self._valid
示例#24
0
    def _save_current_model_as_best(self):
        model_fn = self.best_model_file_name()

        if self._backup_before_override:
            err_msg = "A problem occurred backing up the last best model checkpoint, " \
                      "will not override override model checkpoint with new one. \n\n" \
                      "*** Please ensure there is enough disk space to store the backups and checkpoints ***\n\n"
            try:
                if not self._backup_checkpoint(model_fn):
                    self._log.error(err_msg)
                    return None
            except Exception as e:
                _.log_exception(self._log, err_msg, e)
                return None

        return self._create_model_checkpoint(model_fn=model_fn)
    def _remove_model(self, model_fname):
        try:
            if self._simulation_mode or self._debug_mode:
                self._log.debug("Removing model: [%s]" % model_fname)

            if not os.path.isfile(model_fname):
                if self._simulation_mode or self._debug_mode:
                    self._log.debug(
                        "File does not exists, will not remove ...")

                return

            if not self._simulation_mode:
                os.remove(model_fname)
        except Exception as e:
            _.log_exception(self._log,
                            "Unable to remove file [%s]" % model_fname, e)
示例#26
0
    def _update_lr(self, logs, iter_name):
        current = self._get_logs_base(logs)

        model_quality = self._get_model_quality(current)
        training_iter = current[iter_name]

        success = False
        try:
            success = self._exec_schedulers(training_iter, model_quality)
            if not success:
                self._log.error("Updating of the learning rate(s) failed.")
        except Exception as e:
            _.log_exception(
                self._log, "An unexpected error occurred, "
                "execution of the learning rate scheduler(s) failed", e)

        return success
示例#27
0
文件: training.py 项目: nuhame/mlpug
    def _set_metric_windows_states(self, state):
        success = True
        try:
            metric_windows_state = get_value_at("manager.metric_windows",
                                                state)
            if metric_windows_state is None:
                return success

            for metric_path, window_state in metric_windows_state.items():
                success &= self._set_metric_window_state(
                    metric_path, window_state)

        except Exception as e:
            _.log_exception(
                self._log, f"Unable to set metric windows state, skipped ...",
                e)
            success = False

        return success
示例#28
0
文件: training.py 项目: nuhame/mlpug
    def get_optimizers_state(self):
        """

        :return: state, success (True of False)
        """
        state = {}

        optimizers = self.get_optimizers()

        success = True
        for name, optimizer in optimizers.items():
            try:
                state[name] = self._get_optimizer_state(optimizer, name)
            except Exception as e:
                _.log_exception(self._log,
                                f"Unable to get state for optimizer {name}", e)
                success = False

        return state, success
示例#29
0
文件: training.py 项目: nuhame/mlpug
    def get_model_components_state(self):
        """

        :return: state, success (True or False)
        """
        state = {}

        model_components = self.get_model_components()

        success = True
        for name, model in model_components.items():
            try:
                state[name] = self._get_model_state(model, name)
            except Exception as e:
                _.log_exception(self._log,
                                f"Unable to get state for model {name}", e)
                success = False

        return state, success
示例#30
0
文件: training.py 项目: nuhame/mlpug
    def set_learning_rate(self, lr):
        """
        Convenience method to set the learning rate of all optimizers to `lr`

        :param lr: Learning rate to set
        :return: True on success else False
        """
        success = True

        optimizer_names = self.get_optimizers().keys()
        for opt_name in optimizer_names:
            try:
                success &= self.set_learning_rate_for(opt_name, lr)
            except Exception as e:
                _.log_exception(
                    self._log,
                    f"Unable to set learning rate for optimizer {opt_name}", e)
                success = False

        return success