def on_training_start(self, num_epochs, num_batches_per_epoch, start_epoch, start_batch, start_update_iter): if MetricsLoggingMode.will_log_window_average_metrics( self._logging_mode): if self._batch_averaging_window is None: try: self._batch_averaging_window = len(self._dataset) if self._batch_averaging_window == math.inf: self._log.error( f"The batch average window of the {self._dataset_name} data set is infinite, " f"unable to calculate window average. " f"Please set a finite batch_processing_window during construction of " f"this TestMetricsLogger.") self._valid = False else: self._log.debug( f"Set batch averaging window to number of batches in " f"{self._dataset_name} data set : {self._batch_averaging_window}" ) except Exception as e: _.log_exception( self._log, f"Unable to assess data set length to set the batch averaging window, " f"{self} will not function", e) self._valid = False return super().on_training_start(num_epochs, num_batches_per_epoch, start_epoch, start_batch, start_update_iter)
def _set_metric_window_state(self, metric_path, window_state): success = True try: window_length = window_state['length'] if self.num_batches_per_epoch and (window_length != self.num_batches_per_epoch): # override when different number of batches per epoch is given (or calculated) # during construction window = self.sliding_window_factory( length=self.num_batches_per_epoch, init_window_values=window_state['window'], name=metric_path) else: window = self.sliding_window_factory(state=window_state) self._metric_windows[metric_path] = window except Exception as e: _.log_exception( self._log, f"Unable to set metric window state for {metric_path}, skipped ...", e) success = False return success
def set_learning_rate_for(self, optimizer_name, lr): """ Set learning rate for specific optimizer `optimizer_name` to `lr` :param optimizer_name: :param lr: :return: True on success, else False """ optimizer = self.get_optimizer(optimizer_name) if not hasattr(optimizer, 'param_groups'): self._log.error( f"No valid optimizer available with name {optimizer_name}, unable to set learning rate" ) return False try: for group in optimizer.param_groups: group['lr'] = lr except Exception as e: _.log_exception( self._log, f"Unable to set learning rate for optimizer {optimizer_name}", e) return False self._log.debug( f"Learning rate of optimizer {optimizer_name} set to : {lr}") return True
def _create_training_checkpoint(self): checkpoint_fname = self.training_checkpoint_file_name() if self._backup_before_override: try: if not self._backup_checkpoint(checkpoint_fname): self._log.error("Unable to backup last training checkpoint, " "will not override override training checkpoint with new one") return None except Exception as e: _.log_exception(self._log, f"A problem occurred backing up the last training checkpoint, " f"will not override override training checkpoint with new one", e) return None try: state, success = self._gather_training_checkpoint_data() if state is not None: if not success: self._log.warn("Gathering the training checkpoint data was not completely successful, " "will save available checkpoint data anyway ...") self._log.debug(f"Saving training checkpoint : {checkpoint_fname}") self._save_training_checkpoint(checkpoint_fname, state) else: return None except Exception as e: _.log_exception(self._log, f"Unable to save training checkpoint", e) return None return checkpoint_fname
def set_optimizers_state(self, state): """ :param state: :return: success (True, False) """ if not _.is_callable(getattr(state, 'items', None)): self._log.error("State is invalid, unable to set optimizers state") return False success = True for name, optimizer_state in state.items(): optimizer = self.get_optimizer(name) if optimizer is None: self._log.error( f"No {name} optmizer not found, unable to set state") success = False continue try: self._set_optimizer_state(optimizer, optimizer_state, name) except Exception as e: _.log_exception(self._log, f"Unable to set state for optimizer {name}", e) success = False return success
def _gather_model_checkpoint_data(self): """ :return: state, success """ state, success = self.trainer.get_model_components_state() if state is not None: if not success: self._log.warn("Getting the model components state was not completely successful, " "continuing anyway ...") if self._model_hyper_parameters is not None: state['hyper_parameters'] = self._model_hyper_parameters try: manager_state, manager_state_success = self.training_manager.get_state_for_model_checkpoint() state['manager_state'] = manager_state if not manager_state_success: self._log.warn("Getting the manager state for the model checkpoint was not successful, " "will continue anyway ...") success = False except Exception as e: _.log_exception(self._log, "Unable to add manager state to model checkpoint, " "continuing anyway ...", e) success = False else: # In any case when the state is None, gathering the model checkpoint data is not successful success = False return state, success
def _save_checkpoint_state(self): try: fname = self.checkpoint_state_file_name() # only backup if exists if os.path.isfile(fname): backup_success = self._copy(fname, "%s.backup" % fname) if not backup_success: self._log.error( "Backing up checkpoint state unsuccessful, " "will not override checkpoint state file with new state." ) return if self._simulation_mode or self._debug_mode: self._log.debug("Saving checkpoint state to [%s]" % fname) if not self._simulation_mode: with open(fname, 'wb') as f: pickle.dump( { "model_quality": self._model_quality, "model_iter": self._model_iter, "best_model": self._best_model, "best_model_quality": self._best_model_quality, "earliest_good_model": self._earliest_good_model, "earliest_good_model_iter": self._earliest_good_model_iter, "iter": self._iter }, f) except Exception as e: _.log_exception(self._log, "Unable to save checkpoint state", e)
def _create_model_checkpoint(self, model_fn=None, file_to_copy=None): if model_fn is None: model_fn = self.latest_model_file_name() if file_to_copy: if not self._copy(file_to_copy, model_fn): self._log.error(f"Unable to create model checkpoint based on file : {file_to_copy}") return None else: try: state, success = self._gather_model_checkpoint_data() if state is not None: if not success: self._log.warn("Gathering the model checkpoint data was not completely successful, " "will save available checkpoint data anyway ...") self._log.debug(f"Saving model checkpoint : {model_fn}") self._save_model_checkpoint(model_fn, state) else: return None except Exception as e: _.log_exception(self._log, f"A problem occurred saving the latest model as checkpoint", e) return None return model_fn
def _save_current_model_as_temp(self, checkpoint_fname): try: fname = tempfile.mkstemp(dir=self._temp_model_path) fname = fname[1] if checkpoint_fname is None: if self._simulation_mode or self._debug_mode: self._log.debug( "Saving current model weights to temp. file [%s]" % fname) if not self._simulation_mode: self._model.save_weights(fname) return fname else: if self._simulation_mode or self._debug_mode: self._log.debug( "Saving current model to temp. file: " "Copying model checkpoint [%s] to temp. file [%s]" % (checkpoint_fname, fname)) if not self._simulation_mode: if self._copy(checkpoint_fname, fname): # Success return fname else: return None except Exception as e: _.log_exception( self._log, "Unable to save or copy current model as temp. model", e) return None
def _calc_eta(self, logs): current = self._get_logs_base(logs) eta_str = None try: training_params = current["training_params"] average_batch_duration = training_params["window_average"][ "duration"] if average_batch_duration and average_batch_duration > 0: batch_step = current["batch_step"] final_batch_step = logs["final_batch_step"] num_batches_to_go = final_batch_step - batch_step + 1 eta_seconds = int( round(average_batch_duration * num_batches_to_go)) eta_str = str(datetime.timedelta(seconds=eta_seconds)) else: eta_str = "[UNKNOWN]" except Exception as e: _.log_exception(self._log, "Unable to calculate epoch ETA", e) return eta_str
def set_model_components_state(self, state): """ :param state: :return: success (True or False) """ if not _.is_callable(getattr(state, 'items', None)): self._log.error( "State is invalid, unable to set model components state") return False success = True for name, model_state in state.items(): model = self.get_model_component(name) if model is None: self._log.error( f"No {name} model not found, unable to set state") success = False continue try: self._set_model_state(model, model_state, name) except Exception as e: _.log_exception(self._log, f"Unable to set state for model {name}", e) success = False return success
def get_state(self): """ :return: state, success """ if not self.instance_valid(): self._log.error( 'TrainingManager is not valid, unable to get training manager state' ) return None, False state = { "manager": { "epoch": self.epoch, "batch_step": self.batch_step, "global_iter": self.global_iter, "logs": self.logs, "metric_windows": self._get_metric_windows_state() }, "callbacks": {}, "experiment_data": self.experiment_data } success = True for cb_idx, callback in enumerate(self.callbacks): cb_name = self.cb_names[cb_idx] cb_hash = self.cb_hashes[cb_idx] try: cb_state, cb_success = callback.get_state() success &= cb_success if cb_hash in state["callbacks"] and cb_state != state[ "callbacks"][cb_hash]: self._log.error( f"There is already a callback {cb_name} in the TrainingManager state " f"with exactly the same hash, this state will be overridden now. " f"Please ensure that all your callbacks have a unique names/hashes.\n" f"Callback index = {cb_idx}\n" f"Callback hash = {cb_hash}\n") success = False state["callbacks"][cb_hash] = cb_state except Exception as e: _.log_exception( self._log, f"Failed to get state of callback {cb_name}, " f"unable to add callback state to Training Manager state.\n" f"Callback index = {cb_idx}\n" f"Callback hash = {cb_hash}\n", e) success = False state["trainer"], get_trainer_state_success = self.trainer.get_state() success &= get_trainer_state_success return state, success
def _copy(self, source_fname, dest_fname): try: self._log.debug("Copying model:\n[%s] ==> \n[%s]\n" % (source_fname, dest_fname)) copyfile(source_fname, dest_fname) except Exception as e: _.log_exception(self._log, "Unable to copy [%s]" % source_fname, e) return False return True
def _setup(self): try: success = self._setup_writer_for(self._writer_type) except Exception as e: _.log_exception(self._log, "An exception occurred setting up the Tensorboard callback", e) success = False self._valid = success return self._valid
def _setup_temp_model_path(self): try: self._temp_model_path = os.path.join(self._model_path, 'temp-models') if not os.path.exists(self._temp_model_path): os.makedirs(self._temp_model_path) except Exception as e: _.log_exception( self._log, "Unable to use temp. model path: %s" % self._temp_model_path, e)
def _get_epoch_duration(self, logs): current = self._get_logs_base(logs) duration_str = None try: epoch_duration = int( round(current["training_params"]["epoch"]["duration"])) duration_str = str(datetime.timedelta(seconds=epoch_duration)) except Exception as e: _.log_exception(self._log, "Unable to get epoch duration", e) return duration_str
def reduce(self, batch_metric_data_lists, metrics_output, batch_metric_reducer_funcs=None, dataset_name=None): """ Use the `batch_metric_reducer_funcs` to reduce the lists available in the `batch_metric_data_lists` dict The keys of `batch_metric_data_lists` must be the key path to the reduced metrics. Example: batch_metric_data_lists = { 'loss': [ ... data to use to calculated loss ... ], 'classification.F1': [ ... data to use to calculated F1 ... ] } To get the correct key paths you can use the `get_key_paths` as a utility :param batch_metric_data_lists: See above. :type batch_metric_data_lists: Dict :param metrics_output: Dict to which the reduced metrics will be written :type metrics_output: Dict :param batch_metric_reducer_funcs: Optional alternative dict wth reducer functions :type batch_metric_reducer_funcs: dict :param dataset_name: Used for logging purposes :type dataset_name: :return: True on success, else False :rtype: """ success = True if batch_metric_reducer_funcs is None: batch_metric_reducer_funcs = self._batch_metric_reducer_funcs for metric_path, batch_metric_data_list in batch_metric_data_lists.items( ): try: reducer_func = get_value_at(metric_path, batch_metric_reducer_funcs) set_value_at(metric_path, metrics_output, reducer_func(batch_metric_data_list)) except Exception as e: _.log_exception( self._log, f"Exception occurred reducing {metric_path} for " f"{'' if dataset_name is None else dataset_name} dataset " f"batch metric data", e) success = False return success
def _calc_window_sum(self, metric_path): try: window = self._metric_windows[metric_path] if window is None: return None window_data = window.window return sum(window_data) if len(window_data) > 0 else None except Exception as e: _.log_exception( self._log, f"Exception occurred calculating sliding window average for {metric_path}.", e) return None
def _update_window(self, metric_path): try: window = self._metric_windows[metric_path] if window is None: return value = get_value_at(metric_path, self.logs["current"]) window.slide(value) except Exception as e: _.log_exception( self._log, f"Exception occurred updating sliding window {metric_path}, skipped...", e)
def _save_checkpoint(self): try: fname = self.latest_model_file_name() if self._simulation_mode or self._debug_mode: self._log.debug("Saving checkpoint as [%s]" % fname) if not self._simulation_mode: self._model.save_weights(fname) return fname except Exception as e: _.log_exception(self._log, "Unable to save current model", e) return None
def _get_average_batch_duration(self, logs): current = self._get_logs_base(logs) duration_str = "[UNKNOWN]" try: duration = current["training_params"]["window_average"]["duration"] if duration and duration > 0.0: duration = int(duration * 1000) duration_str = f"{duration}ms" except Exception as e: _.log_exception(self._log, "Unable to get average batch duration", e) return duration_str
def _copy(self, source_fname, dest_fname): success = True try: if self._simulation_mode or self._debug_mode: self._log.debug("Copying model: [%s] ==> [%s]" % (source_fname, dest_fname)) if not self._simulation_mode: copyfile(source_fname, dest_fname) except Exception as e: _.log_exception(self._log, "Unable to copy [%s]" % source_fname, e) success = False return success
def _setup(self): try: # Metrics writer is used for batch level and epoch level success = self._setup_writer_for(METRIC_WRITER) if self._show_batch_level and self._show_batch_window_averages: success &= self._setup_writer_for(WINDOW_AVERAGED_METRICS_WRITER) except Exception as e: _.log_exception(self._log, "An exception occurred setting up the Tensorboard callback", e) success = False self._valid = success return self._valid
def _save_current_model_as_best(self): model_fn = self.best_model_file_name() if self._backup_before_override: err_msg = "A problem occurred backing up the last best model checkpoint, " \ "will not override override model checkpoint with new one. \n\n" \ "*** Please ensure there is enough disk space to store the backups and checkpoints ***\n\n" try: if not self._backup_checkpoint(model_fn): self._log.error(err_msg) return None except Exception as e: _.log_exception(self._log, err_msg, e) return None return self._create_model_checkpoint(model_fn=model_fn)
def _remove_model(self, model_fname): try: if self._simulation_mode or self._debug_mode: self._log.debug("Removing model: [%s]" % model_fname) if not os.path.isfile(model_fname): if self._simulation_mode or self._debug_mode: self._log.debug( "File does not exists, will not remove ...") return if not self._simulation_mode: os.remove(model_fname) except Exception as e: _.log_exception(self._log, "Unable to remove file [%s]" % model_fname, e)
def _update_lr(self, logs, iter_name): current = self._get_logs_base(logs) model_quality = self._get_model_quality(current) training_iter = current[iter_name] success = False try: success = self._exec_schedulers(training_iter, model_quality) if not success: self._log.error("Updating of the learning rate(s) failed.") except Exception as e: _.log_exception( self._log, "An unexpected error occurred, " "execution of the learning rate scheduler(s) failed", e) return success
def _set_metric_windows_states(self, state): success = True try: metric_windows_state = get_value_at("manager.metric_windows", state) if metric_windows_state is None: return success for metric_path, window_state in metric_windows_state.items(): success &= self._set_metric_window_state( metric_path, window_state) except Exception as e: _.log_exception( self._log, f"Unable to set metric windows state, skipped ...", e) success = False return success
def get_optimizers_state(self): """ :return: state, success (True of False) """ state = {} optimizers = self.get_optimizers() success = True for name, optimizer in optimizers.items(): try: state[name] = self._get_optimizer_state(optimizer, name) except Exception as e: _.log_exception(self._log, f"Unable to get state for optimizer {name}", e) success = False return state, success
def get_model_components_state(self): """ :return: state, success (True or False) """ state = {} model_components = self.get_model_components() success = True for name, model in model_components.items(): try: state[name] = self._get_model_state(model, name) except Exception as e: _.log_exception(self._log, f"Unable to get state for model {name}", e) success = False return state, success
def set_learning_rate(self, lr): """ Convenience method to set the learning rate of all optimizers to `lr` :param lr: Learning rate to set :return: True on success else False """ success = True optimizer_names = self.get_optimizers().keys() for opt_name in optimizer_names: try: success &= self.set_learning_rate_for(opt_name, lr) except Exception as e: _.log_exception( self._log, f"Unable to set learning rate for optimizer {opt_name}", e) success = False return success