def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" if distributed_utils.is_master(self.args): # only save one checkpoint extra_state["metrics"] = metrics.state_dict() checkpoint_utils.save_state( filename, self.args, self.get_model().state_dict(), self.get_criterion(), self.optimizer, self.lr_scheduler, self.get_num_updates(), self._optim_history, extra_state, )
def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" if self.is_data_parallel_master: # only save one checkpoint extra_state["metrics"] = metrics.state_dict() extra_state["previous_training_time"] = self.cumulative_training_time() checkpoint_utils.save_state( filename, self.args, self.get_model().state_dict(), self.get_criterion(), self.optimizer, self.lr_scheduler, self.get_num_updates(), self._optim_history, extra_state, )
def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" if self.is_data_parallel_master: # only save one checkpoint logger.info(f"Saving checkpoint to {filename}") extra_state["metrics"] = metrics.state_dict() extra_state["previous_training_time"] = self.cumulative_training_time() checkpoint_utils.save_state( filename, self.cfg, self.model.state_dict(), self.criterion, self.optimizer, self.lr_scheduler, self.get_num_updates(), self._optim_history, extra_state, ) logger.info(f"Finished saving checkpoint to {filename}")