def hpc_save(self, folderpath: str, logger): # make sure the checkpoint folder exists folderpath = str(folderpath) # because the tests pass a path object if not gfile.exists(folderpath): makedirs(folderpath) # save logger to make sure we get all the metrics logger.save() ckpt_number = self.max_ckpt_in_folder(folderpath) + 1 if not gfile.exists(folderpath): makedirs(folderpath) filepath = os.path.join(folderpath, f'hpc_ckpt_{ckpt_number}.ckpt') # give model a chance to do something on hpc_save model = self.get_model() checkpoint = self.dump_checkpoint() model.on_hpc_save(checkpoint) # do the actual save # TODO: fix for anything with multiprocess DP, DDP, DDP2 try: atomic_save(checkpoint, filepath) except AttributeError as err: if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( 'warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}') atomic_save(checkpoint, filepath) return filepath
def _save_model(self, filepath, trainer, pl_module): # in debugging, track when we save checkpoints trainer.dev_debugger.track_checkpointing_history(filepath) # make paths if not gfile.exists(os.path.dirname(filepath)): makedirs(os.path.dirname(filepath)) # delegate the saving to the model if self.save_function is not None: self.save_function(filepath, self.save_weights_only) else: raise ValueError(".save_function() not set")
def experiment(self) -> SummaryWriter: r""" Actual tensorboard object. To use TensorBoard features in your :class:`~pytorch_lightning.core.lightning.LightningModule` do the following. Example:: self.logger.experiment.some_tensorboard_function() """ if self._experiment is not None: return self._experiment assert rank_zero_only.rank == 0, 'tried to init log dirs in non global_rank=0' if self.root_dir and not gfile.exists(str(self.root_dir)): makedirs(self.root_dir) self._experiment = SummaryWriter(log_dir=self.log_dir, **self._kwargs) return self._experiment
def on_train_start(self, trainer, pl_module): """ Determines model checkpoint save directory at runtime. References attributes from the trainer's logger to determine where to save checkpoints. The base path for saving weights is set in this priority: 1. Checkpoint callback's path (if passed in) 2. The default_root_dir from trainer if trainer has no logger 3. The weights_save_path from trainer, if user provides it 4. User provided weights_saved_path The base path gets extended with logger name and version (if these are available) and subfolder "checkpoints". """ if self.dirpath is not None: return # short circuit self.filename = '{epoch}' if trainer.logger is not None: if trainer.weights_save_path != trainer.default_root_dir: # the user has changed weights_save_path, it overrides anything save_dir = trainer.weights_save_path else: save_dir = trainer.logger.save_dir or trainer.default_root_dir version = trainer.logger.version if isinstance( trainer.logger.version, str) else f'version_{trainer.logger.version}' ckpt_path = os.path.join(save_dir, trainer.logger.name, version, "checkpoints") else: ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints") self.dirpath = ckpt_path assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0' if not gfile.exists(self.dirpath): makedirs(self.dirpath)
def __init__(self, filepath: Optional[str] = None, monitor: str = 'val_loss', verbose: bool = False, save_last: bool = False, save_top_k: int = 1, save_weights_only: bool = False, mode: str = 'auto', period: int = 1, prefix: str = ''): super().__init__() if (filepath): filepath = str( filepath ) # the tests pass in a py.path.local but we want a str if save_top_k > 0 and filepath is not None and gfile.isdir( filepath) and len(gfile.listdir(filepath)) > 0: rank_zero_warn( f"Checkpoint directory {filepath} exists and is not empty with save_top_k != 0." "All files in this directory will be deleted when a checkpoint is saved!" ) self._rank = 0 self.monitor = monitor self.verbose = verbose if filepath is None: # will be determined by trainer at runtime self.dirpath, self.filename = None, None else: if gfile.isdir(filepath): self.dirpath, self.filename = filepath, '{epoch}' else: if not is_remote_path(filepath): # dont normalize remote paths filepath = os.path.realpath(filepath) self.dirpath, self.filename = os.path.split(filepath) makedirs(self.dirpath) # calls with exist_ok self.save_last = save_last self.save_top_k = save_top_k self.save_weights_only = save_weights_only self.period = period self.epoch_last_check = None self.prefix = prefix self.best_k_models = {} # {filename: monitor} self.kth_best_model_path = '' self.best_model_score = 0 self.best_model_path = '' self.save_function = None self.warned_result_obj = False torch_inf = torch.tensor(np.Inf) mode_dict = { 'min': (torch_inf, 'min'), 'max': (-torch_inf, 'max'), 'auto': (-torch_inf, 'max') if 'acc' in self.monitor or self.monitor.startswith('fmeasure') else (torch_inf, 'min'), } if mode not in mode_dict: rank_zero_warn( f'ModelCheckpoint mode {mode} is unknown, ' f'fallback to auto mode.', RuntimeWarning) mode = 'auto' self.kth_value, self.mode = mode_dict[mode]