예제 #1
0
    def _do_save(self, trainer: 'pl.Trainer', filepath: str) -> None:
        # in debugging, track when we save checkpoints
        trainer.dev_debugger.track_checkpointing_history(filepath)

        # make paths
        if trainer.is_global_zero or tpu_training_and_local_rank_zero(trainer):
            self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)

        # delegate the saving to the trainer
        trainer.save_checkpoint(filepath, self.save_weights_only)
예제 #2
0
    def _save_none_monitor_checkpoint(
            self, trainer: 'pl.Trainer',
            monitor_candidates: Dict[str, _METRIC]) -> None:
        if self.monitor is not None or self.save_top_k == 0:
            return

        filepath = self._get_metric_interpolated_filepath_name(
            monitor_candidates, trainer)
        self._save_model(trainer, filepath)

        if (self.save_top_k is None and self.best_model_path
                and self.best_model_path != filepath
                and (trainer.is_global_zero
                     or tpu_training_and_local_rank_zero(trainer))):
            self._del_model(self.best_model_path)

        self.best_model_path = filepath
예제 #3
0
    def _save_last_checkpoint(self, trainer: 'pl.Trainer',
                              monitor_candidates: Dict[str, _METRIC]) -> None:
        if not self.save_last:
            return

        filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST,
                                                monitor_candidates)
        filepath = os.path.join(self.dirpath,
                                f"{filepath}{self.FILE_EXTENSION}")

        self._save_model(trainer, filepath)

        if (self.last_model_path and self.last_model_path != filepath
                and (trainer.is_global_zero
                     or tpu_training_and_local_rank_zero(trainer))):
            self._del_model(self.last_model_path)

        self.last_model_path = filepath
예제 #4
0
    def __resolve_ckpt_dir(self, trainer: 'pl.Trainer') -> None:
        """
        Determines model checkpoint save directory at runtime. References attributes from the
        trainer's logger to determine where to save checkpoints.
        The base path for saving weights is set in this priority:

        1.  Checkpoint callback's path (if passed in)
        2.  The default_root_dir from trainer if trainer has no logger
        3.  The weights_save_path from trainer, if user provides it
        4.  User provided weights_saved_path

        The base path gets extended with logger name and version (if these are available)
        and subfolder "checkpoints".
        """
        # Todo: required argument `pl_module` is not used
        if self.dirpath is not None:
            return  # short circuit

        if trainer.logger is not None:
            if trainer.weights_save_path != trainer.default_root_dir:
                # the user has changed weights_save_path, it overrides anything
                save_dir = trainer.weights_save_path
            else:
                save_dir = trainer.logger.save_dir or trainer.default_root_dir

            version = (trainer.logger.version if isinstance(
                trainer.logger.version, str) else
                       f"version_{trainer.logger.version}")
            ckpt_path = os.path.join(save_dir, str(trainer.logger.name),
                                     version, "checkpoints")
        else:
            ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints")

        ckpt_path = trainer.training_type_plugin.broadcast(ckpt_path)

        self.dirpath = ckpt_path

        if (not trainer.fast_dev_run
                and (trainer.is_global_zero
                     or tpu_training_and_local_rank_zero(trainer))):
            self._fs.makedirs(self.dirpath, exist_ok=True)