def _define_logger(self, capture_warnings=True): """Creates the logger if not already created. Called in init""" # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: return self._logger with self._logger_lock: try: self._logger = _logging.getLogger("mridc_logger") # By default, silence all loggers except the logger for rank 0 self.remove_stream_handlers() # If MRIDC_TESTING is set, add a streamhandler to all ranks if get_envbool(MRIDC_ENV_VARNAME_TESTING, False): old_factory = _logging.getLogRecordFactory() def record_factory(*args, **kwargs): record = old_factory(*args, **kwargs) record.rank = self.rank return record _logging.setLogRecordFactory(record_factory) self.add_stream_handlers(formatter=DebugMRIDCFormatter) elif is_global_rank_zero(): self.add_stream_handlers() # Add memoryhandlers, essentially buffers. They are used to save messages that we will flush to file # once the appropriate file handlers are added. if is_global_rank_zero(): # Add a memoryhandler for error messages. Only logged on rank 0 self._handlers["memory_err"] = MemoryHandler(-1) self._handlers["memory_err"].addFilter( lambda record: record.levelno > _logging.INFO) formatter = BaseMRIDCFormatter self._handlers["memory_err"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_err"]) # Add a memoryhandler for all messages on all ranks self._handlers["memory_all"] = MemoryHandler(-1) formatter = BaseMRIDCFormatter self._handlers["memory_all"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_all"]) finally: level = Logger.INFO if get_envbool(MRIDC_ENV_VARNAME_TESTING, False): level = Logger.DEBUG self.set_verbosity(verbosity_level=level) self.captureWarnings(capture_warnings) self._logger.propagate = False
def save_to(self, model, save_path: str): """ Saves model instance (weights and configuration) into .mridc file. You can use "restore_from" method to fully restore instance from .mridc file. .mridc file is an archive (tar.gz) with the following: - model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for \ model's constructor - model_wights.chpt - model checkpoint Parameters ---------- model: ModelPT object to be saved. save_path: Path to .mridc file where model instance should be saved """ if is_global_rank_zero(): with tempfile.TemporaryDirectory() as tmpdir: config_yaml = os.path.join(tmpdir, self.model_config_yaml) model_weights = os.path.join(tmpdir, self.model_weights_ckpt) model.to_config_file(path2yaml_file=config_yaml) if hasattr(model, "artifacts") and model.artifacts is not None: self._handle_artifacts(model, mridc_file_folder=tmpdir) # We should not update self._cfg here - the model can still be in use self._update_artifact_paths(model, path2yaml_file=config_yaml) self._save_state_dict_to_disk(model.state_dict(), model_weights) self._make_mridc_file_from_folder(filename=save_path, source_dir=tmpdir) else: return
def __init__(self, capture_warnings=True): self._logger = None # Multi-GPU runs run in separate processes, thread locks shouldn't be needed self._logger_lock = threading.Lock() self._handlers = {} self.old_warnings_showwarning = None self._define_logger(capture_warnings) self.once_logged = set() self.rank = 0 if is_global_rank_zero() else "UNK"
def check_explicit_log_dir(trainer: Trainer, explicit_log_dir: List[Union[Path, str]], exp_dir: str, name: str, version: str) -> Tuple[Path, str, str, str]: """ Checks that the passed arguments are compatible with explicit_log_dir. Parameters ---------- trainer: The trainer to check. explicit_log_dir: The explicit log dir to check. exp_dir: The experiment directory to check. name: The experiment name to check. version: The experiment version to check. Returns ------- The log_dir, exp_dir, name, and version that should be used. Raises ------ LoggerMisconfigurationError """ if trainer.logger is not None: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger and explicit_log_dir: " f"{explicit_log_dir} was pass to exp_manager. Please remove the logger from the lightning trainer." ) # Checking only (explicit_log_dir) vs (exp_dir and version). # The `name` will be used as the actual name of checkpoint/archive. if exp_dir or version: logging.error( f"exp_manager received explicit_log_dir: {explicit_log_dir} and at least one of exp_dir: {exp_dir}, " f"or version: {version}. Please note that exp_dir, name, and version will be ignored." ) if is_global_rank_zero() and Path(str(explicit_log_dir)).exists(): logging.warning( f"Exp_manager is logging to {explicit_log_dir}, but it already exists." ) return Path(str(explicit_log_dir)), str(explicit_log_dir), "", ""
def _del_model_without_trainer(self, filepath: str) -> None: """ Delete a model without a trainer. Parameters ---------- filepath: The path to the model to delete. """ app_state = AppState() if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: # filepath needs to be updated to include mp_rank filepath = mridc.utils.model_utils.inject_model_parallel_rank( filepath) # type: ignore # each model parallel rank needs to remove its model if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0): try: self._fs.rm(filepath) logging.info(f"Removed checkpoint: {filepath}") except FileNotFoundError: logging.info( f"Tried to remove checkpoint: {filepath} but failed.")
def get_log_dir( trainer: Trainer, exp_dir: str = None, name: str = None, version: str = None, explicit_log_dir: str = None, use_datetime_version: bool = True, resume_if_exists: bool = False, ) -> Tuple[Path, str, str, str]: """ Obtains the log_dir used for exp_manager. Parameters ---------- trainer: The trainer to check. exp_dir: The experiment directory to check. name: The experiment name to check. version: The experiment version to check. explicit_log_dir: The explicit log dir to check. use_datetime_version: Whether to use datetime versioning. resume_if_exists: Whether to resume if the log_dir already exists. Raises ------- LoggerMisconfigurationError: If trainer is incompatible with arguments NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if explicit_log_dir: # If explicit log_dir was passed, short circuit return check_explicit_log_dir(trainer, [Path(explicit_log_dir)], exp_dir, name, version) # type: ignore # Default exp_dir to ./mridc_experiments if None was passed _exp_dir = exp_dir if exp_dir is None: _exp_dir = str(Path.cwd() / "mridc_experiments") # If the user has already defined a logger for the trainer, use the logger defaults for logging directory if trainer.logger is not None: if trainer.logger.save_dir: if exp_dir: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's " f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir " "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir " "must be None.") _exp_dir = trainer.logger.save_dir if name: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: " f"{name} was also passed to exp_manager. If the trainer contains a " "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None." ) name = trainer.logger.name version = f"version_{trainer.logger.version}" # Use user-defined exp_dir, project_name, exp_name, and versioning options else: name = name or "default" version = version or os.environ.get(MRIDC_ENV_VARNAME_VERSION) if not version: if resume_if_exists: logging.warning( "No version folders would be created under the log folder as 'resume_if_exists' is enabled." ) version = None elif is_global_rank_zero(): if use_datetime_version: version = time.strftime("%Y-%m-%d_%H-%M-%S") else: tensorboard_logger = TensorBoardLogger(save_dir=_exp_dir, name=name, version=version) version = f"version_{tensorboard_logger.version}" os.environ[ MRIDC_ENV_VARNAME_VERSION] = "" if version is None else version log_dir = Path(str(_exp_dir)) / Path( str(name)) / Path("" if version is None else str(version)) return log_dir, str(_exp_dir), str(name), str(version)
def check_resume( trainer: Trainer, log_dir: str, resume_past_end: bool = False, resume_ignore_no_checkpoint: bool = False, ): """ Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets trainer._checkpoint_connector.resume_from_checkpoint_fit_path as necessary. Parameters ---------- trainer: The trainer that is being used. log_dir: The directory where the logs are being saved. resume_past_end: Whether to resume from the end of the experiment. resume_ignore_no_checkpoint: Whether to ignore if there is no checkpoint to resume from. Returns ------- NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if not log_dir: raise ValueError( f"Resuming requires the log_dir {log_dir} to be passed to exp_manager" ) checkpoint_dir = Path(Path(log_dir) / "checkpoints") checkpoint = None end_checkpoints = list(checkpoint_dir.rglob("*end.ckpt")) last_checkpoints = list(checkpoint_dir.rglob("*last.ckpt")) if not checkpoint_dir.exists(): if not resume_ignore_no_checkpoint: raise NotFoundError( f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume." ) logging.warning( f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch." ) return if end_checkpoints: if not resume_past_end: raise ValueError( f"Found {end_checkpoints[0]} indicating that the last training run has already completed." ) if len(end_checkpoints) > 1: if "mp_rank" in str(end_checkpoints[0]): checkpoint = end_checkpoints[0] else: raise ValueError( f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt." ) logging.info(f"Resuming from {end_checkpoints[0]}") elif not last_checkpoints: if not resume_ignore_no_checkpoint: raise NotFoundError( f"There were no checkpoints found in {checkpoint_dir}. Cannot resume." ) logging.warning( f"There were no checkpoints found in {checkpoint_dir}. Training from scratch." ) return elif len(last_checkpoints) > 1: if "mp_rank" not in str(last_checkpoints[0]) and "tp_rank" not in str( last_checkpoints[0]): raise ValueError( f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt." ) checkpoint = last_checkpoints[0] checkpoint = mridc.utils.model_utils.uninject_model_parallel_rank( checkpoint) # type: ignore else: logging.info(f"Resuming from {last_checkpoints[0]}") checkpoint = last_checkpoints[0] trainer._checkpoint_connector.resume_from_checkpoint_fit_path = str( checkpoint) if is_global_rank_zero(): if files_to_move := [ child for child in Path(log_dir).iterdir() if child.is_file() ]: # Move old files to a new folder other_run_dirs = Path(log_dir).glob("run_*") run_count = sum(bool(fold.is_dir()) for fold in other_run_dirs) new_run_dir = Path(Path(log_dir) / f"run_{run_count}") new_run_dir.mkdir() for _file in files_to_move: move(str(_file), str(new_run_dir))
def exp_manager( trainer: Trainer, cfg: Optional[Union[DictConfig, Dict]] = None) -> Optional[Path]: """ exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning \ paradigm of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will \ get exp_dir, name, and version from the logger. Otherwise, it will use the exp_dir and name arguments to create \ the logging directory. exp_manager also allows for explicit folder creation via explicit_log_dir. The version can be a datetime string or an integer. Datetime version can be disabled if you use_datetime_version \ is set to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch \ lightning. It copies sys.argv, and git information if available to the logging directory. It creates a log file \ for each process to log their output into. exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from \ the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need \ multiple consecutive jobs), you need to avoid creating the version folders. Therefore, from v1.0.0, when \ resume_if_exists is set to True, creating the version folders is ignored. Parameters ---------- trainer: The lightning trainer object. cfg: Can have the following keys: - explicit_log_dir: Can be used to override exp_dir/name/version folder creation. Defaults to None, which \ will use exp_dir, name, and version to construct the logging directory. - exp_dir: The base directory to create the logging directory. Defaults to None, which logs to \ ./mridc_experiments. - name: The name of the experiment. Defaults to None which turns into "default" via name = name or "default". - version: The version of the experiment. Defaults to None which uses either a datetime string or lightning's \ TensorboardLogger system of using version_{int}. - use_datetime_version: Whether to use a datetime string for version. Defaults to True. - resume_if_exists: Whether this experiment is resuming from a previous run. If True, it sets \ trainer._checkpoint_connector.resume_from_checkpoint_fit_path so that the trainer should auto-resume. \ exp_manager will move files under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when \ resume_if_exists is True, we would not create version folders to make it easier to find the log folder for \ next runs. - resume_past_end: exp_manager errors out if resume_if_exists is True and a checkpoint matching \*end.ckpt \ indicating a previous training run fully completed. This behaviour can be disabled, in which case the \ \*end.ckpt will be loaded by setting resume_past_end to True. Defaults to False. - resume_ignore_no_checkpoint: exp_manager errors out if resume_if_exists is True and no checkpoint could be \ found. This behaviour can be disabled, in which case exp_manager will print a message and continue without \ restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. - create_tensorboard_logger: Whether to create a tensorboard logger and attach it to the pytorch lightning \ trainer. Defaults to True. - summary_writer_kwargs: A dictionary of kwargs that can be passed to lightning's TensorboardLogger class. \ Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None. - create_wandb_logger: Whether to create a Weights and Biases logger and attach it to the pytorch lightning \ trainer. Defaults to False. - wandb_logger_kwargs: A dictionary of kwargs that can be passed to lightning's WandBLogger class. Note that \ name and project are required parameters if create_wandb_logger is True. Defaults to None. - create_checkpoint_callback: Whether to create a ModelCheckpoint callback and attach it to the pytorch \ lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most recent \ checkpoint under \*last.ckpt, and the final checkpoint after training completes under \*end.ckpt. \ Defaults to True. - files_to_copy: A list of files to copy to the experiment logging directory. Defaults to None which copies \ no files. - log_local_rank_0_only: Whether to only create log files for local rank 0. Defaults to False. Set this to \ True if you are using DDP with many GPUs and do not want many log files in your exp dir. - log_global_rank_0_only: Whether to only create log files for global rank 0. Defaults to False. Set this to \ True if you are using DDP with many GPUs and do not want many log files in your exp dir. Returns ------- The final logging directory where logging files are saved. Usually the concatenation of exp_dir, name, and version. """ # Add rank information to logger # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it local_rank = int(os.environ.get("LOCAL_RANK", 0)) global_rank = trainer.node_rank * trainer.num_devices + local_rank logging.rank = global_rank if cfg is None: logging.error( "exp_manager did not receive a cfg argument. It will be disabled.") return None if trainer.fast_dev_run: logging.info( "Trainer was called with fast_dev_run. exp_manager will return without any functionality." ) return None # Ensure passed cfg is compliant with ExpManagerConfig schema = OmegaConf.structured(ExpManagerConfig) if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)) cfg = OmegaConf.merge(schema, cfg) error_checks( trainer, cfg ) # Ensures that trainer options are compliant with MRIDC and exp_manager arguments log_dir, exp_dir, name, version = get_log_dir( trainer=trainer, exp_dir=cfg.exp_dir, name=cfg.name, version=cfg.version, explicit_log_dir=cfg.explicit_log_dir, use_datetime_version=cfg.use_datetime_version, resume_if_exists=cfg.resume_if_exists, ) if cfg.resume_if_exists: check_resume(trainer, str(log_dir), cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == "": checkpoint_name = cfg.name or "default" cfg.name = name # Used for configure_loggers so that the log_dir is properly set even if name is "" cfg.version = version # update app_state with log_dir, exp_dir, etc app_state = AppState() app_state.log_dir = log_dir app_state.exp_dir = exp_dir app_state.name = name app_state.version = version app_state.checkpoint_name = checkpoint_name app_state.create_checkpoint_callback = cfg.create_checkpoint_callback app_state.checkpoint_callback_params = cfg.checkpoint_callback_params # Create the logging directory if it does not exist os.makedirs( log_dir, exist_ok=True ) # Cannot limit creation to global zero as all ranks write to own log file logging.info(f"Experiments will be logged at {log_dir}") trainer._default_root_dir = log_dir if cfg.log_local_rank_0_only is True and cfg.log_global_rank_0_only is True: raise ValueError( "Cannot set both log_local_rank_0_only and log_global_rank_0_only to True." "Please set either one or neither.") # This is set if the env var MRIDC_TESTING is set to True. mridc_testing = get_envbool(MRIDC_ENV_VARNAME_TESTING, False) log_file = log_dir / f"mridc_log_globalrank-{global_rank}_localrank-{local_rank}.txt" # Handle logging to file. Logs local rank 0 only if local_rank == 0 and cfg.log_local_rank_0_only and not mridc_testing: logging.add_file_handler(log_file) elif global_rank == 0 and cfg.log_global_rank_0_only and mridc_testing: logging.add_file_handler(log_file) else: logging.add_file_handler(log_file) # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks # not just global rank 0. if cfg.create_tensorboard_logger or cfg.create_wandb_logger: configure_loggers( trainer, [Path(exp_dir)], cfg.name, cfg.version, cfg.create_tensorboard_logger, cfg.summary_writer_kwargs, cfg.create_wandb_logger, cfg.wandb_logger_kwargs, ) # add loggers timing callbacks if cfg.log_step_timing: timing_callback = TimingCallback( timer_kwargs=cfg.step_timing_kwargs or {}) trainer.callbacks.insert(0, timing_callback) if cfg.create_checkpoint_callback: configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.resume_if_exists, cfg.checkpoint_callback_params) if is_global_rank_zero(): # Move files_to_copy to folder and add git information if present if cfg.files_to_copy: for _file in cfg.files_to_copy: copy(Path(_file), log_dir) # Create files for cmd args and git info with open(log_dir / "cmd-args.log", "w", encoding="utf-8") as _file: _file.write(" ".join(sys.argv)) # Try to get git hash git_repo, git_hash = get_git_hash() if git_repo: with open(log_dir / "git-info.log", "w", encoding="utf-8") as _file: _file.write(f"commit hash: {git_hash}") _file.write(get_git_diff()) # Add err_file logging to global_rank zero logging.add_err_file_handler(log_dir / "mridc_error_log.txt") # Add lightning file logging to global_rank zero add_filehandlers_to_pl_logger(log_dir / "lightning_logs.txt", log_dir / "mridc_error_log.txt") return log_dir