def nemo_topk_check_previous_run(self): try: self.best_k_models self.kth_best_model_path self.best_model_score self.best_model_path except AttributeError: raise AttributeError( "Lightning's ModelCheckpoint was updated. NeMoModelCheckpoint will need an update." ) self.best_k_models = {} self.kth_best_model_path = "" self.best_model_score = None self.best_model_path = "" checkpoints = list(Path(self.dirpath).rglob("*.ckpt")) for checkpoint in checkpoints: if 'mp_rank' in str(checkpoint) or 'tp_rank' in str(checkpoint): checkpoint = uninject_model_parallel_rank(checkpoint) checkpoint = str(checkpoint) if checkpoint[-10:] == '-last.ckpt': continue index = checkpoint.find(self.monitor) + len( self.monitor) + 1 # Find monitor in str + 1 for '=' if index != -1: match = re.search('[A-z]', checkpoint[index:]) if match: value = checkpoint[index:index + match.start() - 1] # -1 due to separator hypen self.best_k_models[checkpoint] = float(value) if len(self.best_k_models) < 1: return # No saved checkpoints yet _reverse = False if self.mode == "min" else True best_k_models = sorted(self.best_k_models, key=self.best_k_models.get, reverse=_reverse) ### This section should be ok as rank zero will delete all excess checkpoints, since all other ranks are ### instantiated after rank zero. models_to_delete should be 0 for all other ranks. if self.model_parallel_size is not None: models_to_delete = len( best_k_models) - self.model_parallel_size * self.save_top_k else: models_to_delete = len(best_k_models) - self.save_top_k logging.debug(f'Number of models to delete: {models_to_delete}') for _ in range(models_to_delete): model = best_k_models.pop(-1) self.best_k_models.pop(model) self._del_model_without_trainer(model) logging.debug(f"Removed checkpoint: {model}") self.kth_best_model_path = best_k_models[-1] self.best_model_path = best_k_models[0] self.best_model_score = self.best_k_models[self.best_model_path]
def configure_checkpointing( trainer: 'pytorch_lightning.Trainer', log_dir: Path, name: str, resume: bool, params: 'DictConfig' ): """ Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint callback or if trainer.weights_save_path was passed to Trainer. """ for callback in trainer.callbacks: if isinstance(callback, ModelCheckpoint): raise CheckpointMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a ModelCheckpoint " "and create_checkpoint_callback was set to True. Please either set create_checkpoint_callback " "to False, or remove ModelCheckpoint from the lightning trainer" ) if Path(trainer.weights_save_path) != Path.cwd(): raise CheckpointMisconfigurationError( "The pytorch lightning was passed weights_save_path. This variable is ignored by exp_manager" ) # Create the callback and attach it to trainer if "filepath" in params: if params.filepath is not None: logging.warning("filepath is deprecated. Please switch to dirpath and filename instead") if params.dirpath is None: params.dirpath = Path(params.filepath).parent if params.filename is None: params.filename = Path(params.filepath).name with open_dict(params): del params["filepath"] if params.dirpath is None: params.dirpath = Path(log_dir / 'checkpoints') if params.filename is None: params.filename = f'{name}--{{{params.monitor}:.4f}}-{{epoch}}' if params.prefix is None: params.prefix = name NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last' logging.debug(params.dirpath) logging.debug(params.filename) logging.debug(params.prefix) if "val" in params.monitor: if ( trainer.max_epochs is not None and trainer.max_epochs != -1 and trainer.max_epochs < trainer.check_val_every_n_epoch ): logging.error( "The checkpoint callback was told to monitor a validation value but trainer.max_epochs(" f"{trainer.max_epochs}) was less than trainer.check_val_every_n_epoch({trainer.check_val_every_n_epoch}" f"). It is very likely this run will fail with ModelCheckpoint(monitor='{params.monitor}') not found " "in the returned metrics. Please ensure that validation is run within trainer.max_epochs." ) elif trainer.max_steps is not None: logging.warning( "The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to " f"{trainer.max_steps}. Please ensure that max_steps will run for at least " f"{trainer.check_val_every_n_epoch} epochs to ensure that checkpointing will not error out." ) checkpoint_callback = NeMoModelCheckpoint(n_resume=resume, **params) checkpoint_callback.last_model_path = trainer.checkpoint_connector.resume_from_checkpoint_fit_path or "" if 'mp_rank' in checkpoint_callback.last_model_path or 'tp_rank' in checkpoint_callback.last_model_path: checkpoint_callback.last_model_path = uninject_model_parallel_rank(checkpoint_callback.last_model_path) trainer.callbacks.append(checkpoint_callback)
def check_resume( trainer: 'pytorch_lightning.Trainer', log_dir: str, resume_past_end: bool = False, resume_ignore_no_checkpoint: bool = False, ): """Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets trainer.checkpoint_connector.resume_from_checkpoint_fit_path as necessary. Returns: log_dir (Path): the log_dir exp_dir (str): the base exp_dir without name nor version name (str): The name of the experiment version (str): The version of the experiment Raises: NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if not log_dir: raise ValueError(f"Resuming requires the log_dir {log_dir} to be passed to exp_manager") checkpoint_dir = Path(Path(log_dir) / "checkpoints") checkpoint = None end_checkpoints = list(checkpoint_dir.rglob("*end.ckpt")) last_checkpoints = list(checkpoint_dir.rglob("*last.ckpt")) if not checkpoint_dir.exists(): if resume_ignore_no_checkpoint: logging.warning( f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch." ) return else: raise NotFoundError(f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume.") elif len(end_checkpoints) > 0: if resume_past_end: if len(end_checkpoints) > 1: if 'mp_rank' in str(end_checkpoints[0]): checkpoint = end_checkpoints[0] else: raise ValueError(f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt.") logging.info(f"Resuming from {end_checkpoints[0]}") else: raise ValueError( f"Found {end_checkpoints[0]} indicating that the last training run has already completed." ) elif not len(last_checkpoints) > 0: if resume_ignore_no_checkpoint: logging.warning(f"There were no checkpoints found in {checkpoint_dir}. Training from scratch.") return else: raise NotFoundError(f"There were no checkpoints found in {checkpoint_dir}. Cannot resume.") elif len(last_checkpoints) > 1: if 'mp_rank' in str(last_checkpoints[0]) or 'tp_rank' in str(last_checkpoints[0]): checkpoint = last_checkpoints[0] checkpoint = uninject_model_parallel_rank(checkpoint) else: raise ValueError(f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt.") else: logging.info(f"Resuming from {last_checkpoints[0]}") checkpoint = last_checkpoints[0] trainer.checkpoint_connector.resume_from_checkpoint_fit_path = str(checkpoint) if is_global_rank_zero(): # Check to see if any files exist that need to be moved files_to_move = [] for child in Path(log_dir).iterdir(): if child.is_file(): files_to_move.append(child) if len(files_to_move) > 0: # Move old files to a new folder other_run_dirs = Path(log_dir).glob("run_*") run_count = 0 for fold in other_run_dirs: if fold.is_dir(): run_count += 1 new_run_dir = Path(Path(log_dir) / f"run_{run_count}") new_run_dir.mkdir() for _file in files_to_move: move(str(_file), str(new_run_dir))