def get_tensor(self, name: Union[str, NmTensor], compute: bool = True): """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already set. args: name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself. compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return None if the tensor has not been computed yet. Defaults to True. returns: (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is False and the tensor has not been computed yet. """ if isinstance(name, NmTensor): unique_name = name.unique_name else: unique_name = AppState().tensor_names[name] tensor_value = self.tensor_dict[unique_name] if tensor_value is None and compute: nmtensor = AppState( ).tensor_names._nmtensor_uniname_dict[unique_name] callchain = topological_sort_from_leaves( [nmtensor], cached_training_state=self) callchain.insert(0, ()) self._action.nm_graph_forward_pass(callchain, self.tensor_dict) tensor_value = self.tensor_dict[unique_name] return tensor_value
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') trainer = None if cfg.trainer.precision == 16: trainer = Trainer( plugins=[ NLPDDPPlugin(), NLPNativeMixedPrecisionPlugin( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ), ], **cfg.trainer, ) elif cfg.trainer.precision == 'bf16': trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,) else: trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer) app_state = AppState() app_state.model_parallel_size = cfg.model.tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size) model = MegatronGPTModel.restore_from( cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(), ) # Note: most nemo models must have the data paths configured before instantiating the model # MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns. model.cfg.data.splits_string = cfg.model.data.splits_string trainer.test(model)
def __init__(self, producer, producer_args, output_port_name, ntype=None): """NmTensor constructor. Args: producer (NeuralModule): object which produced this producer_args (dict): a dictionary of port_name->NmTensor value of arguments which were sent to producer to create this """ super(NmTensor, self).__init__(axes=ntype.axes, elements_type=ntype.elements_type, optional=ntype.optional) # producer is None: a special case present in some of the unit tests. if producer is None: self._producer_name = "None" else: self._producer_name = producer.name self._producer_args = producer_args self._output_port_name = output_port_name self._name = output_port_name self._output_port_name = output_port_name self._uuid = str(uuid.uuid4()) # Remember step at which this tensor was created. self._step_number = AppState().active_graph.step_number # List of tuples (step number, module name, input port name) self._consumers = [] AppState().tensor_names.register(self)
def on_save_checkpoint(self, trainer, pl_module, checkpoint): output = super().on_save_checkpoint(trainer, pl_module, checkpoint) if not self.always_save_nemo: return output # Load the best model and then re-save it app_state = AppState() # since we are creating tarfile artifacts we need to update .nemo path app_state.model_restore_path = os.path.abspath( os.path.expanduser( os.path.join(self.dirpath, self.prefix + self.postfix))) if self.save_best_model: if not os.path.exists(self.best_model_path): return output if self.best_model_path == self.previous_best_path: return output self.previous_model_path = self.best_model_path old_state_dict = deepcopy(pl_module.state_dict()) checkpoint = torch.load(self.best_model_path, map_location='cpu') if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] # get a new instanace of the model pl_module.load_state_dict(checkpoint, strict=True) pl_module.save_to(save_path=app_state.model_restore_path) pl_module.load_state_dict(old_state_dict, strict=True) else: pl_module.save_to(save_path=app_state.model_restore_path) return output
def main(): parser = ArgumentParser() parser.add_argument("--model_file", type=str, default="", required=True, help="Pass path to model's .nemo file") parser.add_argument("--prompt", type=str, default="", required=True, help="Prompt for the model (a text to complete)") parser.add_argument("--tokens_to_generate", type=int, default="16", required=False, help="How many tokens to add to prompt") parser.add_argument( "--tensor_model_parallel_size", type=int, default=1, required=True, ) args = parser.parse_args() torch.set_grad_enabled(False) # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), devices=args.tensor_model_parallel_size, precision=16, accelerator='gpu') app_state = AppState() if args.tensor_model_parallel_size > 1: app_state.model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank( trainer.local_rank, app_state.model_parallel_size) model = MegatronT5Model.restore_from(restore_path=args.model_file, trainer=trainer) model.freeze() request = { "prompt": args.prompt, "tokens_to_generate": args.tokens_to_generate, } dataset = T5RequestDataset(request, model.tokenizer) request_dl = DataLoader(dataset) response = trainer.predict(model, request_dl) print("***************************") print(response) print("***************************")
def main(cfg) -> None: # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" app_state = AppState() app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=cfg.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, ) if cfg.model_file is not None: if not os.path.exists(cfg.model_file): raise ValueError(f"Model file {cfg.model_file} does not exist") model = MegatronNMTModel.restore_from( restore_path=cfg.model_file, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(), ) elif cfg.checkpoint_dir is not None: checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) else: raise ValueError("need at least a nemo file or checkpoint dir") model.freeze() logging.info(f"Translating: {cfg.srctext}") src_text = [] translations = [] with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f: for line in src_f: src_text.append(line.strip()) if len(src_text) == cfg.batch_size: translations = model.translate( text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang, ) for translation in translations: tgt_f.write(translation + "\n") src_text = [] if len(src_text) > 0: translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,) for translation in translations: tgt_f.write(translation + "\n")
def _handle_artifacts(self, model, nemo_file_folder): tarfile_artifacts = [] app_state = AppState() for conf_path, artiitem in model.artifacts.items(): if artiitem.path_type == model_utils.ArtifactPathType.LOCAL_PATH: if not os.path.exists(artiitem.path): raise FileNotFoundError(f"Artifact {conf_path} not found at location: {artiitem.path}") # Generate new uniq artifact name and copy it to nemo_file_folder # Note uuid.uuid4().hex is guaranteed to be 32 character long artifact_base_name = os.path.basename(artiitem.path) artifact_uniq_name = f"{uuid.uuid4().hex}_{artifact_base_name}" shutil.copy2(artiitem.path, os.path.join(nemo_file_folder, artifact_uniq_name)) # Update artifacts registry artiitem.hashed_path = "nemo:" + artifact_uniq_name model.artifacts[conf_path] = artiitem elif artiitem.path_type == model_utils.ArtifactPathType.TAR_PATH: # process all tarfile artifacts in one go, so preserve key-value pair tarfile_artifacts.append((conf_path, artiitem)) else: raise ValueError(f"Directly referencing artifacts from other nemo files isn't supported yet") # Process current tarfile artifacts by unpacking the previous tarfile and extract the artifacts # that are currently required. model_metadata = app_state.get_model_metadata_from_guid(model.model_guid) if len(tarfile_artifacts) > 0 and model_metadata.restoration_path is not None: # Need to step into nemo archive to extract file # Get path where the command is executed - the artifacts will be "retrieved" there # (original .nemo behavior) cwd = os.getcwd() try: # Step into the nemo archive to try and find the file with tempfile.TemporaryDirectory() as archive_dir: self._unpack_nemo_file(path2file=model_metadata.restoration_path, out_folder=archive_dir) os.chdir(archive_dir) for conf_path, artiitem in tarfile_artifacts: # Get basename and copy it to nemo_file_folder if 'nemo:' in artiitem.path: artifact_base_name = artiitem.path.split('nemo:')[1] else: artifact_base_name = os.path.basename(artiitem.path) # no need to hash here as we are in tarfile_artifacts which are already hashed artifact_uniq_name = artifact_base_name shutil.copy2(artifact_base_name, os.path.join(nemo_file_folder, artifact_uniq_name)) # Update artifacts registry new_artiitem = model_utils.ArtifactItem() new_artiitem.path = "nemo:" + artifact_uniq_name new_artiitem.path_type = model_utils.ArtifactPathType.TAR_PATH model.artifacts[conf_path] = new_artiitem finally: # change back working directory os.chdir(cwd)
def test_mock_save_to_restore_chained(self): with tempfile.NamedTemporaryFile( 'w') as empty_file, tempfile.NamedTemporaryFile( 'w') as empty_file2: # Write some data empty_file.writelines(["*****\n"]) empty_file.flush() # Update config + create ,pde;s cfg = _mock_model_config() cfg.model.temp_file = empty_file.name # Create models model = MockModel(cfg=cfg.model, trainer=None) model = model.to('cpu') assert model.temp_file == empty_file.name def save_copy(model, save_folder, restore_folder): # Where model will be saved model_save_path = os.path.join( save_folder, f"{model.__class__.__name__}.nemo") model.save_to(save_path=model_save_path) # Where model will be restored from model_restore_path = os.path.join( restore_folder, f"{model.__class__.__name__}.nemo") shutil.copy(model_save_path, model_restore_path) return model_restore_path # Save test with tempfile.TemporaryDirectory() as level4: with tempfile.TemporaryDirectory() as level3: with tempfile.TemporaryDirectory() as level2: with tempfile.TemporaryDirectory() as level1: path = save_copy(model, level1, level2) model_copy2 = model.__class__.restore_from(path) path = save_copy(model_copy2, level2, level3) model_copy3 = model.__class__.restore_from(path) path = save_copy(model_copy3, level3, level4) model_copy = model.__class__.restore_from(path) # Restore test assert model_copy.temp_data == ["*****\n"] # AppState test appstate = AppState() metadata = appstate.get_model_metadata_from_guid(model_copy.model_guid) assert metadata.guid != model.model_guid assert metadata.restoration_path == path
def set_world_size(self, trainer: Trainer): """ Determines the world size from the PyTorch Lightning Trainer. And then updates AppState. Args: trainer (Trainer): PyTorch Lightning Trainer object """ # Update AppState with world information from trainer if isinstance(trainer, Trainer): app_state = AppState() if self._trainer.num_gpus and self._trainer.num_nodes: app_state.world_size = self._trainer.num_gpus * self._trainer.num_nodes else: logging.warning(f'World size can only be set by PyTorch Lightning Trainer.')
def __init__(self, model_name, config, vocab_file, model_parallel_size=None, model_parallel_rank=None): super().__init__() self._model_parallel_size = model_parallel_size self._model_parallel_rank = model_parallel_rank self._restore_path = None self._app_state = None self._model_name = model_name if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') # convert config to dictionary if isinstance(config, DictConfig): config = OmegaConf.to_container(config) config["vocab_file"] = vocab_file config['tokenizer_type'] = 'BertWordPieceLowerCase' config['lazy_mpu_init'] = True config['onnx_safe'] = True # if 'model_parallel_size' in config: if self._model_parallel_size is not None: app_state = AppState() self._app_state = app_state # must be set for model parallel megatron-lm os.environ["WORLD_SIZE"] = str(app_state.world_size) os.environ["RANK"] = str(self._model_parallel_rank) extra_args_provider = self._update_megatron_args(tensor_model_parallel_size=self._model_parallel_size) else: extra_args_provider = self._update_megatron_args() # configure globals for megatron set_pipeline_model_parallel_rank(0) # pipeline model parallelism not implemented in NeMo set_pipeline_model_parallel_world_size(1) # pipeline model parallelism not implemented in NeMo # Initialize part of Megatron global state that is needed for its constructor. # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either) # and to return a hook for us to call after PTL has torch.distributed initialized. # (or if no PTL in case of inference - then we'll initialize torch.distributed) # We call and clear this hook on first call to forward() self._lazy_init_fn = initialize_megatron( extra_args_provider=extra_args_provider, args_defaults=config, ignore_unknown_args=True ) # read Megatron arguments back args = get_args() logging.info(f'Megatron-lm argparse args: {args}') self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False ) self.config = OmegaConf.create(config) # key used for checkpoints self._hidden_size = self.language_model.hidden_size
def _save_last_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None: """ Overrides PTL method to account for model parallel checkpoints. Checks for data parallel rank 0 rather than global rank 0. """ app_state = AppState() if app_state.model_parallel_size is not None: if not self.save_last: return filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, monitor_candidates) filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}") self._save_model(trainer, filepath) # for model parallel we need to delete models for each model parallel rank if self.last_model_path and self.last_model_path != filepath and app_state.data_parallel_rank == 0: self._del_model(self.last_model_path) self.last_model_path = filepath else: return super()._save_last_checkpoint(trainer, monitor_candidates)
def __save_to(self, path, state): if state["global_rank"] is not None and state["global_rank"] != 0: return if not os.path.isdir(path): logging.info(f"Creating {path} folder") os.makedirs(path, exist_ok=True) unique_mod_names = set() for module in AppState().modules: if module.num_weights > 0: if str(module) in unique_mod_names: raise NotImplementedError( "There were two instances of the same module. Please overwrite __str__() of one of the " "modules.") unique_mod_names.add(str(module)) if self._step_freq > -1: filename = f"{module}-STEP-{state['step']}.pt" else: filename = f"{module}-EPOCH-{state['epoch']}.pt" module.save_to(os.path.join(path, filename)) if self._step_freq > -1: filename = f"trainer-STEP-{state['step']}.pt" state.save_state_to(f"{path}/{filename}") self._saved_ckpts.append(f"-{state['step']}.pt") else: filename = f"trainer-EPOCH-{state['epoch']}.pt" state.save_state_to(f"{path}/{filename}") self._saved_ckpts.append(f"-{state['epoch']}.pt") if len(self._saved_ckpts) > self._ckpt2keep: for end in self._saved_ckpts[:-self._ckpt2keep]: for file in glob.glob(f'{path}/*{end}'): os.remove(file) self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep:] logging.info(f'Saved checkpoint: {path}/{filename}')
def rename(self, new_name: str): """Renames the tensor from its old name to a new user-defined name for easy access within callbacks. Note, a tensor's unique_name is never changed. This simply adds a reference from new_name -> tensor.unique_name args: new_name (str): the new tensor's name. """ AppState().tensor_names.rename_NmTensor(self, new_name) self._name = new_name
def __restore_from(self, path, state): if not os.path.isdir(path): if self._force_load: raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.") logging.warning(f"Checkpoint folder {path} not found!") else: logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.") modules_to_restore = [] modules_to_restore_name = [] for module in AppState().modules: if module.num_weights > 0: modules_to_restore.append(module) modules_to_restore_name.append(str(module)) step_check = None try: module_checkpoints, steps = get_checkpoint_from_dir(modules_to_restore_name, path, return_steps=True) # If the steps are different, print a warning message for step in steps: if step_check is None: step_check = step elif step != step_check: logging.warning("Restoring from modules checkpoints where the training step does not match") break for mod, checkpoint in zip(modules_to_restore, module_checkpoints): mod.restore_from(checkpoint, state["local_rank"]) except (ValueError) as e: if self._force_load: raise ValueError( "force_load was set to True for checkpoint callback but a checkpoint was not found." ) logging.warning(e) logging.warning( f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random " "initialization." ) return try: trainer_checkpoints, steps = get_checkpoint_from_dir(["trainer"], path, return_steps=True) if step_check is not None and step_check != steps[0]: logging.error( "The step we are restoring from the trainer checkpoint does not match one or more steps that " "are being restored from modules." ) state.restore_state_from(trainer_checkpoints[0]) except (ValueError) as e: logging.warning(e) logging.warning( "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights" " have still been restore and fine-tuning should continue fine." ) return
def setup(self, stage): # setup to track metrics, need to put here # as data_parallel_group is initialized when calling `fit, or test function` app = AppState() self.classification_report = ClassificationReport( num_classes=len(self.classes), label_ids=self.label_ids, mode='micro', dist_sync_on_step=True, process_group=app.data_parallel_group, )
def __init__(self, model_name, config, vocab_file, model_parallel_size=None): super().__init__() self._model_parallel_size = model_parallel_size self._restore_path = None self._app_state = None if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') config["vocab_file"] = vocab_file config['tokenizer_type'] = 'BertWordPieceLowerCase' config['lazy_mpu_init'] = True config['onnx_safe'] = True # if 'model_parallel_size' in config: if self._model_parallel_size is not None: app_state = AppState() self._app_state = app_state # must be set for model parallel megatron-lm os.environ["WORLD_SIZE"] = str(app_state.world_size) # used to set model_parallel_size in megatron-lm argparser def _update_model_parallel_arg(parser): parser.set_defaults(model_parallel_size=self._model_parallel_size) return parser extra_args_provider = _update_model_parallel_arg else: extra_args_provider = None # Initialize part of Megatron global state that is needed for its constructor. # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either) # and to return a hook for us to call after PTL has torch.distributed initialized. # We call this hook during .forward # TODO: can we call this hook using the PTL hook .setup() self._lazy_init_fn = initialize_megatron( extra_args_provider=extra_args_provider, args_defaults=config, ignore_unknown_args=True ) # read Megatron arguments back args = get_args() logging.info(f'Megatron-lm argparse args: {args}') self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False ) self.config = OmegaConf.create(config) # key used for checkpoints self._hidden_size = self.language_model.hidden_size
def __init__(self, action: 'Actions'): """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed on the current step. args: action (Actions): The Actions object this state is associated with. """ tensor_naming_registery = AppState().tensor_names self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) self._action = action
def _del_model_without_trainer(self, filepath: str) -> None: app_state = AppState() if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: # filepath needs to be updated to include mp_rank filepath = inject_model_parallel_rank(filepath) # each model parallel rank needs to remove its model if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0): try: self._fs.rm(filepath) logging.info(f"Removed checkpoint: {filepath}") except: logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
def split_partition(model, partitions, tp_size, write_path=None): if len(partitions) != 1: raise ValueError( "Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first." ) if tp_size < 1: raise ValueError("TP size must to be >= 1.") app_state = AppState() app_state.data_parallel_rank = 0 app_state.model_parallel_size = tp_size app_state.model_parallel_rank = tp_size - 1 idx = 0 splits = [] for _, param in model.named_parameters(): if param.shape == partitions[0][idx].shape: split = [partitions[0][idx].data] * tp_size elif param.shape[0] == partitions[0][idx].shape[0]: split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1) else: split = torch.split(partitions[0][idx].data, param.shape[0], dim=0) splits.append(split) idx += 1 for i in range(tp_size - 1, -1, -1): app_state.model_parallel_rank = i idx = 0 for name, param in model.named_parameters(): split_val = splits[idx][i] if param.shape != split_val.shape: logging.info( f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, split shape: {split_val.shape}. Padding to match required size." ) if split_val.shape[1:] == param.shape[1:]: pad = [0, 0] * len(split_val.shape) pad[-1] = param.shape[0] - split_val.shape[0] split_val = torch.nn.functional.pad( split_val, pad, 'constant') elif split_val.shape[:-1] == param.shape[:-1]: pad = [0, param.shape[-1] - split_val.shape[-1]] split_val = torch.nn.functional.pad( split_val, pad, 'constant') else: raise RuntimeError( f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}." ) param.data = split_val idx += 1 if write_path is not None: model.save_to(write_path)
def on_train_end(self, trainer, pl_module): if trainer.fast_dev_run: return None app_state = AppState() if app_state.model_parallel_size is not None: return None # TODO: make this work for model parallel, need to call on data parallel rank 0 and update best_model_path # Load the best model and then re-save it if self.save_best_model: trainer.checkpoint_connector.restore(self.best_model_path) pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
def forward(self, input_ids, attention_mask, token_type_ids): app_state = AppState() if app_state.model_parallel_size is None: self.complete_lazy_init() extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) sequence_output = self.language_model( input_ids=input_ids, position_ids=position_ids, attention_mask=extended_attention_mask, tokentype_ids=token_type_ids, ) return sequence_output
def _del_model_without_trainer(self, filepath: str) -> None: app_state = AppState() if app_state.model_parallel_size is not None: # filepath needs to be updated to include mp_rank dirname = os.path.dirname(filepath) basename = os.path.basename(filepath) filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}' # each model parallel rank needs to remove its model if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0): try: self._fs.rm(filepath) logging.info(f"Removed checkpoint: {filepath}") except: logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
def on_action_start(self, state): num_parameters = 0 unique_mod_names = set() for module in AppState().modules: if module.num_weights > 0: if str(module) in unique_mod_names: raise NotImplementedError( "There were two instances of the same module. Please overwrite __str__() of one of the " "modules.") unique_mod_names.add(str(module)) num_parameters += module.num_weights logging.info(f"Found {len(unique_mod_names)} modules with weights:") for name in unique_mod_names: logging.info(f"{name}") logging.info(f"Total model parameters: {num_parameters}") self.__restore_from(self._load_from_folder, state)
def test_value_sharing(self): # Create first instance of AppState. x = AppState() x.test_value = "ala" # Create second instance of AppState and test value. y = AppState() assert y.test_value == "ala" # Change second instance and test first one. y.test_value = "ola" assert x.test_value == "ola"
def __restore_from(self, path, state): if not os.path.isdir(path): if self._force_load: raise ValueError( "force_load was set to True for checkpoint callback but a checkpoint was not found." ) logging.warning(f"Checkpoint folder {path} not found!") else: logging.info( f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it." ) modules_to_restore = [] modules_to_restore_name = [] for module in AppState().modules: if module.num_weights > 0: modules_to_restore.append(module) modules_to_restore_name.append(str(module)) try: module_checkpoints = get_checkpoint_from_dir( modules_to_restore_name, path) for mod, checkpoint in zip(modules_to_restore, module_checkpoints): mod.restore_from(checkpoint, state["local_rank"]) except (ValueError) as e: if self._force_load: raise ValueError( "force_load was set to True for checkpoint callback but a checkpoint was not found." ) logging.warning(e) logging.warning( f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random " "initialization.") return try: trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) state.restore_state_from(trainer_checkpoints[0]) except (ValueError) as e: logging.warning(e) logging.warning( "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights" " have still been restore and fine-tuning should continue fine." ) return
def _del_model(self, trainer: "pl.Trainer", filepath: str) -> None: """ Overrides PTL method to account for model parallel checkpoints. Updates checkpoint path based on model parallel rank. """ app_state = AppState() if app_state.model_parallel_size is not None: # filepath needs to be updated to include mp_rank dirname = os.path.dirname(filepath) basename = os.path.basename(filepath) filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}' # each model parallel rank needs to remove its model if app_state.data_parallel_rank == 0: super()._del_model(trainer, filepath) logging.info(f"Removed model parallel checkpoint: {filepath}") else: return super()._del_model(trainer, filepath)
def __init__(self, datasets, weights): self.datasets = datasets num_datasets = len(datasets) assert num_datasets == len(weights) self.size = 0 for dataset in self.datasets: self.size += len(dataset) # Normalize weights. weights = np.array(weights, dtype=np.float64) sum_weights = np.sum(weights) assert sum_weights > 0.0 weights /= sum_weights # Build indecies. start_time = time.time() assert num_datasets < 255 self.dataset_index = np.zeros(self.size, dtype=np.uint8) self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) app_state = AppState() try: if app_state.local_rank == 0: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() torch.distributed.barrier() from nemo.collections.nlp.data.language_modeling.megatron import helpers except: raise Exception(f'Could not compile helpers.') helpers.build_blending_indices( self.dataset_index, self.dataset_sample_index, weights, num_datasets, self.size, torch.distributed.get_rank() == 0, ) logging.info( '> elapsed time for building blendable dataset indices: ' '{:.2f} (sec)'.format(time.time() - start_time) )
def _save_none_monitor_checkpoint( self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None: """ Overrides PTL method to account for model parallel checkpoints. Checks for data parallel rank 0 rather than global rank 0. """ app_state = AppState() if app_state.model_parallel_size is not None: if self.monitor is not None or self.save_top_k == 0: return filepath = self._get_metric_interpolated_filepath_name( monitor_candidates, trainer) self._save_model(trainer, filepath) if (self.save_top_k is None and self.best_model_path and self.best_model_path != filepath and app_state.data_parallel_rank == 0): self._del_model(self.best_model_path) self.best_model_path = filepath else: return super()._save_none_monitor_checkpoint( trainer, monitor_candidates)
def main(cfg) -> None: # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" # Load prompt tuned model, virtual_prompt_model_file must be provided in config if cfg.get('virtual_prompt_model_file', None) is not None: # Update frozen GPT model path in case it has changed prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from( cfg.virtual_prompt_model_file, trainer=trainer, return_config=True) with open_dict(prompt_learning_cfg): prompt_learning_cfg.language_model_path = cfg.gpt_model_file # Now load prompt learning model with frozen gpt model base model = MegatronGPTPromptLearningModel.restore_from( restore_path=cfg.virtual_prompt_model_file, trainer=trainer, override_config_path=prompt_learning_cfg) # Or load regular GPT model elif cfg.gpt_model_file: model = MegatronGPTModel.restore_from(restore_path=cfg.gpt_model_file, trainer=trainer) elif cfg.checkpoint_dir: app_state = AppState() if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=cfg.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg. pipeline_model_parallel_split_rank, ) checkpoint_path = inject_model_parallel_rank( os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) else: raise ValueError("need at least a nemo file or checkpoint dir") model.freeze() # Have to turn off activations_checkpoint_method for inference try: model.model.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass try: model.frozen_model.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass length_params: LengthParam = { "max_length": cfg.inference.tokens_to_generate, "min_length": cfg.inference.min_tokens_to_generate, } sampling_params: SamplingParam = { "use_greedy": cfg.inference.greedy, "temperature": cfg.inference.temperature, "top_k": cfg.inference.top_k, "top_p": cfg.inference.top_p, "repetition_penalty": cfg.inference.repetition_penalty, "add_BOS": cfg.inference.add_BOS, "all_probs": cfg.inference.all_probs, "compute_logprob": cfg.inference.compute_logprob, } # First method of running text generation, call model.generate method response = model.generate(inputs=OmegaConf.to_container(cfg.prompts), length_params=length_params, sampling_params=sampling_params) print("***************************") print(response) print("***************************") # Second method of running text generation, call trainer.predict collate_fn = None if cfg.get('virtual_prompt_model', False): collate_fn = lambda x: list(x) ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) request_dl = DataLoader(dataset=ds, collate_fn=collate_fn, batch_size=2) config = OmegaConf.to_container(cfg.inference) model.set_inference_config(config) response = trainer.predict(model, request_dl) print("***************************") print(response) print("***************************") # Third method of running text generation, use inference server if cfg.server: if parallel_state.is_pipeline_first_stage( ) and parallel_state.get_tensor_model_parallel_rank() == 0: server = MegatronServer(model.cuda()) server.run("0.0.0.0", port=cfg.port) while True: choice = torch.cuda.LongTensor(1) torch.distributed.broadcast(choice, 0) if choice[0].item() == 0: generate(model.cuda())
def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Path: """ exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir, name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging directory. exp_manager also allows for explicit folder creation via explicit_log_dir. The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning. It copies sys.argv, and git information if available to the logging directory. It creates a log file for each process to log their output into. exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need multiple consecutive jobs), you need to avoid creating the version folders. Therefore from v1.0.0, when resume_if_exists is set to True, creating the version folders is ignored. Args: trainer (pytorch_lightning.Trainer): The lightning trainer. cfg (DictConfig, dict): Can have the following keys: - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to None, which will use exp_dir, name, and version to construct the logging directory. - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to ./nemo_experiments. - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or "default". - version (str): The version of the experiment. Defaults to None which uses either a datetime string or lightning's TensorboardLogger system of using version_{int}. - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True. - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets trainer.checkpoint_connector.resume_from_checkpoint_fit_path so that the trainer should auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when resume_if_exists is True, we would not create version folders to make it easier to find the log folder for next runs. - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False. - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch lightning trainer. Defaults to True. - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None. - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch lightning trainer. Defaults to False. - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger class. Note that name and project are required parameters if create_wandb_logger is True. Defaults to None. - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt. Defaults to True. - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which copies no files. returns: log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of exp_dir, name, and version. """ # Add rank information to logger # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it local_rank = int(os.environ.get("LOCAL_RANK", 0)) global_rank = trainer.node_rank * trainer.num_gpus + local_rank logging.rank = global_rank world_size = trainer.world_size if cfg is None: logging.error( "exp_manager did not receive a cfg argument. It will be disabled.") return if trainer.fast_dev_run: logging.info( "Trainer was called with fast_dev_run. exp_manager will return without any functionality." ) return # Ensure passed cfg is compliant with ExpManagerConfig schema = OmegaConf.structured(ExpManagerConfig) if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)) cfg = OmegaConf.merge(schema, cfg) error_checks( trainer, cfg ) # Ensures that trainer options are compliant with NeMo and exp_manager arguments log_dir, exp_dir, name, version = get_log_dir( trainer=trainer, exp_dir=cfg.exp_dir, name=cfg.name, version=cfg.version, explicit_log_dir=cfg.explicit_log_dir, use_datetime_version=cfg.use_datetime_version, resume_if_exists=cfg.resume_if_exists, ) if cfg.resume_if_exists: check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == '': checkpoint_name = cfg.name or "default" cfg.name = name # Used for configure_loggers so that the log_dir is properly set even if name is "" cfg.version = version # update app_state with log_dir, exp_dir, etc app_state = AppState() app_state.log_dir = log_dir app_state.exp_dir = exp_dir app_state.name = name app_state.version = version app_state.checkpoint_name = checkpoint_name app_state.create_checkpoint_callback = cfg.create_checkpoint_callback app_state.checkpoint_callback_params = cfg.checkpoint_callback_params # Create the logging directory if it does not exist os.makedirs( log_dir, exist_ok=True ) # Cannot limit creation to global zero as all ranks write to own log file logging.info(f'Experiments will be logged at {log_dir}') trainer._default_root_dir = log_dir # Handle logging to file if get_envbool(NEMO_ENV_VARNAME_TESTING, False) or world_size <= 32: # If NEMO_TESTING is set (debug mode) or if less than 32 ranks save all log files log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif world_size <= 256 and local_rank == 0: # If less than 256 ranks, try to save 1 log file per "machine" log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif global_rank == 0: # If running more than 256 ranks, only save 1 log file log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks # not just global rank 0. if cfg.create_tensorboard_logger or cfg.create_wandb_logger: configure_loggers( trainer, exp_dir, cfg.name, cfg.version, cfg.create_tensorboard_logger, cfg.summary_writer_kwargs, cfg.create_wandb_logger, cfg.wandb_logger_kwargs, ) # add loggers timing callbacks if cfg.log_step_timing: timing_callback = TimingCallback( timer_kwargs=cfg.step_timing_kwargs or {}) trainer.callbacks.insert(0, timing_callback) if cfg.create_checkpoint_callback: configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.resume_if_exists, cfg.checkpoint_callback_params) if is_global_rank_zero(): # Move files_to_copy to folder and add git information if present if cfg.files_to_copy: for _file in cfg.files_to_copy: copy(Path(_file), log_dir) # Create files for cmd args and git info with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file: _file.write(" ".join(sys.argv)) # Try to get git hash git_repo, git_hash = get_git_hash() if git_repo: with open(log_dir / 'git-info.log', 'w', encoding='utf-8') as _file: _file.write(f'commit hash: {git_hash}') _file.write(get_git_diff()) # Add err_file logging to global_rank zero logging.add_err_file_handler(log_dir / 'nemo_error_log.txt') # Add lightning file logging to global_rank zero add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt') return log_dir