Exemplo n.º 1
0
    def _default_save_to(self, save_path: str):
        app_state = AppState()
        if app_state.model_parallel_size is not None:
            # each model parallel rank creates a .nemo file
            # after all .nemo files are created, each rank
            # will add their checkpoint to global rank 0

            base_dir = os.path.dirname(save_path)  # use the directory to merge mp_rank .nemo files into one

            # update save_path based on model parallel_rank
            base_path = os.path.splitext(save_path)[0]  # everything except the extension

            mp_save_path = f'{base_path}_mp_rank_{app_state.model_parallel_rank:02d}.nemo'

            if app_state.data_parallel_rank == 0:
                super()._default_save_to(mp_save_path)

            # barrier so that all processes have finished writing their weights before creating .nemo file
            torch.distributed.barrier()

            if is_global_rank_zero():
                # extract all tar files
                for mp_rank in range(app_state.model_parallel_size):
                    mp_tar_path = f'{base_path}_mp_rank_{mp_rank:02d}.nemo'
                    mp_tar = tarfile.open(mp_tar_path, 'r:gz')
                    mp_tar.extractall(path=os.path.join(base_dir, f'mp_rank_{mp_rank:02d}'))
                    mp_tar.close()
                    os.remove(mp_tar_path)

                # move rank 0 .nemo extract to base_path
                shutil.move(os.path.join(base_dir, 'mp_rank_00'), base_path)

                # move mp_rank_00 checkpoint to mp_rank_00 directory inside base_path
                os.mkdir(os.path.join(base_path, 'mp_rank_00'))
                shutil.move(os.path.join(base_path, 'model_weights.ckpt'), os.path.join(base_path, 'mp_rank_00'))

                # move other mp_rank checkpoints from base_dir to base_path
                for mp_rank in range(1, app_state.model_parallel_size):
                    os.mkdir(os.path.join(base_path, f'mp_rank_{mp_rank:02d}'))
                    shutil.move(
                        os.path.join(base_dir, f'mp_rank_{mp_rank:02d}', 'model_weights.ckpt'),
                        os.path.join(base_path, f'mp_rank_{mp_rank:02d}'),
                    )
                    # clean up leftover directory
                    shutil.rmtree(os.path.join(base_dir, f'mp_rank_{mp_rank:02d}'))

                # create tar file from base_path
                self._make_nemo_file_from_folder(save_path, base_path)

                # clean up base_path
                shutil.rmtree(base_path)

        elif is_global_rank_zero():
            return super()._default_save_to(save_path)
        else:
            return
Exemplo n.º 2
0
Arquivo: g2ps.py Projeto: ggrunin/NeMo
    def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'):
        if phoneme_dict_path is None:
            # this part of code downloads file, but it is not rank zero guarded
            # Try to check if torch distributed is available, if not get global rank zero to download corpora and make
            # all other ranks sleep for a minute
            if torch.distributed.is_available(
            ) and torch.distributed.is_initialized():
                group = torch.distributed.group.WORLD
                if is_global_rank_zero():
                    try:
                        nltk.data.find('corpora/cmudict.zip')
                    except LookupError:
                        nltk.download('cmudict', quiet=True)
                torch.distributed.barrier(group=group)
            elif is_global_rank_zero():
                logging.error(
                    f"Torch distributed needs to be initialized before you initialized EnglishG2p. This class is prone to "
                    "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this "
                    "before rank 0, errors might result.")
                try:
                    nltk.data.find('corpora/cmudict.zip')
                except LookupError:
                    nltk.download('cmudict', quiet=True)
            else:
                logging.error(
                    f"Torch distributed needs to be initialized before you initialized EnglishG2p. This class is prone to "
                    "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this "
                    "rank wakes from sleep prior to rank 0 finishing downloading, errors might result."
                )
                time.sleep(60)

            logging.warning(
                f"English g2p_dict will be used from nltk.corpus.cmudict.dict(), because phoneme_dict_path=None. "
                "Note that nltk.corpus.cmudict.dict() has old version (0.6) of CMUDict. "
                "You can use the latest official version of CMUDict (0.7b) with additional changes from NVIDIA directly from NeMo "
                "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.01."
            )

            return nltk.corpus.cmudict.dict()

        _alt_re = re.compile(r'\([0-9]+\)')
        g2p_dict = {}
        with open(phoneme_dict_path, encoding=encoding) as file:
            for line in file:
                if len(line) and ('A' <= line[0] <= 'Z' or line[0] == "'"):
                    parts = line.split('  ')
                    word = re.sub(_alt_re, '', parts[0])
                    word = word.lower()

                    pronunciation = parts[1].strip().split(" ")
                    if word in g2p_dict:
                        g2p_dict[word].append(pronunciation)
                    else:
                        g2p_dict[word] = [pronunciation]
        return g2p_dict
Exemplo n.º 3
0
    def _define_logger(self, capture_warnings=True):
        """ Creates the logger if not already created. Called in init"""

        # Use double-checked locking to avoid taking lock unnecessarily.
        if self._logger is not None:
            return self._logger

        with self._logger_lock:
            try:
                self._logger = _logging.getLogger("nemo_logger")
                # By default, silence all loggers except the logger for rank 0
                self.remove_stream_handlers()
                # If NEMO_TESTING is set, add a streamhandler to all ranks
                if get_envbool(NEMO_ENV_VARNAME_TESTING, False):
                    old_factory = _logging.getLogRecordFactory()

                    def record_factory(*args, **kwargs):
                        record = old_factory(*args, **kwargs)
                        record.rank = self.rank
                        return record

                    _logging.setLogRecordFactory(record_factory)
                    self.add_stream_handlers(formatter=DebugNeMoFormatter)
                elif is_global_rank_zero():
                    self.add_stream_handlers()

                # Add memoryhandlers, essentially buffers. They are used to save messages that we will flush to file
                # once the appropriate file handlers are added.
                if is_global_rank_zero():
                    # Add a memoryhandler for error messages. Only logged on rank 0
                    self._handlers["memory_err"] = MemoryHandler(-1)
                    self._handlers["memory_err"].addFilter(
                        lambda record: record.levelno > _logging.INFO)
                    formatter = BaseNeMoFormatter
                    self._handlers["memory_err"].setFormatter(formatter())
                    self._logger.addHandler(self._handlers["memory_err"])
                # Add a memoryhandler for all messages on all ranks
                self._handlers["memory_all"] = MemoryHandler(-1)
                formatter = BaseNeMoFormatter
                self._handlers["memory_all"].setFormatter(formatter())
                self._logger.addHandler(self._handlers["memory_all"])

            finally:
                level = Logger.INFO
                if get_envbool(NEMO_ENV_VARNAME_TESTING, False):
                    level = Logger.DEBUG
                self.set_verbosity(verbosity_level=level)
                self.captureWarnings(capture_warnings)

        self._logger.propagate = False
Exemplo n.º 4
0
def check_explicit_log_dir(trainer: 'pytorch_lightning.Trainer',
                           explicit_log_dir: [Path, str], exp_dir: str,
                           name: str, version: str) -> (Path, str, str, str):
    """ Checks that the passed arguments are compatible with explicit_log_dir.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raise:
        LoggerMisconfigurationError
    """
    if trainer.logger is not None:
        raise LoggerMisconfigurationError(
            "The pytorch lightning trainer that was passed to exp_manager contained a logger and explicit_log_dir: "
            f"{explicit_log_dir} was pass to exp_manager. Please remove the logger from the lightning trainer."
        )
    # Checking only (explicit_log_dir) vs (exp_dir and version).
    # The `name` will be used as the actual name of checkpoint/archive.
    if exp_dir or version:
        logging.error(
            f"exp_manager received explicit_log_dir: {explicit_log_dir} and at least one of exp_dir: {exp_dir}, "
            f"or version: {version}. Please note that exp_dir, name, and version will be ignored."
        )
    if is_global_rank_zero() and Path(explicit_log_dir).exists():
        logging.warning(
            f"Exp_manager is logging to {explicit_log_dir}, but it already exists."
        )
    return Path(explicit_log_dir), str(explicit_log_dir), "", ""
Exemplo n.º 5
0
    def prepare_data(self):
        """
        Preprocessed schema and dialogues and caches this
        """
        if self.data_prepared:
            return
        schema_config = {
            "MAX_NUM_CAT_SLOT": self._cfg.dataset.max_num_cat_slot,
            "MAX_NUM_NONCAT_SLOT": self._cfg.dataset.max_num_noncat_slot,
            "MAX_NUM_VALUE_PER_CAT_SLOT": self._cfg.dataset.max_value_per_cat_slot,
            "MAX_NUM_INTENT": self._cfg.dataset.max_num_intent,
            "NUM_TASKS": NUM_TASKS,
            "MAX_SEQ_LENGTH": self._cfg.dataset.max_seq_length,
        }
        all_schema_json_paths = []
        for dataset_split in ['train', 'test', 'dev']:
            all_schema_json_paths.append(os.path.join(self._cfg.dataset.data_dir, dataset_split, "schema.json"))
        schemas = Schema(all_schema_json_paths)

        self.dialogues_processor = SGDDataProcessor(
            task_name=self._cfg.dataset.task_name,
            data_dir=self._cfg.dataset.data_dir,
            dialogues_example_dir=self._cfg.dataset.dialogues_example_dir,
            tokenizer=self.tokenizer,
            schemas=schemas,
            schema_config=schema_config,
            subsample=self._cfg.dataset.subsample,
        )

        if is_global_rank_zero():
            overwrite_dial_files = not self._cfg.dataset.use_cache
            self.dialogues_processor.save_dialog_examples(overwrite_dial_files=overwrite_dial_files)

        self.data_prepared = True
Exemplo n.º 6
0
    def setup(self, stage: str) -> None:
        """ PTL hook that is called on all DDP processes. """

        if stage == 'fit':

            # adds self.bert_model config to .nemo file
            if hasattr(self, 'bert_model') and self.bert_model is not None:
                self.register_bert_model()

            app_state = AppState()

            if app_state.model_parallel_size is not None:

                self._trainer.checkpoint_connector = NLPCheckpointConnector(
                    self._trainer)

                # Configure checkpointing for model parallel
                if app_state.create_checkpoint_callback:
                    # global rank 0 is configured by exp_manager
                    if not is_global_rank_zero(
                    ) and app_state.data_parallel_rank == 0:
                        configure_checkpointing(
                            self._trainer,
                            app_state.log_dir,
                            app_state.checkpoint_name,
                            app_state.checkpoint_callback_params,
                        )
Exemplo n.º 7
0
    def save_to(self, model, save_path: str):
        """
        Saves model instance (weights and configuration) into .nemo file.
        You can use "restore_from" method to fully restore instance from .nemo file.

        .nemo file is an archive (tar.gz) with the following:
            model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor
            model_wights.chpt - model checkpoint

        Args:
            model: ModelPT object to be saved.
            save_path: Path to .nemo file where model instance should be saved
        """

        if is_global_rank_zero():
            with tempfile.TemporaryDirectory() as tmpdir:
                config_yaml = os.path.join(tmpdir, self.model_config_yaml)
                model_weights = os.path.join(tmpdir, self.model_weights_ckpt)
                model.to_config_file(path2yaml_file=config_yaml)
                if hasattr(model, 'artifacts') and model.artifacts is not None:
                    self._handle_artifacts(model, nemo_file_folder=tmpdir)
                    # We should not update self._cfg here - the model can still be in use
                    self._update_artifact_paths(model,
                                                path2yaml_file=config_yaml)
                self._save_state_dict_to_disk(model.state_dict(),
                                              model_weights)
                self._make_nemo_file_from_folder(filename=save_path,
                                                 source_dir=tmpdir)
        else:
            return
Exemplo n.º 8
0
def main(cfg: ParallelAlignmentConfig):
    if cfg.model.endswith(".nemo"):
        logging.info("Attempting to initialize from .nemo file")
        model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu")
    elif cfg.model.endswith(".ckpt"):
        logging.info("Attempting to initialize from .ckpt file")
        model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu")
    else:
        logging.info(
            "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt"
        )
        model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu")

    trainer = ptl.Trainer(**cfg.trainer)

    cfg.predict_ds.return_sample_id = True
    cfg.return_predictions = False
    cfg.use_cer = False
    cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model._cfg.train_ds)
    data_loader = model._setup_dataloader_from_config(cfg.predict_ds)

    os.makedirs(cfg.output_path, exist_ok=True)
    # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank.
    global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0))
    output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json")
    output_ctm_dir = os.path.join(cfg.output_path, "ctm")
    predictor_writer = ASRCTMPredictionWriter(
        dataset=data_loader.dataset,
        output_file=output_file,
        output_ctm_dir=output_ctm_dir,
        time_per_frame=cfg.model_stride * model._cfg.preprocessor['window_stride'],
    )
    trainer.callbacks.extend([predictor_writer])

    aligner_wrapper = AlignerWrapperModel(model=model, cfg=cfg.aligner_args)
    trainer.predict(model=aligner_wrapper, dataloaders=data_loader, return_predictions=cfg.return_predictions)
    samples_num = predictor_writer.close_output_file()

    logging.info(
        f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}."
    )

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    samples_num = 0
    if is_global_rank_zero():
        output_file = os.path.join(cfg.output_path, f"predictions_all.json")
        logging.info(f"Prediction files are being aggregated in {output_file}.")
        with open(output_file, 'tw', encoding="utf-8") as outf:
            for rank in range(trainer.world_size):
                input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json")
                with open(input_file, 'r', encoding="utf-8") as inpf:
                    lines = inpf.readlines()
                    samples_num += len(lines)
                    outf.writelines(lines)
        logging.info(
            f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}."
        )
Exemplo n.º 9
0
    def __init__(self, capture_warnings=True):

        self._logger = None
        # Multi-GPU runs run in separate processes, thread locks shouldn't be needed
        self._logger_lock = threading.Lock()
        self._handlers = dict()
        self.old_warnings_showwarning = None
        self._define_logger(capture_warnings)
        self.once_logged = set()
        self.rank = 0 if is_global_rank_zero() else "UNK"
Exemplo n.º 10
0
    def _del_model_without_trainer(self, filepath: str) -> None:
        app_state = AppState()
        if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
            # filepath needs to be updated to include mp_rank
            filepath = inject_model_parallel_rank(filepath)

        # each model parallel rank needs to remove its model
        if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0):
            try:
                self._fs.rm(filepath)
                logging.info(f"Removed checkpoint: {filepath}")
            except:
                logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
Exemplo n.º 11
0
    def _del_model_without_trainer(self, filepath: str) -> None:
        app_state = AppState()
        if app_state.model_parallel_size is not None:
            # filepath needs to be updated to include mp_rank
            dirname = os.path.dirname(filepath)
            basename = os.path.basename(filepath)
            filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'

        # each model parallel rank needs to remove its model
        if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0):
            try:
                self._fs.rm(filepath)
                logging.info(f"Removed checkpoint: {filepath}")
            except:
                logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
Exemplo n.º 12
0
    def __init__(self, datasets, weights):

        self.datasets = datasets
        num_datasets = len(datasets)
        assert num_datasets == len(weights)

        self.size = 0
        for dataset in self.datasets:
            self.size += len(dataset)

        # Normalize weights.
        weights = np.array(weights, dtype=np.float64)
        sum_weights = np.sum(weights)
        assert sum_weights > 0.0
        weights /= sum_weights

        # Build indecies.
        start_time = time.time()
        assert num_datasets < 255
        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)

        try:
            if is_global_rank_zero():
                from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

                compile_helper()
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except:
            raise Exception(f'Could not compile helpers.')
        helpers.build_blending_indices(
            self.dataset_index,
            self.dataset_sample_index,
            weights,
            num_datasets,
            self.size,
            torch.distributed.get_rank() == 0,
        )
        logging.info('> elapsed time for building blendable dataset indices: '
                     '{:.2f} (sec)'.format(time.time() - start_time))
Exemplo n.º 13
0
    def save_to(self, save_path: str):
        """
        Saves model instance (weights and configuration) into EFF archive or .
         You can use "restore_from" method to fully restore instance from .nemo file.

        .nemo file is an archive (tar.gz) with the following:
            model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor
            model_wights.chpt - model checkpoint

        Args:
            save_path: Path to .nemo file where model instance should be saved
        """

        # Add nemo rank check as well
        if not is_global_rank_zero():
            return

        if _EFF_PRESENT_ and self.use_eff_save():
            # Save EFF archive.
            self._eff_save_to(save_path)
        else:
            # Save .nemo tar archive.
            self._default_save_to(save_path)
Exemplo n.º 14
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.tensor_model_parallel_size = 1
            app_state.tensor_model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )

        try:
            from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

            if is_global_rank_zero():
                compile_helper()

            if torch.distributed.is_available() and torch.distributed.is_initialized():
                torch.distributed.barrier()

            from nemo.collections.nlp.data.language_modeling.megatron import helpers

            logging.info('Megatron dataset helper compiled successfully.')
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )
Exemplo n.º 15
0
def check_resume(
    trainer: 'pytorch_lightning.Trainer',
    log_dir: str,
    resume_past_end: bool = False,
    resume_ignore_no_checkpoint: bool = False,
):
    """Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets
    trainer.checkpoint_connector.resume_from_checkpoint_fit_path as necessary.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raises:
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """

    if not log_dir:
        raise ValueError(
            f"Resuming requires the log_dir {log_dir} to be passed to exp_manager"
        )

    checkpoint_dir = Path(Path(log_dir) / "checkpoints")

    checkpoint = None
    end_checkpoints = list(checkpoint_dir.rglob("*end.ckpt"))
    last_checkpoints = list(checkpoint_dir.rglob("*last.ckpt"))
    if not checkpoint_dir.exists():
        if resume_ignore_no_checkpoint:
            logging.warning(
                f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch."
            )
            return
        else:
            raise NotFoundError(
                f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume."
            )
    elif len(end_checkpoints) > 0:
        if resume_past_end:
            if len(end_checkpoints) > 1:
                if 'mp_rank' in str(end_checkpoints[0]):
                    checkpoint = end_checkpoints[0]
                else:
                    raise ValueError(
                        f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt."
                    )
            logging.info(f"Resuming from {end_checkpoints[0]}")
        else:
            raise ValueError(
                f"Found {end_checkpoints[0]} indicating that the last training run has already completed."
            )
    elif not len(last_checkpoints) > 0:
        if resume_ignore_no_checkpoint:
            logging.warning(
                f"There were no checkpoints found in {checkpoint_dir}. Training from scratch."
            )
            return
        else:
            raise NotFoundError(
                f"There were no checkpoints found in {checkpoint_dir}. Cannot resume."
            )
    elif len(last_checkpoints) > 1:
        if 'mp_rank' in str(last_checkpoints[0]):
            checkpoint = last_checkpoints[0]
        else:
            raise ValueError(
                f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt."
            )
    else:
        logging.info(f"Resuming from {last_checkpoints[0]}")
        checkpoint = last_checkpoints[0]

    trainer.checkpoint_connector.resume_from_checkpoint_fit_path = str(
        checkpoint)

    if is_global_rank_zero():
        # Check to see if any files exist that need to be moved
        files_to_move = []
        for child in Path(log_dir).iterdir():
            if child.is_file():
                files_to_move.append(child)

        if len(files_to_move) > 0:
            # Move old files to a new folder
            other_run_dirs = Path(log_dir).glob("run_*")
            run_count = 0
            for fold in other_run_dirs:
                if fold.is_dir():
                    run_count += 1
            new_run_dir = Path(Path(log_dir) / f"run_{run_count}")
            new_run_dir.mkdir()
            for _file in files_to_move:
                move(str(_file), str(new_run_dir))
Exemplo n.º 16
0
def exp_manager(trainer: 'pytorch_lightning.Trainer',
                cfg: Optional[Union[DictConfig, Dict]] = None) -> Path:
    """
    exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm
    of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir,
    name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging
    directory. exp_manager also allows for explicit folder creation via explicit_log_dir.

    The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set
     to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning.
    It copies sys.argv, and git information if available to the logging directory. It creates a log file for each
    process to log their output into.

    exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from
    the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need
    multiple consecutive jobs), you need to avoid creating the version folders. Therefore from v1.0.0, when
    resume_if_exists is set to True, creating the version folders is ignored.

    Args:
        trainer (pytorch_lightning.Trainer): The lightning trainer.
        cfg (DictConfig, dict): Can have the following keys:
            - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to
                None, which will use exp_dir, name, and version to construct the logging directory.
            - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to
                ./nemo_experiments.
            - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or
                "default".
            - version (str): The version of the experiment. Defaults to None which uses either a datetime string or
                lightning's TensorboardLogger system of using version_{int}.
            - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True.
            - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets
                trainer.checkpoint_connector.resume_from_checkpoint_fit_path so that the trainer should auto-resume. exp_manager will move files
                under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when resume_if_exists is True,
                we would not create version folders to make it easier to find the log folder for next runs.
            - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching
                *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which
                case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False.
            - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint
                could be found. This behaviour can be disabled, in which case exp_manager will print a message and
                continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False.
            - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch
                lightning trainer. Defaults to True.
            - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger
                class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None.
            - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch
                lightning trainer. Defaults to False.
            - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger
                class. Note that name and project are required parameters if create_wandb_logger is True.
                Defaults to None.
            - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the
                pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most
                recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt.
                Defaults to True.
            - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                copies no files.

    returns:
        log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
            exp_dir, name, and version.
    """
    # Add rank information to logger
    # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it
    local_rank = int(os.environ.get("LOCAL_RANK", 0))
    global_rank = trainer.node_rank * trainer.num_gpus + local_rank
    logging.rank = global_rank
    world_size = trainer.world_size

    if cfg is None:
        logging.error(
            "exp_manager did not receive a cfg argument. It will be disabled.")
        return
    if trainer.fast_dev_run:
        logging.info(
            "Trainer was called with fast_dev_run. exp_manager will return without any functionality."
        )
        return

    # Ensure passed cfg is compliant with ExpManagerConfig
    schema = OmegaConf.structured(ExpManagerConfig)
    if isinstance(cfg, dict):
        cfg = OmegaConf.create(cfg)
    elif not isinstance(cfg, DictConfig):
        raise ValueError(
            f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig"
        )
    cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True))
    cfg = OmegaConf.merge(schema, cfg)

    error_checks(
        trainer, cfg
    )  # Ensures that trainer options are compliant with NeMo and exp_manager arguments

    log_dir, exp_dir, name, version = get_log_dir(
        trainer=trainer,
        exp_dir=cfg.exp_dir,
        name=cfg.name,
        version=cfg.version,
        explicit_log_dir=cfg.explicit_log_dir,
        use_datetime_version=cfg.use_datetime_version,
        resume_if_exists=cfg.resume_if_exists,
    )

    if cfg.resume_if_exists:
        check_resume(trainer, log_dir, cfg.resume_past_end,
                     cfg.resume_ignore_no_checkpoint)

    checkpoint_name = name
    # If name returned from get_log_dir is "", use cfg.name for checkpointing
    if checkpoint_name is None or checkpoint_name == '':
        checkpoint_name = cfg.name or "default"
    cfg.name = name  # Used for configure_loggers so that the log_dir is properly set even if name is ""
    cfg.version = version

    # update app_state with log_dir, exp_dir, etc
    app_state = AppState()
    app_state.log_dir = log_dir
    app_state.exp_dir = exp_dir
    app_state.name = name
    app_state.version = version
    app_state.checkpoint_name = checkpoint_name
    app_state.create_checkpoint_callback = cfg.create_checkpoint_callback
    app_state.checkpoint_callback_params = cfg.checkpoint_callback_params

    # Create the logging directory if it does not exist
    os.makedirs(
        log_dir, exist_ok=True
    )  # Cannot limit creation to global zero as all ranks write to own log file
    logging.info(f'Experiments will be logged at {log_dir}')
    trainer._default_root_dir = log_dir

    # Handle logging to file
    if get_envbool(NEMO_ENV_VARNAME_TESTING, False) or world_size <= 32:
        # If NEMO_TESTING is set (debug mode) or if less than 32 ranks save all log files
        log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt'
        logging.add_file_handler(log_file)
    elif world_size <= 256 and local_rank == 0:
        # If less than 256 ranks, try to save 1 log file per "machine"
        log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt'
        logging.add_file_handler(log_file)
    elif global_rank == 0:
        # If running more than 256 ranks, only save 1 log file
        log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt'
        logging.add_file_handler(log_file)

    # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks
    # not just global rank 0.
    if cfg.create_tensorboard_logger or cfg.create_wandb_logger:
        configure_loggers(
            trainer,
            exp_dir,
            cfg.name,
            cfg.version,
            cfg.create_tensorboard_logger,
            cfg.summary_writer_kwargs,
            cfg.create_wandb_logger,
            cfg.wandb_logger_kwargs,
        )

    # add loggers timing callbacks
    if cfg.log_step_timing:
        timing_callback = TimingCallback(
            timer_kwargs=cfg.step_timing_kwargs or {})
        trainer.callbacks.insert(0, timing_callback)

    if cfg.create_checkpoint_callback:
        configure_checkpointing(trainer, log_dir, checkpoint_name,
                                cfg.resume_if_exists,
                                cfg.checkpoint_callback_params)

    if is_global_rank_zero():
        # Move files_to_copy to folder and add git information if present
        if cfg.files_to_copy:
            for _file in cfg.files_to_copy:
                copy(Path(_file), log_dir)

        # Create files for cmd args and git info
        with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file:
            _file.write(" ".join(sys.argv))

        # Try to get git hash
        git_repo, git_hash = get_git_hash()
        if git_repo:
            with open(log_dir / 'git-info.log', 'w',
                      encoding='utf-8') as _file:
                _file.write(f'commit hash: {git_hash}')
                _file.write(get_git_diff())

        # Add err_file logging to global_rank zero
        logging.add_err_file_handler(log_dir / 'nemo_error_log.txt')

        # Add lightning file logging to global_rank zero
        add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt',
                                      log_dir / 'nemo_error_log.txt')

    return log_dir
Exemplo n.º 17
0
    def setup(self, stage: str) -> None:
        """ PTL hook that is called after DDP is initialized.
            Called at the beginning of fit and test.

        Args:
            stage (str): either 'fit' or 'test'
        """
        # TODO: implement model parallel for test stage
        if stage == 'fit':
            # set find_unused_parameters to True by default for NLP models
            if isinstance(self.trainer.accelerator.training_type_plugin,
                          DDPPlugin):
                self.trainer.accelerator.training_type_plugin._ddp_kwargs[
                    'find_unused_parameters'] = True

            # adds self.bert_model config to .nemo file
            if hasattr(self, 'bert_model') and self.bert_model is not None:
                self.register_bert_model()

            app_state = AppState()

            if app_state.model_parallel_size is not None:

                if app_state.model_parallel_group is None:
                    self.init_model_parallel(app_state.global_rank,
                                             app_state.world_size)

                # mpu grad clipping needs parameters to have the attribute model_parallel
                parameters = self._trainer.get_model().parameters()
                for p in parameters:
                    if not hasattr(p, 'model_parallel'):
                        p.model_parallel = False

                # Update PTL trainer to use our configure_ddp
                self._trainer.accelerator_backend.ddp_plugin.configure_ddp = self.configure_ddp
                # Update PTL trainer to use our _clip_gradients
                self._trainer.accelerator_backend._clip_gradients = self._clip_gradients
                self._trainer.checkpoint_connector = NLPCheckpointConnector(
                    self._trainer)

                # Configure checkpointing for model parallel
                if app_state.create_checkpoint_callback:
                    # global rank 0 is configured by exp_manager
                    if not is_global_rank_zero(
                    ) and app_state.data_parallel_rank == 0:
                        configure_checkpointing(
                            self._trainer,
                            app_state.log_dir,
                            app_state.checkpoint_name,
                            app_state.checkpoint_callback_params,
                        )

                if isinstance(self.bert_model, MegatronBertEncoder):
                    self.bert_model.complete_lazy_init()

                    # model parallel checkpoints need to be restored after torch.distributed is initialized
                    if self._trainer.resume_from_checkpoint is not None:
                        # update path based on model parallel rank
                        filepath = self._trainer.resume_from_checkpoint
                        dirname = os.path.dirname(os.path.dirname(filepath))
                        basename = os.path.basename(filepath)
                        filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
                        self._trainer.resume_from_checkpoint = filepath
                        logging.info(
                            f'Resuming training from checkpoint {self._trainer.resume_from_checkpoint}'
                        )
                        # need to set checkpoint version for megatron-lm
                        checkpoint_version = torch.load(
                            self._trainer.resume_from_checkpoint).get(
                                'checkpoint_version', None)
                        if checkpoint_version is not None:
                            set_checkpoint_version(checkpoint_version)
                        else:
                            logging.warning(
                                'Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.'
                            )
                            set_checkpoint_version(0)
                    else:
                        logging.info(
                            f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}"
                        )
                        self.bert_model.restore_weights(
                            self.bert_model._restore_path)

                    logging.info(
                        "Replacing sampler with model parallel sampler")
                    mp_sampler = torch.utils.data.distributed.DistributedSampler(
                        self._train_dl.dataset,
                        num_replicas=app_state.data_parallel_size,
                        rank=app_state.data_parallel_rank,
                    )
                    mp_dl = self._trainer.replace_sampler(
                        self._train_dl, mp_sampler)
                    self._train_dl = mp_dl
                else:
                    raise NotImplementedError(
                        f'The BERT encoder: {self.bert_model} does not support model parallelism yet.'
                    )
Exemplo n.º 18
0
    def __init__(
        self,
        punct=True,
        stresses=False,
        spaces=True,
        chars=False,
        *,
        space=' ',
        silence=None,
        apostrophe=True,
        oov=Base.OOV,
        sep='|',  # To be able to distinguish between 2/3 letters codes.
        add_blank_at="last_but_one",
        pad_with_space=False,
        improved_version_g2p=False,
        phoneme_dict_path=None,
    ):
        labels = []
        self.space, labels = len(labels), labels + [space]  # Space

        if silence is not None:
            self.silence, labels = len(labels), labels + [silence]  # Silence

        labels.extend(self.CONSONANTS)
        vowels = list(self.VOWELS)

        if stresses:
            vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))]
        labels.extend(vowels)

        if chars:
            labels.extend(string.ascii_lowercase)

        if apostrophe:
            labels.append("'")  # Apostrophe

        if punct:
            labels.extend(self.PUNCT)

        super().__init__(labels, oov=oov, sep=sep, add_blank_at=add_blank_at)

        self.punct = punct
        self.stresses = stresses
        self.spaces = spaces
        self.pad_with_space = pad_with_space

        # g2p_en tries to run download_corpora() on import but it is not rank zero guarded
        # Try to check if torch distributed is available, if not get global rank zero to download corpora and make
        # all other ranks sleep for a minute
        if torch.distributed.is_available() and torch.distributed.is_initialized():
            group = torch.distributed.group.WORLD
            if is_global_rank_zero():
                download_corpora()
            torch.distributed.barrier(group=group)
        elif is_global_rank_zero():
            logging.error(
                f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to "
                "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this "
                "before rank 0, errors might result."
            )
            download_corpora()
        else:
            logging.error(
                f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to "
                "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this "
                "rank wakes from sleep prior to rank 0 finishing downloading, errors might result."
            )
            time.sleep(60)

        import g2p_en  # noqa pylint: disable=import-outside-toplevel

        _g2p = g2p_en.G2p()
        _g2p.variables = None

        if improved_version_g2p:
            self.g2p = G2p(_g2p, phoneme_dict_path)
        else:
            self.g2p = _g2p
Exemplo n.º 19
0
def main(cfg: ParallelTranscriptionConfig):
    if cfg.model.endswith(".nemo"):
        logging.info("Attempting to initialize from .nemo file")
        model = ASRModel.restore_from(restore_path=cfg.model,
                                      map_location="cpu")
    elif cfg.model.endswith(".ckpt"):
        logging.info("Attempting to initialize from .ckpt file")
        model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model,
                                              map_location="cpu")
    else:
        logging.info(
            "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt"
        )
        model = ASRModel.from_pretrained(model_name=cfg.model,
                                         map_location="cpu")

    trainer = ptl.Trainer(**cfg.trainer)

    cfg.predict_ds.return_sample_id = True
    cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds,
                                        train_ds=model.cfg.train_ds)
    data_loader = model._setup_dataloader_from_config(cfg.predict_ds)

    os.makedirs(cfg.output_path, exist_ok=True)
    # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank.
    global_rank = trainer.node_rank * trainer.num_gpus + int(
        os.environ.get("LOCAL_RANK", 0))
    output_file = os.path.join(cfg.output_path,
                               f"predictions_{global_rank}.json")
    predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset,
                                           output_file=output_file)
    trainer.callbacks.extend([predictor_writer])

    predictions = trainer.predict(model=model,
                                  dataloaders=data_loader,
                                  return_predictions=cfg.return_predictions)
    if predictions is not None:
        predictions = list(itertools.chain.from_iterable(predictions))
    samples_num = predictor_writer.close_output_file()

    logging.info(
        f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}."
    )

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    samples_num = 0
    pred_text_list = []
    text_list = []
    if is_global_rank_zero():
        output_file = os.path.join(cfg.output_path, f"predictions_all.json")
        logging.info(
            f"Prediction files are being aggregated in {output_file}.")
        with open(output_file, 'w') as outf:
            for rank in range(trainer.world_size):
                input_file = os.path.join(cfg.output_path,
                                          f"predictions_{rank}.json")
                with open(input_file, 'r') as inpf:
                    lines = inpf.readlines()
                    for line in lines:
                        item = json.loads(line)
                        pred_text_list.append(item["pred_text"])
                        text_list.append(item["text"])
                        outf.write(json.dumps(item) + "\n")
                        samples_num += 1
        wer_cer = word_error_rate(hypotheses=pred_text_list,
                                  references=text_list,
                                  use_cer=cfg.use_cer)
        logging.info(
            f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}."
        )
        logging.info("{} for all predictions is {:.4f}.".format(
            "CER" if cfg.use_cer else "WER", wer_cer))
Exemplo n.º 20
0
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
                        max_num_samples, max_seq_length, short_seq_prob, seed,
                        name, binary_head):
    """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""

    if not num_epochs:
        if not max_num_samples:
            raise ValueError("Need to specify either max_num_samples "
                             "or num_epochs")
        num_epochs = np.iinfo(np.int32).max - 1
    if not max_num_samples:
        max_num_samples = np.iinfo(np.int64).max - 1

    # Filename of the index mapping
    indexmap_filename = data_prefix
    indexmap_filename += '_{}_indexmap'.format(name)
    if num_epochs != (np.iinfo(np.int32).max - 1):
        indexmap_filename += '_{}ep'.format(num_epochs)
    if max_num_samples != (np.iinfo(np.int64).max - 1):
        indexmap_filename += '_{}mns'.format(max_num_samples)
    indexmap_filename += '_{}msl'.format(max_seq_length)
    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
    indexmap_filename += '_{}s'.format(seed)
    indexmap_filename += '.npy'

    # Build the indexed mapping if not exist.
    if torch.distributed.get_rank(
    ) == 0 and not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

        # Make sure the types match the helpers input types.
        assert indexed_dataset.doc_idx.dtype == np.int64
        assert indexed_dataset.sizes.dtype == np.int32

        # Build samples mapping
        verbose = torch.distributed.get_rank() == 0
        start_time = time.time()
        logging.info(
            ' > building samples index mapping for {} ...'.format(name))
        # First compile and then import.
        try:
            if is_global_rank_zero():
                compile_helper()
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )

        samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
            indexed_dataset.sizes,
            num_epochs,
            max_num_samples,
            max_seq_length,
            short_seq_prob,
            seed,
            verbose,
            2 if binary_head else 1,
        )
        logging.info(' > done building samples index maping')
        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
        logging.info(
            ' > saved the index mapping in {}'.format(indexmap_filename))
        # Make sure all the ranks have built the mapping
        logging.info(' > elasped time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time))

    torch.distributed.barrier()
    counts = torch.cuda.LongTensor([1])
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_data_parallel_group())
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_pipeline_model_parallel_group())
    assert counts[0].item() == (
        torch.distributed.get_world_size() // torch.distributed.get_world_size(
            group=parallel_state.get_tensor_model_parallel_group()))

    # Load indexed dataset.
    logging.info(
        ' > loading indexed mapping from {}'.format(indexmap_filename))
    start_time = time.time()
    samples_mapping = np.load(indexmap_filename,
                              allow_pickle=True,
                              mmap_mode='r')
    logging.info(
        '    loaded indexed file in {:3.3f} seconds'.format(time.time() -
                                                            start_time))
    logging.info('    total number of samples: {}'.format(
        samples_mapping.shape[0]))

    return samples_mapping
Exemplo n.º 21
0
    def __init__(
        self,
        data_dir: str,
        dialogues_example_dir: str,
        tokenizer: object,
        cfg=None,
    ):
        """
        Constructs DialogueSGDDataProcessor
        Args:
            data_dir: path to data directory
            dialogues_example_dir: path to store processed dialogue examples
            tokenizer: tokenizer object
            cfg: cfg container for dataset
        """
        self.data_dir = data_dir
        self.cfg = cfg

        self._task_name = self.cfg.task_name  # e.g. "sgd_single_domain"
        self._subsample = self.cfg.subsample

        all_schema_json_paths = []
        for dataset_split in ['train', 'test', 'dev']:
            all_schema_json_paths.append(
                os.path.join(self.cfg.data_dir, dataset_split, "schema.json"))
        self.schemas = Schema(all_schema_json_paths)

        self.schema_config = {
            "MAX_NUM_CAT_SLOT": self.cfg.max_num_cat_slot,
            "MAX_NUM_NONCAT_SLOT": self.cfg.max_num_noncat_slot,
            "MAX_NUM_VALUE_PER_CAT_SLOT": self.cfg.max_value_per_cat_slot,
            "MAX_NUM_INTENT": self.cfg.max_num_intent,
            "NUM_TASKS": self.cfg.num_tasks,
            "MAX_SEQ_LENGTH": self.cfg.max_seq_length,
        }

        train_file_range = FILE_RANGES[self._task_name]["train"]
        dev_file_range = FILE_RANGES[self._task_name]["dev"]
        test_file_range = FILE_RANGES[self._task_name]["test"]

        self._file_ranges = {
            "train": train_file_range,
            "dev": dev_file_range,
            "test": test_file_range,
        }

        self._seen_services = {
            "train": set(),
            "dev": set(),
            "test": set(),
        }

        self._tokenizer = tokenizer

        self._dialogues_example_dir = dialogues_example_dir

        self.dial_files = {}

        # slots_relation_list.np would contain the candidate list of slots for each (service, slot) which would be
        # looked into when a switch between two services happens in the dialogue and we can not find any value for a slot in the current user utterance.
        # This file would get generated from the dialogues in the training set.
        self.slots_relation_file = os.path.join(
            dialogues_example_dir,
            f"{self._task_name}_train_slots_relation_list.np")
        for dataset in ["train", "dev", "test"]:
            # Process dialogue files
            dial_file = f"{self._task_name}_{dataset}_examples.json"
            dial_file = os.path.join(dialogues_example_dir, dial_file)
            self.dial_files[(self._task_name, dataset)] = dial_file

            dialog_paths = DialogueSGDDataProcessor.get_dialogue_files(
                data_dir, dataset, self._task_name)
            dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths)
            for dialog in dialogs:
                self._seen_services[dataset].update(set(dialog['services']))

        if is_global_rank_zero():
            overwrite_dial_files = not self.cfg.use_cache
            self.save_dialog_examples(
                overwrite_dial_files=overwrite_dial_files)
Exemplo n.º 22
0
def _build_index_mappings(name, data_prefix, documents, sizes, num_samples,
                          seq_length, seed):
    """Build doc-idx, sample-idx, and shuffle-idx.
    doc-idx: is an array (ordered) of documents to be used in training.
    sample-idx: is the start document index and document offset for each
       training sample.
    shuffle-idx: maps the sample index into a random index into sample-idx.
    """
    # Number of tokens in each epoch and number of required epochs.
    tokens_per_epoch = _num_tokens(documents, sizes)
    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
    # rng state
    np_rng = np.random.RandomState(seed=seed)

    # Filename of the index mappings.
    _filename = data_prefix
    _filename += '_{}_indexmap'.format(name)
    _filename += '_{}ns'.format(num_samples)
    _filename += '_{}sl'.format(seq_length)
    _filename += '_{}s'.format(seed)
    doc_idx_filename = _filename + '_doc_idx.npy'
    sample_idx_filename = _filename + '_sample_idx.npy'
    shuffle_idx_filename = _filename + '_shuffle_idx.npy'

    # Build the indexed mapping if not exist.
    if torch.distributed.get_rank() == 0:
        if ((not os.path.isfile(doc_idx_filename))
                or (not os.path.isfile(sample_idx_filename))
                or (not os.path.isfile(shuffle_idx_filename))):

            logging.info(
                ' > WARNING: could not find index map files, building '
                'the indices on rank 0 ...')

            # For the last epoch, decide whether include the entire epoch
            # in the global shuffle or not.

            # If we need only one epoch, then separating last epoch  does
            # not mean anything.
            if num_epochs == 1:
                separate_last_epoch = False
                print(
                    ' > only one epoch required, setting '
                    'separate_last_epoch to False',
                    flush=True)

            else:
                # Get the number of samples for the last epoch
                num_samples_from_epochs_minus_one = (
                    (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
                last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one
                assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.'
                num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
                assert last_epoch_num_samples < (
                    num_samples_per_epoch +
                    1), 'last epoch number of samples exceeded max value.'
                # If we have less than 80% of the samples for the last epoch,
                # seperate out the epoch and treat it differently.
                # Note: the 80% number is just based on common sense and can
                # be adjusted if needed.
                separate_last_epoch = last_epoch_num_samples < int(
                    0.80 * num_samples_per_epoch)
                if separate_last_epoch:
                    string = (
                        ' > last epoch number of samples ({}) is smaller '
                        'than 80% of number of samples per epoch ({}), '
                        'setting separate_last_epoch to True')
                else:
                    string = (' > last epoch number of samples ({}) is larger '
                              'than 80% of number of samples per epoch ({}), '
                              'setting separate_last_epoch to False')
                print(string.format(last_epoch_num_samples,
                                    num_samples_per_epoch),
                      flush=True)

            # doc-idx.
            start_time = time.time()
            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
                                     separate_last_epoch)
            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
            logging.info(' > elasped time to build and save doc-idx mapping '
                         '(seconds): {:4f}'.format(time.time() - start_time))
            # sample-idx.
            start_time = time.time()
            # Use C++ implementation for speed.
            # First compile and then import.
            assert doc_idx.dtype == np.int32
            assert sizes.dtype == np.int32
            try:
                if is_global_rank_zero():
                    from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

                    compile_helper()
                from nemo.collections.nlp.data.language_modeling.megatron import helpers
            except:
                raise Exception(f'Could not compile helpers.')

            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
                                                  num_epochs, tokens_per_epoch)
            # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
            #                               num_epochs, tokens_per_epoch)
            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
            logging.info(
                ' > elasped time to build and save sample-idx mapping '
                '(seconds): {:4f}'.format(time.time() - start_time))
            # shuffle-idx.
            start_time = time.time()
            # -1 is due to data structure used to retieve the index:
            #    sample i --> [sample_idx[i], sample_idx[i+1])
            if separate_last_epoch:
                num_samples_ = num_samples_from_epochs_minus_one
            else:
                num_samples_ = sample_idx.shape[0] - 1
            shuffle_idx = _build_shuffle_idx(num_samples_,
                                             sample_idx.shape[0] - 1, np_rng)
            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
            logging.info(
                ' > elasped time to build and save shuffle-idx mapping'
                ' (seconds): {:4f}'.format(time.time() - start_time))

    torch.distributed.barrier()

    counts = torch.cuda.LongTensor([1])
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_data_parallel_group())
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_pipeline_model_parallel_group())
    assert counts[0].item() == (
        torch.distributed.get_world_size() // torch.distributed.get_world_size(
            group=parallel_state.get_tensor_model_parallel_group()))

    # Load mappings.
    start_time = time.time()
    logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename))
    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
    logging.info(
        ' > loading sample-idx mapping from {}'.format(sample_idx_filename))
    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
    logging.info(
        ' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename))
    shuffle_idx = np.load(shuffle_idx_filename,
                          allow_pickle=True,
                          mmap_mode='r')
    logging.info(
        '    loaded indexed file in {:3.3f} seconds'.format(time.time() -
                                                            start_time))
    logging.info('    total number of samples: {}'.format(sample_idx.shape[0]))
    logging.info('    total number of epochs: {}'.format(num_epochs))

    return doc_idx, sample_idx, shuffle_idx
Exemplo n.º 23
0
def get_log_dir(
    trainer: 'pytorch_lightning.Trainer',
    exp_dir: str = None,
    name: str = None,
    version: str = None,
    explicit_log_dir: str = None,
    use_datetime_version: bool = True,
    resume_if_exists: bool = False,
) -> (Path, str, str, str):
    """
    Obtains the log_dir used for exp_manager.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment
        explicit_log_dir (str): The explicit path to the log folder. Defaults to False.
        use_datetime_version (bool): Uses date and time as the version of the log folder. Defaults to True.
        resume_if_exists (bool): if resume_if_exists of the exp_manager's config is enabled or not. When enabled, the
            version folders would not get created.

    Raise:
        LoggerMisconfigurationError: If trainer is incompatible with arguments
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """
    if explicit_log_dir:  # If explicit log_dir was passed, short circuit
        return check_explicit_log_dir(trainer, explicit_log_dir, exp_dir, name,
                                      version)

    # Default exp_dir to ./nemo_experiments if None was passed
    _exp_dir = exp_dir
    if exp_dir is None:
        _exp_dir = str(Path.cwd() / 'nemo_experiments')

    # If the user has already defined a logger for the trainer, use the logger defaults for logging directory
    if trainer.logger is not None:
        if trainer.logger.save_dir:
            if exp_dir:
                raise LoggerMisconfigurationError(
                    "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's "
                    f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir "
                    "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir "
                    "must be None.")
            _exp_dir = trainer.logger.save_dir
        if name:
            raise LoggerMisconfigurationError(
                "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: "
                f"{name} was also passed to exp_manager. If the trainer contains a "
                "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None."
            )
        name = trainer.logger.name
        version = f"version_{trainer.logger.version}"
    # Use user-defined exp_dir, project_name, exp_name, and versioning options
    else:
        name = name or "default"
        version = version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None)

        if not version:
            if resume_if_exists:
                logging.warning(
                    "No version folders would be created under the log folder as 'resume_if_exists' is enabled."
                )
                version = None
            elif is_global_rank_zero():
                if use_datetime_version:
                    version = time.strftime('%Y-%m-%d_%H-%M-%S')
                else:
                    tensorboard_logger = TensorBoardLogger(
                        save_dir=Path(_exp_dir), name=name, version=version)
                    version = f"version_{tensorboard_logger.version}"
                os.environ[
                    NEMO_ENV_VARNAME_VERSION] = "" if version is None else version

    log_dir = Path(_exp_dir) / Path(
        str(name)) / Path("" if version is None else str(version))
    return log_dir, str(_exp_dir), name, version
Exemplo n.º 24
0
    def __init__(
        self,
        text_file: str,
        label_file: str,
        max_seq_length: int,
        tokenizer: TokenizerSpec,
        num_samples: int = -1,
        pad_label: str = 'O',
        label_ids: Dict[str, int] = None,
        ignore_extra_tokens: bool = False,
        ignore_start_end: bool = False,
        use_cache: bool = True,
    ):
        """ Initializes BertTokenClassificationDataset. """

        data_dir = os.path.dirname(text_file)
        filename = os.path.basename(text_file)

        if not filename.endswith('.txt'):
            raise ValueError("{text_file} should have extension .txt")

        vocab_size = getattr(tokenizer, "vocab_size", 0)
        features_pkl = os.path.join(
            data_dir,
            "cached_{}_{}_{}_{}_{}".format(filename, tokenizer.name,
                                           str(max_seq_length),
                                           str(vocab_size), str(num_samples)),
        )

        master_device = is_global_rank_zero()
        features = None
        if master_device and (not use_cache
                              or not os.path.exists(features_pkl)):
            if num_samples == 0:
                raise ValueError("num_samples has to be positive", num_samples)

            with open(text_file, 'r') as f:
                text_lines = f.readlines()

            labels_lines = []
            with open(label_file, 'r') as f:
                for line in f:
                    line = line.strip().split()
                    labels_lines.append(line)

            if len(labels_lines) != len(text_lines):
                raise ValueError(
                    "Labels file should contain labels for every word")

            if num_samples > 0:
                dataset = list(zip(text_lines, labels_lines))
                dataset = dataset[:num_samples]

                dataset = list(zip(*dataset))
                text_lines = dataset[0]
                labels_lines = dataset[1]

            features = get_features(
                queries=text_lines,
                max_seq_length=max_seq_length,
                tokenizer=tokenizer,
                pad_label=pad_label,
                raw_labels=labels_lines,
                label_ids=label_ids,
                ignore_extra_tokens=ignore_extra_tokens,
                ignore_start_end=ignore_start_end,
            )

            pickle.dump(features, open(features_pkl, "wb"))
            logging.info(f'features saved to {features_pkl}')

        # wait until the master process writes to the processed data files
        if torch.distributed.is_initialized():
            torch.distributed.barrier()

        if features is None:
            features = pickle.load(open(features_pkl, 'rb'))
            logging.info(f'features restored from {features_pkl}')

        self.all_input_ids = features[0]
        self.all_segment_ids = features[1]
        self.all_input_mask = features[2]
        self.all_subtokens_mask = features[3]
        self.all_loss_mask = features[4]
        self.all_labels = features[5]