示例#1
0
def initialize_model_parallel_for_nemo(
    world_size,
    global_rank,
    local_rank,
    tensor_model_parallel_size=1,
    seed=1234,
):

    # updating NeMo globals
    app_state = AppState()
    app_state.global_rank = global_rank
    app_state.world_size = world_size
    app_state.model_parallel_size = tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(
        local_rank, tensor_model_parallel_size)

    # update apex.mpu globals
    set_tensor_model_parallel_world_size(tensor_model_parallel_size)
    set_tensor_model_parallel_rank(app_state.model_parallel_rank)

    # pipeline model parallelism not implemented in NeMo yet
    set_pipeline_model_parallel_rank(0)
    set_pipeline_model_parallel_world_size(1)

    _set_random_seed(seed)

    app_state._is_megatron_initialized = True
示例#2
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )

        try:
            from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

            compile_helper()
            logging.info('Megatron dataset helper compiled successfully.')
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )
示例#3
0
def initialize_model_parallel_for_nemo(
    world_size,
    global_rank,
    local_rank,
    tensor_model_parallel_size=1,
    pipeline_model_parallel_size=1,
    micro_batch_size=None,
    global_batch_size=None,
    seed=1234,
    apex_transformer_log_level=30,
):

    # updating NeMo globals
    app_state = AppState()
    app_state.global_rank = global_rank
    app_state.world_size = world_size
    app_state.local_rank = local_rank
    app_state.tensor_model_parallel_size = tensor_model_parallel_size
    app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        app_state.data_parallel_size,
    ) = fake_initialize_model_parallel(
        world_size=world_size,
        rank=global_rank,
        tensor_model_parallel_size_=tensor_model_parallel_size,
        pipeline_model_parallel_size_=pipeline_model_parallel_size,
    )

    # update apex.transformer globals
    set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size)
    set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank)

    # pipeline model parallelism not implemented in NeMo yet
    set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank)
    set_pipeline_model_parallel_world_size(
        app_state.pipeline_model_parallel_size)

    _set_random_seed(seed)

    if global_batch_size and micro_batch_size is not None:
        # TODO: add rampup_batch_size here when we have it implemented
        setup_microbatch_calculator(
            rank=global_rank,
            global_batch_size=global_batch_size,
            micro_batch_size=micro_batch_size,
            data_parallel_size=app_state.data_parallel_size,
            rampup_batch_size=None,
        )

    app_state._is_megatron_initialized = True

    set_logging_level(apex_transformer_log_level)
示例#4
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )
示例#5
0
    def restore_from(
        cls,
        restore_path: str,
        override_config_path: Optional[Union[OmegaConf, str]] = None,
        map_location: Optional[torch.device] = None,
        strict: bool = True,
        return_config: bool = False,
        trainer: Trainer = None,
        save_restore_connector: SaveRestoreConnector = None,
    ):
        """
        Restores model instance (weights and configuration) from .nemo file.

        Args:
            restore_path: path to .nemo file from which model should be instantiated
            override_config_path: path to a yaml config that will override the internal
                config file or an OmegaConf / DictConfig object representing the model config.
            map_location: Optional torch.device() to map the instantiated model to a device.
                By default (None), it will select a GPU if available, falling back to CPU otherwise.
            strict: Passed to load_state_dict. Set to True by default.
            return_config: If set to true, will return just the underlying config of the restored
                model as an OmegaConf DictConfig object without instantiating the model.
            trainer: PyTorch Lightning trainer. Must be passed in order to use model parallel .nemo

            Example:
                ```
                model = nemo.collections.nlp.models.TokenClassificationModel.restore_from('token_classification.nemo')
                assert isinstance(model, nemo.collections.nlp.models.TokenClassificationModel)
                ```

        Returns:
            An instance of type cls or its underlying config (if return_config is set).
        """
        if save_restore_connector is None:
            save_restore_connector = SaveRestoreConnector()

        if not os.path.exists(restore_path):
            raise FileNotFoundError(f"Can't find {restore_path}")

        app_state = AppState()
        app_state.model_restore_path = os.path.abspath(
            os.path.expanduser(restore_path))

        # detect if we have a model parallel .nemo file
        with tempfile.TemporaryDirectory() as tmpdir:
            cwd = os.getcwd()
            os.chdir(tmpdir)
            # detect if model parallel from tarfile
            tar = tarfile.open(app_state.model_restore_path, "r:gz")
            names = tar.getnames()
            mp_ranks = []
            for name in names:
                if 'mp_rank' in name:
                    mp_ranks.append(name)
            if mp_ranks:
                app_state.model_parallel_size = len(
                    mp_ranks
                ) // 2  # directory and file are included in getnames()

                # get checkpoint version
                checkpoint_version_member = None
                for member in tar.getmembers():
                    if 'megatron_checkpoint_version.json' in member.name:
                        checkpoint_version_member = member
                tar.extract(checkpoint_version_member, tmpdir)
                with open(checkpoint_version_member.name, 'r') as f:
                    checkpoint_version = json.load(f).get(
                        'checkpoint_version', None)
                logging.info(
                    (f'Detected model parallel .nemo file: {restore_path}. '
                     f'Assuming megatron model parallelism with '
                     f'model_parallel_size: {app_state.model_parallel_size} '
                     f'and checkpoint version: {checkpoint_version}'))
            tar.close()
            os.chdir(cwd)

        if app_state.model_parallel_size is not None:
            if not isinstance(trainer, Trainer):
                raise ValueError(
                    "trainer must be a PyTorch Lightning Trainer to restore model parallel .nemo files."
                )

            if checkpoint_version is None:
                raise ValueError(
                    "Restoring from megatron model parallel .nemo but could not find megatron checkpoint version."
                )
            else:
                logging.info(
                    f"Setting megatron checkpoint version: {checkpoint_version}"
                )
                set_checkpoint_version(checkpoint_version)

            app_state.world_size = trainer.num_gpus * trainer.num_nodes

            if trainer.local_rank is not None:
                app_state.local_rank = trainer.local_rank
            else:
                raise ValueError(
                    "trainer.local_rank is None. local_rank needed to restore model parallel models."
                )

            model_parallel_rank = compute_model_parallel_rank(
                trainer.local_rank, app_state.model_parallel_size)
            app_state.model_parallel_rank = model_parallel_rank

            cls.update_save_restore_connector(save_restore_connector)
            restored_model = cls._save_restore_connector.restore_from(
                cls, app_state.model_restore_path, override_config_path,
                map_location, strict, return_config)
            restored_model.set_trainer(trainer)
            return restored_model
        else:
            return super().restore_from(
                app_state.model_restore_path,
                override_config_path,
                map_location,
                strict,
                return_config,
                save_restore_connector=save_restore_connector,
            )