예제 #1
0
def initialize_model_parallel_for_nemo(
    world_size,
    global_rank,
    local_rank,
    tensor_model_parallel_size=1,
    seed=1234,
):

    # updating NeMo globals
    app_state = AppState()
    app_state.global_rank = global_rank
    app_state.world_size = world_size
    app_state.local_rank = local_rank
    app_state.model_parallel_size = tensor_model_parallel_size
    app_state.model_parallel_rank = compute_model_parallel_rank(
        local_rank, tensor_model_parallel_size)

    # update apex.mpu globals
    set_tensor_model_parallel_world_size(tensor_model_parallel_size)
    set_tensor_model_parallel_rank(app_state.model_parallel_rank)

    # pipeline model parallelism not implemented in NeMo yet
    set_pipeline_model_parallel_rank(0)
    set_pipeline_model_parallel_world_size(1)

    _set_random_seed(seed)

    app_state._is_megatron_initialized = True
예제 #2
0
def initialize_model_parallel_for_nemo(
    world_size,
    global_rank,
    local_rank,
    tensor_model_parallel_size=1,
    pipeline_model_parallel_size=1,
    micro_batch_size=None,
    global_batch_size=None,
    seed=1234,
    apex_transformer_log_level=30,
):

    # updating NeMo globals
    app_state = AppState()
    app_state.global_rank = global_rank
    app_state.world_size = world_size
    app_state.local_rank = local_rank
    app_state.tensor_model_parallel_size = tensor_model_parallel_size
    app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
    (
        app_state.tensor_model_parallel_rank,
        app_state.pipeline_model_parallel_rank,
        app_state.model_parallel_size,
        app_state.data_parallel_size,
    ) = fake_initialize_model_parallel(
        world_size=world_size,
        rank=global_rank,
        tensor_model_parallel_size_=tensor_model_parallel_size,
        pipeline_model_parallel_size_=pipeline_model_parallel_size,
    )

    # update apex.transformer globals
    set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size)
    set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank)

    # pipeline model parallelism not implemented in NeMo yet
    set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank)
    set_pipeline_model_parallel_world_size(
        app_state.pipeline_model_parallel_size)

    _set_random_seed(seed)

    if global_batch_size and micro_batch_size is not None:
        # TODO: add rampup_batch_size here when we have it implemented
        setup_microbatch_calculator(
            rank=global_rank,
            global_batch_size=global_batch_size,
            micro_batch_size=micro_batch_size,
            data_parallel_size=app_state.data_parallel_size,
            rampup_batch_size=None,
        )

    app_state._is_megatron_initialized = True

    set_logging_level(apex_transformer_log_level)
예제 #3
0
    def restore_from(
        cls,
        restore_path: str,
        override_config_path: Optional[Union[OmegaConf, str]] = None,
        map_location: Optional[torch.device] = None,
        strict: bool = True,
        return_config: bool = False,
        trainer: Trainer = None,
        save_restore_connector: SaveRestoreConnector = None,
    ):
        """
        Restores model instance (weights and configuration) from .nemo file.

        Args:
            restore_path: path to .nemo file from which model should be instantiated
            override_config_path: path to a yaml config that will override the internal
                config file or an OmegaConf / DictConfig object representing the model config.
            map_location: Optional torch.device() to map the instantiated model to a device.
                By default (None), it will select a GPU if available, falling back to CPU otherwise.
            strict: Passed to load_state_dict. Set to True by default.
            return_config: If set to true, will return just the underlying config of the restored
                model as an OmegaConf DictConfig object without instantiating the model.
            trainer: PyTorch Lightning trainer. Must be passed in order to use model parallel .nemo

            Example:
                ```
                model = nemo.collections.nlp.models.TokenClassificationModel.restore_from('token_classification.nemo')
                assert isinstance(model, nemo.collections.nlp.models.TokenClassificationModel)
                ```

        Returns:
            An instance of type cls or its underlying config (if return_config is set).
        """
        if save_restore_connector is None:
            save_restore_connector = SaveRestoreConnector()

        if not os.path.exists(restore_path):
            raise FileNotFoundError(f"Can't find {restore_path}")

        app_state = AppState()
        app_state.model_restore_path = os.path.abspath(
            os.path.expanduser(restore_path))

        # detect if we have a model parallel .nemo file
        with tempfile.TemporaryDirectory() as tmpdir:
            cwd = os.getcwd()
            os.chdir(tmpdir)
            # detect if model parallel from tarfile
            tar = tarfile.open(app_state.model_restore_path, "r:gz")
            names = tar.getnames()
            mp_ranks = []
            for name in names:
                if 'mp_rank' in name:
                    mp_ranks.append(name)
            if mp_ranks:
                app_state.model_parallel_size = len(
                    mp_ranks
                ) // 2  # directory and file are included in getnames()

                # get checkpoint version
                checkpoint_version_member = None
                for member in tar.getmembers():
                    if 'megatron_checkpoint_version.json' in member.name:
                        checkpoint_version_member = member
                tar.extract(checkpoint_version_member, tmpdir)
                with open(checkpoint_version_member.name, 'r') as f:
                    checkpoint_version = json.load(f).get(
                        'checkpoint_version', None)
                logging.info(
                    (f'Detected model parallel .nemo file: {restore_path}. '
                     f'Assuming megatron model parallelism with '
                     f'model_parallel_size: {app_state.model_parallel_size} '
                     f'and checkpoint version: {checkpoint_version}'))
            tar.close()
            os.chdir(cwd)

        if app_state.model_parallel_size is not None:
            if not isinstance(trainer, Trainer):
                raise ValueError(
                    "trainer must be a PyTorch Lightning Trainer to restore model parallel .nemo files."
                )

            if checkpoint_version is None:
                raise ValueError(
                    "Restoring from megatron model parallel .nemo but could not find megatron checkpoint version."
                )
            else:
                logging.info(
                    f"Setting megatron checkpoint version: {checkpoint_version}"
                )
                set_checkpoint_version(checkpoint_version)

            app_state.world_size = trainer.num_gpus * trainer.num_nodes

            if trainer.local_rank is not None:
                app_state.local_rank = trainer.local_rank
            else:
                raise ValueError(
                    "trainer.local_rank is None. local_rank needed to restore model parallel models."
                )

            model_parallel_rank = compute_model_parallel_rank(
                trainer.local_rank, app_state.model_parallel_size)
            app_state.model_parallel_rank = model_parallel_rank

            cls.update_save_restore_connector(save_restore_connector)
            restored_model = cls._save_restore_connector.restore_from(
                cls, app_state.model_restore_path, override_config_path,
                map_location, strict, return_config)
            restored_model.set_trainer(trainer)
            return restored_model
        else:
            return super().restore_from(
                app_state.model_restore_path,
                override_config_path,
                map_location,
                strict,
                return_config,
                save_restore_connector=save_restore_connector,
            )
예제 #4
0
def get_megatron_lm_model(
    pretrained_model_name: str,
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    checkpoint_file: Optional[str] = None,
) -> Tuple[MegatronBertEncoder, str]:
    """
    Returns MegatronBertEncoder and a default or user specified path to the checkpoint file

    Args:
        pretrained_mode_name: model name from MEGATRON_CONFIG_MAP
            for example: megatron-bert-cased
        config_dict: model configuration parameters
        config_file: path to model configuration file. Takes precedence over config_dict if both supplied.
        checkpoint_file: path to checkpoint file or directory if using model parallel.

    Returns:
        model: MegatronBertEncoder
        checkpoint_file: path to checkpoint file or directory
    """
    config = None
    # get default config and checkpoint
    if config_file:
        with open(config_file) as f:
            config = json.load(f)
            # replace dashes with underscores in config keys
            fixed_config = {}
            for key in config.keys():
                fixed_key = key.replace("-", "_")
                if fixed_key == 'max_seq_length':
                    fixed_key = 'max_position_embeddings'
                fixed_config[fixed_key] = config[key]
            # 'vocab_size" no longer used.
            if 'vocab_size' in fixed_config:
                fixed_config.pop('vocab_size')
            config = fixed_config
    elif config_dict:
        config = config_dict
    elif pretrained_model_name in get_megatron_lm_models_list():
        config = get_megatron_config(pretrained_model_name)
    else:
        raise ValueError(f"{pretrained_model_name} is not supported")

    if config is None:
        raise ValueError(
            f"config_file or config_dict is required for {pretrained_model_name}"
        )

    if not checkpoint_file:
        checkpoint_file = get_megatron_checkpoint(pretrained_model_name)

    vocab = get_megatron_vocab_file(pretrained_model_name)

    # if checkpoint path is a directory, then we automatically compute model parallel size,
    # and model parallel rank
    if os.path.isdir(checkpoint_file):
        app_state = AppState()
        model_parallel_size = len(os.listdir(checkpoint_file))
        app_state.model_parallel_size = model_parallel_size
        logging.info((f'restore_path: {checkpoint_file} is a directory. '
                      f'Assuming megatron model parallelism with '
                      f'model_parallel_size: {model_parallel_size}'))
        # try to get local rank from global
        local_rank = None
        try:
            local_rank = int(os.environ['LOCAL_RANK'])
        except:
            logging.info('Global variable LOCAL_RANK not yet specified')
        if local_rank is not None:
            app_state.local_rank = local_rank
        else:
            # if local is None then we are on the main process
            local_rank = 0
        model_parallel_rank = compute_model_parallel_rank(
            local_rank, model_parallel_size)
        app_state.model_parallel_rank = model_parallel_rank
    else:
        model_parallel_size = None
        model_parallel_rank = None

    model = MegatronBertEncoder(
        model_name=pretrained_model_name,
        config=config,
        vocab_file=vocab,
        model_parallel_size=model_parallel_size,
        model_parallel_rank=model_parallel_rank,
    )

    return model, checkpoint_file