예제 #1
0
파일: lm_utils.py 프로젝트: manneh/NeMo
def get_lm_model(
    pretrained_model_name: str,
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    checkpoint_file: Optional[str] = None,
    vocab_file: Optional[str] = None,
) -> BertModule:
    """
    Helper function to instantiate a language model encoder, either from scratch or a pretrained model.
    If only pretrained_model_name are passed, a pretrained model is returned.
    If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights.

    Args:
        pretrained_model_name: pretrained model name, for example, bert-base-uncased or megatron-bert-cased.
            See get_pretrained_lm_models_list() for full list.
        config_dict: path to the model configuration dictionary
        config_file: path to the model configuration file
        checkpoint_file: path to the pretrained model checkpoint
        vocab_file: path to vocab_file to be used with Megatron-LM

    Returns:
        Pretrained BertModule
    """

    # check valid model type
    if not pretrained_model_name or pretrained_model_name not in get_pretrained_lm_models_list(
            include_external=False):
        logging.warning(
            f'{pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
            f'will be using AutoModel from HuggingFace.')

    # warning when user passes both configuration dict and file
    if config_dict and config_file:
        logging.warning(
            f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
        )

    if "megatron" in pretrained_model_name:
        model, checkpoint_file = get_megatron_lm_model(
            config_dict=config_dict,
            config_file=config_file,
            pretrained_model_name=pretrained_model_name,
            checkpoint_file=checkpoint_file,
            vocab_file=vocab_file,
        )
    else:
        model = get_huggingface_lm_model(
            config_dict=config_dict,
            config_file=config_file,
            pretrained_model_name=pretrained_model_name,
        )

    if checkpoint_file:
        app_state = AppState()
        if not app_state.is_model_being_restored and not os.path.exists(
                checkpoint_file):
            raise ValueError(f'{checkpoint_file} not found')
        model.restore_weights(restore_path=checkpoint_file)

    return model
예제 #2
0
def get_lm_model(
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    vocab_file: Optional[str] = None,
    trainer: Optional[Trainer] = None,
    cfg: DictConfig = None,
) -> BertModule:
    """
    Helper function to instantiate a language model encoder, either from scratch or a pretrained model.
    If only pretrained_model_name are passed, a pretrained model is returned.
    If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights.

    Args:
        config_dict: path to the model configuration dictionary
        config_file: path to the model configuration file
        vocab_file: path to vocab_file to be used with Megatron-LM
        trainer: an instance of a PyTorch Lightning trainer
        cfg: a model configuration
    Returns:
        Pretrained BertModule
    """

    # check valid model type
    if cfg.language_model.get('pretrained_model_name'):
        if (not cfg.language_model.pretrained_model_name
                or cfg.language_model.pretrained_model_name
                not in get_pretrained_lm_models_list(include_external=False)):
            logging.warning(
                f'{cfg.language_model.pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
                f'will be using AutoModel from HuggingFace.')

    # warning when user passes both configuration dict and file
    if config_dict and config_file:
        logging.warning(
            f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
        )

    pretrain_model_name = ''
    if cfg.get('language_model') and cfg.language_model.get(
            'pretrained_model_name', ''):
        pretrain_model_name = cfg.language_model.get('pretrained_model_name',
                                                     '')
    all_pretrained_megatron_bert_models = get_megatron_pretrained_bert_models()
    if (cfg.tokenizer is not None
            and cfg.tokenizer.get("tokenizer_name", "") is not None
            and "megatron" in cfg.tokenizer.get("tokenizer_name", "")
        ) or pretrain_model_name in all_pretrained_megatron_bert_models:
        import torch

        from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel

        class Identity(torch.nn.Module):
            def __init__(self):
                super(Identity, self).__init__()

            def forward(self, x, *args):
                return x

        if cfg.language_model.get("lm_checkpoint"):
            model = MegatronBertModel.restore_from(
                restore_path=cfg.language_model.lm_checkpoint, trainer=trainer)
        else:
            model = MegatronBertModel.from_pretrained(
                cfg.language_model.get('pretrained_model_name'),
                trainer=trainer)
        # remove the headers that are only revelant for pretraining
        model.model.lm_head = Identity()
        model.model.binary_head = Identity()
        model.model.language_model.pooler = Identity()

    else:
        model = get_huggingface_lm_model(
            config_dict=config_dict,
            config_file=config_file,
            pretrained_model_name=cfg.language_model.pretrained_model_name,
        )

        if cfg.language_model.get("lm_checkpoint"):
            app_state = AppState()
            if not app_state.is_model_being_restored and not os.path.exists(
                    cfg.language_model.lm_checkpoint):
                raise ValueError(
                    f'{cfg.language_model.lm_checkpoint} not found')
            model.restore_weights(
                restore_path=cfg.language_model.lm_checkpoint)

    return model
예제 #3
0
def get_lm_model(
    pretrained_model_name: str,
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    checkpoint_file: Optional[str] = None,
    nemo_file: Optional[str] = None,
    vocab_file: Optional[str] = None,
    trainer: Optional[Trainer] = None,
) -> BertModule:
    """
    Helper function to instantiate a language model encoder, either from scratch or a pretrained model.
    If only pretrained_model_name are passed, a pretrained model is returned.
    If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights.

    Args:
        pretrained_model_name: pretrained model name, for example, bert-base-uncased or megatron-bert-cased.
            See get_pretrained_lm_models_list() for full list.
        config_dict: path to the model configuration dictionary
        config_file: path to the model configuration file
        checkpoint_file: path to the pretrained model checkpoint
        vocab_file: path to vocab_file to be used with Megatron-LM

    Returns:
        Pretrained BertModule
    """

    # check valid model type
    if not pretrained_model_name or pretrained_model_name not in get_pretrained_lm_models_list(include_external=False):
        logging.warning(
            f'{pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), '
            f'will be using AutoModel from HuggingFace.'
        )

    # warning when user passes both configuration dict and file
    if config_dict and config_file:
        logging.warning(
            f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used."
        )

    if nemo_file is not None:
        import torch

        from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel

        class Identity(torch.nn.Module):
            def __init__(self):
                super(Identity, self).__init__()

            def forward(self, x, *args):
                return x

        model = MegatronBertModel.restore_from(restore_path=nemo_file, trainer=trainer)
        # remove the headers that are only revelant for pretraining
        model.model.lm_head = Identity()
        model.model.binary_head = Identity()
        model.model.language_model.pooler = Identity()
    else:
        model = get_huggingface_lm_model(
            config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name,
        )

    if checkpoint_file:
        app_state = AppState()
        if not app_state.is_model_being_restored and not os.path.exists(checkpoint_file):
            raise ValueError(f'{checkpoint_file} not found')
        model.restore_weights(restore_path=checkpoint_file)

    return model