def get_tokenizer_list() -> List[str]: """ Returns all all supported tokenizer names """ s = set(get_pretrained_lm_models_list()) s.update(set(get_huggingface_pretrained_lm_models_list(include_external=True))) return ["sentencepiece", "char", "word"] + list(s)
def get_pretrained_lm_models_list(include_external: bool = False) -> List[str]: """ Returns the list of supported pretrained model names Args: include_external if true includes all HuggingFace model names, not only those supported language models in NeMo. """ return get_huggingface_pretrained_lm_models_list(include_external=include_external)
def __init__( self, model_name: Optional[str] = None, pretrained: bool = False, config_dict: Optional[dict] = None, checkpoint_file: Optional[str] = None, ): """Gets HuggingFace based model to be used as an Encoder in NeMo NLP. Use the model_name arg to get a named model architecture. Available model names can be found with get_huggingface_pretrained_lm_models_list() or by going to https://huggingface.co/models. Use the pretrained arg to get the named model architecture with or without pretrained weights. If model_name is None, then we can pass in a custom configuration via the config_dict. For example, to instantiate a HuggingFace BERT model with custom configuration we would do: config_dict={ '_target_': 'transformers.BertConfig', 'hidden_size': 1536 } Args: model_name (Optional[str]): Named model architecture from HuggingFace. Defaults to None. pretrained (bool): Use True to get pretrained weights. False will use the same architecture but with randomly initialized weights. Defaults to False. config_dict (Optional[dict], optional): Use for custom configuration of the HuggingFace model. Defaults to None. checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint. Defaults to None. """ super().__init__() if checkpoint_file: raise NotImplementedError( 'Restoring from checkpoint file not implemented yet.') model = None if model_name is not None: if model_name in get_huggingface_pretrained_lm_models_list( include_external=True): if pretrained: config_dict.pop('vocab_size') if config_dict: raise ValueError( f'When using pretrained model, config_dict should be None or empty. Got: {config_dict}' ) model = AutoModel.from_pretrained(model_name) else: cfg = AutoConfig.from_pretrained(model_name) model = AutoModel.from_config(cfg) else: logging.error( f'{model_name} not found in list of HuggingFace pretrained models' ) else: if pretrained: raise ValueError( f'If not using model_name, then pretrained should be False. Got: {pretrained}.' ) cfg = instantiate(config_dict) model = AutoModel.from_config(cfg) self._hidden_size = model.config.hidden_size self._vocab_size = model.config.vocab_size self._encoder = model