def test_ids_to_text(self, test_data_dir): tokenizer = YouTokenToMeTokenizer(test_data_dir + self.model_name) text = "a b c e f g h i" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) assert text == result
def test_text_to_ids(self, test_data_dir): tokenizer = YouTokenToMeTokenizer(test_data_dir + self.model_name) text = "<BOS> a b c <UNK> e f g h i <EOS>" tokens = tokenizer.text_to_ids(text) assert tokens.count(tokenizer.bos_id) == 0 assert tokens.count(tokenizer.unk_id) == 0 assert tokens.count(tokenizer.eos_id) == 0
def test_text_to_tokens(self, test_data_dir): tokenizer = YouTokenToMeTokenizer(test_data_dir + self.model_name) text = "<BOS> a b c e <UNK> f g h i <EOS>" tokens = tokenizer.text_to_tokens(text) assert tokens.count("<BOS>") == 0 assert tokens.count("<UNK>") == 0 assert tokens.count("<EOS>") == 0
def test_tokens_to_ids(self, test_data_dir): tokenizer = YouTokenToMeTokenizer(test_data_dir + self.model_name) tokens = [ "<BOS>", "a", "b", "c", "<UNK>", "e", "f", "<UNK>", "g", "h", "i", "<EOS>" ] ids = tokenizer.tokens_to_ids(tokens) assert len(ids) == len(tokens) assert ids.count(tokenizer.bos_id) == 1 assert ids.count(tokenizer.eos_id) == 1 assert ids.count(tokenizer.unk_id) == 2
def test_ids_to_tokens(self, test_data_dir): tokenizer = YouTokenToMeTokenizer(test_data_dir + self.model_name) tokens = [ "<BOS>", "a", "b", "c", "<UNK>", "e", "f", "<UNK>", "g", "h", "i", "<EOS>" ] ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) assert len(result) == len(tokens) for i in range(len(result)): assert result[i] == tokens[i]
def get_tokenizer( tokenizer_name: str, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: tokenizer_name: sentencepiece or pretrained model from the hugging face list, for example: bert-base-cased To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure of BPE to help model better learn word compositionality and become robust to segmentation errors. It has emperically been shown to improve inference time BLEU scores. """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: if not HAVE_APEX: raise RuntimeError("Apex required to use megatron.") if vocab_file is None: vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( tokenizer_name ) merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file( tokenizer_name ) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens, legacy=True ) elif tokenizer_name == 'yttm': return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif tokenizer_name == 'word': return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) logging.info( f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" ) return AutoTokenizer( pretrained_model_name=tokenizer_name, vocab_file=vocab_file, merges_file=merges_file, **special_tokens_dict, use_fast=use_fast, )
def get_nmt_tokenizer( library: str = 'yttm', model_name: Optional[str] = None, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, r2l: Optional[bool] = False, ): """ Args: model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure of BPE to help model better learn word compositionality and become robust to segmentation errors. It has empirically been shown to improve inference time BLEU scores. r2l: Whether to return subword IDs from right to left """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if library == 'yttm': logging.info(f'Getting YouTokenToMeTokenizer with model: {tokenizer_model} with r2l: {r2l}.') return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout, r2l=r2l) elif library == 'huggingface': logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}') return AutoTokenizer( pretrained_model_name=model_name, vocab_file=vocab_file, merges_file=merges_file, **special_tokens_dict, use_fast=use_fast, ) elif library == 'sentencepiece': logging.info(f'Getting SentencePiece with model: {tokenizer_model}') return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens_dict ) elif library == 'byte-level': logging.info(f'Using byte-level tokenization') return ByteLevelTokenizer() elif library == 'megatron': if model_name in megatron_tokenizer_model_map: model_name = megatron_tokenizer_model_map[model_name] logging.info( f'Getting Megatron tokenizer for pretrained model name: {model_name} and custom vocab file: {vocab_file}' ) return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file) else: raise NotImplementedError( 'Currently we only support "yttm", "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer' 'libraries.' )
def get_nmt_tokenizer( library: str = 'yttm', model_name: Optional[str] = None, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: model_name: if using a pretrained model from NeMo or HuggingFace tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure of BPE to help model better learn word compositionality and become robust to segmentation errors. It has emperically been shown to improve inference time BLEU scores. """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if library == 'yttm': logging.info( f'Getting YouTokenToMeTokenizer with model: {tokenizer_model}.') return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif library == 'huggingface': logging.info( f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}' ) return AutoTokenizer(pretrained_model_name=model_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast) elif library == 'sentencepiece': logging.info(f'Getting SentencePiece with model: {model_name}') return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens_dict) else: raise NotImplementedError( 'Currently we only support "yttm", "huggingface", and "sentencepiece" tokenizer library.' )
def get_tokenizer( tokenizer_name: str, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: tokenizer_name: sentencepiece or pretrained model from the hugging face list, for example: bert-base-cased To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: if vocab_file is None: vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( tokenizer_name) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens) elif tokenizer_name == 'yttm': return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif tokenizer_name == 'word': return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) return AutoTokenizer(pretrained_model_name=tokenizer_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast)
def get_nmt_tokenizer( library: str = 'yttm', model_name: Optional[str] = None, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: model_name: if using a pretrained model from NeMo or HuggingFace tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer """ if library == 'yttm': logging.info( f'Getting YouTokenToMeTokenizer with model: {tokenizer_model}.') return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif library == 'huggingface': if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens logging.info( f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}' ) return AutoTokenizer(pretrained_model_name=model_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast) else: raise NotImplementedError( 'Currently we only support "yttm" and "huggingface" tokenizer library.' )