def test_add_special_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name, legacy=True) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) assert tokenizer.vocab_size == tokenizer.original_vocab_size + len( set(special_tokens.values()))
def test_tokens_to_text(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) result = tokenizer.tokens_to_text(tokens) assert text == result
def test_ids_to_text(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) text = "<cls> a b c <sep> e f g h i </s>" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) assert text == result
def test_tokens_to_text(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) # <cls> is user_defined_symbol in the test tokenizer model text = "<cls> a b c e f g h i" tokens = tokenizer.text_to_tokens(text) result = tokenizer.tokens_to_text(tokens) assert text == result
def test_text_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) # <cls> is user_defined_symbol in the test tokenizer model # <unk>, <sep>, <s>, and </s> are control symbols text = "<cls> a b c <sep> e f g h i </s>" tokens = tokenizer.text_to_ids(text) assert tokens.count(tokenizer.token_to_id("<cls>")) == 1 assert tokens.count(tokenizer.token_to_id("<sep>")) == 0 assert tokens.count(tokenizer.token_to_id("</s>")) == 0
def test_text_to_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) assert len(tokens) == len(text.split()) assert tokens.count("[CLS]") == 1 assert tokens.count("[MASK]") == 1 assert tokens.count("[SEP]") == 2
def test_ids_to_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) tokens = [ "<cls>", "a", "b", "c", "<sep>", "e", "f", "<sep>", "g", "h", "i", "</s>" ] ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) assert len(result) == len(tokens) for i in range(len(result)): assert result[i] == tokens[i]
def test_tokens_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name, legacy=True) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) assert len(ids) == len(tokens) assert ids.count(tokenizer.token_to_id("[CLS]")) == 1 assert ids.count(tokenizer.token_to_id("[MASK]")) == 1 assert ids.count(tokenizer.token_to_id("[SEP]")) == 2
def test_ids_to_text(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) assert text == result
def test_ids_to_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) assert len(result) == len(tokens) for i in range(len(result)): assert result[i] == tokens[i]
def get_monolingual_tokenizer( tokenizer_name=None, tokenizer_model=None, bpe_dropout=0.0, ): if tokenizer_name == 'yttm': if bpe_dropout is None: bpe_dropout = 0.0 tokenizer = get_tokenizer( tokenizer_name=tokenizer_name, tokenizer_model=tokenizer_model, bpe_dropout=bpe_dropout, ) elif tokenizer_name == 'sentencepiece': tokenizer = SentencePieceTokenizer(model_path=tokenizer_model) else: try: tokenizer = get_tokenizer(tokenizer_name, special_tokens={"pad_token": "[PAD]"}) except Exception as e: raise ValueError(f'{tokenizer_name} is not supported by either NeMo or HuggingFace. {e}') return tokenizer
def test_tokens_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) tokens = [ "<cls>", "a", "b", "c", "<sep>", "e", "f", "<sep>", "g", "h", "i", "</s>" ] ids = tokenizer.tokens_to_ids(tokens) assert len(ids) == len(tokens) assert ids.count(tokenizer.token_to_id("<cls>")) == 1 assert ids.count(tokenizer.token_to_id("</s>")) == 1 assert ids.count(tokenizer.token_to_id("<sep>")) == 2
class EnJaTokenizer: """ Tokenizer for Japanese & English that does Moses tokenization followed by SentencePiece Args: sp_tokenizer_model_path: String path to a sentencepiece model lang_id: One of ['en', 'ja']. """ def __init__(self, sp_tokenizer_model_path: str, lang_id: str): self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.sp_tokenizer = SentencePieceTokenizer(model_path=sp_tokenizer_model_path) def sp_tokenize(self, text: str) -> str: return ' '.join(self.sp_tokenizer.text_to_tokens(text)) def tokenize(self, text, escape=False, return_str=False): """ Tokenizes text using Moses -> Sentencepiece. """ text = self.moses_tokenizer.tokenize(text, escape=escape, return_str=True) text = self.sp_tokenize(text) return text if return_str else text.split()
def __init__(self, sp_tokenizer_model_path: str, lang_id: str): self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.sp_tokenizer = SentencePieceTokenizer(model_path=sp_tokenizer_model_path)