def test_ids_to_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) assert len(result) == len(tokens) for i in range(len(result)): assert result[i] == tokens[i]
def test_ids_to_tokens(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) tokens = [ "<cls>", "a", "b", "c", "<sep>", "e", "f", "<sep>", "g", "h", "i", "</s>" ] ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) assert len(result) == len(tokens) for i in range(len(result)): assert result[i] == tokens[i]