def test_text_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) # <cls> is user_defined_symbol in the test tokenizer model # <unk>, <sep>, <s>, and </s> are control symbols text = "<cls> a b c <sep> e f g h i </s>" tokens = tokenizer.text_to_ids(text) assert tokens.count(tokenizer.token_to_id("<cls>")) == 1 assert tokens.count(tokenizer.token_to_id("<sep>")) == 0 assert tokens.count(tokenizer.token_to_id("</s>")) == 0
def test_text_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) special_tokens = MODEL_SPECIAL_TOKENS tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) assert len(ids) == len(text.split()) assert ids.count(tokenizer.token_to_id("[CLS]")) == 1 assert ids.count(tokenizer.token_to_id("[MASK]")) == 1 assert ids.count(tokenizer.token_to_id("[SEP]")) == 2
def test_tokens_to_ids(self, test_data_dir): tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name) tokens = [ "<cls>", "a", "b", "c", "<sep>", "e", "f", "<sep>", "g", "h", "i", "</s>" ] ids = tokenizer.tokens_to_ids(tokens) assert len(ids) == len(tokens) assert ids.count(tokenizer.token_to_id("<cls>")) == 1 assert ids.count(tokenizer.token_to_id("</s>")) == 1 assert ids.count(tokenizer.token_to_id("<sep>")) == 2