예제 #1
0
 def test_add_special_tokens(self, test_data_dir):
     tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name,
                                        legacy=True)
     special_tokens = MODEL_SPECIAL_TOKENS
     tokenizer.add_special_tokens(special_tokens)
     assert tokenizer.vocab_size == tokenizer.original_vocab_size + len(
         set(special_tokens.values()))
예제 #2
0
    def test_ids_to_text(self, test_data_dir):
        tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name)
        special_tokens = MODEL_SPECIAL_TOKENS
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        ids = tokenizer.text_to_ids(text)
        result = tokenizer.ids_to_text(ids)

        assert text == result
예제 #3
0
    def test_text_to_ids(self, test_data_dir):
        tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name)
        special_tokens = MODEL_SPECIAL_TOKENS
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        ids = tokenizer.text_to_ids(text)

        assert len(ids) == len(text.split())
        assert ids.count(tokenizer.token_to_id("[CLS]")) == 1
        assert ids.count(tokenizer.token_to_id("[MASK]")) == 1
        assert ids.count(tokenizer.token_to_id("[SEP]")) == 2
예제 #4
0
    def test_ids_to_tokens(self, test_data_dir):
        tokenizer = SentencePieceTokenizer(test_data_dir + self.model_name)
        special_tokens = MODEL_SPECIAL_TOKENS
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        tokens = tokenizer.text_to_tokens(text)
        ids = tokenizer.tokens_to_ids(tokens)
        result = tokenizer.ids_to_tokens(ids)

        assert len(result) == len(tokens)

        for i in range(len(result)):
            assert result[i] == tokens[i]