예제 #1
0
    def test_full_blenderbot_small_tokenizer(self):
        tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file,
                                             **self.special_tokens_map)
        text = "adapt act apte"
        bpe_tokens = ["adapt", "act", "ap@@", "te"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]

        input_bpe_tokens = [0, 1, 2, 3, 4, 5]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                             input_bpe_tokens)
예제 #2
0
    def test_empty_word_small_tok(self):
        tok = BlenderbotSmallTokenizer.from_pretrained(
            "facebook/blenderbot-90M")
        src_text = "I am a small frog ."
        src_text_dot = "."
        encoded = tok(src_text)["input_ids"]
        encoded_dot = tok(src_text_dot)["input_ids"]

        assert encoded[-1] == encoded_dot[0]
예제 #3
0
 def test_special_tokens_small_tok(self):
     tok = BlenderbotSmallTokenizer.from_pretrained(
         "facebook/blenderbot-90M")
     assert tok("sam").input_ids == [1384]
     src_text = "I am a small frog."
     encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
     decoded = tok.batch_decode(encoded,
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=False)[0]
     assert src_text != decoded  # I wish it did!
     assert decoded == "i am a small frog ."
예제 #4
0
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return BlenderbotSmallTokenizer.from_pretrained(
         self.tmpdirname, **kwargs)