def test_full_blenderbot_small_tokenizer(self): tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "adapt act apte" bpe_tokens = ["adapt", "act", "ap@@", "te"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token] input_bpe_tokens = [0, 1, 2, 3, 4, 5] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_empty_word_small_tok(self): tok = BlenderbotSmallTokenizer.from_pretrained( "facebook/blenderbot-90M") src_text = "I am a small frog ." src_text_dot = "." encoded = tok(src_text)["input_ids"] encoded_dot = tok(src_text_dot)["input_ids"] assert encoded[-1] == encoded_dot[0]
def test_special_tokens_small_tok(self): tok = BlenderbotSmallTokenizer.from_pretrained( "facebook/blenderbot-90M") assert tok("sam").input_ids == [1384] src_text = "I am a small frog." encoded = tok([src_text], padding=False, truncation=False)["input_ids"] decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] assert src_text != decoded # I wish it did! assert decoded == "i am a small frog ."
def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BlenderbotSmallTokenizer.from_pretrained( self.tmpdirname, **kwargs)