def test_online_tokenizer_config(self): """this just tests that the online tokenizer files get correctly fetched and loaded via its tokenizer_config.json and it's not slow so it's run by normal CI """ tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2) self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"]) self.assertEqual(tokenizer.src_vocab_size, 21) self.assertEqual(tokenizer.tgt_vocab_size, 21)
def test_tokenizer_lower(self): tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True) tokens = tokenizer.tokenize("USA is United States of America") expected = [ "us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>" ] self.assertListEqual(tokens, expected)
def tokenizer_en_ru(self): return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")
def tokenizer_ru_en(self): return FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en")