def test_full_tokenizer(self): """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt""" tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file) text = "lower" bpe_tokens = ["low", "er</w>"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + ["<unk>"] input_bpe_tokens = [14, 15, 20] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_online_tokenizer_config(self): """this just tests that the online tokenizer files get correctly fetched and loaded via its tokenizer_config.json and it's not slow so it's run by normal CI """ tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2) self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"]) self.assertEqual(tokenizer.src_vocab_size, 21) self.assertEqual(tokenizer.tgt_vocab_size, 21)
def test_tokenizer_lower(self): tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True) tokens = tokenizer.tokenize("USA is United States of America") expected = [ "us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>" ] self.assertListEqual(tokens, expected)
def tokenizer_en_ru(self): return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")
def tokenizer_ru_en(self): return FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en")