def setUp(self): super().setUp() # We have a SentencePiece fixture for testing tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
def test_special_tokens_unaffacted_by_save_load(self): tmpdirname = tempfile.mkdtemp() original_special_tokens = self.tokenizer.fairseq_tokens_to_ids self.tokenizer.save_pretrained(tmpdirname) new_tok = PLBartTokenizer.from_pretrained(tmpdirname) self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
def setUpClass(cls): cls.tokenizer: PLBartTokenizer = PLBartTokenizer.from_pretrained( cls.checkpoint_name, language_codes="base", src_lang="python", tgt_lang="en_XX") cls.pad_token_id = 1 return cls
def test_full_base_tokenizer(self): tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [ value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382] ], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [ value + tokenizer.fairseq_offset for value in [ 8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4 ] ], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], ) end = tokenizer.vocab_size language_tokens = [ tokenizer.convert_ids_to_tokens(x) for x in range(end - 4, end) ] self.assertListEqual(language_tokens, ["java", "python", "en_XX", "<mask>"])