示例#1
0
    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = PLBartTokenizer(SAMPLE_VOCAB,
                                    language_codes="base",
                                    keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
示例#2
0
 def test_special_tokens_unaffacted_by_save_load(self):
     tmpdirname = tempfile.mkdtemp()
     original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
     self.tokenizer.save_pretrained(tmpdirname)
     new_tok = PLBartTokenizer.from_pretrained(tmpdirname)
     self.assertDictEqual(new_tok.fairseq_tokens_to_ids,
                          original_special_tokens)
示例#3
0
 def setUpClass(cls):
     cls.tokenizer: PLBartTokenizer = PLBartTokenizer.from_pretrained(
         cls.checkpoint_name,
         language_codes="base",
         src_lang="python",
         tgt_lang="en_XX")
     cls.pad_token_id = 1
     return cls
示例#4
0
    def test_full_base_tokenizer(self):
        tokenizer = PLBartTokenizer(SAMPLE_VOCAB,
                                    language_codes="base",
                                    keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens),
            [
                value + tokenizer.fairseq_offset
                for value in [285, 46, 10, 170, 382]
            ],
        )

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [
                value + tokenizer.fairseq_offset for value in [
                    8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66,
                    46, 72, 80, 6, 2, 4
                ]
            ],
        )

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )

        end = tokenizer.vocab_size
        language_tokens = [
            tokenizer.convert_ids_to_tokens(x) for x in range(end - 4, end)
        ]

        self.assertListEqual(language_tokens,
                             ["java", "python", "en_XX", "<mask>"])