示例#1
0
    def test_sequence_builders(self):
        tokenizer = FNetTokenizer(SAMPLE_VOCAB)

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
            tokenizer.sep_token_id
        ]
示例#2
0
    def test_full_tokenizer(self):
        tokenizer = FNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁", "T", "his", "▁is", "▁a", "▁test"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [13, 1, 4398, 25, 21, 1289])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                "▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and",
                "▁this", "▁is", "▁fal", "s", "é", "."
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                "▁",
                "<unk>",
                "▁was",
                "▁born",
                "▁in",
                "▁9",
                "2000",
                ",",
                "▁and",
                "▁this",
                "▁is",
                "▁fal",
                "s",
                "<unk>",
                ".",
            ],
        )
示例#3
0
    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = FNetTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)