def test_full_tokenizer(self):
        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens,
                             ["▁", "[UNK]", "his", "▁is", "▁a", "▁test"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [13, 1, 4398, 25, 21, 1289])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        # fmt: off
        self.assertListEqual(
            tokens,
            [
                "▁", "[UNK]", "▁was", "▁born", "▁in", "▁9", "2000", ",",
                "▁and", "▁this", "▁is", "▁fal", "s", "[UNK]", "."
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                "▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",",
                "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."
            ],
        )
    def test_sequence_builders(self):
        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
            tokenizer.sep_token_id
        ]
示例#3
0
    def test_do_lower_case(self):
        # fmt: off
        sequence = " \tHeLLo!how  \n Are yoU?  "
        tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=True)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
示例#4
0
    def test_do_lower_case_false_split_by_punct(self):
        # fmt: off
        sequence = "I was born in 92000, and this is falsé."
        tokens_target = [
            "▁",
            "<unk>",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            "▁",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "<unk>",
            "▁",
            ".",
        ]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB,
                                       do_lower_case=False,
                                       split_by_punct=True)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=False,
                                                split_by_punct=True)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
示例#5
0
    def test_do_lower_case_false_split_by_punct_false(self):
        # fmt: off
        sequence = " \tHeLLo!how  \n Are yoU?  "
        tokens_target = [
            "▁", "<unk>", "e", "<unk>", "o", "!", "how", "▁", "<unk>", "re",
            "▁yo", "<unk>", "?"
        ]
        # fmt: on

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB,
                                       do_lower_case=False,
                                       split_by_punct=False)
        tokens = tokenizer.convert_ids_to_tokens(
            tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(tokens, tokens_target)

        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                do_lower_case=False,
                                                split_by_punct=False)
        rust_tokens = rust_tokenizer.convert_ids_to_tokens(
            rust_tokenizer.encode(sequence, add_special_tokens=False))

        self.assertListEqual(rust_tokens, tokens_target)
示例#6
0
    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
示例#7
0
    def test_full_tokenizer(self):
        sequence = "This is a test"
        ids_target = [13, 1, 4398, 25, 21, 1289]
        tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
        back_tokens_target = ["▁", "<unk>", "his", "▁is", "▁a", "▁test"]

        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB,
                                                keep_accents=True)

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, ids_target)
        tokens = tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, tokens_target)
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, back_tokens_target)

        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(rust_ids, ids_target)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(rust_tokens, tokens_target)
        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
        self.assertListEqual(rust_back_tokens, back_tokens_target)

        # fmt: off
        sequence = "I was born in 92000, and this is falsé."
        ids_target = [
            13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9
        ]
        tokens_target = [
            "▁",
            "I",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "é",
            ".",
        ]
        back_tokens_target = [
            "▁",
            "<unk>",
            "▁was",
            "▁born",
            "▁in",
            "▁9",
            "2000",
            ",",
            "▁and",
            "▁this",
            "▁is",
            "▁fal",
            "s",
            "<unk>",
            ".",
        ]
        # fmt: on

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, ids_target)
        tokens = tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, tokens_target)
        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, back_tokens_target)

        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(rust_ids, ids_target)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(rust_tokens, tokens_target)
        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
        self.assertListEqual(rust_back_tokens, back_tokens_target)