示例#1
0
    def normalize_tokenizations(cls, tokenizer, space_tokenization,
                                target_tokenization):
        """See tokenization_normalization.py for details"""
        space_tokenization = [token.lower() for token in space_tokenization]
        modifed_space_tokenization = bow_tag_tokens(space_tokenization)
        modifed_target_tokenization = process_sentencepiece_tokens(
            target_tokenization)

        return modifed_space_tokenization, modifed_target_tokenization
def test_process_sentencepiece_token_sequence():
    expected_adjusted_sentencepiece_tokens = [
        "<w>Mr",
        ".",
        "<w>I",
        "m",
        "mel",
        "t",
        "<w>chose",
        "<w>to",
        "<w>focus",
        "<w>on",
        "<w>the",
        "<w>in",
        "comp",
        "re",
        "hen",
        "s",
        "ibility",
        "<w>of",
        "<w>accounting",
        "<w>rules",
        ".",
    ]
    original_sentencepiece_tokens = [
        "▁Mr",
        ".",
        "▁I",
        "m",
        "mel",
        "t",
        "▁chose",
        "▁to",
        "▁focus",
        "▁on",
        "▁the",
        "▁in",
        "comp",
        "re",
        "hen",
        "s",
        "ibility",
        "▁of",
        "▁accounting",
        "▁rules",
        ".",
    ]
    adjusted_sentencepiece_tokens = tu.process_sentencepiece_tokens(
        original_sentencepiece_tokens)
    assert adjusted_sentencepiece_tokens == expected_adjusted_sentencepiece_tokens