def normalize_tokenizations(cls, tokenizer, space_tokenization, target_tokenization): """See tokenization_normalization.py for details""" space_tokenization = [token.lower() for token in space_tokenization] modifed_space_tokenization = bow_tag_tokens(space_tokenization) modifed_target_tokenization = process_sentencepiece_tokens( target_tokenization) return modifed_space_tokenization, modifed_target_tokenization
def test_process_sentencepiece_token_sequence(): expected_adjusted_sentencepiece_tokens = [ "<w>Mr", ".", "<w>I", "m", "mel", "t", "<w>chose", "<w>to", "<w>focus", "<w>on", "<w>the", "<w>in", "comp", "re", "hen", "s", "ibility", "<w>of", "<w>accounting", "<w>rules", ".", ] original_sentencepiece_tokens = [ "▁Mr", ".", "▁I", "m", "mel", "t", "▁chose", "▁to", "▁focus", "▁on", "▁the", "▁in", "comp", "re", "hen", "s", "ibility", "▁of", "▁accounting", "▁rules", ".", ] adjusted_sentencepiece_tokens = tu.process_sentencepiece_tokens( original_sentencepiece_tokens) assert adjusted_sentencepiece_tokens == expected_adjusted_sentencepiece_tokens