def test_merge_dagdra(): token_list = pos_tok.tokenize("བཀྲ་ཤིས་-པ་") token_list = [t for t in token_list if t.text != "-" ] # remove the "-" inserted to ensure we have two tokens mp = MergeDagdra() mp.merge(token_list) assert len(token_list) == 1 and token_list[0].text == "བཀྲ་ཤིས་པ་" token_list = pos_tok.tokenize("བཀྲ་ཤིས་-པའོ།") token_list = [t for t in token_list if t.text != "-" ] # remove the "-" inserted to ensure we have two tokens mp.merge(token_list) assert len(token_list) == 3 and token_list[0].text == "བཀྲ་ཤིས་པ"
def test_tokenize(): profile = "empty" main, custom = Config().get_tok_data_paths(profile) tok = Tokenize(Trie(BoSyl, profile, main, custom)) tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") tok.trie.inflect_n_modify_trie("མཐའ་") tok.trie.inflect_n_add_data("མཐའ་\tNOUN") in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་" preproc = TokChunks(in_str) preproc.serve_syls_to_trie() tokens = tok.tokenize(preproc) expected = dedent( """\ text: "བཀྲ་ཤིས" text_cleaned: "བཀྲ་ཤིས་" text_unaffixed: "བཀྲ་ཤིས་" syls: ["བཀྲ", "ཤིས"] senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False | char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| chunk_type: TEXT syls_idx: [[0, 1, 2], [4, 5, 6]] syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 """ ) str(tokens[0]) assert str(tokens[1]) == expected assert tokens[2].text == "། " assert tokens[2].chunk_type == "PUNCT" # add sense to བཀྲ་ཤིས་ pos_tok.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") tokens = pos_tok.tokenize(in_str) expected = dedent( """\ text: "བཀྲ་ཤིས" text_cleaned: "བཀྲ་ཤིས་" text_unaffixed: "བཀྲ་ཤིས་" syls: ["བཀྲ", "ཤིས"] pos: NOUN lemma: བཀྲ་ཤིས་ sense: བཀྲ་ཤིས་ senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ | char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| chunk_type: TEXT freq: 17500 syls_idx: [[0, 1, 2], [4, 5, 6]] syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 """ ) assert str(tokens[2]) == expected
def test_num_lemmas_missing(): in_str = "སྟོང་ཕྲག་བརྒྱ་པ་སུམ་བརྒྱ་པ་བཅུ་པ་ལྔ་པ་" tokens = pos_tok.tokenize(in_str) assert [t.lemma for t in tokens] == [ "སྟོང་ཕྲག་", "བརྒྱ་པ་", "སུམ་བརྒྱ་པ་", "བཅུ་པ་", "ལྔ་པ་", ]
def test_missing_token26(): input_str = "སྲི་མོ་བཛྲ་ནོ་ཏི་སྟ་ཀཱི་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == [ "སྲི་", "མོ་", "བཛྲ་", "ནོ་", "ཏི་", "སྟ་", "ཀཱི་", ]
def test_adjust_tokens(): string = "ལ་ལ་ལ་ལ་ལ་བ་ཡོད།" token_list = pos_tok.tokenize(string, split_affixes=False) at = AdjustTokens(main=main, custom=custom) adjusted = at.adjust(token_list) assert token_list[0].text == "ལ་ལ་" assert token_list[1].text == "ལ་ལ་" assert adjusted[0].text == "ལ་" assert adjusted[0].pos == "PART" assert adjusted[1].text == "ལ་ལ་" assert adjusted[1].pos == "PART" assert adjusted[2].text == "ལ་" assert adjusted[2].pos == "PART"
def test_missing_token13(): input_str = "།ཨོཾ་གི་རི་ཧི་རི་ཙི་རི། །ཨཱ་ཨཱ་ཤུ་མ་ཤ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == [ "།", "ཨོཾ་", "གི་", "རི་", "ཧི་", "རི་", "ཙི་", "རི", "། །", "ཨཱ་", "ཨཱ་", "ཤུ་", "མ་", "ཤ་", ]
def test_segmentation_bug(): tokens = pos_tok.tokenize("ལ་པོ་ལ་པོ་ལ་པོ་") assert len(tokens) == 3 tokens = pos_tok.tokenize("ལ་མོ་ལ་མོ་ལ་མོ་") assert len(tokens) == 3 tokens = pos_tok.tokenize("གྲོགས་པོ་གྲོགས་པོ་གྲོགས་པོ་") assert len(tokens) == 3 tokens = pos_tok.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་དང་") assert len(tokens) == 4 tokens = pos_tok.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་") assert len(tokens) == 3 tokens = pos_tok.tokenize( "བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་") assert len(tokens) == 9
def test_missing_token11(): input_str = "སྦོམ་ཞིང་ཆེ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["སྦོམ་", "ཞིང་", "ཆེ་"]
def test_missing_token9(): input_str = "བསྐོལ། །རྡོ་རྗེ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བསྐོལ", "། །", "རྡོ་རྗེ་"]
def test_missing_token8(): input_str = "བི་སི་ནི་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བི་", "སི་", "ནི་"]
def test_missing_token59(): input_str = "།་གླེན་ལྐུགས་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["།་", "གླེན་ལྐུགས་"]
def test_missing_token57(): input_str = "སྒལ་བརྒྱུད་ཞབས་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["སྒལ་", "བརྒྱུད་", "ཞབས་"]
def test_missing_token44(): input_str = "བཏུལ་མཚམས་བཅད་པ" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བཏུལ་", "མཚམས་", "བཅད་པ"]
def test_missing_token43(): input_str = "གཅོད་འཁོར་ལོ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["གཅོད་", "འཁོར་ལོ་"]
def test_missing_token42(): input_str = "ཞིབ་བས་སྦལ།" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["ཞིབ་", "བས་", "སྦལ", "།"]
def test_missing_token55(): input_str = "བྷ་གར་སྦྱོར་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བྷ་", "གར་", "སྦྱོར་"]
def test_missing_token56(): input_str = "བརྒྱུད་སྐུ་གདུང་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བརྒྱུད་", "སྐུ་གདུང་"]
def test_missing_token46(): input_str = "བསྐུར་ལས་ཀྱི་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བསྐུར་", "ལས་", "ཀྱི་"]
def test_missing_token58(): input_str = "བརྩེགས་ཆེ་མཆོག་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བརྩེགས་", "ཆེ་མཆོག་"]
def test_missing_token47(): input_str = "འཁོས་དུ། །ཆེ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["འཁོས་", "དུ", "། །", "ཆེ་"]
def test_missing_token7(): input_str = "གདབ། །ཨོཾ་ན་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["གདབ", "། །", "ཨོཾ་", "ན་"]
def test_missing_token48(): input_str = "ནུ་ཧེ་རུ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["ནུ་", "ཧེ་", "རུ་"]
def test_missing_token1(): input_str = "འཐུང་བུད་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["འཐུང་", "བུད་"]
def test_missing_token49(): input_str = "བརྩེགས་རྣམ་པར་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བརྩེགས་", "རྣམ་པར་"]
def test_missing_token10(): input_str = "བསྐུས་ཤིང་མཉེས་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བསྐུས་", "ཤིང་", "མཉེས་"]
def test_missing_token51(): input_str = "ནུ་ཡེ་ཤེས་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["ནུ་", "ཡེ་ཤེས་"]
def test_missing_token12(): input_str = "བྷ་ག་ཁ་ཆེ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བྷ་", "ག་", "ཁ་ཆེ་"]
def test_missing_token54(): input_str = "བྷ་གར་འཁྱིལ། །ཨོཾ་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བྷ་", "གར་", "འཁྱིལ", "། །", "ཨོཾ་"]
def test_missing_token14(): input_str = "བཟླས་བྱས་" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["བཟླས་", "བྱས་"]
def test_missing_token6(): input_str = "རབ་བསྐུས་ནས།" tokens = pos_tok.tokenize(input_str, split_affixes=False) assert [t.text for t in tokens] == ["རབ་", "བསྐུས་", "ནས", "།"]