def load_tikhonov_dict(path): result = [] with open(path, 'r') as p: for line in p: word, parse = line.strip().split('\t') result.append(parse_word(word, parse)) return result
def gen_multi_lexeme_parse(lemma, lemma_parse, multform, multform_tail): parsed_lemma = parse_word(lemma, lemma_parse) multform_parse = parsed_lemma.copy() if multform == "люди": return "люд:ROOT/и:END" if len(lemma) == len(multform): for i in range(len(multform)): multform_parse.set_letter(i, multform[i]) return str(multform_parse) tail_parts, tail_len = parse_tail_parts(multform_tail) if len(multform) > len(lemma): if len(tail_parts ) == 1 and multform[:-len(tail_parts[0].text)] == lemma: new = True for letter in tail_parts[0].text: multform_parse.append_letter(letter, MorphemeLabel.END, new) new = False return str(multform_parse) new = True for i in range(len(multform) - tail_len): if i < len(multform_parse): if multform_parse.get_label(i) != 'END': multform_parse.set_letter(i, multform[i]) else: multform_parse.set_letter(i, multform[i]) multform_parse.set_label(i, MorphemeLabel.SUFF) new = False else: multform_parse.append_letter(multform[i], MorphemeLabel.SUFF, new) new = False while len(multform_parse) > len(multform) - tail_len: multform_parse.pop_letter() current_tail_pos = 0 while current_tail_pos < len(tail_parts) - 1: if tail_parts[current_tail_pos].text == 'ь': new = False label = multform_parse.get_last_label_value() else: label = MorphemeLabel.SUFF new = tail_parts[current_tail_pos].text[0] != 'ь' for letter in tail_parts[current_tail_pos].text: multform_parse.append_letter(letter, label, new) new = False current_tail_pos += 1 if current_tail_pos < len(tail_parts): label = MorphemeLabel.END if not tail_parts[current_tail_pos].new: label = MorphemeLabel.SUFF new = True for letter in tail_parts[current_tail_pos].text: multform_parse.append_letter(letter, label, new) new = False return str(multform_parse) return str(lemma_parse)
def gen_parse2(common_part, tail, lemma, lemma_parse, is_gerund=False): parsed_lemma = parse_word(lemma, lemma_parse) common_parse = parsed_lemma.copy() is_polden = lemma == "полдень" has_postfix = False if parsed_lemma.morphemes[-1].label == MorphemeLabel.POSTFIX: has_postfix = True tail_parts, tail_len = parse_tail_parts(tail) postfix_part = None if has_postfix: postfix_part = tail_parts[-1] tail_parts.pop() while len(common_parse) > len(common_part): common_parse.pop_letter() if len(tail_parts) > 0: for i in range(len(common_parse)): if common_parse.get_label(i) == 'END': common_parse.set_label(i, MorphemeLabel.SUFF) first_is_prefix = False if len(common_parse) == 0 or common_parse.get_last_label_value( ) == MorphemeLabel.PREF: first_is_prefix = True if not tail_parts: return str(common_parse) current_pos = len(common_parse) if current_pos >= len(lemma): if len(tail_parts) > 1: raise Exception("Unknown case") new = True for letter in tail_parts[0].text: common_parse.append_letter(letter, MorphemeLabel.END, new) new = False if has_postfix: new = True for letter in postfix_part.text: common_parse.append_letter(letter, MorphemeLabel.POSTFIX, new) new = False return str(common_parse) if parsed_lemma.get_label(current_pos) != 'ROOT': complete_root = True elif lemma[current_pos] == 'ь' and ( len(parsed_lemma) == current_pos + 1 or parsed_lemma.get_label(current_pos + 1) != 'ROOT'): if tail_parts[0].text[0] != 'ь' or len(tail_parts[0].text[0]) > 1: complete_root = True else: complete_root = False else: complete_root = False current_tail_pos = 0 if first_is_prefix: if not len(common_parse) == 0 and not tail_parts[current_tail_pos].new: for letter in tail_parts[current_tail_pos].text: common_parse.append_letter(letter, MorphemeLabel.PREF, False) current_tail_pos += 1 if current_tail_pos < len(tail_parts): new = True for letter in tail_parts[current_tail_pos].text: common_parse.append_letter(letter, MorphemeLabel.ROOT, new) new = False current_tail_pos += 1 if current_tail_pos < len( tail_parts ) and not complete_root and not tail_parts[current_tail_pos].new: for letter in tail_parts[current_tail_pos].text: common_parse.append_letter(letter, MorphemeLabel.ROOT, False) current_tail_pos += 1 while current_tail_pos < len(tail_parts) - 1: new = tail_parts[current_tail_pos].text[0] != 'ь' label = MorphemeLabel.SUFF if is_polden: label = MorphemeLabel.ROOT for letter in tail_parts[current_tail_pos].text: common_parse.append_letter(letter, label, new) new = False current_tail_pos += 1 if current_tail_pos < len(tail_parts): label = MorphemeLabel.END if not tail_parts[current_tail_pos].new and not tail_parts[ current_tail_pos].text == 'ь': label = MorphemeLabel.SUFF if tail_parts[current_tail_pos].text in ('л', 'вши', 'ши', 'в', 'ну', 'ек', 'ин', 'ен', 'н') or is_gerund: label = MorphemeLabel.SUFF new = True for letter in tail_parts[current_tail_pos].text: common_parse.append_letter(letter, label, new) new = False if has_postfix: new = False for letter in postfix_part.text: common_parse.append_letter(letter, MorphemeLabel.POSTFIX, new) new = False if len(common_parse) != len(common_part) + tail_len: raise Exception("Incorrect parse generated") return str(common_parse)