def load_tikhonov_dict(path):
    result = []
    with open(path, 'r') as p:
        for line in p:
            word, parse = line.strip().split('\t')
            result.append(parse_word(word, parse))
    return result
示例#2
0
def gen_multi_lexeme_parse(lemma, lemma_parse, multform, multform_tail):
    parsed_lemma = parse_word(lemma, lemma_parse)
    multform_parse = parsed_lemma.copy()

    if multform == "люди":
        return "люд:ROOT/и:END"
    if len(lemma) == len(multform):
        for i in range(len(multform)):
            multform_parse.set_letter(i, multform[i])
        return str(multform_parse)

    tail_parts, tail_len = parse_tail_parts(multform_tail)

    if len(multform) > len(lemma):
        if len(tail_parts
               ) == 1 and multform[:-len(tail_parts[0].text)] == lemma:
            new = True
            for letter in tail_parts[0].text:
                multform_parse.append_letter(letter, MorphemeLabel.END, new)
                new = False
            return str(multform_parse)

        new = True
        for i in range(len(multform) - tail_len):
            if i < len(multform_parse):
                if multform_parse.get_label(i) != 'END':
                    multform_parse.set_letter(i, multform[i])
                else:
                    multform_parse.set_letter(i, multform[i])
                    multform_parse.set_label(i, MorphemeLabel.SUFF)
                    new = False
            else:
                multform_parse.append_letter(multform[i], MorphemeLabel.SUFF,
                                             new)
                new = False

        while len(multform_parse) > len(multform) - tail_len:
            multform_parse.pop_letter()

        current_tail_pos = 0
        while current_tail_pos < len(tail_parts) - 1:
            if tail_parts[current_tail_pos].text == 'ь':
                new = False
                label = multform_parse.get_last_label_value()
            else:
                label = MorphemeLabel.SUFF
                new = tail_parts[current_tail_pos].text[0] != 'ь'

            for letter in tail_parts[current_tail_pos].text:
                multform_parse.append_letter(letter, label, new)
                new = False
            current_tail_pos += 1

        if current_tail_pos < len(tail_parts):
            label = MorphemeLabel.END
            if not tail_parts[current_tail_pos].new:
                label = MorphemeLabel.SUFF

            new = True
            for letter in tail_parts[current_tail_pos].text:
                multform_parse.append_letter(letter, label, new)
                new = False
        return str(multform_parse)

    return str(lemma_parse)
def gen_parse2(common_part, tail, lemma, lemma_parse, is_gerund=False):
    parsed_lemma = parse_word(lemma, lemma_parse)
    common_parse = parsed_lemma.copy()
    is_polden = lemma == "полдень"
    has_postfix = False
    if parsed_lemma.morphemes[-1].label == MorphemeLabel.POSTFIX:
        has_postfix = True

    tail_parts, tail_len = parse_tail_parts(tail)
    postfix_part = None

    if has_postfix:
        postfix_part = tail_parts[-1]
        tail_parts.pop()

    while len(common_parse) > len(common_part):
        common_parse.pop_letter()

    if len(tail_parts) > 0:
        for i in range(len(common_parse)):
            if common_parse.get_label(i) == 'END':
                common_parse.set_label(i, MorphemeLabel.SUFF)

    first_is_prefix = False
    if len(common_parse) == 0 or common_parse.get_last_label_value(
    ) == MorphemeLabel.PREF:
        first_is_prefix = True

    if not tail_parts:
        return str(common_parse)

    current_pos = len(common_parse)
    if current_pos >= len(lemma):
        if len(tail_parts) > 1:
            raise Exception("Unknown case")
        new = True
        for letter in tail_parts[0].text:
            common_parse.append_letter(letter, MorphemeLabel.END, new)
            new = False

        if has_postfix:
            new = True
            for letter in postfix_part.text:
                common_parse.append_letter(letter, MorphemeLabel.POSTFIX, new)
                new = False

        return str(common_parse)

    if parsed_lemma.get_label(current_pos) != 'ROOT':
        complete_root = True
    elif lemma[current_pos] == 'ь' and (
            len(parsed_lemma) == current_pos + 1
            or parsed_lemma.get_label(current_pos + 1) != 'ROOT'):
        if tail_parts[0].text[0] != 'ь' or len(tail_parts[0].text[0]) > 1:
            complete_root = True
        else:
            complete_root = False
    else:
        complete_root = False

    current_tail_pos = 0
    if first_is_prefix:
        if not len(common_parse) == 0 and not tail_parts[current_tail_pos].new:
            for letter in tail_parts[current_tail_pos].text:
                common_parse.append_letter(letter, MorphemeLabel.PREF, False)
            current_tail_pos += 1
        if current_tail_pos < len(tail_parts):
            new = True
            for letter in tail_parts[current_tail_pos].text:
                common_parse.append_letter(letter, MorphemeLabel.ROOT, new)
                new = False
            current_tail_pos += 1

    if current_tail_pos < len(
            tail_parts
    ) and not complete_root and not tail_parts[current_tail_pos].new:
        for letter in tail_parts[current_tail_pos].text:
            common_parse.append_letter(letter, MorphemeLabel.ROOT, False)
        current_tail_pos += 1

    while current_tail_pos < len(tail_parts) - 1:
        new = tail_parts[current_tail_pos].text[0] != 'ь'
        label = MorphemeLabel.SUFF
        if is_polden:
            label = MorphemeLabel.ROOT
        for letter in tail_parts[current_tail_pos].text:
            common_parse.append_letter(letter, label, new)
            new = False
        current_tail_pos += 1

    if current_tail_pos < len(tail_parts):
        label = MorphemeLabel.END
        if not tail_parts[current_tail_pos].new and not tail_parts[
                current_tail_pos].text == 'ь':
            label = MorphemeLabel.SUFF

        if tail_parts[current_tail_pos].text in ('л', 'вши', 'ши', 'в', 'ну',
                                                 'ек', 'ин', 'ен',
                                                 'н') or is_gerund:
            label = MorphemeLabel.SUFF

        new = True
        for letter in tail_parts[current_tail_pos].text:
            common_parse.append_letter(letter, label, new)
            new = False

    if has_postfix:
        new = False
        for letter in postfix_part.text:
            common_parse.append_letter(letter, MorphemeLabel.POSTFIX, new)
            new = False

    if len(common_parse) != len(common_part) + tail_len:
        raise Exception("Incorrect parse generated")

    return str(common_parse)