Пример #1
0
def nom_and_gen_xml(content0, content1, content2, content3, content4):
    result = []
    tag_span = 0
    sense_numbers_in_end = []

    def remove_sense_numbers(text):
        sense_numbers_in_end = []
        if text.strip().endswith('1.'):
            sense_numbers_in_end = create_nodes_for_sense_numbers('1.')
            text = text[:text.index('1.')]
        elif text.strip().endswith('1. a)'):
            sense_numbers_in_end = create_nodes_for_sense_numbers('1. a)')
            text = text[:text.index('1.')]

        return text, sense_numbers_in_end

    def create_nodes_for_sense_numbers(sense_number):
        res = []
        if sense_number == '1.':
            res.append(nf.create_bold_hi_node('1.'))
        elif sense_number == '1. a)':
            res.append(nf.create_bold_hi_node('1.'))
            res.append(nf.create_bold_hi_node(' a)'))

        return res



    if content0.rstrip()[-1] == ',':
        form_lemma = nf.create_form_lemma_node(content0.rstrip()[:-1])
        usg_gen = nf.create_usg_node('gen.')
        if content2.strip() == '-':
            content3, sense_numbers_in_end = remove_sense_numbers(content3)
            form_inflected = nf.create_form_inflected_node(' -' + content3)
            tag_span = 4
        else:
            content2, sense_numbers_in_end = remove_sense_numbers(content2)
            form_inflected = nf.create_form_inflected_node(' ' + content2.lstrip())
            tag_span = 3

    else:
        form_lemma = nf.create_form_lemma_node(content0)
        usg_gen = nf.create_usg_node('gen.')
        if content3.strip() == '-':
            content4, sense_numbers_in_end = remove_sense_numbers(content4)
            form_inflected = nf.create_form_inflected_node(' -' + content4)
            tag_span = 5
        else:
            content3, sense_numbers_in_end = remove_sense_numbers(content3)
            form_inflected = nf.create_form_inflected_node(' ' + content3.lstrip())
            tag_span = 4

    pc = nf.create_pc_node(', ')

    result.extend([form_lemma, pc, *usg_gen, form_inflected])

    return result, tag_span, sense_numbers_in_end
Пример #2
0
def adj_multiple_forms_xml(content0):
    content0_split = content0.split(', ')
    result = []

    form_lemma = nf.create_form_lemma_node(content0_split[0])
    result.append(form_lemma)

    for word in content0_split[1:]:
        form_inflected = nf.create_form_inflected_node(word)
        pc = nf.create_pc_node(', ')
        result.append(pc)
        result.append(form_inflected)

    return result
def unknown_initial_xml(content0):
    res = []
    content0_split = [
        x for x in content0.split(', ') if not is_empty_string(x)
    ]
    for i in range(len(content0_split)):
        if i == 0:
            form_lemma = nf.create_form_lemma_node(content0_split[i])
            res.append(form_lemma)

            if len(content0_split) > 1:
                pc = nf.create_pc_node(', ')
                res.append(pc)
            # TODO: Extra comma / missing comma
        elif i == len(content0_split) - 1:
            form_inflected = nf.create_form_inflected_node(content0_split[i])
            res.append(form_inflected)
        else:
            form_inflected = nf.create_form_inflected_node(content0_split[i])
            pc = nf.create_pc_node(', ')
            res.append(form_inflected)
            res.append(pc)
    return res
Пример #4
0
def noun_xml(content0, content1):
    result = []

    content0_split = content0.rsplit(', ', 1)
    form_lemma = nf.create_form_lemma_node(content0_split[0])
    result.append(form_lemma)

    if content0_split[1]:
        pc = nf.create_pc_node(', ')
        form_inflected = nf.create_form_inflected_node(content0_split[1])
        result.append(pc)
        result.append(form_inflected)

    gram_grp = nf.create_gram_grp(content1)
    result.append(gram_grp)

    return result
Пример #5
0
def verb_xml(entry_type, content0, content1):
    result = []
    content0_split = content0.split(', ')
    form_lemma = nf.create_form_lemma_node(content0_split[0])
    result.append(form_lemma)

    if len(content0_split) > 1:
        for word in content0_split[1:]:
            pc = nf.create_pc_node(', ')
            form_inflected = nf.create_form_inflected_node(word)
            result.append(pc)
            result.append(form_inflected)

    if entry_type != 'special_verb':
        gram_grp = nf.create_gram_grp(content1, "iType")
        result.append(gram_grp)

    return result
def unknown_entry_partially_encode(entry):
    old_morph_part = copy(entry.encoded_parts['morph_part'])
    entry.encoded_parts['morph_part'] = []

    counter = 0
    while old_morph_part:
        content_node = [old_morph_part[0]]
        node_content = SafeString(old_morph_part[0].text)

        if counter == 0:
            content_node = unknown_initial_xml(content_node[0].text)

        elif node_content.strip().startswith(
                '(') and node_content.strip().endswith(')'):
            content_node = [nf.create_extra_morph(node_content)]

        elif node_content.strip() in (punctuation + '–'):
            content_node = [nf.create_pc_node(node_content)]

        elif old_morph_part[0].get('rend') == "italic":
            if node_content.strip() in ('m', 'f', 'n'):
                content_node = [nf.create_gram_grp(node_content)]

            elif len(entry.encoded_parts['morph_part']) == 1 and len(old_morph_part) >= 2 and old_morph_part[1].get('rend' ) == 'bold' and \
                 node_content.strip() == 'и' and entry.encoded_parts['morph_part'][0].tag == nf.get_ns('form'):
                content_node = []
                entry.encoded_parts['morph_part'][0].append(old_morph_part[0])
                entry.encoded_parts['morph_part'][0].append(
                    nf.create_orth_node(SafeString(old_morph_part[1].text)))
                old_morph_part.pop(0)

            else:
                content_node = nf.create_usg_node(node_content)

        elif node_content.strip() in ('1', '2', '3', '4') and (
                len(old_morph_part) == 1
                or old_morph_part[1].text.strip() != '.'):
            content_node = [nf.create_gram_grp(node_content, 'iType')]

        [entry.encoded_parts['morph_part'].append(x) for x in content_node]
        old_morph_part.pop(0)
        counter += 1
Пример #7
0
def create_cit_nodes(node_content):
    result = []
    node_content, *separated_end_punctuation = separate_dash_dot_semi_colon_in_end_of_node_content(SafeString(node_content))

    split_contents = node_content.split('; ')
    split_contents = [x for x in split_contents if not is_empty_string(x)]

    for y in range(len(split_contents)):
        x = split_contents[y]

        for i in range(len(x.split(' '))):
            word = x.split(' ')[i]
            if is_empty_string(word) and i != len(x.split(' '))-1:
                continue

            if has_more_cyrillic_than_latin(word) and not (len(word) == 1 and not has_more_cyrillic_than_latin(SafeString(x).split(' ')[i+1])):
                if i > 0:
                    if not (x.split(' ')[0] == '' and i == 1):
                        cit_node = nf.assemble_cit_nodes('example', ' '.join((x.split(' ')[:i])) + ' ')
                        result.append(cit_node)
                if x.split(' ')[0] == '' and i == 1:
                    cit_node = nf.assemble_cit_nodes('translation', ' '.join(x.split(' ')))
                else:
                    cit_node = nf.assemble_cit_nodes('translation', ' '.join(x.split(' ')[i:]))
                result.append(cit_node)
                break
            elif i == len(x.split(' ')) - 1:
                cit_node = nf.assemble_cit_nodes('example', ' '.join((x.split(' '))))
                result.append(cit_node)
                break

        if y < (len(split_contents)-1):
            result.append(nf.create_pc_node('; '))

    for punct_node in separated_end_punctuation:
        result.append(punct_node)

    return result
Пример #8
0
def separate_dash_dot_semi_colon_in_end_of_node_content(node_content):
    dash_node = None
    dot_node = None
    s_colon_node = None

    result = deque()

    if node_content.strip().endswith('–'):
        if node_content.endswith(' '):
            node_content = node_content.rstrip()[:-1]
            dash_node = nf.create_pc_node('— ')
        else:
            dash_node = nf.create_pc_node('—')
            node_content = node_content[:-1]
        result.appendleft(dash_node)

    if node_content.strip().endswith('.'):
        if node_content.endswith(' '):
            node_content = node_content.rstrip()[:-1]
            dot_node = nf.create_pc_node('. ')
        else:
            dot_node = nf.create_pc_node('.')
            node_content = node_content[:-1]
        result.appendleft(dot_node)

    if node_content.strip().endswith(';'):
        if node_content.endswith(' '):
            node_content = node_content.rstrip()[:-1]
            s_colon_node = nf.create_pc_node('; ')
        else:
            s_colon_node = nf.create_pc_node(';')
            node_content = node_content[:-1]
        result.appendleft(s_colon_node)

    result.appendleft(node_content)

    return result
Пример #9
0
def encode_senses(entry):
    raw_senses = entry.raw_senses
    title_lemma = entry.title_lemma
    last_sense_container = None
    numbers = []

    if raw_senses:
        fix_dot_in_next_node(raw_senses)


        if one_is_missing(raw_senses):
            add_missing_one(entry, title_lemma)
            numbers.append('1')

        if not is_numbered_entry(raw_senses):
            entry.encoded_parts['senses'].append(nf.create_sense_container_non_numbered(title_lemma))
            last_sense_container = entry.encoded_parts['senses'][0]
            numbers.append('1')

    else:
        deal_with_completely_unknown_entry(entry)


    while raw_senses:
        initial = SafeString(raw_senses[0].text).strip()
        initial = fix_cyrillic_letter(initial)

        if is_subsense_number(initial):

            if fix_mixed_numbers(entry, initial):
                continue

            sense_number = initial[0]
            if sense_number == 'I':
                sense_number = initial[:-1]
            numbers.append(sense_number)
            last_sense_container = create_subsense_number_node(title_lemma, numbers, SafeString(raw_senses[0].text))
            append_sense_container_and_label(entry, last_sense_container)

        else:
            content_node = [raw_senses[0]]
            current_text = SafeString(raw_senses[0].text)

            if current_text.strip() in (punctuation + '–'):
                content_node = [nf.create_pc_node(current_text)]

            elif raw_senses[0].get('rend') == "italic":
                content_node = nf.create_usg_node(current_text)

            elif raw_senses[0].get('rend') == "bold" and has_more_cyrillic_than_latin(current_text):
                content_node = nf.create_def_node(current_text)

            elif (not last_sense_container or len([x for x in last_sense_container.getchildren() if x.tag in (nf.get_ns('cit'), nf.get_ns('quote'))]) == 0) and \
                has_more_cyrillic_than_latin(current_text.strip().split(' ')[0]):

                    node_content, *separated_end_punctuation = separate_dash_dot_semi_colon_in_end_of_node_content(current_text)

                    content_node = []

                    found_latin = False

                    i = 0
                    for i in range(len(node_content.split(' '))):
                        word = node_content.split(' ')[i]
                        if is_empty_string(word):
                            continue
                        if has_more_cyrillic_than_latin(word) and not (len(word) == 1 and not has_more_cyrillic_than_latin(SafeString(node_content).split(' ')[i+1])):
                            pass
                        else:
                            found_latin = True
                            break
                    if not found_latin:
                        def_node = nf.create_def_node(node_content)
                        content_node.extend(def_node)
                    else:
                        def_node = nf.create_def_node(' '.join(node_content.split(' ')[:i]) + ' ')
                        content_node.extend(def_node)
                        cit_node = create_cit_nodes(' '.join(node_content.split(' ')[i:]))
                        content_node.extend(cit_node)

                    for punct_node in separated_end_punctuation:
                        content_node.append(punct_node)



            else:
                content_node = create_cit_nodes(current_text)

            if last_sense_container is not None:
                [last_sense_container.append(x) for x in content_node]
            else:
                [entry.encoded_parts['senses'].append(x) for x in content_node]

        raw_senses.pop(0)