def nom_and_gen_xml(content0, content1, content2, content3, content4): result = [] tag_span = 0 sense_numbers_in_end = [] def remove_sense_numbers(text): sense_numbers_in_end = [] if text.strip().endswith('1.'): sense_numbers_in_end = create_nodes_for_sense_numbers('1.') text = text[:text.index('1.')] elif text.strip().endswith('1. a)'): sense_numbers_in_end = create_nodes_for_sense_numbers('1. a)') text = text[:text.index('1.')] return text, sense_numbers_in_end def create_nodes_for_sense_numbers(sense_number): res = [] if sense_number == '1.': res.append(nf.create_bold_hi_node('1.')) elif sense_number == '1. a)': res.append(nf.create_bold_hi_node('1.')) res.append(nf.create_bold_hi_node(' a)')) return res if content0.rstrip()[-1] == ',': form_lemma = nf.create_form_lemma_node(content0.rstrip()[:-1]) usg_gen = nf.create_usg_node('gen.') if content2.strip() == '-': content3, sense_numbers_in_end = remove_sense_numbers(content3) form_inflected = nf.create_form_inflected_node(' -' + content3) tag_span = 4 else: content2, sense_numbers_in_end = remove_sense_numbers(content2) form_inflected = nf.create_form_inflected_node(' ' + content2.lstrip()) tag_span = 3 else: form_lemma = nf.create_form_lemma_node(content0) usg_gen = nf.create_usg_node('gen.') if content3.strip() == '-': content4, sense_numbers_in_end = remove_sense_numbers(content4) form_inflected = nf.create_form_inflected_node(' -' + content4) tag_span = 5 else: content3, sense_numbers_in_end = remove_sense_numbers(content3) form_inflected = nf.create_form_inflected_node(' ' + content3.lstrip()) tag_span = 4 pc = nf.create_pc_node(', ') result.extend([form_lemma, pc, *usg_gen, form_inflected]) return result, tag_span, sense_numbers_in_end
def adj_multiple_forms_xml(content0): content0_split = content0.split(', ') result = [] form_lemma = nf.create_form_lemma_node(content0_split[0]) result.append(form_lemma) for word in content0_split[1:]: form_inflected = nf.create_form_inflected_node(word) pc = nf.create_pc_node(', ') result.append(pc) result.append(form_inflected) return result
def unknown_initial_xml(content0): res = [] content0_split = [ x for x in content0.split(', ') if not is_empty_string(x) ] for i in range(len(content0_split)): if i == 0: form_lemma = nf.create_form_lemma_node(content0_split[i]) res.append(form_lemma) if len(content0_split) > 1: pc = nf.create_pc_node(', ') res.append(pc) # TODO: Extra comma / missing comma elif i == len(content0_split) - 1: form_inflected = nf.create_form_inflected_node(content0_split[i]) res.append(form_inflected) else: form_inflected = nf.create_form_inflected_node(content0_split[i]) pc = nf.create_pc_node(', ') res.append(form_inflected) res.append(pc) return res
def noun_xml(content0, content1): result = [] content0_split = content0.rsplit(', ', 1) form_lemma = nf.create_form_lemma_node(content0_split[0]) result.append(form_lemma) if content0_split[1]: pc = nf.create_pc_node(', ') form_inflected = nf.create_form_inflected_node(content0_split[1]) result.append(pc) result.append(form_inflected) gram_grp = nf.create_gram_grp(content1) result.append(gram_grp) return result
def verb_xml(entry_type, content0, content1): result = [] content0_split = content0.split(', ') form_lemma = nf.create_form_lemma_node(content0_split[0]) result.append(form_lemma) if len(content0_split) > 1: for word in content0_split[1:]: pc = nf.create_pc_node(', ') form_inflected = nf.create_form_inflected_node(word) result.append(pc) result.append(form_inflected) if entry_type != 'special_verb': gram_grp = nf.create_gram_grp(content1, "iType") result.append(gram_grp) return result
def unknown_entry_partially_encode(entry): old_morph_part = copy(entry.encoded_parts['morph_part']) entry.encoded_parts['morph_part'] = [] counter = 0 while old_morph_part: content_node = [old_morph_part[0]] node_content = SafeString(old_morph_part[0].text) if counter == 0: content_node = unknown_initial_xml(content_node[0].text) elif node_content.strip().startswith( '(') and node_content.strip().endswith(')'): content_node = [nf.create_extra_morph(node_content)] elif node_content.strip() in (punctuation + '–'): content_node = [nf.create_pc_node(node_content)] elif old_morph_part[0].get('rend') == "italic": if node_content.strip() in ('m', 'f', 'n'): content_node = [nf.create_gram_grp(node_content)] elif len(entry.encoded_parts['morph_part']) == 1 and len(old_morph_part) >= 2 and old_morph_part[1].get('rend' ) == 'bold' and \ node_content.strip() == 'и' and entry.encoded_parts['morph_part'][0].tag == nf.get_ns('form'): content_node = [] entry.encoded_parts['morph_part'][0].append(old_morph_part[0]) entry.encoded_parts['morph_part'][0].append( nf.create_orth_node(SafeString(old_morph_part[1].text))) old_morph_part.pop(0) else: content_node = nf.create_usg_node(node_content) elif node_content.strip() in ('1', '2', '3', '4') and ( len(old_morph_part) == 1 or old_morph_part[1].text.strip() != '.'): content_node = [nf.create_gram_grp(node_content, 'iType')] [entry.encoded_parts['morph_part'].append(x) for x in content_node] old_morph_part.pop(0) counter += 1
def create_cit_nodes(node_content): result = [] node_content, *separated_end_punctuation = separate_dash_dot_semi_colon_in_end_of_node_content(SafeString(node_content)) split_contents = node_content.split('; ') split_contents = [x for x in split_contents if not is_empty_string(x)] for y in range(len(split_contents)): x = split_contents[y] for i in range(len(x.split(' '))): word = x.split(' ')[i] if is_empty_string(word) and i != len(x.split(' '))-1: continue if has_more_cyrillic_than_latin(word) and not (len(word) == 1 and not has_more_cyrillic_than_latin(SafeString(x).split(' ')[i+1])): if i > 0: if not (x.split(' ')[0] == '' and i == 1): cit_node = nf.assemble_cit_nodes('example', ' '.join((x.split(' ')[:i])) + ' ') result.append(cit_node) if x.split(' ')[0] == '' and i == 1: cit_node = nf.assemble_cit_nodes('translation', ' '.join(x.split(' '))) else: cit_node = nf.assemble_cit_nodes('translation', ' '.join(x.split(' ')[i:])) result.append(cit_node) break elif i == len(x.split(' ')) - 1: cit_node = nf.assemble_cit_nodes('example', ' '.join((x.split(' ')))) result.append(cit_node) break if y < (len(split_contents)-1): result.append(nf.create_pc_node('; ')) for punct_node in separated_end_punctuation: result.append(punct_node) return result
def separate_dash_dot_semi_colon_in_end_of_node_content(node_content): dash_node = None dot_node = None s_colon_node = None result = deque() if node_content.strip().endswith('–'): if node_content.endswith(' '): node_content = node_content.rstrip()[:-1] dash_node = nf.create_pc_node('— ') else: dash_node = nf.create_pc_node('—') node_content = node_content[:-1] result.appendleft(dash_node) if node_content.strip().endswith('.'): if node_content.endswith(' '): node_content = node_content.rstrip()[:-1] dot_node = nf.create_pc_node('. ') else: dot_node = nf.create_pc_node('.') node_content = node_content[:-1] result.appendleft(dot_node) if node_content.strip().endswith(';'): if node_content.endswith(' '): node_content = node_content.rstrip()[:-1] s_colon_node = nf.create_pc_node('; ') else: s_colon_node = nf.create_pc_node(';') node_content = node_content[:-1] result.appendleft(s_colon_node) result.appendleft(node_content) return result
def encode_senses(entry): raw_senses = entry.raw_senses title_lemma = entry.title_lemma last_sense_container = None numbers = [] if raw_senses: fix_dot_in_next_node(raw_senses) if one_is_missing(raw_senses): add_missing_one(entry, title_lemma) numbers.append('1') if not is_numbered_entry(raw_senses): entry.encoded_parts['senses'].append(nf.create_sense_container_non_numbered(title_lemma)) last_sense_container = entry.encoded_parts['senses'][0] numbers.append('1') else: deal_with_completely_unknown_entry(entry) while raw_senses: initial = SafeString(raw_senses[0].text).strip() initial = fix_cyrillic_letter(initial) if is_subsense_number(initial): if fix_mixed_numbers(entry, initial): continue sense_number = initial[0] if sense_number == 'I': sense_number = initial[:-1] numbers.append(sense_number) last_sense_container = create_subsense_number_node(title_lemma, numbers, SafeString(raw_senses[0].text)) append_sense_container_and_label(entry, last_sense_container) else: content_node = [raw_senses[0]] current_text = SafeString(raw_senses[0].text) if current_text.strip() in (punctuation + '–'): content_node = [nf.create_pc_node(current_text)] elif raw_senses[0].get('rend') == "italic": content_node = nf.create_usg_node(current_text) elif raw_senses[0].get('rend') == "bold" and has_more_cyrillic_than_latin(current_text): content_node = nf.create_def_node(current_text) elif (not last_sense_container or len([x for x in last_sense_container.getchildren() if x.tag in (nf.get_ns('cit'), nf.get_ns('quote'))]) == 0) and \ has_more_cyrillic_than_latin(current_text.strip().split(' ')[0]): node_content, *separated_end_punctuation = separate_dash_dot_semi_colon_in_end_of_node_content(current_text) content_node = [] found_latin = False i = 0 for i in range(len(node_content.split(' '))): word = node_content.split(' ')[i] if is_empty_string(word): continue if has_more_cyrillic_than_latin(word) and not (len(word) == 1 and not has_more_cyrillic_than_latin(SafeString(node_content).split(' ')[i+1])): pass else: found_latin = True break if not found_latin: def_node = nf.create_def_node(node_content) content_node.extend(def_node) else: def_node = nf.create_def_node(' '.join(node_content.split(' ')[:i]) + ' ') content_node.extend(def_node) cit_node = create_cit_nodes(' '.join(node_content.split(' ')[i:])) content_node.extend(cit_node) for punct_node in separated_end_punctuation: content_node.append(punct_node) else: content_node = create_cit_nodes(current_text) if last_sense_container is not None: [last_sense_container.append(x) for x in content_node] else: [entry.encoded_parts['senses'].append(x) for x in content_node] raw_senses.pop(0)