def process(self, lexicon: Lexicon): """Build Latin WFL to DeriNet format.""" def parse_lemmas(l_lem, l_pos): parse_pos = { 'A': 'ADJ', 'N': 'NOUN', 'V': 'VERB', 'I': 'X', 'P': 'PRON', 'U': 'AUX' } gend_parse = {'m': 'Masc', 'f': 'Fem', 'n': 'Neut'} pos, gend, _, wid = l_pos.split('_') feat = {} if pos[0] == 'N': if gend in ('m', 'f', 'n'): feat['Gender'] = gend_parse[gend] if len(pos) > 1: feat['Declension'] = pos[1] elif pos[0] == 'A' and len(pos) > 1: feat['AdjClass'] = pos[1] elif pos[0] == 'V': if len(pos) <= 1: pass elif pos[1] in ('1', '2', '3', '4', '5'): feat['Conjugation'] = pos[1] elif pos[1] == 'A': pos = 'U' lid = l_lem + '#' + parse_pos[pos[0]] + '#' + wid return parse_pos[pos[0]], feat, lid # load data harm = pickle.load(open(self.fname, 'rb')) # add lemmas, morphological features and segmentation for entry in harm: pos, feat, lid = parse_lemmas(entry['form'], entry['pos']) # check presence in the lexicon (due to compounds) present = lexicon.get_lexemes(lemma=entry['form'], pos=pos, lemid=lid) if len(present) == 0: lexicon.create_lexeme(lemma=entry['form'], pos=pos, feats=feat, lemid=lid) # add main relations and used afix, # add other derivational relations and used afix, # add references to splitted families, # add compounding for entry in harm: c_pos, _, c_lid = parse_lemmas(entry['form'], entry['pos']) chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos, lemid=c_lid)[0] if entry['parent']: parse = entry['parent'][0][0].split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] afix = entry['parent'][1][3] typ = entry['parent'][1][2].replace('Derivation_', '') if typ in ('Prefix', 'Suffix'): lexicon.add_derivation(source=par_node, target=chi_node) chi_node.parent_relation.feats[typ] = afix elif typ == 'Conversion': lexicon.add_conversion(source=par_node, target=chi_node) if entry['others']: # TODO: change place to 9th colummn;conversion parents = list() for other in entry['others']: parse = other[0][0].split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) afix = other[1][3] typ = other[1][2].replace('Derivation_', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: if typ in ('Prefix', 'Suffix'): p = par_node.lemid + '&' + typ + '=' + afix p += '&Type=Derivation' parents.append(p) else: parents.append(par_node.lemid + '&Type=' + typ) if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: parse = ref.split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) if entry['compounding']: p1_parse = entry['compounding'][0][0].split('_') p1_form = p1_parse[0] p1_attr = '_'.join(p1_parse[1:]) p1_pos, p1_feat, p1_lid = parse_lemmas(p1_form, p1_attr) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid) if len(p1_node) == 0: lexicon.create_lexeme(lemma=p1_form, pos=p1_pos, feats=p1_feat, lemid=p1_lid) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] p2_parse = entry['compounding'][1][0].split('_') p2_form = p2_parse[0] p2_attr = '_'.join(p2_parse[1:]) p2_pos, p2_feat, p2_lid = parse_lemmas(p2_form, p2_attr) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid) if len(p2_node) == 0: lexicon.create_lexeme(lemma=p2_form, pos=p2_pos, feats=p2_feat, lemid=p2_lid) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid)[0] if p1_node == p2_node or not p1_node or not p2_node: continue lexicon.add_composition([p1_node, p2_node], p1_node, chi_node) return lexicon
def process(self, lexicon: Lexicon): """Build GCelex to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = { 'N': 'NOUN', 'V': 'VERB', 'A': 'ADJ', 'D': 'ADV', 'X': 'X', 'C': 'NUM', 'P': 'ADP' } # add lemmas and morphological features for entry in harm: oid, form = entry['form'].split('_') lid = form + '#' + parse_pos[entry['pos']] + '#' + oid lexicon.create_lexeme(lemma=form, pos=parse_pos[entry['pos']], lemid=lid) # add main relations, # add original features, # add compounds for entry in harm: c_pos = parse_pos[entry['pos']] oid, form = entry['form'].split('_') c_lid = form + '#' + c_pos + '#' + oid chi_node = lexicon.get_lexemes(lemma=form, pos=c_pos, lemid=c_lid)[0] if entry['parent']: p_oid, p_form, p_pos = entry['parent'][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos + '#' + p_oid par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) # features orig = entry['orig'].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] chi_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] chi_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] chi_node.misc['morpheme_order'] = orig_morphs # compounds if entry['compounding']: # parent 1 p1_oid, p1_form, p1_pos = entry['compounding'][0][0].split('_') p1_pos = parse_pos[p1_pos] p1_lid = '#'.join([p1_oid, p1_form, p1_pos]) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid) if len(p1_node) == 0: lexicon.create_lexeme(lemma=p1_form, pos=p1_pos, lemid=p1_lid) # features p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] orig = entry['compounding'][0][1].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] p1_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] p1_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] p1_node.misc['morpheme_order'] = orig_morphs p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] # parent 2 p2_oid, p2_form, p2_pos = entry['compounding'][1][0].split('_') p2_pos = parse_pos[p2_pos] p2_lid = '#'.join([p2_oid, p2_form, p2_pos]) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid) if len(p2_node) == 0: lexicon.create_lexeme(lemma=p2_form, pos=p2_pos, lemid=p2_lid) # features p2_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] orig = entry['compounding'][1][1].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] p2_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] p2_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] p2_node.misc['morpheme_order'] = orig_morphs p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid)[0] if p1_node == p2_node or not p1_node or not p2_node: continue lexicon.add_composition([p1_node, p2_node], p1_node, chi_node) return lexicon