def eurosense_reanchor(inf: IO, outf: IO): """ Reanchors Eurosense lemmas which are actually forms including some "light" word like ei and olla by removing said unneccesary word. """ EXTRA_BITS = {"ei", "olla"} fi2en, en2fi = get_en_fi_maps() def ann_reanchor(ann): all_lemma_names = [] for _, synset in iter_synsets(ann.text): for lemma in synset.lemmas(): all_lemma_names.append(lemma.name()) if " " not in ann.attrib["lemma"]: return lem_begin, lem_rest = ann.attrib["lemma"].split(" ", 1) if lem_begin not in EXTRA_BITS: return anchor_begin = ann.attrib["anchor"].split(" ", 1)[0] for lemma_name in all_lemma_names: if lemma_name.split("_", 1)[0] in (anchor_begin, lem_begin): return ann.attrib["lemma"] = lem_rest ann.attrib["anchor"] = ann.attrib["anchor"].split(" ", 1)[1] transform_blocks(eq_matcher("annotation"), inf, ann_reanchor, outf)
def write_lemma(keyout, inst_id, lemma): fi2en, en2fi = get_en_fi_maps() if lemma is None: guess = "U" else: chosen_synset_fi_id = ss2pre(lemma.synset()) if chosen_synset_fi_id not in fi2en: sys.stderr.write("No fi2en mapping found for {} ({})\n".format( chosen_synset_fi_id, lemma)) guess = "U" else: guess = pre_id_to_post(fi2en[chosen_synset_fi_id]) keyout.write("{} {}\n".format(inst_id, guess))
def maybe_fi2en_ss(ss: Synset) -> Optional[Synset]: from finntk.wordnet.reader import get_en_fi_maps from finntk.wordnet.utils import ss2pre, pre2ss from nltk.corpus import wordnet fi2en, _en2fi = get_en_fi_maps() pre_fi = ss2pre(ss) pre_en = fi2en.get(pre_fi) if pre_en is None: return None try: return pre2ss(wordnet, pre_en) except WordNetError: return None
def _map_qf2(synset_obj): fi2en, en2fi = get_en_fi_maps() return fi2en[ss2pre(synset_obj)]
def eurosense_fix_lemmas(inf: IO, outf: IO, keep_unknown: bool, quiet: bool): """ Eurosense contains many lemmas which are not in the set of lemmas for the synset in FinnWordNet. There are two reasons this might occur. Scenario A) Bad lemmatisation by Babelfy. In this case we can try and recover the correct lemma by lemmatising ourself and combining with information from WordNet Scenario B) Extra lemmas have been associated with the WordNet synset in BabelNet. In this case there's nothing to do, and we should usually just drop the annotation. """ fi2en, en2fi = get_en_fi_maps() def ann_fix_lemmas(ann): # 1) check if their lemmatisation matches something in FiWN as is orig_lemma_str = ann.attrib["lemma"] orig_lemma_str = orig_lemma_str.replace("#", "").replace(" ", "_") def mk_lemma_synset_map(lower=False): lemma_synset_map = {} for synset_id, synset in iter_synsets(ann.text): for lemma in synset.lemmas(): lemma_str = lemma.name() if lower: lemma_str = lemma_str.lower() lemma_synset_map.setdefault(lemma_str, set()).add(synset_id) return lemma_synset_map lemma_synset_map = mk_lemma_synset_map() if orig_lemma_str in lemma_synset_map: ann.text = " ".join(lemma_synset_map[orig_lemma_str]) ann.attrib["lemma"] = orig_lemma_str return # 2) Try and just use the surface as is as the lemma lemmatised_anchor = ann.attrib["anchor"].replace(" ", "_") lemma_synset_map_lower = mk_lemma_synset_map(lower=True) if lemmatised_anchor.lower() in lemma_synset_map_lower: ann.text = " ".join( lemma_synset_map_lower[lemmatised_anchor.lower()]) # XXX: Should be lemma in original case rather than anchor in original case ann.attrib["lemma"] = lemmatised_anchor return # 3) Re-lemmatise the surface using OMorFi and try and match with FiWN anchor_bits = ann.attrib["anchor"].split(" ") matches = {} for lemma_str, synset_id in lemma_synset_map.items(): lemma_bits = lemma_str.split("_") common = lemma_intersect(anchor_bits, lemma_bits) if common is not None: matches.setdefault(lemma_str, set()).update(synset_id) if len(matches) == 1: lemma, synsets = next(iter(matches.items())) ann.attrib["lemma"] = lemma ann.text = " ".join(synsets) return elif len(matches) > 1: if not quiet: sys.stderr.write( "Multiple lemmas found found for {}: {}\n".format( ann.attrib["anchor"], matches)) # If nothing has worked, it's probably scenario B as above elif len(matches) == 0: if not quiet: sys.stderr.write("No lemma found for {} {} {}\n".format( ann.text, orig_lemma_str, lemmatised_anchor)) if keep_unknown: ann.attrib["lemma"] = orig_lemma_str else: return BYPASS transform_blocks(eq_matcher("annotation"), inf, ann_fix_lemmas, outf)
def iter_synsets(synset_list): fi2en, en2fi = get_en_fi_maps() for synset_id in synset_list.split(" "): fi_pre_synset = en2fi[post_id_to_pre(synset_id)] synset = pre2ss(fiwn, fi_pre_synset) yield synset_id, synset