def eurosense_reanchor(inf: IO, outf: IO): """ Reanchors Eurosense lemmas which are actually forms including some "light" word like ei and olla by removing said unneccesary word. """ EXTRA_BITS = {"ei", "olla"} fi2en, en2fi = get_en_fi_maps() def ann_reanchor(ann): all_lemma_names = [] for _, synset in iter_synsets(ann.text): for lemma in synset.lemmas(): all_lemma_names.append(lemma.name()) if " " not in ann.attrib["lemma"]: return lem_begin, lem_rest = ann.attrib["lemma"].split(" ", 1) if lem_begin not in EXTRA_BITS: return anchor_begin = ann.attrib["anchor"].split(" ", 1)[0] for lemma_name in all_lemma_names: if lemma_name.split("_", 1)[0] in (anchor_begin, lem_begin): return ann.attrib["lemma"] = lem_rest ann.attrib["anchor"] = ann.attrib["anchor"].split(" ", 1)[1] transform_blocks(eq_matcher("annotation"), inf, ann_reanchor, outf)
def transform_senseval_contexts(inf: IO, transform_tokens: Callable[[List[str]], List[str]], outf: IO) -> None: def transform_context(context: etree.ElementBase) -> etree.ElementBase: sent: List[str] = [] before = context.text head_tag = context[0] head = head_tag.text after = head_tag.tail before_tok = space_tokenize(before) head_tok = space_tokenize(head) after_tok = space_tokenize(after) sent = before_tok + head_tok + after_tok new_sent = transform_tokens(sent) new_before = new_sent[:len(before_tok)] new_head = new_sent[len(before_tok):len(before_tok) + len(head_tok)] new_after = new_sent[len(before_tok) + len(head_tok):] context.text = "\n" + "".join(tok + " " for tok in new_before) head_tag.text = " ".join(tok for tok in new_head) head_tag.tail = "".join(" " + tok for tok in new_after) + "\n" return context transform_blocks(eq_matcher("context"), inf, transform_context, outf)
def lemma_to_synset(inf: IO, outf: IO): from stiff.munge.utils import synset_id_of_ann def l2ss(ann): ann.text = pre_id_to_post(synset_id_of_ann(ann)) transform_blocks(eq_matcher("annotation"), inf, l2ss, outf)
def reann(input: IO, output: IO): extractor = FinExtractor() text = None def proc(elem): nonlocal text if elem.tag == "text": text = elem.text else: valid = {} for ann in elem.xpath("annotation"): lemmas = [] if "&" not in ann.attrib["wnlemma"]: lemmas.append(wnlemma_to_analy_lemma( ann.attrib["wnlemma"])) lemmas.append(ann.attrib["lemma"]) valid[key_ann(ann)] = lemmas comments = {} for comment in elem.xpath("comment()"): if "XXX:" not in comment.text: continue prev_annotation = comment.xpath( "preceding-sibling::annotation") if prev_annotation: key = key_ann(prev_annotation[-1]) else: key = None comments[key] = etree.tostring(comment, encoding="unicode") processed = set() tagging = extractor.extract(text) anns = [] if None in comments: anns.append(comments[None]) bests = {} for tok in tagging.tokens: for tag in tok.tags: match = key_tok_tag(tok, tag) if match not in valid or tag.lemma not in valid[match]: continue cur_priority = valid[match].index(tag.lemma) if match in bests: if bests[match][0] < cur_priority: continue assert bests[match][0] != cur_priority bests[match] = (cur_priority, tok, tag) for _idx, tok, tag in bests.values(): match = key_tok_tag(tok, tag) assert match not in processed anns.append(man_ann_ann("fi", tok, tag)) if match in comments: anns.append(comments[match]) processed.add(match) new_elem = etree.fromstring("<div>{}</div>".format("".join(anns))) elem[:] = new_elem[:] assert len(valid) == len(processed) transform_blocks(in_matcher("text", "annotations"), input, proc, output)
def senseval_filter_lemma(lemmas, inf, outf, filter_key_out=None): lemma_poses = pickle.load(lemmas) filter_keys = set() def filter_lexelt(lexelt): if (str(lexelt.attrib["item"]), lexelt.attrib["pos"]) not in lemma_poses: return BYPASS elif filter_key_out: for instance in lexelt: filter_keys.add(instance.attrib["id"]) transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf) if filter_key_out: pickle.dump(filter_keys, filter_key_out)
def senseval_rm_lemma(inf, outf, rm_key_out=None, lemmas=None): lemmas = lemmas.split(",") if lemmas else [] rm_keys = set() def filter_lexelt(lexelt): if str(lexelt.attrib["item"]) in lemmas: if rm_key_out: for instance in lexelt: rm_keys.add(instance.attrib["id"]) return BYPASS transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf) if rm_key_out: pickle.dump(rm_keys, rm_key_out)
def stiff_select_wn(inf: IO, outf: IO, wn): from stiff.munge.utils import langs_of_wns selected_wns = set(wn) selected_langs = langs_of_wns(selected_wns) def filter_wns(wns): return [wn for wn in wns if wn in selected_wns] def select_wn(ann): # annotation[wordnets] ann_wns = ann.attrib["wordnets"].split() common_wns = filter_wns(ann_wns) if not len(common_wns): return BYPASS ann.attrib["wordnets"] = " ".join(common_wns) # annotation[wnlemma] wnlemma_bits = ann.attrib["wnlemma"].split(" ") new_wmlemmas_bits = [] for wnlemma in wnlemma_bits: wnlemma_dict = parse_qs_single(wnlemma) wnlemma_wns = wnlemma_dict["wn"].split(",") common_wns = filter_wns(wnlemma_wns) if not common_wns: continue wnlemma_dict["wn"] = ",".join(common_wns) new_wmlemmas_bits.append(urlencode(wnlemma_dict)) ann.attrib["wnlemma"] = " ".join(new_wmlemmas_bits) # annotation > #text ann_langs = langs_of_wns(ann_wns) if len(ann_langs) <= len(selected_langs): return lemmas_str = ann.text bits = lemmas_str.split(" ") assert len(bits) <= 2 if len(bits) <= 1: return if "eng" in selected_langs: ann.text = bits[0] else: ann.text = bits[1] transform_blocks(eq_matcher("annotation"), inf, select_wn, outf)
def filter(input: IO, output: IO): extractor = FinExtractor() text = None def proc(elem): nonlocal text if elem.tag == "text": text = elem.text else: tagging = extractor.extract(text) anns = [] for tok in tagging.tokens: for tag in tok.tags: anns.append(man_ann_ann("fi", tok, tag)) new_elem = etree.fromstring("<div>{}</div>".format("".join(anns))) elem[:] = new_elem[:] transform_blocks(in_matcher("text", "annotations"), input, proc, output)
def babelnet_lookup(inf: IO, map_bn2wn: IO, outf: IO): """ This stage converts BabelNet ids to WordNet ids. """ bn2wn_map: Dict[str, Set[str]] = {} for line in map_bn2wn: bn, wn_full = line[:-1].split("\t") wn_off = wn_full.split(":", 1)[1] bn2wn_map.setdefault(bn, set()).add(wn_off) def ann_bn2wn(ann): if ann.text not in bn2wn_map: return BYPASS wn_ids = bn2wn_map[ann.text] bits = [] for wn_id in wn_ids: off, pos = wn_id[:-1], wn_id[-1] bits.append("{}-{}".format(off, pos)) ann.text = " ".join(bits) transform_blocks(eq_matcher("annotation"), inf, ann_bn2wn, outf)
def senseval_select_lemma(inf, keyin, outf, keyout, lemma_pos): if "." in lemma_pos: lemma, pos = lemma_pos.rsplit(".", 1) else: lemma = lemma_pos pos = None keys = set() def filter_lexelt(lexelt): if lexelt.attrib["item"] != lemma: return BYPASS if pos and lexelt.attrib["pos"] != pos: return BYPASS for instance in lexelt: keys.add(instance.attrib["id"]) transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf) for line in keyin: if line.split(" ", 1)[0] not in keys: continue keyout.write(line)
def eurosense_fix_lemmas(inf: IO, outf: IO, keep_unknown: bool, quiet: bool): """ Eurosense contains many lemmas which are not in the set of lemmas for the synset in FinnWordNet. There are two reasons this might occur. Scenario A) Bad lemmatisation by Babelfy. In this case we can try and recover the correct lemma by lemmatising ourself and combining with information from WordNet Scenario B) Extra lemmas have been associated with the WordNet synset in BabelNet. In this case there's nothing to do, and we should usually just drop the annotation. """ fi2en, en2fi = get_en_fi_maps() def ann_fix_lemmas(ann): # 1) check if their lemmatisation matches something in FiWN as is orig_lemma_str = ann.attrib["lemma"] orig_lemma_str = orig_lemma_str.replace("#", "").replace(" ", "_") def mk_lemma_synset_map(lower=False): lemma_synset_map = {} for synset_id, synset in iter_synsets(ann.text): for lemma in synset.lemmas(): lemma_str = lemma.name() if lower: lemma_str = lemma_str.lower() lemma_synset_map.setdefault(lemma_str, set()).add(synset_id) return lemma_synset_map lemma_synset_map = mk_lemma_synset_map() if orig_lemma_str in lemma_synset_map: ann.text = " ".join(lemma_synset_map[orig_lemma_str]) ann.attrib["lemma"] = orig_lemma_str return # 2) Try and just use the surface as is as the lemma lemmatised_anchor = ann.attrib["anchor"].replace(" ", "_") lemma_synset_map_lower = mk_lemma_synset_map(lower=True) if lemmatised_anchor.lower() in lemma_synset_map_lower: ann.text = " ".join( lemma_synset_map_lower[lemmatised_anchor.lower()]) # XXX: Should be lemma in original case rather than anchor in original case ann.attrib["lemma"] = lemmatised_anchor return # 3) Re-lemmatise the surface using OMorFi and try and match with FiWN anchor_bits = ann.attrib["anchor"].split(" ") matches = {} for lemma_str, synset_id in lemma_synset_map.items(): lemma_bits = lemma_str.split("_") common = lemma_intersect(anchor_bits, lemma_bits) if common is not None: matches.setdefault(lemma_str, set()).update(synset_id) if len(matches) == 1: lemma, synsets = next(iter(matches.items())) ann.attrib["lemma"] = lemma ann.text = " ".join(synsets) return elif len(matches) > 1: if not quiet: sys.stderr.write( "Multiple lemmas found found for {}: {}\n".format( ann.attrib["anchor"], matches)) # If nothing has worked, it's probably scenario B as above elif len(matches) == 0: if not quiet: sys.stderr.write("No lemma found for {} {} {}\n".format( ann.text, orig_lemma_str, lemmatised_anchor)) if keep_unknown: ann.attrib["lemma"] = orig_lemma_str else: return BYPASS transform_blocks(eq_matcher("annotation"), inf, ann_fix_lemmas, outf)