示例#1
0
文件: munge.py 项目: frankier/STIFF
def eurosense_reanchor(inf: IO, outf: IO):
    """
    Reanchors Eurosense lemmas which are actually forms including some "light"
    word like ei and olla by removing said unneccesary word.
    """
    EXTRA_BITS = {"ei", "olla"}
    fi2en, en2fi = get_en_fi_maps()

    def ann_reanchor(ann):
        all_lemma_names = []
        for _, synset in iter_synsets(ann.text):
            for lemma in synset.lemmas():
                all_lemma_names.append(lemma.name())
        if " " not in ann.attrib["lemma"]:
            return
        lem_begin, lem_rest = ann.attrib["lemma"].split(" ", 1)
        if lem_begin not in EXTRA_BITS:
            return
        anchor_begin = ann.attrib["anchor"].split(" ", 1)[0]
        for lemma_name in all_lemma_names:
            if lemma_name.split("_", 1)[0] in (anchor_begin, lem_begin):
                return
        ann.attrib["lemma"] = lem_rest
        ann.attrib["anchor"] = ann.attrib["anchor"].split(" ", 1)[1]

    transform_blocks(eq_matcher("annotation"), inf, ann_reanchor, outf)
示例#2
0
def write_lemma(keyout, inst_id, lemma):
    fi2en, en2fi = get_en_fi_maps()
    if lemma is None:
        guess = "U"
    else:
        chosen_synset_fi_id = ss2pre(lemma.synset())
        if chosen_synset_fi_id not in fi2en:
            sys.stderr.write("No fi2en mapping found for {} ({})\n".format(
                chosen_synset_fi_id, lemma))
            guess = "U"
        else:
            guess = pre_id_to_post(fi2en[chosen_synset_fi_id])
    keyout.write("{} {}\n".format(inst_id, guess))
示例#3
0
def maybe_fi2en_ss(ss: Synset) -> Optional[Synset]:
    from finntk.wordnet.reader import get_en_fi_maps
    from finntk.wordnet.utils import ss2pre, pre2ss
    from nltk.corpus import wordnet

    fi2en, _en2fi = get_en_fi_maps()
    pre_fi = ss2pre(ss)
    pre_en = fi2en.get(pre_fi)
    if pre_en is None:
        return None
    try:
        return pre2ss(wordnet, pre_en)
    except WordNetError:
        return None
示例#4
0
文件: fin.py 项目: pombredanne/STIFF
def _map_qf2(synset_obj):
    fi2en, en2fi = get_en_fi_maps()
    return fi2en[ss2pre(synset_obj)]
示例#5
0
文件: munge.py 项目: frankier/STIFF
def eurosense_fix_lemmas(inf: IO, outf: IO, keep_unknown: bool, quiet: bool):
    """
    Eurosense contains many lemmas which are not in the set of lemmas for the
    synset in FinnWordNet. There are two reasons this might occur.

    Scenario A) Bad lemmatisation by Babelfy. In this case we can try and
    recover the correct lemma by lemmatising ourself and combining with
    information from WordNet

    Scenario B) Extra lemmas have been associated with the WordNet synset in
    BabelNet.  In this case there's nothing to do, and we should usually just
    drop the annotation.
    """
    fi2en, en2fi = get_en_fi_maps()

    def ann_fix_lemmas(ann):
        # 1) check if their lemmatisation matches something in FiWN as is
        orig_lemma_str = ann.attrib["lemma"]
        orig_lemma_str = orig_lemma_str.replace("#", "").replace(" ", "_")

        def mk_lemma_synset_map(lower=False):
            lemma_synset_map = {}
            for synset_id, synset in iter_synsets(ann.text):
                for lemma in synset.lemmas():
                    lemma_str = lemma.name()
                    if lower:
                        lemma_str = lemma_str.lower()
                    lemma_synset_map.setdefault(lemma_str,
                                                set()).add(synset_id)
            return lemma_synset_map

        lemma_synset_map = mk_lemma_synset_map()

        if orig_lemma_str in lemma_synset_map:
            ann.text = " ".join(lemma_synset_map[orig_lemma_str])
            ann.attrib["lemma"] = orig_lemma_str
            return
        # 2) Try and just use the surface as is as the lemma
        lemmatised_anchor = ann.attrib["anchor"].replace(" ", "_")

        lemma_synset_map_lower = mk_lemma_synset_map(lower=True)
        if lemmatised_anchor.lower() in lemma_synset_map_lower:
            ann.text = " ".join(
                lemma_synset_map_lower[lemmatised_anchor.lower()])
            # XXX: Should be lemma in original case rather than anchor in original case
            ann.attrib["lemma"] = lemmatised_anchor
            return
        # 3) Re-lemmatise the surface using OMorFi and try and match with FiWN
        anchor_bits = ann.attrib["anchor"].split(" ")
        matches = {}

        for lemma_str, synset_id in lemma_synset_map.items():
            lemma_bits = lemma_str.split("_")
            common = lemma_intersect(anchor_bits, lemma_bits)
            if common is not None:
                matches.setdefault(lemma_str, set()).update(synset_id)
        if len(matches) == 1:
            lemma, synsets = next(iter(matches.items()))
            ann.attrib["lemma"] = lemma
            ann.text = " ".join(synsets)
            return
        elif len(matches) > 1:
            if not quiet:
                sys.stderr.write(
                    "Multiple lemmas found found for {}: {}\n".format(
                        ann.attrib["anchor"], matches))
        # If nothing has worked, it's probably scenario B as above
        elif len(matches) == 0:
            if not quiet:
                sys.stderr.write("No lemma found for {} {} {}\n".format(
                    ann.text, orig_lemma_str, lemmatised_anchor))
        if keep_unknown:
            ann.attrib["lemma"] = orig_lemma_str
        else:
            return BYPASS

    transform_blocks(eq_matcher("annotation"), inf, ann_fix_lemmas, outf)
示例#6
0
文件: munge.py 项目: frankier/STIFF
def iter_synsets(synset_list):
    fi2en, en2fi = get_en_fi_maps()
    for synset_id in synset_list.split(" "):
        fi_pre_synset = en2fi[post_id_to_pre(synset_id)]
        synset = pre2ss(fiwn, fi_pre_synset)
        yield synset_id, synset