예제 #1
0
def transform_senseval_contexts(inf: IO,
                                transform_tokens: Callable[[List[str]],
                                                           List[str]],
                                outf: IO) -> None:
    def transform_context(context: etree.ElementBase) -> etree.ElementBase:
        sent: List[str] = []
        before = context.text
        head_tag = context[0]
        head = head_tag.text
        after = head_tag.tail

        before_tok = space_tokenize(before)
        head_tok = space_tokenize(head)
        after_tok = space_tokenize(after)

        sent = before_tok + head_tok + after_tok
        new_sent = transform_tokens(sent)

        new_before = new_sent[:len(before_tok)]
        new_head = new_sent[len(before_tok):len(before_tok) + len(head_tok)]
        new_after = new_sent[len(before_tok) + len(head_tok):]

        context.text = "\n" + "".join(tok + " " for tok in new_before)
        head_tag.text = " ".join(tok for tok in new_head)
        head_tag.tail = "".join(" " + tok for tok in new_after) + "\n"
        return context

    transform_blocks(eq_matcher("context"), inf, transform_context, outf)
예제 #2
0
파일: munge.py 프로젝트: frankier/STIFF
def eurosense_reanchor(inf: IO, outf: IO):
    """
    Reanchors Eurosense lemmas which are actually forms including some "light"
    word like ei and olla by removing said unneccesary word.
    """
    EXTRA_BITS = {"ei", "olla"}
    fi2en, en2fi = get_en_fi_maps()

    def ann_reanchor(ann):
        all_lemma_names = []
        for _, synset in iter_synsets(ann.text):
            for lemma in synset.lemmas():
                all_lemma_names.append(lemma.name())
        if " " not in ann.attrib["lemma"]:
            return
        lem_begin, lem_rest = ann.attrib["lemma"].split(" ", 1)
        if lem_begin not in EXTRA_BITS:
            return
        anchor_begin = ann.attrib["anchor"].split(" ", 1)[0]
        for lemma_name in all_lemma_names:
            if lemma_name.split("_", 1)[0] in (anchor_begin, lem_begin):
                return
        ann.attrib["lemma"] = lem_rest
        ann.attrib["anchor"] = ann.attrib["anchor"].split(" ", 1)[1]

    transform_blocks(eq_matcher("annotation"), inf, ann_reanchor, outf)
예제 #3
0
파일: munge.py 프로젝트: frankier/STIFF
def lemma_to_synset(inf: IO, outf: IO):
    from stiff.munge.utils import synset_id_of_ann

    def l2ss(ann):
        ann.text = pre_id_to_post(synset_id_of_ann(ann))

    transform_blocks(eq_matcher("annotation"), inf, l2ss, outf)
예제 #4
0
파일: munge.py 프로젝트: frankier/STIFF
def senseval_filter_lemma(lemmas, inf, outf, filter_key_out=None):
    lemma_poses = pickle.load(lemmas)
    filter_keys = set()

    def filter_lexelt(lexelt):
        if (str(lexelt.attrib["item"]),
                lexelt.attrib["pos"]) not in lemma_poses:
            return BYPASS
        elif filter_key_out:
            for instance in lexelt:
                filter_keys.add(instance.attrib["id"])

    transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf)

    if filter_key_out:
        pickle.dump(filter_keys, filter_key_out)
예제 #5
0
파일: munge.py 프로젝트: frankier/STIFF
def senseval_rm_lemma(inf, outf, rm_key_out=None, lemmas=None):
    lemmas = lemmas.split(",") if lemmas else []

    rm_keys = set()

    def filter_lexelt(lexelt):
        if str(lexelt.attrib["item"]) in lemmas:
            if rm_key_out:
                for instance in lexelt:
                    rm_keys.add(instance.attrib["id"])
            return BYPASS

    transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf)

    if rm_key_out:
        pickle.dump(rm_keys, rm_key_out)
예제 #6
0
파일: munge.py 프로젝트: frankier/STIFF
def stiff_select_wn(inf: IO, outf: IO, wn):
    from stiff.munge.utils import langs_of_wns

    selected_wns = set(wn)
    selected_langs = langs_of_wns(selected_wns)

    def filter_wns(wns):
        return [wn for wn in wns if wn in selected_wns]

    def select_wn(ann):
        # annotation[wordnets]
        ann_wns = ann.attrib["wordnets"].split()
        common_wns = filter_wns(ann_wns)
        if not len(common_wns):
            return BYPASS
        ann.attrib["wordnets"] = " ".join(common_wns)

        # annotation[wnlemma]
        wnlemma_bits = ann.attrib["wnlemma"].split(" ")
        new_wmlemmas_bits = []
        for wnlemma in wnlemma_bits:
            wnlemma_dict = parse_qs_single(wnlemma)
            wnlemma_wns = wnlemma_dict["wn"].split(",")
            common_wns = filter_wns(wnlemma_wns)
            if not common_wns:
                continue
            wnlemma_dict["wn"] = ",".join(common_wns)
            new_wmlemmas_bits.append(urlencode(wnlemma_dict))
        ann.attrib["wnlemma"] = " ".join(new_wmlemmas_bits)

        # annotation > #text
        ann_langs = langs_of_wns(ann_wns)
        if len(ann_langs) <= len(selected_langs):
            return
        lemmas_str = ann.text
        bits = lemmas_str.split(" ")
        assert len(bits) <= 2
        if len(bits) <= 1:
            return
        if "eng" in selected_langs:
            ann.text = bits[0]
        else:
            ann.text = bits[1]

    transform_blocks(eq_matcher("annotation"), inf, select_wn, outf)
예제 #7
0
파일: munge.py 프로젝트: frankier/STIFF
def babelnet_lookup(inf: IO, map_bn2wn: IO, outf: IO):
    """
    This stage converts BabelNet ids to WordNet ids.
    """
    bn2wn_map: Dict[str, Set[str]] = {}
    for line in map_bn2wn:
        bn, wn_full = line[:-1].split("\t")
        wn_off = wn_full.split(":", 1)[1]
        bn2wn_map.setdefault(bn, set()).add(wn_off)

    def ann_bn2wn(ann):
        if ann.text not in bn2wn_map:
            return BYPASS
        wn_ids = bn2wn_map[ann.text]
        bits = []
        for wn_id in wn_ids:
            off, pos = wn_id[:-1], wn_id[-1]
            bits.append("{}-{}".format(off, pos))
        ann.text = " ".join(bits)

    transform_blocks(eq_matcher("annotation"), inf, ann_bn2wn, outf)
예제 #8
0
파일: munge.py 프로젝트: pombredanne/STIFF
def senseval_gather(indir: str, outf: IO, keyout: IO):
    """
    Gather individual per-word SenseEval files into one big file, usable by
    ItMakesSense and Context2Vec.
    """
    with lexical_sample(outf):
        for word_dir in listdir(indir):
            train_fn = pjoin(indir, word_dir, "train.xml")
            key_fn = pjoin(indir, word_dir, "train.key")
            with open(train_fn, "rb") as train_f:
                stream = etree.iterparse(train_f, events=("start", "end"))

                def cb(lexelt):
                    if not len(lexelt):
                        return
                    outf.write(etree.tostring(lexelt, encoding="unicode"))

                chunk_cb(stream, eq_matcher("lexelt"), cb)

            with open(key_fn) as key_f:
                keyout.write(key_f.read())
예제 #9
0
파일: munge.py 프로젝트: frankier/STIFF
def senseval_select_lemma(inf, keyin, outf, keyout, lemma_pos):
    if "." in lemma_pos:
        lemma, pos = lemma_pos.rsplit(".", 1)
    else:
        lemma = lemma_pos
        pos = None

    keys = set()

    def filter_lexelt(lexelt):
        if lexelt.attrib["item"] != lemma:
            return BYPASS
        if pos and lexelt.attrib["pos"] != pos:
            return BYPASS
        for instance in lexelt:
            keys.add(instance.attrib["id"])

    transform_blocks(eq_matcher("lexelt"), inf, filter_lexelt, outf)

    for line in keyin:
        if line.split(" ", 1)[0] not in keys:
            continue
        keyout.write(line)
예제 #10
0
파일: munge.py 프로젝트: frankier/STIFF
def eurosense_fix_lemmas(inf: IO, outf: IO, keep_unknown: bool, quiet: bool):
    """
    Eurosense contains many lemmas which are not in the set of lemmas for the
    synset in FinnWordNet. There are two reasons this might occur.

    Scenario A) Bad lemmatisation by Babelfy. In this case we can try and
    recover the correct lemma by lemmatising ourself and combining with
    information from WordNet

    Scenario B) Extra lemmas have been associated with the WordNet synset in
    BabelNet.  In this case there's nothing to do, and we should usually just
    drop the annotation.
    """
    fi2en, en2fi = get_en_fi_maps()

    def ann_fix_lemmas(ann):
        # 1) check if their lemmatisation matches something in FiWN as is
        orig_lemma_str = ann.attrib["lemma"]
        orig_lemma_str = orig_lemma_str.replace("#", "").replace(" ", "_")

        def mk_lemma_synset_map(lower=False):
            lemma_synset_map = {}
            for synset_id, synset in iter_synsets(ann.text):
                for lemma in synset.lemmas():
                    lemma_str = lemma.name()
                    if lower:
                        lemma_str = lemma_str.lower()
                    lemma_synset_map.setdefault(lemma_str,
                                                set()).add(synset_id)
            return lemma_synset_map

        lemma_synset_map = mk_lemma_synset_map()

        if orig_lemma_str in lemma_synset_map:
            ann.text = " ".join(lemma_synset_map[orig_lemma_str])
            ann.attrib["lemma"] = orig_lemma_str
            return
        # 2) Try and just use the surface as is as the lemma
        lemmatised_anchor = ann.attrib["anchor"].replace(" ", "_")

        lemma_synset_map_lower = mk_lemma_synset_map(lower=True)
        if lemmatised_anchor.lower() in lemma_synset_map_lower:
            ann.text = " ".join(
                lemma_synset_map_lower[lemmatised_anchor.lower()])
            # XXX: Should be lemma in original case rather than anchor in original case
            ann.attrib["lemma"] = lemmatised_anchor
            return
        # 3) Re-lemmatise the surface using OMorFi and try and match with FiWN
        anchor_bits = ann.attrib["anchor"].split(" ")
        matches = {}

        for lemma_str, synset_id in lemma_synset_map.items():
            lemma_bits = lemma_str.split("_")
            common = lemma_intersect(anchor_bits, lemma_bits)
            if common is not None:
                matches.setdefault(lemma_str, set()).update(synset_id)
        if len(matches) == 1:
            lemma, synsets = next(iter(matches.items()))
            ann.attrib["lemma"] = lemma
            ann.text = " ".join(synsets)
            return
        elif len(matches) > 1:
            if not quiet:
                sys.stderr.write(
                    "Multiple lemmas found found for {}: {}\n".format(
                        ann.attrib["anchor"], matches))
        # If nothing has worked, it's probably scenario B as above
        elif len(matches) == 0:
            if not quiet:
                sys.stderr.write("No lemma found for {} {} {}\n".format(
                    ann.text, orig_lemma_str, lemmatised_anchor))
        if keep_unknown:
            ann.attrib["lemma"] = orig_lemma_str
        else:
            return BYPASS

    transform_blocks(eq_matcher("annotation"), inf, ann_fix_lemmas, outf)