Exemplo n.º 1
0
def unified_test_dev_split(inf, ingoldf, keyin, goldkeyin, outf, keyout):
    gold_sent_iter = peekable(iter_sentences(ingoldf))
    rm_inst_ids = []

    def sent_rm_gold(sent):
        gold_sent = gold_sent_iter.peek(None)
        if gold_sent is not None and gold_sent.attrib["id"] == sent.attrib[
                "id"]:
            for instance in sent.xpath("./instance"):
                rm_inst_ids.append(instance.attrib["id"])
            next(gold_sent_iter)
            return BYPASS

    transform_sentences(inf, sent_rm_gold, outf)

    def next_rm():
        try:
            return rm_inst_ids.pop(0)
        except IndexError:
            return None

    rm_id = next_rm()
    for line in keyin:
        if rm_id == line.split()[0]:
            rm_id = next_rm()
            continue
        keyout.write(line)

    assert len(rm_inst_ids) == 0 and rm_id is None
Exemplo n.º 2
0
def iter_sentences_opensubs18_man_ann(stream):
    # XXX: This assumes a 1-1 imdb subtitle correspondance -- which should be
    # the case near the beginning where the man-ann takes place, but should be
    # fixed in general
    for sent in iter_sentences(stream):
        sources, imdb, sent_id = sent.attrib["id"].split("; ")
        sent_id = "stiff.{:010d}.000.{:08d}".format(int(imdb), int(sent_id))
        yield sent_id, sent
Exemplo n.º 3
0
def unigram(inf, keyout, wn):
    for sent in iter_sentences(inf):
        for instance in sent.xpath("instance"):
            inst_id = instance.attrib["id"]
            word, pos, lemmas = lemmas_from_instance(wn, instance)
            if not len(lemmas):
                sys.stderr.write("No lemma found for {} {}\n".format(word, pos))
                continue
            lemma = lemmas[0]
            write_lemma(keyout, inst_id, lemma)
Exemplo n.º 4
0
def lex_ambg_hist_uni(inf, wn):
    hist = Counter()
    for sent in iter_sentences(inf):
        instances = sent.xpath("instance")
        for inst in instances:
            ambg = len(
                wn.lemmas(inst.attrib["lemma"],
                          UNI_POS_WN_MAP[inst.attrib["pos"]]))
            hist[ambg] += 1
    return hist
Exemplo n.º 5
0
def plot_train_entropies(
    eurosensetrainxml,
    eurosensetrainkey,
    stifftrainxml,
    stifftrainkey,
    semcorxml,
    semcorkey,
    outf,
):
    from statsmodels.sandbox.nonparametric import kernels
    from statsmodels.nonparametric.kde import bandwidths

    fig, (ax1, ax2, ax3) = pl.subplots(3,
                                       sharex=True,
                                       gridspec_kw={"hspace": 0.05})

    def add_to_data(data, dists):
        for dist in dists:
            insts = sum(dist.values())
            h = entropy(dist, insts)
            data.extend((h for _ in range(int(insts + 0.5))))

    # EuroSense/STIFF
    bw = None
    for inf, keyin, ax in [
        (eurosensetrainxml, eurosensetrainkey, ax1),
        (stifftrainxml, stifftrainkey, ax2),
    ]:
        data = []
        add_to_data(data, iter_dists_sup(inf, keyin))
        if bw is None:
            bw = bandwidths.select_bandwidth(data, "scott", kernels.Gaussian)
        sns.distplot(data, kde_kws=dict(bw=bw, gridsize=1000), ax=ax)
    # SemCor
    semcor_vocab = {}
    build_uni_sense_dist(iter_sentences(semcorxml), semcorkey, semcor_vocab)
    data = []
    add_to_data(data, semcor_vocab.values())
    # Plot
    sns.distplot(data, kde_kws=dict(bw=bw, gridsize=1000), ax=ax3)
    ax3.set_xlim(-0.1)
    ax3.set_xlabel("Entropy")
    ax1.set_ylabel("EuroSense instance density")
    ax2.set_ylabel("STIFF instance density")
    ax3.set_ylabel("SemCor instance density")
    fix_border(ax1)
    fix_border(ax2)
    fix_border(ax3)
    ax3.xaxis.set_minor_locator(MultipleLocator(0.1))
    fig.set_size_inches(441.0 / 72, 645.0 / 72)

    if outf:
        pl.savefig(outf, bbox_inches="tight")
    else:
        pl.show()
Exemplo n.º 6
0
def unified_to_senseval(inf: IO, keyin: IO, outdir: str):
    """

    Converts from the unified format to a Senseval-3 -style format in
    individual files. The resulting files should be directly usable to train a
    single word model with ItMakesSense or can be gathered using.

    This is a scatter type operation.
    """
    out_files: Dict[str, str] = {}
    for sent_elem in iter_sentences(inf):
        for inst in sent_elem.xpath("instance"):
            lemma_str = inst.attrib["lemma"].lower()
            pos_str = inst.attrib["pos"]
            pos_chr = UNI_POS_WN_MAP[pos_str]
            lemma_pos = "{}.{}".format(lemma_str, pos_chr)

            # Write XML
            out_dir = pjoin(outdir, lemma_pos)
            if lemma_pos not in out_files:
                makedirs(out_dir, exist_ok=True)
                out_fn = pjoin(out_dir, "train.xml")
                out_f = open(out_fn, "w")
                lexical_sample_head(out_f)
                lexelt_head(lemma_str, pos_chr, out_f)
            else:
                out_fn = out_files[lemma_pos]
                out_f = open(out_fn, "a")
            with instance(inst, out_f):
                write_context(sent_elem, inst, out_f)
            out_f.close()

            # Write key file
            key_fn = pjoin(out_dir, "train.key")
            key_line = keyin.readline()
            key_id, key_synset = key_line.rstrip().split(" ", 1)
            assert key_id == inst.attrib["id"]
            if lemma_pos not in out_files:
                key_f = open(key_fn, "w")
            else:
                key_f = open(key_fn, "a")
            out_line = "{} {} {}\n".format(lemma_pos, key_id, key_synset)
            key_f.write(out_line)
            key_f.close()

            # Add to out_files
            if lemma_pos not in out_files:
                out_files[lemma_pos] = out_fn

    for out_fn in out_files.values():
        with open(out_fn, "a") as out_f:
            lexelt_foot(out_f)
            lexical_sample_foot(out_f)
Exemplo n.º 7
0
def overlap_examples(inf):
    for sent in iter_sentences(inf):
        tok_lems = sent.xpath("./text[@id='zh-tok']")[0].text.split(" ")
        untok_lems = set()
        for ann in sent.xpath("./annotations/annotation[@lang='zh']"):
            anchor_positions = ann.attrib["anchor-positions"]
            for position in anchor_positions.split(" "):
                anchor = parse_qs_single(position)
                source = anchor["from-id"]
                if source == "zh-untok":
                    untok_lems.add(ann.attrib["lemma"])
        for untok_lem in untok_lems:
            if not (any(untok_lem in tok_lem for tok_lem in tok_lems)):
                print("Not a substring:", untok_lem)
                for text in sent.xpath("./text"):
                    print(text.text)
Exemplo n.º 8
0
def sent_report(inf, report_cb, subtotal=None):
    sents = 0
    done = False
    try:
        # XXX: take into account token length for coverage
        for sent in iter_sentences(inf):
            yield sent
            sents += 1
            if subtotal is not None and sents % subtotal == 0:
                print(f"Report at {sents} sentences:")
                report_cb()
        done = True
    finally:
        if sents:
            if not done:
                print(f"Terminated early after {sents} sentences.")
            else:
                print(f"Finished after {sents} sentences.")
            report_cb()
Exemplo n.º 9
0
def unified_to_ukb(inf, outf, extract_extra):
    from stiff.extract.fin import FinExtractor

    if extract_extra:
        extractor = FinExtractor()
    for sent_elem in iter_sentences(inf):
        bits = []
        for instance in sent_elem.xpath("instance"):
            id = instance.attrib["id"]
            lemma = instance.attrib["lemma"].lower()
            pos = UNI_POS_WN_MAP[instance.attrib["pos"]]
            bits.append(f"{lemma}#{pos}#{id}#1")
        if extract_extra:
            elems = sent_elem.xpath("wf|instance")
            toks = [node.text for node in elems]
            known_idxs = {
                idx for idx, elem in enumerate(elems) if elem.tag == "instance"
            }
            tagging = extractor.extract_toks(toks, list(fake_starts(toks)))
            for tok_idx, tok in enumerate(tagging.tokens):
                if tok_idx in known_idxs:
                    continue
                extra_id = 0
                lemma_poses = set()
                for tag in tok.tags:
                    for wn, lemma_obj in tag.lemma_objs:
                        lemma_name = lemma_obj.name().lower().strip()
                        if lemma_name == "":
                            continue
                        lemma_pos = lemma_obj.synset().pos()
                        if lemma_pos == "s":
                            lemma_pos = "a"
                        lemma_poses.add((lemma_name, lemma_pos))
                for lemma, pos in lemma_poses:
                    bits.append(f"{lemma}#{pos}#xT{tok_idx}N{extra_id}#0")
                    extra_id += 1
        if bits:
            outf.write(sent_elem.attrib["id"])
            outf.write("\n")
            outf.write(" ".join(bits))
            outf.write("\n")
Exemplo n.º 10
0
def lesk_pp(mean, inf, keyout, include_wfs, expand, exclude_cand, score_by):
    aggf = ALL_MEANS[mean]
    lesk_pp = LeskPP(numberbatch_multispace, aggf, False, expand)
    for sent_idx, sent in enumerate(iter_sentences(inf)):
        if include_wfs:
            instances = sent.xpath("instance|wf")
            sent = [inst.text for inst in instances]
            tagged_sent = sent_finnpos(sent)
        else:
            instances = sent.xpath("instance")
            tagged_sent = None
        sent_lemmas = []
        instance_ids = []
        # XXX: SHOULD add wfs too! (equiv to wn_filter)
        for idx, instance in enumerate(instances):
            if instance.tag == "wf":
                lemma_str = tagged_sent[idx][1]
                lemmas = []
            else:
                lemma_str, _pos, lemmas = lemmas_from_instance(fiwn_encnt, instance)
            sent_lemmas.append((lemma_str, lemmas))
            if instance.tag == "instance":
                instance_ids.append(instance.attrib["id"])
        disambg_order = sorted(
            (len(lemmas), idx)
            for idx, (lemma_str, lemmas) in enumerate(sent_lemmas)
            if len(lemmas) > 0
        )
        for ambiguity, lemma_idx in disambg_order:
            if ambiguity <= 1:
                continue
            lemma_str, lemmas = sent_lemmas[lemma_idx]

            # XXX: Should context_vec exclude the word being disambiguated
            context_vec = lesk_pp.mk_ctx_vec(
                sent_lemmas, *([lemma_idx] if exclude_cand else [])
            )
            if context_vec is None:
                logger.debug("No context vec, backing off to MFS")
                # Back off to MFS
                sent_lemmas[lemma_idx] = (lemma_str, [lemmas[0]])
            else:
                logger.debug(f"Got context vec {context_vec}")
                best_lemma = None
                best_score = -2
                for lemma in lemmas:
                    logger.debug(f"Considering lemma: {lemma}")
                    defn_vec = lesk_pp.mk_defn_vec(lemma)
                    logger.debug(f"Got defn_vec: {defn_vec}")
                    if defn_vec is None:
                        defn_ctx_score = 0
                    else:
                        defn_ctx_score = cosine_sim(defn_vec, context_vec)
                    try:
                        lemma_vec = mk_lemma_vec(lemma)
                    except KeyError:
                        # XXX: Is this reasonable, or should there be a penalty?
                        lemma_ctx_score = defn_ctx_score
                    else:
                        logger.debug(f"Got lemma_vec: {lemma_vec}")
                        lemma_ctx_score = cosine_sim(lemma_vec, context_vec)
                    if score_by == "both":
                        score = defn_ctx_score + lemma_ctx_score
                    elif score_by == "defn":
                        score = defn_ctx_score
                    elif score_by == "lemma":
                        score = lemma_ctx_score
                    else:
                        assert False
                    logger.debug(
                        f"Score: {score} ({defn_ctx_score} + {lemma_ctx_score})"
                    )
                    if score > best_score:
                        best_lemma = lemma
                        best_score = score
                sent_lemmas[lemma_idx] = (lemma_str, [best_lemma])
        instance_sent_lemmas = (x for x in sent_lemmas if len(x[1]) > 0)
        for (lemma_str, lemmas), inst_id in zip(instance_sent_lemmas, instance_ids):
            if lemmas[0] is None:
                continue
            write_lemma(keyout, inst_id, lemmas[0])
Exemplo n.º 11
0
def iter_sentences_eurosense(stream):
    for sent_elem in iter_sentences(stream):
        yield "eurosense.{:08d}".format(int(sent_elem.attrib["id"])), sent_elem
Exemplo n.º 12
0
def unified_to_senseval(
    inf: IO,
    keyin: IO,
    outdir: str,
    exclude_word: List[str],
    write_tag: bool,
    synset_group: bool,
    filter_key: Optional[IO],
):
    """
    Converts from the unified format to a Senseval-3 -style format in
    individual files. The resulting files should be directly usable to train a
    single word model with ItMakesSense or can be gathered using senseval-gather.

    This is a scatter type operation.
    """
    def train_out(tag):
        if tag:
            return "train.tag.xml"
        else:
            return "train.xml"

    seen_keys: Set[str] = set()
    filter = None
    if filter_key is not None:
        exclude = pickle.load(filter_key)
    for sent_elem in iter_sentences(inf):
        for inst in sent_elem.xpath("instance"):

            def read_key():
                key_line = keyin.readline()
                key_id, key_synset = key_line.rstrip().split(" ", 1)
                assert key_id == inst.attrib["id"]
                return key_id, key_synset

            lemma_str = inst.attrib["lemma"].lower()
            key_id, key_synset = read_key()

            if lemma_str in exclude:
                continue

            pos_str = inst.attrib["pos"]
            pos_chr = UNI_POS_WN_MAP[pos_str]
            lemma_pos = "{}.{}".format(lemma_str, pos_chr)
            if synset_group:
                group_keys = key_synset.split(" ")
            else:
                group_keys = [lemma_pos]

            for group_key in group_keys:
                if filter is not None and group_key not in filter:
                    continue
                new_group = group_key not in seen_keys
                seen_keys.add(group_key)

                # Make dir
                group_dir = pjoin(outdir, group_key)
                if new_group:
                    makedirs(group_dir, exist_ok=True)

                # Write XML
                def write_xml(tag=False):
                    out_fn = pjoin(group_dir, train_out(tag))
                    if new_group:
                        out_f = open(out_fn, "w")
                        lexical_sample_head(out_f)
                        if synset_group:
                            lexelt_synset_head(group_key, out_f)
                        else:
                            lexelt_head(lemma_str, pos_chr, out_f)
                    else:
                        out_f = open(out_fn, "a")
                    with instance(inst, out_f):
                        write_context(sent_elem, inst, out_f, write_tag=tag)
                    out_f.close()

                write_xml()
                if write_tag:
                    write_xml(True)

                # Write key file
                key_fn = pjoin(group_dir, "train.key")
                if new_group:
                    key_f = open(key_fn, "w")
                else:
                    key_f = open(key_fn, "a")
                out_line = "{} {} {}\n".format(lemma_pos, key_id, key_synset)
                key_f.write(out_line)
                key_f.close()

    for group_key in seen_keys:

        def write_foot(tag=False):
            out_fn = pjoin(outdir, group_key, train_out(tag))
            with open(out_fn, "a") as out_f:
                lexelt_foot(out_f)
                lexical_sample_foot(out_f)

        write_foot(False)
        if write_tag:
            write_foot(True)
Exemplo n.º 13
0
def cov(inf, subtotal=None):
    """
    Produce a report of how much of a corpus in Eurosense/STIFF format is
    covered by annotations.
    """
    source = Stream()
    header = pd.DataFrame(
        {"toks": [], "anns": [], "unambg_anns": [], "uniq_anns": [], "cover": []}
    )
    sdf = DataFrame(source, example=header)
    sums = {}
    for col in ["toks", "anns", "unambg_anns", "uniq_anns", "cover"]:
        sums[col] = getattr(sdf, col).sum().stream.gather().sink_to_list()

    ambg_source = Stream()
    ambg_header = pd.DataFrame({"ambg": []})
    ambg_sdf = DataFrame(ambg_source, example=ambg_header)
    ambg_hist = ambg_sdf.ambg.value_counts().stream.gather().sink_to_list()

    def print_cov():
        sents = len(sums["anns"])
        print("Coverage at {} sentences:".format(sents))
        print(
            "Total annotations, unique annotations, unambiguous annotations, "
            "tokens, tokens covered, proportion of tokens covered"
        )
        print(
            sums["anns"][-1],
            sums["uniq_anns"][-1],
            sums["unambg_anns"][-1],
            sums["toks"][-1],
            sums["cover"][-1],
            sums["cover"][-1] / sums["toks"][-1],
        )
        print(ambg_hist[-1])

    try:
        # XXX: take into account token length for coverage
        for idx, sent in enumerate(iter_sentences(inf)):
            toks = len(sent.xpath("text")[0].text.split(" "))
            anns = sent.xpath("annotations/annotation")
            num_anns = len(anns)
            ann_index = {}
            cov_map = [0] * toks
            for ann in anns:
                tok, tok_len = get_ann_pos(ann)
                ann_index.setdefault(tok, []).append(ann)
                for idx in range(tok, tok + tok_len):
                    cov_map[idx] += 1
            unambg_anns = 0
            uniq_anns = 0
            for ann_list in ann_index.values():
                ambg = len(ann_list)
                ambg_source.emit(pd.DataFrame({"ambg": [ambg]}))
                if ambg == 1:
                    unambg_anns += 1
                uniq_anns += 1
            source.emit(
                pd.DataFrame(
                    {
                        "toks": [toks],
                        "anns": [num_anns],
                        "unambg_anns": [unambg_anns],
                        "uniq_anns": [uniq_anns],
                        "cover": [toks - cov_map.count(0)],
                    }
                )
            )
            idx1 = idx + 1
            if subtotal is not None and idx1 % subtotal == 0:
                print_cov()
    finally:
        if len(sums["anns"]):
            print_cov()