示例#1
0
def priority_union(main_csvin, secondary_csvin, csvout):
    """
    Perform a priority union on (frame, synset) rels. MAIN_CSV gets priority
    """
    disagreements = agreements = 0
    csvout.write("pb,wn\n")
    next(main_csvin)
    next(secondary_csvin)
    for lemma, main_clus, sec_clus in outer_join(
            gen_groupings(main_csvin), gen_groupings(secondary_csvin)):
        if sec_clus is None:
            write_grouping(lemma, main_clus[0], csvout)
        elif main_clus is None:
            write_grouping(lemma, sec_clus[0], csvout)
        else:
            sk_main_clus = synset_key_clus(main_clus[0])
            sk_sec_clus = synset_key_clus(sec_clus[0])
            for synset, clus_idx in sk_sec_clus.items():
                if synset in sk_main_clus:
                    if sk_main_clus[synset] == clus_idx:
                        agreements += 1
                    else:
                        disagreements += 1
                        print(
                            f"{lemma}: Main says {synset} goes in frame {sk_main_clus[synset]}, but secondary says {clus_idx}",
                            file=sys.stderr)
                else:
                    sk_main_clus[synset] = clus_idx
            write_grouping(lemma, clus_key_clus(sk_main_clus), csvout)
    print(f"Agreements: {agreements}; Disagreements: {disagreements}",
          file=sys.stderr)
示例#2
0
def filter_clus(csvin, csvout, wn):
    import stiff.wordnet.fin
    csvin = skip_first(csvin, csvout)
    dropped_non_smap = 0
    dropped_lemmas = 0
    for lemma, groupings in gen_groupings(csvin):
        num_synsets = 0
        empty_groups = []
        for group_num, synsets in groupings.items():
            new_synsets = []
            for synset in synsets:
                if not is_smap(wn, lemma, synset):
                    dropped_non_smap += 1
                    continue
                new_synsets.append(synset)
                num_synsets += 1
            if len(new_synsets):
                groupings[group_num] = new_synsets
            else:
                empty_groups.append(group_num)
        for group_num in empty_groups:
            del groupings[group_num]
        if num_synsets >= 2:
            write_grouping(lemma, groupings, csvout)
        else:
            dropped_lemmas += 1
    print(
        f"Dropped non-smap: {dropped_non_smap}; Dropped lemmas: {dropped_lemmas}",
        file=sys.stderr)
示例#3
0
def eval(gold, test, multi_group):
    lemmas = 0
    cnt = Counter(**ZERO_CONFUSION)
    line = next(gold)
    assert line.strip() in HEADERS
    gold_gen = gen_gold_groupings(gold, multi_group)
    for lemma, gold_clus, test_clus in outer_join(gold_gen,
                                                  gen_groupings(test)):
        if gold_clus is None:
            # right join
            continue
        if test_clus is None:
            # right join must == inner join
            raise UnguessedLemmaException(lemma)
        lemmas += 1
        try:
            if multi_group:
                for gc in gold_clus[0]:
                    eval_clus(gc, test_clus[0], cnt)
            else:
                eval_clus(gold_clus[0], test_clus[0], cnt)
        except UnguessedInstanceException as exc:
            exc.missing_lemma = lemma
            raise

    res = stats_dict(cnt)
    res["lemmas"] = lemmas
    return res
示例#4
0
def stats(csvin, wn, multi=False):
    """
    Write out stats for CSVIN
    """
    import stiff.wordnet.fin
    cnt = Counter()
    first_line = next(csvin).strip()
    if first_line == "manann,ref":
        inclusion_criteria = ('ambg', 'none')
        has_wiktionary = True
    elif first_line == "pb,wn":
        inclusion_criteria = GROUPING_INCLUSION_CRITERIA
        has_wiktionary = False
    else:
        assert False

    if multi:
        for lemma, multi_groupings in gen_multi_groupings(csvin):
            for inc_crit in inclusion_criteria:
                if any((include_grouping(inc_crit, wn, lemma, groupings)
                        for groupings in multi_groupings)):
                    cnt['lemmas_' + inc_crit] += 1
                    for groupings in multi_groupings:
                        count_groupings(cnt, has_wiktionary, inc_crit, wn,
                                        lemma, groupings)
    else:
        for lemma, groupings in gen_groupings(csvin):
            for inc_crit in inclusion_criteria:
                if include_grouping(inc_crit, wn, lemma, groupings):
                    cnt['lemmas_' + inc_crit] += 1
                    count_groupings(cnt, has_wiktionary, inc_crit, wn, lemma,
                                    groupings)

    for k, v in sorted(cnt.items()):
        print(k, v)
示例#5
0
def decompile(inf, out_dir):
    session = get_session()
    for lemma, grouping in gen_groupings(inf):
        with open(pjoin(out_dir, lemma), "w") as outf:
            first = True
            for group_num, synsets in grouping.items():
                if not first:
                    outf.write("\n")
                else:
                    first = False
                for synset in synsets:
                    outf.write(synset)
                    outf.write(" # ")
                    if is_wn_ref(synset):
                        sense = wordnet.of2ss(synset).definition()
                    else:
                        sense = session.execute(select([
                            word_sense.c.sense,
                        ]).select_from(joined).where(
                            (headword.c.name == lemma) &
                            (word_sense.c.sense_id == synset)
                        )).fetchone()["sense"]
                    tokens = word_tokenize(sense)
                    outf.write(" ".join(tokens))
                    outf.write("\n")
示例#6
0
def filter_repeats(csvin, csvout):
    num_filtered = 0
    csvin = skip_first(csvin, csvout)
    for lemma, grouping in gen_groupings(csvin):
        grouped, filtered = filter_grouping_repeats(grouping)
        num_filtered += len(filtered)
        write_grouping(lemma, grouping, csvout)
    print(f"Filtered: {num_filtered}", file=sys.stderr)
示例#7
0
def from_conc(csvin, csvout, wn, contradictions):
    csvout.write("pb,wn\n")
    import stiff.wordnet.fin  # noqa
    next(csvin)
    conc_clus = read_conc_clust(csvin)
    csvin.seek(0)
    next(csvin)
    all_synth_clus = mk_synth(gen_groupings(csvin), wn)
    proc_write_synth(all_synth_clus, conc_clus, contradictions, csvout)
示例#8
0
def create_sample_maps(multi, gold, guess):
    line_map = []
    gold_map = {}
    for lemma, groups in gen_gold_groupings(gold, multi):
        line_map.append(lemma)
        gold_map[lemma] = groups
    guess_map = {}
    for lemma, groups in gen_groupings(guess):
        guess_map[lemma] = groups
    return line_map, guess_map, gold_map
示例#9
0
def pre_cnt_assignments(gold, test, multi_group):
    line = next(gold)
    assert line.strip() in HEADERS
    index_map, rev_map, num_gold_instances = index_gold_instances(
        gold, multi_group)
    gold.seek(0)
    line = next(gold)
    assert line.strip() in HEADERS
    gold_gen = gen_gold_groupings(gold, multi_group)
    cnts = {}
    for lemma, gold_clus, test_clus in inner_join(gold_gen,
                                                  gen_groupings(test)):
        if multi_group:
            for gc in gold_clus[0]:
                add_cluster_partial_cnts(lemma, gc, test_clus[0], cnts,
                                         index_map)
        else:
            add_cluster_partial_cnts(lemma, gold_clus[0], test_clus[0], cnts,
                                     index_map)
    return cnts
示例#10
0
def pre_cnt_lemmas(gold, test, multi_group):
    line = next(gold)
    assert line.strip() in HEADERS
    lemma_line_map = {}
    for idx, (lemma,
              groups) in enumerate(gen_gold_groupings(gold, multi_group)):
        lemma_line_map[lemma] = idx
    cnts = {}
    gold.seek(0)
    line = next(gold)
    assert line.strip() in HEADERS
    gold_gen = gen_gold_groupings(gold, multi_group)
    for lemma, gold_clus, test_clus in inner_join(gold_gen,
                                                  gen_groupings(test)):
        cnt = Counter(**ZERO_CONFUSION)
        if multi_group:
            for gc in gold_clus[0]:
                eval_clus(gc, test_clus[0], cnt)
        else:
            eval_clus(gold_clus[0], test_clus[0], cnt)
        cnts[lemma_line_map[lemma]] = cnt
    return cnts
示例#11
0
def read_conc_clust(csvin):
    conc_clus = {}
    for lemma, groupings in gen_groupings(csvin):
        conc_clus[lemma] = groupings
    return conc_clus
示例#12
0
def gen_gold_groupings(gold, multi_group):
    if multi_group:
        return gen_multi_groupings(gold)
    else:
        return gen_groupings(gold)