def priority_union(main_csvin, secondary_csvin, csvout): """ Perform a priority union on (frame, synset) rels. MAIN_CSV gets priority """ disagreements = agreements = 0 csvout.write("pb,wn\n") next(main_csvin) next(secondary_csvin) for lemma, main_clus, sec_clus in outer_join( gen_groupings(main_csvin), gen_groupings(secondary_csvin)): if sec_clus is None: write_grouping(lemma, main_clus[0], csvout) elif main_clus is None: write_grouping(lemma, sec_clus[0], csvout) else: sk_main_clus = synset_key_clus(main_clus[0]) sk_sec_clus = synset_key_clus(sec_clus[0]) for synset, clus_idx in sk_sec_clus.items(): if synset in sk_main_clus: if sk_main_clus[synset] == clus_idx: agreements += 1 else: disagreements += 1 print( f"{lemma}: Main says {synset} goes in frame {sk_main_clus[synset]}, but secondary says {clus_idx}", file=sys.stderr) else: sk_main_clus[synset] = clus_idx write_grouping(lemma, clus_key_clus(sk_main_clus), csvout) print(f"Agreements: {agreements}; Disagreements: {disagreements}", file=sys.stderr)
def filter_clus(csvin, csvout, wn): import stiff.wordnet.fin csvin = skip_first(csvin, csvout) dropped_non_smap = 0 dropped_lemmas = 0 for lemma, groupings in gen_groupings(csvin): num_synsets = 0 empty_groups = [] for group_num, synsets in groupings.items(): new_synsets = [] for synset in synsets: if not is_smap(wn, lemma, synset): dropped_non_smap += 1 continue new_synsets.append(synset) num_synsets += 1 if len(new_synsets): groupings[group_num] = new_synsets else: empty_groups.append(group_num) for group_num in empty_groups: del groupings[group_num] if num_synsets >= 2: write_grouping(lemma, groupings, csvout) else: dropped_lemmas += 1 print( f"Dropped non-smap: {dropped_non_smap}; Dropped lemmas: {dropped_lemmas}", file=sys.stderr)
def eval(gold, test, multi_group): lemmas = 0 cnt = Counter(**ZERO_CONFUSION) line = next(gold) assert line.strip() in HEADERS gold_gen = gen_gold_groupings(gold, multi_group) for lemma, gold_clus, test_clus in outer_join(gold_gen, gen_groupings(test)): if gold_clus is None: # right join continue if test_clus is None: # right join must == inner join raise UnguessedLemmaException(lemma) lemmas += 1 try: if multi_group: for gc in gold_clus[0]: eval_clus(gc, test_clus[0], cnt) else: eval_clus(gold_clus[0], test_clus[0], cnt) except UnguessedInstanceException as exc: exc.missing_lemma = lemma raise res = stats_dict(cnt) res["lemmas"] = lemmas return res
def stats(csvin, wn, multi=False): """ Write out stats for CSVIN """ import stiff.wordnet.fin cnt = Counter() first_line = next(csvin).strip() if first_line == "manann,ref": inclusion_criteria = ('ambg', 'none') has_wiktionary = True elif first_line == "pb,wn": inclusion_criteria = GROUPING_INCLUSION_CRITERIA has_wiktionary = False else: assert False if multi: for lemma, multi_groupings in gen_multi_groupings(csvin): for inc_crit in inclusion_criteria: if any((include_grouping(inc_crit, wn, lemma, groupings) for groupings in multi_groupings)): cnt['lemmas_' + inc_crit] += 1 for groupings in multi_groupings: count_groupings(cnt, has_wiktionary, inc_crit, wn, lemma, groupings) else: for lemma, groupings in gen_groupings(csvin): for inc_crit in inclusion_criteria: if include_grouping(inc_crit, wn, lemma, groupings): cnt['lemmas_' + inc_crit] += 1 count_groupings(cnt, has_wiktionary, inc_crit, wn, lemma, groupings) for k, v in sorted(cnt.items()): print(k, v)
def decompile(inf, out_dir): session = get_session() for lemma, grouping in gen_groupings(inf): with open(pjoin(out_dir, lemma), "w") as outf: first = True for group_num, synsets in grouping.items(): if not first: outf.write("\n") else: first = False for synset in synsets: outf.write(synset) outf.write(" # ") if is_wn_ref(synset): sense = wordnet.of2ss(synset).definition() else: sense = session.execute(select([ word_sense.c.sense, ]).select_from(joined).where( (headword.c.name == lemma) & (word_sense.c.sense_id == synset) )).fetchone()["sense"] tokens = word_tokenize(sense) outf.write(" ".join(tokens)) outf.write("\n")
def filter_repeats(csvin, csvout): num_filtered = 0 csvin = skip_first(csvin, csvout) for lemma, grouping in gen_groupings(csvin): grouped, filtered = filter_grouping_repeats(grouping) num_filtered += len(filtered) write_grouping(lemma, grouping, csvout) print(f"Filtered: {num_filtered}", file=sys.stderr)
def from_conc(csvin, csvout, wn, contradictions): csvout.write("pb,wn\n") import stiff.wordnet.fin # noqa next(csvin) conc_clus = read_conc_clust(csvin) csvin.seek(0) next(csvin) all_synth_clus = mk_synth(gen_groupings(csvin), wn) proc_write_synth(all_synth_clus, conc_clus, contradictions, csvout)
def create_sample_maps(multi, gold, guess): line_map = [] gold_map = {} for lemma, groups in gen_gold_groupings(gold, multi): line_map.append(lemma) gold_map[lemma] = groups guess_map = {} for lemma, groups in gen_groupings(guess): guess_map[lemma] = groups return line_map, guess_map, gold_map
def pre_cnt_assignments(gold, test, multi_group): line = next(gold) assert line.strip() in HEADERS index_map, rev_map, num_gold_instances = index_gold_instances( gold, multi_group) gold.seek(0) line = next(gold) assert line.strip() in HEADERS gold_gen = gen_gold_groupings(gold, multi_group) cnts = {} for lemma, gold_clus, test_clus in inner_join(gold_gen, gen_groupings(test)): if multi_group: for gc in gold_clus[0]: add_cluster_partial_cnts(lemma, gc, test_clus[0], cnts, index_map) else: add_cluster_partial_cnts(lemma, gold_clus[0], test_clus[0], cnts, index_map) return cnts
def pre_cnt_lemmas(gold, test, multi_group): line = next(gold) assert line.strip() in HEADERS lemma_line_map = {} for idx, (lemma, groups) in enumerate(gen_gold_groupings(gold, multi_group)): lemma_line_map[lemma] = idx cnts = {} gold.seek(0) line = next(gold) assert line.strip() in HEADERS gold_gen = gen_gold_groupings(gold, multi_group) for lemma, gold_clus, test_clus in inner_join(gold_gen, gen_groupings(test)): cnt = Counter(**ZERO_CONFUSION) if multi_group: for gc in gold_clus[0]: eval_clus(gc, test_clus[0], cnt) else: eval_clus(gold_clus[0], test_clus[0], cnt) cnts[lemma_line_map[lemma]] = cnt return cnts
def read_conc_clust(csvin): conc_clus = {} for lemma, groupings in gen_groupings(csvin): conc_clus[lemma] = groupings return conc_clus
def gen_gold_groupings(gold, multi_group): if multi_group: return gen_multi_groupings(gold) else: return gen_groupings(gold)