def do_compute_intervals(args): assert os.path.exists(args.preds) and os.path.isdir( args.preds), "{} does not exist or is not a directory".format( args.preds) Q = load_queries(args.queries) E = sorted(set(Q.values())) gold = load_gold(args.gold, Q) writer = csv.writer(args.output, delimiter="\t") writer.writerow([ "system", "micro-p", "micro-p-left", "micro-p-right", "micro-r", "micro-r-left", "micro-r-right", "micro-f1", "micro-f1-left", "micro-f1-right", "macro-p", "macro-p-left", "macro-p-right", "macro-r", "macro-r-left", "macro-r-right", "macro-f1", "macro-f1-left", "macro-f1-right", ]) for fname in os.listdir(args.preds): if not fname.endswith(".txt"): continue runid = fname.split(".")[0] logger.info("Loading output for %s", runid) with open(os.path.join(args.preds, fname)) as f: output = load_output(f, Q) S, C, T = compute_entity_scores(Q, gold, output) def compute_metric(E_): S_, C_, T_ = {}, {}, {} for i, e in enumerate(E_): S_[i], C_[i], T_[i] = S[e], C[e], T[e] return micro(S_, C_, T_) + macro(S_, C_, T_) # compute bootstrap stats = confidence_intervals(E, compute_metric, args.samples, args.confidence) writer.writerow([runid, *list(stats.T.flatten())])
def do_standardized_evaluation(args): assert os.path.exists(args.preds) and os.path.isdir( args.preds), "{} does not exist or is not a directory".format( args.preds) Q = load_queries(args.queries) E = sorted(set(Q.values())) gold = load_gold(args.gold, Q) scores = {} for fname in os.listdir(args.preds): if not fname.endswith(".txt"): continue runid = fname.split(".")[0] logger.info("Loading output for %s", runid) if runid == "LDC": continue with open(os.path.join(args.preds, fname)) as f: output = load_output(f, Q) scores[runid] = compute_entity_scores(Q, gold, output) X_rs = compute_score_matrix(scores, E) report_score_matrix(X_rs, args.output_vis, sorted(scores), sorted(Q)) writer = csv.writer(args.output, delimiter="\t") writer.writerow([ "system", "macro-sf1", "macro-sf1-left", "macro-sf1-right", ]) def compute_metric(E_): scores_ = {} for runid in scores: S, C, T = scores[runid] S_, C_, T_ = {}, {}, {} for i, e in enumerate(E_): S_[i], C_[i], T_[i] = S[e], C[e], T[e] scores_[runid] = S_, C_, T_ X_rs = compute_score_matrix(scores_, E_) ys = standardize_scores(X_rs) return ys # compute bootstrap stats = confidence_intervals(E, compute_metric, args.samples, args.confidence) logger.info("stats: %d, %d", *stats.shape) stats = stats.T for i, runid in enumerate(sorted(scores)): writer.writerow([runid, *list(stats[i])])
def do_mention_evaluation(args): Q = load_queries(args.queries) gold = load_gold(args.gold, Q) output = load_output(args.pred, Q) S, C, T = compute_mention_scores(gold, output) for s in sorted(S): args.output.write("{} {:.04f} {:.04f} {:.04f}\n".format( s, *micro({s: S[s]}, {s: C[s]}, {s: T[s]}))) args.output.write( "micro {:.04f} {:.04f} {:.04f}\n".format(*micro(S, C, T))) args.output.write( "macro {:.04f} {:.04f} {:.04f}\n".format(*macro(S, C, T)))
def load_data(args): Q = load_queries(args.queries) gold = load_gold(args.gold, Q) outputs = {} for fname in tqdm(os.listdir(args.preds), desc="Loading outputs"): if not fname.endswith(".txt"): continue runid = fname.split(".")[0] #logger.info("Loading output for %s", runid) with open(os.path.join(args.preds, fname)) as f: output = load_output(f, Q) if len(output) > 0: outputs[runid] = output logger.info("Loaded output for %d systems", len(outputs)) assert "LDC" in outputs return Q, gold, outputs
def do_pooling_bias(args): assert os.path.exists(args.preds) and os.path.isdir( args.preds), "{} does not exist or is not a directory".format( args.preds) Q = load_queries(args.queries) gold = load_gold(args.gold, Q) outputs = {} for fname in os.listdir(args.preds): if not fname.endswith(".txt"): continue runid = fname.split(".")[0] logger.info("Loading output for %s", runid) with open(os.path.join(args.preds, fname)) as f: outputs[runid] = load_output(f, Q) logger.info("Loaded output for %d systems", len(outputs)) def make_loo_pool(gold, outputs, runid, mode="closed-world"): """ Create a new gold set which includes only the inputs from all other systems. """ if mode == "closed-world" or "condensed": key = k elif mode == "anydoc" or "condensed-anydoc": key = kn else: raise ValueError("Unsupported mode: " + mode) valid_entries = set([]) for runid_, output in outputs.items(): # Making sure UTAustin doesn't make fudge our results if runid == runid_ or runid == 'SF_UTAustin1': continue valid_entries.update(key(entry) for entry in output) gold_ = [entry for entry in gold if key(entry) in valid_entries] logger.info("loo pool for %s contains %d entries", runid, len(gold_)) return gold_ def make_lto_pool(gold, outputs, runid, mode="closed-world"): """ Create a new gold set which includes only the inputs from all other systems. """ if mode == "closed-world" or "condensed": key = k elif mode == "anydoc" or "condensed-anydoc": key = kn else: raise ValueError("Unsupported mode: " + mode) valid_entries = set([]) for runid_, output in outputs.items(): if teamid(runid) == teamid(runid_) or runid == 'SF_UTAustin1': continue valid_entries.update(key(entry) for entry in output) gold_ = [entry for entry in gold if key(entry) in valid_entries] logger.info("lto pool for %s contains %d entries", runid, len(gold_)) return gold_ writer = csv.writer(args.output, delimiter="\t") writer.writerow([ "system", "micro-p", "micro-r", "micro-f1", "macro-p", "macro-r", "macro-f1", "micro-p-loo", "micro-r-loo", "micro-f1-loo", "macro-p-loo", "macro-r-loo", "macro-f1-loo", "micro-p-lto", "micro-r-lto", "micro-f1-lto", "macro-p-lto", "macro-r-lto", "macro-f1-lto", ]) rows = [] for runid, output in tqdm(outputs.items()): row = [] S, C, T = compute_entity_scores(Q, gold, output, args.mode) row += micro(S, C, T) + macro(S, C, T) S, C, T = compute_entity_scores( Q, make_loo_pool(gold, outputs, runid, args.mode), output, args.mode) row += micro(S, C, T) + macro(S, C, T) S, C, T = compute_entity_scores( Q, make_lto_pool(gold, outputs, runid, args.mode), output, args.mode) row += micro(S, C, T) + macro(S, C, T) writer.writerow([ runid, ] + row) args.output.flush() rows.append([ runid, ] + row) logger.info("Wrote %d rows of output", len(rows)) args.output.flush()