示例#1
0
def do_compute_intervals(args):
    assert os.path.exists(args.preds) and os.path.isdir(
        args.preds), "{} does not exist or is not a directory".format(
            args.preds)

    Q = load_queries(args.queries)
    E = sorted(set(Q.values()))

    gold = load_gold(args.gold, Q)

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow([
        "system",
        "micro-p",
        "micro-p-left",
        "micro-p-right",
        "micro-r",
        "micro-r-left",
        "micro-r-right",
        "micro-f1",
        "micro-f1-left",
        "micro-f1-right",
        "macro-p",
        "macro-p-left",
        "macro-p-right",
        "macro-r",
        "macro-r-left",
        "macro-r-right",
        "macro-f1",
        "macro-f1-left",
        "macro-f1-right",
    ])

    for fname in os.listdir(args.preds):
        if not fname.endswith(".txt"): continue
        runid = fname.split(".")[0]
        logger.info("Loading output for %s", runid)

        with open(os.path.join(args.preds, fname)) as f:
            output = load_output(f, Q)
            S, C, T = compute_entity_scores(Q, gold, output)

            def compute_metric(E_):
                S_, C_, T_ = {}, {}, {}
                for i, e in enumerate(E_):
                    S_[i], C_[i], T_[i] = S[e], C[e], T[e]
                return micro(S_, C_, T_) + macro(S_, C_, T_)

            # compute bootstrap
            stats = confidence_intervals(E, compute_metric, args.samples,
                                         args.confidence)
            writer.writerow([runid, *list(stats.T.flatten())])
示例#2
0
def do_standardized_evaluation(args):
    assert os.path.exists(args.preds) and os.path.isdir(
        args.preds), "{} does not exist or is not a directory".format(
            args.preds)

    Q = load_queries(args.queries)
    E = sorted(set(Q.values()))

    gold = load_gold(args.gold, Q)
    scores = {}

    for fname in os.listdir(args.preds):
        if not fname.endswith(".txt"): continue
        runid = fname.split(".")[0]
        logger.info("Loading output for %s", runid)

        if runid == "LDC": continue

        with open(os.path.join(args.preds, fname)) as f:
            output = load_output(f, Q)
            scores[runid] = compute_entity_scores(Q, gold, output)

    X_rs = compute_score_matrix(scores, E)
    report_score_matrix(X_rs, args.output_vis, sorted(scores), sorted(Q))

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow([
        "system",
        "macro-sf1",
        "macro-sf1-left",
        "macro-sf1-right",
    ])

    def compute_metric(E_):
        scores_ = {}
        for runid in scores:
            S, C, T = scores[runid]
            S_, C_, T_ = {}, {}, {}
            for i, e in enumerate(E_):
                S_[i], C_[i], T_[i] = S[e], C[e], T[e]
            scores_[runid] = S_, C_, T_
        X_rs = compute_score_matrix(scores_, E_)
        ys = standardize_scores(X_rs)
        return ys

    # compute bootstrap
    stats = confidence_intervals(E, compute_metric, args.samples,
                                 args.confidence)
    logger.info("stats: %d, %d", *stats.shape)
    stats = stats.T
    for i, runid in enumerate(sorted(scores)):
        writer.writerow([runid, *list(stats[i])])
示例#3
0
def do_mention_evaluation(args):
    Q = load_queries(args.queries)
    gold = load_gold(args.gold, Q)
    output = load_output(args.pred, Q)

    S, C, T = compute_mention_scores(gold, output)

    for s in sorted(S):
        args.output.write("{} {:.04f} {:.04f} {:.04f}\n".format(
            s, *micro({s: S[s]}, {s: C[s]}, {s: T[s]})))
    args.output.write(
        "micro {:.04f} {:.04f} {:.04f}\n".format(*micro(S, C, T)))
    args.output.write(
        "macro {:.04f} {:.04f} {:.04f}\n".format(*macro(S, C, T)))
示例#4
0
def load_data(args):
    Q = load_queries(args.queries)
    gold = load_gold(args.gold, Q)
    outputs = {}

    for fname in tqdm(os.listdir(args.preds), desc="Loading outputs"):
        if not fname.endswith(".txt"): continue
        runid = fname.split(".")[0]
        #logger.info("Loading output for %s", runid)

        with open(os.path.join(args.preds, fname)) as f:
            output = load_output(f, Q)
            if len(output) > 0:
                outputs[runid] = output
    logger.info("Loaded output for %d systems", len(outputs))
    assert "LDC" in outputs
    return Q, gold, outputs
示例#5
0
def do_pooling_bias(args):
    assert os.path.exists(args.preds) and os.path.isdir(
        args.preds), "{} does not exist or is not a directory".format(
            args.preds)

    Q = load_queries(args.queries)

    gold = load_gold(args.gold, Q)
    outputs = {}

    for fname in os.listdir(args.preds):
        if not fname.endswith(".txt"): continue
        runid = fname.split(".")[0]
        logger.info("Loading output for %s", runid)

        with open(os.path.join(args.preds, fname)) as f:
            outputs[runid] = load_output(f, Q)
    logger.info("Loaded output for %d systems", len(outputs))

    def make_loo_pool(gold, outputs, runid, mode="closed-world"):
        """
        Create a new gold set which includes only the inputs from all other systems.
        """
        if mode == "closed-world" or "condensed":
            key = k
        elif mode == "anydoc" or "condensed-anydoc":
            key = kn
        else:
            raise ValueError("Unsupported mode: " + mode)

        valid_entries = set([])
        for runid_, output in outputs.items():
            # Making sure UTAustin doesn't make fudge our results
            if runid == runid_ or runid == 'SF_UTAustin1': continue
            valid_entries.update(key(entry) for entry in output)
        gold_ = [entry for entry in gold if key(entry) in valid_entries]
        logger.info("loo pool for %s contains %d entries", runid, len(gold_))
        return gold_

    def make_lto_pool(gold, outputs, runid, mode="closed-world"):
        """
        Create a new gold set which includes only the inputs from all other systems.
        """
        if mode == "closed-world" or "condensed":
            key = k
        elif mode == "anydoc" or "condensed-anydoc":
            key = kn
        else:
            raise ValueError("Unsupported mode: " + mode)

        valid_entries = set([])
        for runid_, output in outputs.items():
            if teamid(runid) == teamid(runid_) or runid == 'SF_UTAustin1':
                continue
            valid_entries.update(key(entry) for entry in output)
        gold_ = [entry for entry in gold if key(entry) in valid_entries]
        logger.info("lto pool for %s contains %d entries", runid, len(gold_))
        return gold_

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow([
        "system",
        "micro-p",
        "micro-r",
        "micro-f1",
        "macro-p",
        "macro-r",
        "macro-f1",
        "micro-p-loo",
        "micro-r-loo",
        "micro-f1-loo",
        "macro-p-loo",
        "macro-r-loo",
        "macro-f1-loo",
        "micro-p-lto",
        "micro-r-lto",
        "micro-f1-lto",
        "macro-p-lto",
        "macro-r-lto",
        "macro-f1-lto",
    ])

    rows = []
    for runid, output in tqdm(outputs.items()):
        row = []
        S, C, T = compute_entity_scores(Q, gold, output, args.mode)
        row += micro(S, C, T) + macro(S, C, T)

        S, C, T = compute_entity_scores(
            Q, make_loo_pool(gold, outputs, runid, args.mode), output,
            args.mode)
        row += micro(S, C, T) + macro(S, C, T)

        S, C, T = compute_entity_scores(
            Q, make_lto_pool(gold, outputs, runid, args.mode), output,
            args.mode)
        row += micro(S, C, T) + macro(S, C, T)

        writer.writerow([
            runid,
        ] + row)
        args.output.flush()
        rows.append([
            runid,
        ] + row)

    logger.info("Wrote %d rows of output", len(rows))

    args.output.flush()