示例#1
0
def do_build_table(args):
    args.estimator_args = dict(args.estimator_args or [])
    data = load_jsonl(args.input)
    data_means = load_jsonl(args.input_means) if args.input_means else None
    metric_ss = get_metric_ss(data_means) if data_means else {}

    prompts = list(first(data)["prompts"])
    metrics = list(first(first(data)["prompts"].values()))
    metrics.remove('human')
    systems = sorted({datum["system"] for datum in data})
    systems.remove('reference')

    trajectories = []
    settings = [(metric, prompt, system) for metric in metrics
                for prompt in prompts for system in systems]
    #settings = [(metric, prompt, system) for metric in ["gold"] for prompt in prompts for system in systems]
    for metric, prompt, system in tqdm(settings, desc="settings"):
        args.data_prompt = prompt
        args.data_metric = metric
        args.data_system = system

        # project data.
        fs, gs, hs, anns = apply_transforms(args, data)
        fs = np.array([np.mean(h) for h in hs])  # FIXES BUG

        if metric in metric_ss:
            args.estimator_args["_g0"], args.estimator_args[
                "_var_g"] = metric_ss[metric][prompt][system]
        else:
            args.estimator_args["_g0"], args.estimator_args[
                "_var_g"] = np.mean(gs), np.var(gs)

        # model.
        truth = np.mean(fs)

        args.estimator = "simple"
        trajectory = bootstrap_trajectory(fs, gs, hs, anns,
                                          get_estimator(args), args.num_epochs)
        simple = np.stack([
            np.mean(trajectory,
                    0), truth + np.percentile(trajectory - truth, 10, 0),
            truth + np.percentile(trajectory - truth, 90, 0)
        ]).T
        trajectories.append(report_trajectory(args, truth, simple))

        args.estimator = "model_variate"
        trajectory = bootstrap_trajectory(fs, gs, hs, anns,
                                          get_estimator(args), args.num_epochs)
        mv = np.stack([
            np.mean(trajectory,
                    0), truth + np.percentile(trajectory - truth, 10, 0),
            truth + np.percentile(trajectory - truth, 90, 0)
        ]).T

        trajectories.append(report_trajectory(args, truth, mv))

    save_jsonl(args.output, trajectories)
示例#2
0
def do_data_efficiency_table(args):
    data = [json.loads(line) for line in open(args.input, "rt")]
    data = get_data_efficiencies(data)

    prompt = args.data_prompt
    metrics = sorted(data.keys())
    task = first(key for key, values in PROMPTS.items() if prompt in values)
    systems = SYSTEMS[task]

    X = np.array([[data[metric][prompt][system]**2 for system in systems]
                  for metric in metrics])

    plt.rc("font", size=16)
    plt.rc("text", usetex=False)

    draw_matrix(X,
                with_values=True,
                x_labels=[LABELS.get(s, s) for s in systems],
                y_labels=[LABELS.get(m, m) for m in metrics],
                vmin=0.9,
                vmax=1.3)

    plt.colorbar(label="Data efficiency")
    plt.xlabel("Systems")
    plt.ylabel("Metrics")

    if args.with_title:
        plt.title(r"Data efficiencies on {} using the {} prompt".format(
            LABELS.get(task, task),
            LABELS.get(prompt, prompt),
        ),
                  fontsize=14)

    plt.tight_layout()
    plt.savefig(args.output)
示例#3
0
def do_correlation_table(args):
    with open(args.input) as f:
        data = load_jsonl(f)
    data = get_correlations(data)
    data = data[args.data_prompt]

    prompt = args.data_prompt
    metrics = sorted(data.keys())
    task = first(key for key, values in PROMPTS.items() if prompt in values)
    systems = SYSTEMS[task] + ["*"]

    X = np.array([[data[metric][system] for system in systems]
                  for metric in metrics])

    plt.rc("font", size=16)
    plt.rc("text", usetex=False)
    #plt.rc("figure", figsize=(10,10))

    draw_matrix(
        X,
        with_values=True,
        x_labels=[LABELS.get(s, s) for s in systems],
        y_labels=[LABELS.get(m, m) for m in metrics],
    )

    plt.colorbar(label=r"Pearson ρ")
    plt.xlabel("Systems")
    plt.ylabel("Metrics")

    if args.with_title:
        task = first(key for key, values in PROMPTS.items()
                     if prompt in values)
        plt.title(r"Correlations on {} using the {} prompt".format(
            LABELS.get(task, task),
            LABELS.get(prompt, prompt),
        ),
                  fontsize=14)

    plt.tight_layout()
    plt.savefig(args.output)
示例#4
0
def do_trajectory(args):
    data = [json.loads(line) for line in open(args.input, "rt")]
    data = {(obj["system"], obj["metric"], obj["prompt"], obj["estimator"]):
            obj
            for obj in data}

    if args.input_gold:
        data_gold = [json.loads(line) for line in open(args.input_gold, "rt")]
        data_gold = {(obj["system"], obj["metric"], obj["prompt"],
                      obj["estimator"]): obj
                     for obj in data_gold}
    else:
        data_gold = None

    colors = cm.tab10.colors

    system = args.data_system
    metric = args.data_metric
    prompt = args.data_prompt

    baseline = np.array(data[system, metric, prompt, "simple"]["summary"])
    model = np.array(data[system, metric, prompt, "model_variate"]["summary"])
    if data_gold:
        model_gold = np.array(data_gold[system, metric, prompt,
                                        "model_variate"]["summary"])
    gold = np.array(data[system, "gold", prompt, "model_variate"]["summary"])

    plt.rc("font", size=16)
    plt.rc("text", usetex=False)
    #plt.rc("figure", figsize=(10,10))

    plt.xlabel("Number of samples")
    plt.ylabel(r"80% confidence interval")
    plt.plot(baseline.T[2] - baseline.T[1], color=colors[0], label="Humans")
    plt.plot(model.T[2] - model.T[1],
             color=colors[1],
             label="Humans + {}".format(LABELS.get(metric, metric)))
    if data_gold:
        plt.plot(model_gold.T[2] - model_gold.T[1],
                 ':',
                 color=colors[2],
                 label="Noiseless humans + {}".format(
                     LABELS.get(metric, metric)))
    plt.plot(gold.T[2] - gold.T[1],
             ':',
             color=colors[4],
             label="Humans + perfect metric")

    plt.xlim([0, 500])
    plt.ylim([0.05, 0.2])

    plt.legend()

    if args.with_title:
        task = first(key for key, values in PROMPTS.items()
                     if prompt in values)
        plt.title(r"{} on {} using the {} prompt".format(
            LABELS.get(system, system),
            LABELS.get(task, task),
            LABELS.get(prompt, prompt),
        ),
                  fontsize=14)

    plt.tight_layout()
    plt.savefig(args.output)
示例#5
0
def do_instance_correlation(args):
    data = [json.loads(line) for line in open(args.input)]
    prompt, metric = args.data_prompt, args.data_metric
    task = first(key for key, values in PROMPTS.items() if prompt in values)
    systems = SYSTEMS[task]

    # Group by data by system.
    plt.rc("font", size=16)
    plt.rc("text", usetex=False)
    plt.rc("figure", figsize=(6, 8))
    colors = cm.Dark2.colors[:len(systems)]

    # 1. How many distinct Y values exist?
    fig, axs = plt.subplots(4, 1, sharex=True, sharey=True)

    def _thresh(y):
        return max(min(y, 1), -1)

    xy = {
        system: np.array([[
            _thresh(datum["prompts"][prompt]["gold"]),
            datum["prompts"][prompt][metric]
        ] for datum in data if system in datum["system"].split(";")])
        for system in systems
    }

    if args.bins:
        y = np.array(
            [_thresh(datum["prompts"][prompt]["gold"]) for datum in data])
        distinct_values = np.linspace(y.min(), y.max(), args.bins)
        plt.xticks(distinct_values)

        for system in systems:
            xy[system] = _snap(xy[system], distinct_values)

        # 2. Make violin plots.
        for i, system in enumerate(systems):
            violinplot(axs[i], xy[system], distinct_values, colors[i])

    for i, system in enumerate(systems):
        x, y = xy[system].T[0], xy[system].T[1]
        axs[i].scatter(x, y, alpha=0.3, marker='.', color=colors[i])

    for i, system in enumerate(systems):
        x, y = xy[system].T[0], xy[system].T[1]
        coeffs = np.polyfit(x, y, 1)
        xlim = np.array([x.min(), x.max()])
        axs[i].plot(xlim,
                    xlim * coeffs[0] + coeffs[1],
                    linestyle='--',
                    linewidth=1,
                    zorder=-1,
                    color=colors[i])

    for i, system in enumerate(systems):
        axs[i].text(1.2,
                    0.5,
                    LABELS.get(system, system),
                    va='center',
                    rotation='vertical')

    plt.xlabel(r"Human judgement ({})".format(LABELS.get(prompt, prompt)))
    #plt.text(-1, 0, LABELS.get(metric, metric), va="center")
    fig.text(0.01,
             0.5,
             LABELS.get(metric, metric),
             va='center',
             rotation='vertical')

    if args.with_title:
        task = first(key for key, values in PROMPTS.items()
                     if prompt in values)
        axs[0].set_title(r"Instance-level correlation on {}".format(
            LABELS.get(task, task), ),
                         fontsize=14)

    plt.subplots_adjust(wspace=0, hspace=0.05)
    #plt.tight_layout()

    #plt.legend(handles=[mp.Patch(color=colors[i], label=LABELS.get(system, system)) for i, system in enumerate(systems)])

    plt.savefig(args.output)
示例#6
0
def _snap(vs, points):
    ret = []
    for x, y in vs:
        ret.append((first(x_ for x_ in points if x_ >= x), y))
    return np.array(ret)
示例#7
0
def do_system_correlation(args):
    data = [json.loads(line) for line in open(args.input)]
    prompt, metric = args.data_prompt, args.data_metric
    task = first(key for key, values in PROMPTS.items() if prompt in values)
    systems = SYSTEMS[task]

    # Group by data by system.
    data = make_bias_table(data, prompt, metric, ["lr", "ur"])

    plt.rc("font", size=16)
    plt.rc("text", usetex=False)
    plt.rc("figure", figsize=(8, 6))
    colors = cm.Dark2.colors[:len(systems)]

    def _thresh(y):
        return max(min(y, 1), -1)

    # 0. Plot the xy correlation curve.
    xy = np.array([[x, _thresh(y)] for system in systems
                   for (x, *_), (y, *_) in [data[system]["default"]]])
    xlim = np.array([xy.T[0].min(), xy.T[0].max()])
    coeffs = np.polyfit(xy.T[0], xy.T[1], 1)
    plt.plot(xlim,
             xlim * coeffs[0] + coeffs[1],
             linestyle='--',
             linewidth=2,
             zorder=-1)

    # 1. Plot actual data points with error bars.
    xy = np.array([[x, y] for system in systems
                   for (x, *_), (y, *_) in data[system].values()])
    xy_l = np.array([[x, y] for system in systems
                     for (_, x, _), (_, y, _) in data[system].values()])
    xy_u = np.array([[x, y] for system in systems
                     for (_, _, x), (_, _, y) in data[system].values()])
    plt.errorbar(xy.T[0],
                 xy.T[1],
                 xerr=[(xy - xy_l).T[0], (xy_u - xy).T[0]],
                 yerr=[(xy - xy_l).T[1], (xy_u - xy).T[1]],
                 capsize=2,
                 alpha=0.5,
                 linestyle='',
                 marker="",
                 zorder=-1)

    # 2. Plot markers.
    xy = np.array([[x, y] for system in systems
                   for (x, *_), (y, *_) in [data[system]["default"]]])
    xy_lr = np.array([[x, y] for system in systems
                      for (x, *_), (y, *_) in [data[system]["lr"]]])
    xy_ur = np.array([[x, y] for system in systems
                      for (x, *_), (y, *_) in [data[system]["ur"]]])

    plt.scatter(xy_lr.T[0], xy_lr.T[1], color=colors, marker=">")
    plt.scatter(xy_ur.T[0], xy_ur.T[1], color=colors, marker="^")
    plt.scatter(xy.T[0], xy.T[1], 100, c=colors, marker="o")
    plt.xlabel(r"Human judgement ({})".format(LABELS.get(prompt, prompt)))
    plt.ylabel(LABELS.get(metric, metric))

    if args.with_title:
        task = first(key for key, values in PROMPTS.items()
                     if prompt in values)
        plt.title(r"System-level correlation on {}".format(
            LABELS.get(task, task), ),
                  fontsize=14)

    plt.tight_layout()

    plt.legend(handles=[
        mp.Patch(color=colors[i], label=LABELS.get(system, system))
        for i, system in enumerate(systems)
    ])

    plt.savefig(args.output)