def do_build_table(args): args.estimator_args = dict(args.estimator_args or []) data = load_jsonl(args.input) data_means = load_jsonl(args.input_means) if args.input_means else None metric_ss = get_metric_ss(data_means) if data_means else {} prompts = list(first(data)["prompts"]) metrics = list(first(first(data)["prompts"].values())) metrics.remove('human') systems = sorted({datum["system"] for datum in data}) systems.remove('reference') trajectories = [] settings = [(metric, prompt, system) for metric in metrics for prompt in prompts for system in systems] #settings = [(metric, prompt, system) for metric in ["gold"] for prompt in prompts for system in systems] for metric, prompt, system in tqdm(settings, desc="settings"): args.data_prompt = prompt args.data_metric = metric args.data_system = system # project data. fs, gs, hs, anns = apply_transforms(args, data) fs = np.array([np.mean(h) for h in hs]) # FIXES BUG if metric in metric_ss: args.estimator_args["_g0"], args.estimator_args[ "_var_g"] = metric_ss[metric][prompt][system] else: args.estimator_args["_g0"], args.estimator_args[ "_var_g"] = np.mean(gs), np.var(gs) # model. truth = np.mean(fs) args.estimator = "simple" trajectory = bootstrap_trajectory(fs, gs, hs, anns, get_estimator(args), args.num_epochs) simple = np.stack([ np.mean(trajectory, 0), truth + np.percentile(trajectory - truth, 10, 0), truth + np.percentile(trajectory - truth, 90, 0) ]).T trajectories.append(report_trajectory(args, truth, simple)) args.estimator = "model_variate" trajectory = bootstrap_trajectory(fs, gs, hs, anns, get_estimator(args), args.num_epochs) mv = np.stack([ np.mean(trajectory, 0), truth + np.percentile(trajectory - truth, 10, 0), truth + np.percentile(trajectory - truth, 90, 0) ]).T trajectories.append(report_trajectory(args, truth, mv)) save_jsonl(args.output, trajectories)
def do_data_efficiency_table(args): data = [json.loads(line) for line in open(args.input, "rt")] data = get_data_efficiencies(data) prompt = args.data_prompt metrics = sorted(data.keys()) task = first(key for key, values in PROMPTS.items() if prompt in values) systems = SYSTEMS[task] X = np.array([[data[metric][prompt][system]**2 for system in systems] for metric in metrics]) plt.rc("font", size=16) plt.rc("text", usetex=False) draw_matrix(X, with_values=True, x_labels=[LABELS.get(s, s) for s in systems], y_labels=[LABELS.get(m, m) for m in metrics], vmin=0.9, vmax=1.3) plt.colorbar(label="Data efficiency") plt.xlabel("Systems") plt.ylabel("Metrics") if args.with_title: plt.title(r"Data efficiencies on {} using the {} prompt".format( LABELS.get(task, task), LABELS.get(prompt, prompt), ), fontsize=14) plt.tight_layout() plt.savefig(args.output)
def do_correlation_table(args): with open(args.input) as f: data = load_jsonl(f) data = get_correlations(data) data = data[args.data_prompt] prompt = args.data_prompt metrics = sorted(data.keys()) task = first(key for key, values in PROMPTS.items() if prompt in values) systems = SYSTEMS[task] + ["*"] X = np.array([[data[metric][system] for system in systems] for metric in metrics]) plt.rc("font", size=16) plt.rc("text", usetex=False) #plt.rc("figure", figsize=(10,10)) draw_matrix( X, with_values=True, x_labels=[LABELS.get(s, s) for s in systems], y_labels=[LABELS.get(m, m) for m in metrics], ) plt.colorbar(label=r"Pearson ρ") plt.xlabel("Systems") plt.ylabel("Metrics") if args.with_title: task = first(key for key, values in PROMPTS.items() if prompt in values) plt.title(r"Correlations on {} using the {} prompt".format( LABELS.get(task, task), LABELS.get(prompt, prompt), ), fontsize=14) plt.tight_layout() plt.savefig(args.output)
def do_trajectory(args): data = [json.loads(line) for line in open(args.input, "rt")] data = {(obj["system"], obj["metric"], obj["prompt"], obj["estimator"]): obj for obj in data} if args.input_gold: data_gold = [json.loads(line) for line in open(args.input_gold, "rt")] data_gold = {(obj["system"], obj["metric"], obj["prompt"], obj["estimator"]): obj for obj in data_gold} else: data_gold = None colors = cm.tab10.colors system = args.data_system metric = args.data_metric prompt = args.data_prompt baseline = np.array(data[system, metric, prompt, "simple"]["summary"]) model = np.array(data[system, metric, prompt, "model_variate"]["summary"]) if data_gold: model_gold = np.array(data_gold[system, metric, prompt, "model_variate"]["summary"]) gold = np.array(data[system, "gold", prompt, "model_variate"]["summary"]) plt.rc("font", size=16) plt.rc("text", usetex=False) #plt.rc("figure", figsize=(10,10)) plt.xlabel("Number of samples") plt.ylabel(r"80% confidence interval") plt.plot(baseline.T[2] - baseline.T[1], color=colors[0], label="Humans") plt.plot(model.T[2] - model.T[1], color=colors[1], label="Humans + {}".format(LABELS.get(metric, metric))) if data_gold: plt.plot(model_gold.T[2] - model_gold.T[1], ':', color=colors[2], label="Noiseless humans + {}".format( LABELS.get(metric, metric))) plt.plot(gold.T[2] - gold.T[1], ':', color=colors[4], label="Humans + perfect metric") plt.xlim([0, 500]) plt.ylim([0.05, 0.2]) plt.legend() if args.with_title: task = first(key for key, values in PROMPTS.items() if prompt in values) plt.title(r"{} on {} using the {} prompt".format( LABELS.get(system, system), LABELS.get(task, task), LABELS.get(prompt, prompt), ), fontsize=14) plt.tight_layout() plt.savefig(args.output)
def do_instance_correlation(args): data = [json.loads(line) for line in open(args.input)] prompt, metric = args.data_prompt, args.data_metric task = first(key for key, values in PROMPTS.items() if prompt in values) systems = SYSTEMS[task] # Group by data by system. plt.rc("font", size=16) plt.rc("text", usetex=False) plt.rc("figure", figsize=(6, 8)) colors = cm.Dark2.colors[:len(systems)] # 1. How many distinct Y values exist? fig, axs = plt.subplots(4, 1, sharex=True, sharey=True) def _thresh(y): return max(min(y, 1), -1) xy = { system: np.array([[ _thresh(datum["prompts"][prompt]["gold"]), datum["prompts"][prompt][metric] ] for datum in data if system in datum["system"].split(";")]) for system in systems } if args.bins: y = np.array( [_thresh(datum["prompts"][prompt]["gold"]) for datum in data]) distinct_values = np.linspace(y.min(), y.max(), args.bins) plt.xticks(distinct_values) for system in systems: xy[system] = _snap(xy[system], distinct_values) # 2. Make violin plots. for i, system in enumerate(systems): violinplot(axs[i], xy[system], distinct_values, colors[i]) for i, system in enumerate(systems): x, y = xy[system].T[0], xy[system].T[1] axs[i].scatter(x, y, alpha=0.3, marker='.', color=colors[i]) for i, system in enumerate(systems): x, y = xy[system].T[0], xy[system].T[1] coeffs = np.polyfit(x, y, 1) xlim = np.array([x.min(), x.max()]) axs[i].plot(xlim, xlim * coeffs[0] + coeffs[1], linestyle='--', linewidth=1, zorder=-1, color=colors[i]) for i, system in enumerate(systems): axs[i].text(1.2, 0.5, LABELS.get(system, system), va='center', rotation='vertical') plt.xlabel(r"Human judgement ({})".format(LABELS.get(prompt, prompt))) #plt.text(-1, 0, LABELS.get(metric, metric), va="center") fig.text(0.01, 0.5, LABELS.get(metric, metric), va='center', rotation='vertical') if args.with_title: task = first(key for key, values in PROMPTS.items() if prompt in values) axs[0].set_title(r"Instance-level correlation on {}".format( LABELS.get(task, task), ), fontsize=14) plt.subplots_adjust(wspace=0, hspace=0.05) #plt.tight_layout() #plt.legend(handles=[mp.Patch(color=colors[i], label=LABELS.get(system, system)) for i, system in enumerate(systems)]) plt.savefig(args.output)
def _snap(vs, points): ret = [] for x, y in vs: ret.append((first(x_ for x_ in points if x_ >= x), y)) return np.array(ret)
def do_system_correlation(args): data = [json.loads(line) for line in open(args.input)] prompt, metric = args.data_prompt, args.data_metric task = first(key for key, values in PROMPTS.items() if prompt in values) systems = SYSTEMS[task] # Group by data by system. data = make_bias_table(data, prompt, metric, ["lr", "ur"]) plt.rc("font", size=16) plt.rc("text", usetex=False) plt.rc("figure", figsize=(8, 6)) colors = cm.Dark2.colors[:len(systems)] def _thresh(y): return max(min(y, 1), -1) # 0. Plot the xy correlation curve. xy = np.array([[x, _thresh(y)] for system in systems for (x, *_), (y, *_) in [data[system]["default"]]]) xlim = np.array([xy.T[0].min(), xy.T[0].max()]) coeffs = np.polyfit(xy.T[0], xy.T[1], 1) plt.plot(xlim, xlim * coeffs[0] + coeffs[1], linestyle='--', linewidth=2, zorder=-1) # 1. Plot actual data points with error bars. xy = np.array([[x, y] for system in systems for (x, *_), (y, *_) in data[system].values()]) xy_l = np.array([[x, y] for system in systems for (_, x, _), (_, y, _) in data[system].values()]) xy_u = np.array([[x, y] for system in systems for (_, _, x), (_, _, y) in data[system].values()]) plt.errorbar(xy.T[0], xy.T[1], xerr=[(xy - xy_l).T[0], (xy_u - xy).T[0]], yerr=[(xy - xy_l).T[1], (xy_u - xy).T[1]], capsize=2, alpha=0.5, linestyle='', marker="", zorder=-1) # 2. Plot markers. xy = np.array([[x, y] for system in systems for (x, *_), (y, *_) in [data[system]["default"]]]) xy_lr = np.array([[x, y] for system in systems for (x, *_), (y, *_) in [data[system]["lr"]]]) xy_ur = np.array([[x, y] for system in systems for (x, *_), (y, *_) in [data[system]["ur"]]]) plt.scatter(xy_lr.T[0], xy_lr.T[1], color=colors, marker=">") plt.scatter(xy_ur.T[0], xy_ur.T[1], color=colors, marker="^") plt.scatter(xy.T[0], xy.T[1], 100, c=colors, marker="o") plt.xlabel(r"Human judgement ({})".format(LABELS.get(prompt, prompt))) plt.ylabel(LABELS.get(metric, metric)) if args.with_title: task = first(key for key, values in PROMPTS.items() if prompt in values) plt.title(r"System-level correlation on {}".format( LABELS.get(task, task), ), fontsize=14) plt.tight_layout() plt.legend(handles=[ mp.Patch(color=colors[i], label=LABELS.get(system, system)) for i, system in enumerate(systems) ]) plt.savefig(args.output)