def get_dataset(data_config, device="cpu"): """ Take config and return dataloader""" dataname = data_config.name val_size = data_config.val_size if dataname == "adult": data = load_adult(val_size=val_size) c_size = 2 c_type = "binary" elif dataname == "health": data = load_health(val_size=val_size) c_size = 9 c_type = "one_hot" else: logger.error(f"Invalid data name {dataname} specified") raise Exception(f"Invalid data name {dataname} specified") train, valid, test = data["train"], data["valid"], data["test"] if valid is None: valid = data["test"] return ( Box({"train": TensorDataset( torch.tensor(train[0]).float().to(device), torch.tensor(train[1]).long().to(device), torch.tensor(train[2]).long().to(device), ), "test": TensorDataset( torch.tensor(test[0]).float().to(device), torch.tensor(test[1]).long().to(device), torch.tensor(test[2]).long().to(device), ), "valid": TensorDataset( torch.tensor(valid[0]).float().to(device), torch.tensor(valid[1]).long().to(device), torch.tensor(valid[2]).long().to(device), )}), { "input_shape": train[0].shape[1:], "c_size": c_size, "c_type": c_type, "y_size": 2, "y_type": "binary", })
for i in range(p_y_c.shape[1]): for j in range(i + 1, p_y_c.shape[1]): constraints.extend([ -dp <= (p_y_c[1, i] - delta[i]) / p_c[i] - (p_y_c[1, j] - delta[j]) / p_c[j], (p_y_c[1, i] - delta[i]) / p_c[i] - (p_y_c[1, j] - delta[j]) / p_c[j] <= dp, ]) prob = cvxpy.Problem(objective, constraints) result = prob.solve() # breakpoint() solution.append([result, dp]) print(f"DP: {dp}, sol : {result}") return solution if __name__ == "__main__": for data in ["adult", "health"]: # compute idea areas if data == "adult": adult = load_adult(0.2) Y = adult["test"][2] C = adult["test"][1] elif data == "health": health = load_health(0.2) Y = health["test"][2] C = health["test"][1] solution = get_optimal_front(Y, C)
def area_over_curve_lp(): def compute_ideal_area(Y, C): len_c = len(numpy.unique(C)) len_y = len(numpy.unique(Y)) p_y_c = numpy.zeros((len_y, len_c)) for c in range(len_c): for y in range(len_y): p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean() print(p_y_c) # compute desired rate i.e p(y=1|C=c) desired_rate = p_y_c[1, :].mean() errors = p_y_c[1, :] - desired_rate majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1)) max_dp = demographic_parity_difference(Y, Y, sensitive_features=C) solution = get_optimal_front(Y, C) # add no error and max_dp to the solution solution.append([1, max_dp]) solution = numpy.array(solution) # sort by dp solution = solution[solution[:, 1].argsort()] area = numpy.sum( # acc * dp_next - dp_cur (solution[:-1, 0] - majority_acc) * (solution[1:, 1] - solution[0:-1, 1])) return area, majority_acc, max_dp # Methods methods = [ "fcrl", "cvib_supervised", "lag-fairness", "maxent_arl", "laftr", "adv_forgetting" ] # compute AUC table area = {} for data in ["adult", "health"]: # compute idea areas if data == "adult": adult = load_adult(0.2) Y = adult["test"][2] C = adult["test"][1] elif data == "health": health = load_health(0.2) Y = health["test"][2] C = health["test"][1] norm_area, majority_acc, max_dp = compute_ideal_area(Y, C) area[data] = {} for idx, key in enumerate([ "nn_1_layer", "nn_2_layer", "random_forest", "svm", "logistic_regression" ]): area[data][key] = {} for m in methods: if data == "health" and m == "laftr": continue t = numpy.load(f"result/eval/{data}/{m}.npy", allow_pickle=True).item() df = get_dataframe_from_results(t) # get pareto front pareto = df[[f'{key}_normalized_acc', f'{key}_normalized_dp']].values # drop nan pareto = pareto[~numpy.isnan(pareto).any(axis=1)] pareto = get_pareto_front(pareto) pareto = numpy.array(pareto) pareto = pareto[pareto[:, 1].argsort()] # reject points that have more dp than data THRESH = 1.0 idx = pareto.shape[0] while idx > -1: if pareto[idx - 1, 1] > THRESH * max_dp: idx = idx - 1 else: break pareto = pareto[:idx] if idx == -1: area[data][key][m] = 0 print(f"No point found below dp_max for {m}, {data}") continue # add random acc point, 0 (this works as a reference to create horizontal bars # add max_dp, pareto[-1,0] i.e max acc you can get at data's dp pareto = numpy.concatenate( [[[majority_acc, 0]], pareto, [[pareto[-1, 0], max_dp]]], axis=0) # get area by making rectangle area[data][key][m] = numpy.sum( # acc * dp_next - dp_cur (pareto[:-1, 0] - pareto[0, 0]) * (pareto[1:, 1] - pareto[0:-1, 1])) # normalize area[data][key][m] /= norm_area # dump to table for idx, key in enumerate([ "nn_1_layer", "nn_2_layer", "random_forest", "svm", "logistic_regression" ]): table = Texttable() table.set_cols_align(["l", "c", "c"]) table.header(["Method", "UCI Adult", "Heritage Health"]) for m in methods: if m == "fcrl": table.add_row([ "FCRL (Ours)", area["adult"][key][m], area["health"][key][m] ]) if m == "lag-fairness": table.add_row( ["MIFR", area["adult"][key][m], area["health"][key][m]]) if m == "maxent_arl": table.add_row([ "MaxEnt-ARL", area["adult"][key][m], area["health"][key][m] ]) if m == "cvib_supervised": table.add_row( ["CVIB", area["adult"][key][m], area["health"][key][m]]) if m == "laftr": table.add_row(["LAFTR", area["adult"][key][m], "N/A"]) if m == "adv_forgetting": table.add_row([ "Adversarial Forgetting", area["adult"][key][m], area["health"][key][m] ]) os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "table")) with open(os.path.join(FIGURES_FOLDER, "table", f"{key}.better.tex"), 'w') as f: f.write( latextable.draw_latex( table, caption="Area Over Parity Accuracy Curve", label=f"AOPAC_{key}"))
def figure9(): def compute_ideal_stats(Y, C): len_c = len(numpy.unique(C)) len_y = len(numpy.unique(Y)) p_y_c = numpy.zeros((len_y, len_c)) for c in range(len_c): for y in range(len_y): p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean() print(p_y_c) # compute desired rate i.e p(y=1|C=c) desired_rate = p_y_c[1, :].mean() errors = p_y_c[1, :] - desired_rate majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1)) max_dp = demographic_parity_difference(Y, Y, sensitive_features=C) return 0, majority_acc, max_dp # modify font size fontsize = pyplot.rcParams.get("font.size") xlabelsize = pyplot.rcParams.get("xtick.labelsize") ylabelsize = pyplot.rcParams.get("ytick.labelsize") labelsize = pyplot.rcParams.get("axes.labelsize") titlesize = pyplot.rcParams.get("axes.titlesize") pyplot.rcParams.update({ "font.size": 12, "xtick.labelsize": 16, "ytick.labelsize": 16, "axes.labelsize": 16, "axes.titlesize": 20 }) for data in ["adult", "health"]: # compute idea areas if data == "adult": adult = load_adult(0.2) Y = adult["test"][2] C = adult["test"][1] elif data == "health": health = load_health(0.2) Y = health["test"][2] C = health["test"][1] _, RANDOM_ACC, MAX_DP = compute_ideal_stats(Y, C) t = numpy.load(f"result/eval/{data}/fcrl.npy", allow_pickle=True).item() df = get_dataframe_from_results(t) for idx, key in enumerate(["nn_1_layer"]): figure = pyplot.figure(figsize=(16, 8)) ax = figure.add_subplot(1, 1, 1) pareto = get_pareto_front( df[[f'{key}_normalized_acc', f'{key}_normalized_dp']].values) pareto = numpy.array(pareto) pareto = pareto[pareto[:, 1].argsort()] # plot the points df.plot(kind="scatter", x=f'{key}_normalized_acc', y=f'{key}_normalized_dp', c="none", edgecolors=COLOR[0], linewidth=2, marker=MARKER[0], ax=ax, s=SCATTER_MARKERSIZE, label='All Models') ax.scatter(pareto[:, 0], pareto[:, 1], label="Pareto Front", c="none", edgecolors=COLOR[1], linewidth=2, marker=MARKER[1], s=SCATTER_MARKERSIZE) # create bars ax.barh(y=pareto[:-1, 1], width=pareto[:-1, 0] - RANDOM_ACC, height=pareto[1:, 1] - pareto[:-1, 1], left=RANDOM_ACC, color="yellow", alpha=0.2, align="edge", edgecolor="red") ax.barh(y=pareto[-1, 1], height=MAX_DP - pareto[-1, 1], width=pareto[-1, 0] - RANDOM_ACC, left=RANDOM_ACC, color="yellow", alpha=0.2, align="edge", edgecolor="red") # ideal plot ax.plot([1, 1], [MAX_DP, 0], color="red", label="Ideal") # ideal plot but better solution = get_optimal_front(Y, C) solution.append([1, MAX_DP]) solution = numpy.array(solution) solution = solution[solution[:, 1].argsort()] ax.plot(solution[:, 0], solution[:, 1], color="cyan", label="Ideal (LP)") # box ax.plot([RANDOM_ACC, RANDOM_ACC], [0, MAX_DP], color="gray", linestyle="--") ax.plot([RANDOM_ACC, 1], [0, 0], color="gray", linestyle="--") ax.plot([RANDOM_ACC, 1], [MAX_DP, MAX_DP], color="gray", linestyle="--") ax.set_xlabel("Accuracy") ax.set_ylabel("$\Delta_{DP}$") # ax.set_title( # "Acc Vs $\Delta_{DP}$" + f" ({'UCI Adult' if data == 'adult' else 'Heritage Health'})") ax.legend() ax.set_xlim(left=RANDOM_ACC - 0.005, right=1.005) os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "appendix")) pyplot.savefig(os.path.join(FIGURES_FOLDER, "appendix", f"pareto_{data}_{key}.{FORMAT}"), bbox_inches='tight') pyplot.close() # put values back pyplot.rcParams.update({ "font.size": fontsize, "xtick.labelsize": xlabelsize, "ytick.labelsize": ylabelsize, "axes.labelsize": labelsize, "axes.titlesize": titlesize })
import numpy as np from src.common.data.adult import load_adult if __name__=="__main__": data = load_adult(val_size=0) f_out_np = 'laftr/data/adult/adult.npz' train = data["train"] test = data["test"] D = {"training": {}, "test":{}} D["training"]["X"] = train[0] D["training"]["Y"] = train[2] D["training"]["A"] = train[1] D["test"]["X"] = test[0] D["test"]["Y"] = test[2] D["test"]["A"] = test[1] # since we don't want to use the validation strategy we can reduce this to minimum so that all the training data is used. But this shouldnot matter much n = D['training']['X'].shape[0] shuf = np.random.permutation(n) valid_pct = 0.2 valid_ct = int(n * valid_pct) valid_inds = shuf[:valid_ct] train_inds = shuf[valid_ct:] np.savez(f_out_np, x_train=D['training']['X'], x_test=D['test']['X'], y_train=D['training']['Y'], y_test=D['test']['Y'], attr_train=D['training']['A'], attr_test=D['test']['A'], train_inds=train_inds, valid_inds=valid_inds)