def get_counterfactual_mp(args): # special setting for MEPS for ri in range(1, args.counter_run + 1): if args.src_data.strip("None"): cur_df = pd.read_csv(args.src_data + ".csv") else: print("Need to specify the raw real data!!") exit() data_name = os.path.join( args.repo_dir, "parameter_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) # get y shift if args.model_flag != "m1": print("Only support m1 model for MEPS as a special case!") exit() else: y_res = pd.read_csv(data_name + "_y_counter.csv") cur_df["Y_count"] = y_res["y_counter"] output_f = os.path.join( args.repo_dir, "counterfactual_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str( ri) + "_count.csv" writeToCSV(output_f, cur_df) if cur_df.shape[0] != args.val_n: print("Error !!!!!", ri) exit() if args.verbose: print("--- Save counterfactual in", output_f, " --- \n")
def get_counterfactual_data_syn(args): for ri in range(1, args.counter_run + 1): cur_df = pd.read_csv( os.path.join(args.repo_dir, args.data_dir) + "/" + args.data_flag + "/R" + str(ri) + ".csv") group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g] orig_cols = list(cur_df.columns) data_name = os.path.join( args.repo_dir, "parameter_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) y_res = pd.read_csv(data_name + "_y.csv") if args.model_flag in ["m2", "m4"]: x_res = pd.read_csv(data_name + "_x.csv") counter_g_base_x = x_res[x_res["Unnamed: 0"] == "GR" + args.counter_g]["Estimate"].values[0] x_shifts = {args.counter_g: 0} for gi in group_list: x_shifts[gi] = counter_g_base_x - x_res[ x_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0] cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x]) cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"] counter_g_base = y_res[y_res["Unnamed: 0"] == "GR" + args.counter_g]["Estimate"].values[0] y_shifts = {args.counter_g: 0} for gi in group_list: y_shifts[gi] = counter_g_base - y_res[y_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0] x_weight = y_res[y_res["Unnamed: 0"] == "X"]["Estimate"].values[0] cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x]) cur_df["Y_count"] = cur_df["Y_shift"] + x_weight * cur_df["X_count"] if args.model_flag in ["m2", "m4"]: cur_df[ "Y_count_resolve"] = cur_df["Y_shift"] + x_weight * cur_df["X"] cur_df = cur_df.loc[:, orig_cols + ["X_count", "Y_count", "Y_count_resolve"]] else: cur_df = cur_df.loc[:, orig_cols + ["Y_count"]] output_f = os.path.join( args.repo_dir, "counterfactual_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str( ri) + "_count.csv" writeToCSV(output_f, cur_df) if cur_df.shape[0] != args.val_n: print("Error !!!!!", ri) exit() if args.verbose: print("--- Save counterfactual in", output_f, " --- \n")
def get_LTR_predict_data_single(args): m2_keep_cols = {"Full": {"fair_count": ["G", "R", "X_count", "Y_count"], "bias": ["G", "R", "X", "Y"], "fair_res": ["G", "R", "X", "Y_count_resolve"]}, "Unaware": {"fair_count": ["X_count", "Y_count"], "bias": ["X", "Y"], "fair_res": ["X", "Y_count_resolve"]}} m1_keep_cols = {"Full": {"fair_count": ["G", "R", "Y_count"], "bias": ["G", "R", "Y"]}} # special set for meps with a moderator age encoded as X meps_keep_cols = {"Full": {"fair_count": ["G", "R", "X", "Y_count"], "bias": ["G", "R", "X", "Y"]}} for ri in range(1, args.test_run+1): count_df = pd.read_csv(os.path.join(args.repo_dir,"counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R1"+args.file_n+".csv") for expi in args.settings.split(","): if args.model_flag == "m2": col_map = m2_keep_cols[expi] else: if args.data_flag == "mp": col_map = meps_keep_cols[expi] else: col_map = m1_keep_cols[expi] # include all the prediction in this setting all_fair_settings = [f for f in os.listdir(os.path.join(args.repo_dir,"ranklib_data") + "/" + args.data_flag + "/" + args.model_flag + "/"+expi) if ~os.path.isfile(os.path.join(os.path.join(args.repo_dir,"ranklib_data") + "/" + args.data_flag + "/" + args.model_flag + "/"+expi, f)) and "." not in f] for pred_di in all_fair_settings: cols = col_map[pred_di.split("__")[-1]] train_cols = col_map[pred_di.split("__")[0]] ri_pred = get_prediction_scores(os.path.join(args.repo_dir, "ranklib_data")+"/"+args.data_flag +"/" + args.model_flag + "/" + expi + "/" + pred_di, "R" + str(ri), "ListNet") pred_y_col = train_cols[-1]+ "__" + cols[-1] + "__" + expi.lower() count_df = count_df[count_df["UID"].isin([int(x) for x in ri_pred])] count_df[pred_y_col] = count_df["UID"].apply(lambda x: ri_pred[str(x)]) if args.output_n: output_f = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) + "_" + args.output_n + ".csv" else: output_f = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) + ".csv" writeToCSV(output_f, count_df) if args.verbose: print("--- Save LTR predict in ", output_f, " --- \n")
def eval_counter_results(args): res_df = pd.DataFrame(columns=["run", "rank", "k", "group", args.measure]) counter_path = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/" all_files = get_files_with_name(counter_path, args.file_n) for ri, fi in enumerate(all_files): # cur_run = int(fi.replace("_LTR", "").replace("R", "").replace(".csv","")) count_df = pd.read_csv(counter_path + fi) k_list = [int(x) for x in args.eval_ks.split(",")] seti_quotas = get_quotas_count(count_df) for rank_all in args.rankings.split(","): if "__" in rank_all: train_ranki = rank_all.split("__")[0] else: train_ranki = rank_all[0] orig_df = count_df.sort_values(by=train_ranki, ascending=False) # shift score to positive orig_df[train_ranki] = orig_df[train_ranki] + abs(orig_df[train_ranki].min()) for ki in k_list: opt_u = sum(orig_df.head(ki)[train_ranki]) res_row = [ri+1, rank_all, ki] all_row = res_row+["all"] if args.measure == "select_rate" : all_row.append(1) sort_df = get_sort_df(rank_all, count_df, ki, quotas_max=seti_quotas) if args.measure == "score_utility": all_row.append(compute_score_util(list(sort_df["UID"]), orig_df, train_ranki, opt_u)) sort_df["rank"] = list(range(1, ki + 1)) # compute jacard index and kendall-tau distance at top-k ranking top_orig = orig_df.head(ki) if args.measure == "sensitivity": all_row.append(compute_k_recall(list(top_orig["UID"]), list(sort_df["UID"]), ki)) if args.measure == "ap": all_row.append(compute_ap(list(top_orig["UID"]), list(sort_df["UID"]), ki)) if args.measure == "igf": all_row.append(compute_igf_ratio(list(sort_df["UID"]), orig_df, train_ranki)) if args.measure == "rKL": if args.data_flag == "cm": # random permutate for COMPAS to avoid bias due to sorting with ties all_row.append(compute_rKL(sort_df, orig_df, sort_col=rank_all, group_col=args.group_col)) else: all_row.append(compute_rKL(sort_df, orig_df, group_col=args.group_col)) res_df.loc[res_df.shape[0]] = all_row # group-level evaluation cur_quotas = dict(sort_df['GR'].value_counts(normalize=True)) for gi in list(orig_df[args.group_col].unique()): gi_row = res_row + [gi] if args.measure == "select_rate": # selection rate to rank inside top-k if gi in cur_quotas: gi_row.append(cur_quotas[gi] / seti_quotas[gi]) else: gi_row.append(0) gi_df = sort_df[sort_df["GR"] == gi] gi_orig = orig_df[orig_df["GR"] == gi] if args.measure == "score_utility": gi_row.append(compute_score_util(list(gi_df["UID"]), gi_orig, train_ranki)) gi_orig_k = top_orig[top_orig["GR"] == gi] if args.measure == "sensitivity": gi_row.append(compute_k_recall(list(gi_orig_k["UID"]), list(gi_df["UID"]), len(gi_orig_k["UID"]))) if args.measure == "ap": gi_row.append(compute_ap(list(gi_orig_k["UID"]), list(gi_df["UID"]), ki)) if args.measure == "igf": if not gi_df.shape[0]: gi_row.append(-1) else: gi_row.append(compute_igf_ratio(list(gi_df["UID"]), gi_orig, train_ranki)) if args.measure == "rKL": # not application to group gi_row.append(-1) res_df.loc[res_df.shape[0]] = gi_row if args.verbose: print("--- Done eval ", args.measure, " for ", args.data_flag, args.model_flag, ri+1, " --- \n") if "select" in args.measure: # selection rate for counterfactual and LTR output_f = os.path.join(args.repo_dir, "evaluation_res") + "/"+args.data_flag + "/"+ args.model_flag + "/Eval_R" + str(ri+1) + "_" + args.measure + args.file_n + ".csv" else: output_f = os.path.join(args.repo_dir, "evaluation_res") + "/"+args.data_flag + "/"+ args.model_flag + "/Eval_R" + str(ri+1) + "_" + args.measure + ".csv" writeToCSV(output_f, res_df) if args.verbose: print("--- Save eval file in ", output_f, " --- \n")
def generate_data(file_name, para_dict, src_data=None): if "values" in para_dict: # generate the categorical columns for synthetic data dataset = pd.DataFrame() categoricalData = pd.DataFrame(columns=para_dict["values"]) for ai, ai_values in para_dict["values"].items(): ai_col = [] for vi in ai_values: vi_n = sum([ para_dict["quotas_budget"][x] for x in para_dict["quotas_budget"] if len(x) == 2 and vi in x ]) ai_size = int(np.ceil(para_dict["N"] * vi_n)) ai_col += [vi for _ in range(ai_size)] np.random.shuffle(ai_col) categoricalData[ai] = ai_col if categoricalData.shape[0] != para_dict["N"]: categoricalData.sample(n=para_dict["N"]) # add categorical columns to dataset dataset = pd.concat([dataset, categoricalData], axis=1) else: # for semi real data dataset = pd.read_csv(src_data) # generate the continous columns for cur_formula in para_dict["edge_weights"]: key_col = list(cur_formula.keys())[0] if len(cur_formula[key_col]) > 1: # multiple dependent columns if "values" in para_dict: dataset = dataset.groupby(list( cur_formula[key_col].keys())).apply( lambda x: generateScores( x, cur_formula, para_dict["mius"], para_dict["vars"], value_dict=para_dict["values"], inter_miu=para_dict["intersectionality"]["miu"], inter_var=para_dict["intersectionality"]["var"])) else: dataset = dataset.groupby(list( cur_formula[key_col].keys())).apply( lambda x: generateScores(x, cur_formula, para_dict["mius"], para_dict["vars"], sensi_cols=["G", "R"])) else: # single dependent column depend_col, depend_weight = list(cur_formula[key_col].items())[0] dataset[key_col] = dataset[depend_col] * depend_weight if "values" in para_dict: dataset["".join(list(para_dict["values"].keys()))] = dataset.apply( lambda x: "".join([x[i] for i in para_dict["values"]]), axis=1) dataset["UID"] = list(range(1, para_dict["N"] + 1)) # dataset["Y_bi"] = dataset["Y"].apply(lambda x: int(x >= dataset["Y"].quantile(.8))) # get_noise_col(dataset, "Y_bi") dataset = dataset[[ x for x in dataset.columns if x not in ["X_i", "Y_d"] ]] else: # rename columns for si in ["G", "R"]: dataset[si] = dataset[si].apply(lambda x: x[0].upper()) dataset.drop(columns=['Y', 'X'], inplace=True) for new_coli in [list(x.keys())[0] for x in para_dict["edge_weights"]]: if "_" in new_coli: dataset.rename(columns={new_coli: new_coli[0]}, inplace=True) # save dataframe to a csv file writeToCSV(file_name, dataset)
def get_counterfactual_data_real(args): for ri in range(1, args.counter_run + 1): if args.src_data.strip("None"): cur_df = pd.read_csv(args.src_data + ".csv") else: print("Need to specify the raw real data!!") exit() group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g] orig_cols = list(cur_df.columns) data_name = os.path.join( args.repo_dir, "parameter_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) if args.model_flag in ["m2", "m4"]: x_res = pd.read_csv(data_name + "_x.csv") counter_g_base = x_res[x_res["Unnamed: 0"] == "GR" + args.counter_g]["Estimate"].values[0] x_shifts = {args.counter_g: 0} for gi in group_list: x_shifts[gi] = counter_g_base - x_res[ x_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0] cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x]) cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"] # get y shift if args.model_flag in ["m1", "m3"]: y_res = pd.read_csv(data_name + "_y.csv") counter_g_base = y_res[y_res["Unnamed: 0"] == "GR" + args.counter_g]["Estimate"].values[0] y_shifts = {args.counter_g: 0} for gi in group_list: y_shifts[gi] = counter_g_base - y_res[ y_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0] else: y_shifts = {args.counter_g: 0} y_shifts_resolve = {args.counter_g: 0} for gi in group_list: g_res = pd.read_csv(data_name + "_" + gi + "_med.csv")["Estimate"] y_shifts[gi] = -g_res[2] y_shifts_resolve[gi] = -g_res[1] cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x]) cur_df["Y_count"] = cur_df["Y"] + cur_df["Y_shift"] if args.model_flag in ["m2", "m4"]: cur_df["Y_shift_resolve"] = cur_df["GR"].apply( lambda x: y_shifts_resolve[x]) cur_df["Y_count_resolve"] = cur_df["Y"] + cur_df["Y_shift_resolve"] cur_df = cur_df.loc[:, orig_cols + ["X_count", "Y_count", "Y_count_resolve"]] else: cur_df = cur_df.loc[:, orig_cols + ["Y_count"]] output_f = os.path.join( args.repo_dir, "counterfactual_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str( ri) + "_count.csv" writeToCSV(output_f, cur_df) if cur_df.shape[0] != args.val_n: print("Error !!!!!", ri) exit() if args.verbose: print("--- Save counterfactual in", output_f, " --- \n")
def get_counterfactual_data_single_m(args): # Only support mediation on gender now for ri in range(1, args.counter_run + 1): cur_df = pd.read_csv( os.path.join(args.repo_dir, args.data_dir) + "/" + args.data_flag + "/R" + str(ri) + ".csv") group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g] orig_cols = list(cur_df.columns) data_name = os.path.join( args.repo_dir, "parameter_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) if args.model_flag in ["m2", "m4"]: x_res = pd.read_csv(data_name + "_x.csv") counter_g_base = x_res[x_res["Unnamed: 0"] == args.med_s + args.counter_g]["Estimate"].values[0] other_g_base = x_res[x_res["Unnamed: 0"] == args.med_s + args.other_g]["Estimate"].values[0] x_shifts = {} for gi in group_list: if args.counter_g in gi: x_shifts[gi] = 0 else: x_shifts[gi] = counter_g_base - other_g_base cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x]) cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"] # get y shift if args.model_flag in ["m1", "m3"]: print( "Only support model m2 and m4 for mediation on single attribute!" ) exit() else: y_res = pd.read_csv(data_name + "_y.csv") y_shifts = {} for gi in group_list: if args.hidden_g in gi: # for BM and BF gi_inter = y_res[y_res["Unnamed: 0"] == "G" + gi[0]]["Estimate"].values[0] else: if "MW" in gi: gi_inter = y_res[y_res["Unnamed: 0"] == "G" + gi[0]]["Estimate"].values[0] + \ y_res[y_res["Unnamed: 0"] == "R" + gi[1]]["Estimate"].values[0] + y_res[y_res["Unnamed: 0"] == "GM:RW"]["Estimate"].values[0] else: gi_inter = y_res[y_res["Unnamed: 0"] == "G" + gi[0]]["Estimate"].values[0] + \ y_res[y_res["Unnamed: 0"] == "R" + gi[1]]["Estimate"].values[0] y_shifts[gi] = -gi_inter + y_res[y_res["Unnamed: 0"] == "GF"]["Estimate"].values[0] med_weight = pd.read_csv(data_name + "_" + args.other_g + "_med.csv")["Estimate"][0] cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x]) cur_df["Y_count"] = cur_df["Y_shift"] + med_weight * cur_df["X_count"] if args.model_flag in ["m2", "m4"]: cur_df["Y_count_resolve"] = cur_df[ "Y_shift"] + med_weight * cur_df["X"] cur_df = cur_df.loc[:, orig_cols + ["X_count", "Y_count", "Y_count_resolve"]] else: cur_df = cur_df.loc[:, orig_cols + ["Y_count"]] output_f = os.path.join( args.repo_dir, "counterfactual_data" ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str( ri) + "_count.csv" writeToCSV(output_f, cur_df) if cur_df.shape[0] != args.val_n: print("Error !!!!!", ri) exit() if args.verbose: print("--- Save counterfactual in", output_f, " --- \n")