Exemplo n.º 1
0
def get_counterfactual_mp(args):
    # special setting for MEPS
    for ri in range(1, args.counter_run + 1):
        if args.src_data.strip("None"):
            cur_df = pd.read_csv(args.src_data + ".csv")
        else:
            print("Need to specify the raw real data!!")
            exit()
        data_name = os.path.join(
            args.repo_dir, "parameter_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri)

        # get y shift
        if args.model_flag != "m1":
            print("Only support m1 model for MEPS as a special case!")
            exit()
        else:
            y_res = pd.read_csv(data_name + "_y_counter.csv")
            cur_df["Y_count"] = y_res["y_counter"]

        output_f = os.path.join(
            args.repo_dir, "counterfactual_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(
            ri) + "_count.csv"
        writeToCSV(output_f, cur_df)
        if cur_df.shape[0] != args.val_n:
            print("Error !!!!!", ri)
            exit()
        if args.verbose:
            print("--- Save counterfactual in", output_f, " --- \n")
Exemplo n.º 2
0
def get_counterfactual_data_syn(args):
    for ri in range(1, args.counter_run + 1):
        cur_df = pd.read_csv(
            os.path.join(args.repo_dir, args.data_dir) + "/" + args.data_flag +
            "/R" + str(ri) + ".csv")

        group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g]
        orig_cols = list(cur_df.columns)

        data_name = os.path.join(
            args.repo_dir, "parameter_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri)
        y_res = pd.read_csv(data_name + "_y.csv")

        if args.model_flag in ["m2", "m4"]:
            x_res = pd.read_csv(data_name + "_x.csv")
            counter_g_base_x = x_res[x_res["Unnamed: 0"] == "GR" +
                                     args.counter_g]["Estimate"].values[0]

            x_shifts = {args.counter_g: 0}
            for gi in group_list:
                x_shifts[gi] = counter_g_base_x - x_res[
                    x_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0]

            cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x])
            cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"]

        counter_g_base = y_res[y_res["Unnamed: 0"] == "GR" +
                               args.counter_g]["Estimate"].values[0]
        y_shifts = {args.counter_g: 0}
        for gi in group_list:
            y_shifts[gi] = counter_g_base - y_res[y_res["Unnamed: 0"] == "GR" +
                                                  gi]["Estimate"].values[0]

        x_weight = y_res[y_res["Unnamed: 0"] == "X"]["Estimate"].values[0]

        cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x])
        cur_df["Y_count"] = cur_df["Y_shift"] + x_weight * cur_df["X_count"]

        if args.model_flag in ["m2", "m4"]:
            cur_df[
                "Y_count_resolve"] = cur_df["Y_shift"] + x_weight * cur_df["X"]

            cur_df = cur_df.loc[:, orig_cols +
                                ["X_count", "Y_count", "Y_count_resolve"]]
        else:
            cur_df = cur_df.loc[:, orig_cols + ["Y_count"]]

        output_f = os.path.join(
            args.repo_dir, "counterfactual_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(
            ri) + "_count.csv"
        writeToCSV(output_f, cur_df)
        if cur_df.shape[0] != args.val_n:
            print("Error !!!!!", ri)
            exit()
        if args.verbose:
            print("--- Save counterfactual in", output_f, " --- \n")
Exemplo n.º 3
0
def get_LTR_predict_data_single(args):

    m2_keep_cols = {"Full": {"fair_count": ["G", "R", "X_count", "Y_count"],
                             "bias": ["G", "R", "X", "Y"],
                             "fair_res": ["G", "R", "X", "Y_count_resolve"]},

                    "Unaware": {"fair_count": ["X_count", "Y_count"],
                                "bias": ["X", "Y"],
                                "fair_res": ["X", "Y_count_resolve"]}}

    m1_keep_cols = {"Full": {"fair_count": ["G", "R", "Y_count"],
                             "bias": ["G", "R", "Y"]}}

    # special set for meps with a moderator age encoded as X
    meps_keep_cols = {"Full": {"fair_count": ["G", "R", "X", "Y_count"],
                               "bias": ["G", "R", "X", "Y"]}}


    for ri in range(1, args.test_run+1):
        count_df = pd.read_csv(os.path.join(args.repo_dir,"counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R1"+args.file_n+".csv")

        for expi in args.settings.split(","):
            if args.model_flag == "m2":
                col_map = m2_keep_cols[expi]
            else:
                if args.data_flag == "mp":
                    col_map = meps_keep_cols[expi]
                else:
                    col_map = m1_keep_cols[expi]
            # include all the prediction in this setting
            all_fair_settings = [f for f in os.listdir(os.path.join(args.repo_dir,"ranklib_data") + "/" + args.data_flag + "/" + args.model_flag + "/"+expi) if ~os.path.isfile(os.path.join(os.path.join(args.repo_dir,"ranklib_data") + "/" + args.data_flag + "/" + args.model_flag + "/"+expi, f)) and "." not in f]
            for pred_di in all_fair_settings:
                cols = col_map[pred_di.split("__")[-1]]
                train_cols = col_map[pred_di.split("__")[0]]
                ri_pred = get_prediction_scores(os.path.join(args.repo_dir, "ranklib_data")+"/"+args.data_flag +"/" + args.model_flag + "/" + expi + "/" + pred_di, "R" + str(ri), "ListNet")

                pred_y_col = train_cols[-1]+ "__" + cols[-1] + "__" + expi.lower()

                count_df = count_df[count_df["UID"].isin([int(x) for x in ri_pred])]

                count_df[pred_y_col] = count_df["UID"].apply(lambda x: ri_pred[str(x)])


        if args.output_n:
            output_f = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) + "_" + args.output_n + ".csv"
        else:
            output_f = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri) + ".csv"

        writeToCSV(output_f, count_df)
        if args.verbose:
            print("--- Save LTR predict in ", output_f, " --- \n")
Exemplo n.º 4
0
def eval_counter_results(args):
    res_df = pd.DataFrame(columns=["run", "rank", "k", "group", args.measure])
    counter_path = os.path.join(args.repo_dir, "counterfactual_data") + "/" + args.data_flag + "/" + args.model_flag + "/"

    all_files = get_files_with_name(counter_path, args.file_n)

    for ri, fi in enumerate(all_files):
        # cur_run = int(fi.replace("_LTR", "").replace("R", "").replace(".csv",""))
        count_df = pd.read_csv(counter_path + fi)
        k_list = [int(x) for x in args.eval_ks.split(",")]

        seti_quotas = get_quotas_count(count_df)

        for rank_all in args.rankings.split(","):
            if "__" in rank_all:
                train_ranki = rank_all.split("__")[0]
            else:
                train_ranki = rank_all[0]
            orig_df = count_df.sort_values(by=train_ranki, ascending=False)
            # shift score to positive
            orig_df[train_ranki] = orig_df[train_ranki] + abs(orig_df[train_ranki].min())

            for ki in k_list:
                opt_u = sum(orig_df.head(ki)[train_ranki])
                res_row = [ri+1, rank_all, ki]
                all_row = res_row+["all"]

                if args.measure == "select_rate" :
                    all_row.append(1)

                sort_df = get_sort_df(rank_all, count_df, ki, quotas_max=seti_quotas)

                if args.measure == "score_utility":
                    all_row.append(compute_score_util(list(sort_df["UID"]), orig_df, train_ranki, opt_u))


                sort_df["rank"] = list(range(1, ki + 1))

                # compute jacard index and kendall-tau distance at top-k ranking
                top_orig = orig_df.head(ki)

                if args.measure == "sensitivity":
                    all_row.append(compute_k_recall(list(top_orig["UID"]), list(sort_df["UID"]), ki))

                if args.measure == "ap":
                    all_row.append(compute_ap(list(top_orig["UID"]), list(sort_df["UID"]), ki))

                if args.measure == "igf":
                    all_row.append(compute_igf_ratio(list(sort_df["UID"]), orig_df, train_ranki))

                if args.measure == "rKL":
                    if args.data_flag == "cm": # random permutate for COMPAS to avoid bias due to sorting with ties
                        all_row.append(compute_rKL(sort_df, orig_df, sort_col=rank_all, group_col=args.group_col))
                    else:
                        all_row.append(compute_rKL(sort_df, orig_df, group_col=args.group_col))

                res_df.loc[res_df.shape[0]] = all_row


                # group-level evaluation
                cur_quotas = dict(sort_df['GR'].value_counts(normalize=True))
                for gi in list(orig_df[args.group_col].unique()):
                    gi_row = res_row + [gi]
                    if args.measure == "select_rate":
                        # selection rate to rank inside top-k
                        if gi in cur_quotas:
                            gi_row.append(cur_quotas[gi] / seti_quotas[gi])
                        else:
                            gi_row.append(0)

                    gi_df = sort_df[sort_df["GR"] == gi]
                    gi_orig = orig_df[orig_df["GR"] == gi]

                    if args.measure == "score_utility":
                        gi_row.append(compute_score_util(list(gi_df["UID"]), gi_orig, train_ranki))


                    gi_orig_k = top_orig[top_orig["GR"] == gi]

                    if args.measure == "sensitivity":
                        gi_row.append(compute_k_recall(list(gi_orig_k["UID"]), list(gi_df["UID"]), len(gi_orig_k["UID"])))
                    if args.measure == "ap":
                        gi_row.append(compute_ap(list(gi_orig_k["UID"]), list(gi_df["UID"]), ki))

                    if args.measure == "igf":
                        if not gi_df.shape[0]:
                            gi_row.append(-1)
                        else:
                            gi_row.append(compute_igf_ratio(list(gi_df["UID"]), gi_orig, train_ranki))
                    if args.measure == "rKL": # not application to group
                        gi_row.append(-1)
                    res_df.loc[res_df.shape[0]] = gi_row
        if args.verbose:
            print("--- Done eval ", args.measure, " for ", args.data_flag, args.model_flag, ri+1, " --- \n")
    if "select" in args.measure: # selection rate for counterfactual and LTR
        output_f = os.path.join(args.repo_dir, "evaluation_res") + "/"+args.data_flag + "/"+ args.model_flag + "/Eval_R" + str(ri+1) + "_" + args.measure + args.file_n + ".csv"
    else:
        output_f = os.path.join(args.repo_dir, "evaluation_res") + "/"+args.data_flag + "/"+ args.model_flag + "/Eval_R" + str(ri+1) + "_" + args.measure + ".csv"
    writeToCSV(output_f, res_df)
    if args.verbose:
        print("--- Save eval file in ", output_f, " --- \n")
Exemplo n.º 5
0
def generate_data(file_name, para_dict, src_data=None):
    if "values" in para_dict:
        # generate the categorical columns for synthetic data
        dataset = pd.DataFrame()
        categoricalData = pd.DataFrame(columns=para_dict["values"])
        for ai, ai_values in para_dict["values"].items():
            ai_col = []
            for vi in ai_values:
                vi_n = sum([
                    para_dict["quotas_budget"][x]
                    for x in para_dict["quotas_budget"]
                    if len(x) == 2 and vi in x
                ])
                ai_size = int(np.ceil(para_dict["N"] * vi_n))
                ai_col += [vi for _ in range(ai_size)]
            np.random.shuffle(ai_col)
            categoricalData[ai] = ai_col
        if categoricalData.shape[0] != para_dict["N"]:
            categoricalData.sample(n=para_dict["N"])
        # add categorical columns to dataset
        dataset = pd.concat([dataset, categoricalData], axis=1)
    else:  # for semi real data
        dataset = pd.read_csv(src_data)

    # generate the continous columns
    for cur_formula in para_dict["edge_weights"]:
        key_col = list(cur_formula.keys())[0]
        if len(cur_formula[key_col]) > 1:  # multiple dependent columns
            if "values" in para_dict:
                dataset = dataset.groupby(list(
                    cur_formula[key_col].keys())).apply(
                        lambda x: generateScores(
                            x,
                            cur_formula,
                            para_dict["mius"],
                            para_dict["vars"],
                            value_dict=para_dict["values"],
                            inter_miu=para_dict["intersectionality"]["miu"],
                            inter_var=para_dict["intersectionality"]["var"]))
            else:
                dataset = dataset.groupby(list(
                    cur_formula[key_col].keys())).apply(
                        lambda x: generateScores(x,
                                                 cur_formula,
                                                 para_dict["mius"],
                                                 para_dict["vars"],
                                                 sensi_cols=["G", "R"]))
        else:  # single dependent column
            depend_col, depend_weight = list(cur_formula[key_col].items())[0]
            dataset[key_col] = dataset[depend_col] * depend_weight

    if "values" in para_dict:
        dataset["".join(list(para_dict["values"].keys()))] = dataset.apply(
            lambda x: "".join([x[i] for i in para_dict["values"]]), axis=1)
        dataset["UID"] = list(range(1, para_dict["N"] + 1))

        # dataset["Y_bi"] = dataset["Y"].apply(lambda x: int(x >= dataset["Y"].quantile(.8)))
        # get_noise_col(dataset, "Y_bi")
        dataset = dataset[[
            x for x in dataset.columns if x not in ["X_i", "Y_d"]
        ]]
    else:
        # rename columns
        for si in ["G", "R"]:
            dataset[si] = dataset[si].apply(lambda x: x[0].upper())
        dataset.drop(columns=['Y', 'X'], inplace=True)
        for new_coli in [list(x.keys())[0] for x in para_dict["edge_weights"]]:
            if "_" in new_coli:
                dataset.rename(columns={new_coli: new_coli[0]}, inplace=True)

    # save dataframe to a csv file
    writeToCSV(file_name, dataset)
Exemplo n.º 6
0
def get_counterfactual_data_real(args):
    for ri in range(1, args.counter_run + 1):
        if args.src_data.strip("None"):
            cur_df = pd.read_csv(args.src_data + ".csv")
        else:
            print("Need to specify the raw real data!!")
            exit()
        group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g]
        orig_cols = list(cur_df.columns)

        data_name = os.path.join(
            args.repo_dir, "parameter_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri)

        if args.model_flag in ["m2", "m4"]:
            x_res = pd.read_csv(data_name + "_x.csv")
            counter_g_base = x_res[x_res["Unnamed: 0"] == "GR" +
                                   args.counter_g]["Estimate"].values[0]

            x_shifts = {args.counter_g: 0}
            for gi in group_list:
                x_shifts[gi] = counter_g_base - x_res[
                    x_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0]

            cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x])
            cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"]

        # get y shift
        if args.model_flag in ["m1", "m3"]:
            y_res = pd.read_csv(data_name + "_y.csv")
            counter_g_base = y_res[y_res["Unnamed: 0"] == "GR" +
                                   args.counter_g]["Estimate"].values[0]
            y_shifts = {args.counter_g: 0}
            for gi in group_list:
                y_shifts[gi] = counter_g_base - y_res[
                    y_res["Unnamed: 0"] == "GR" + gi]["Estimate"].values[0]
        else:
            y_shifts = {args.counter_g: 0}
            y_shifts_resolve = {args.counter_g: 0}
            for gi in group_list:
                g_res = pd.read_csv(data_name + "_" + gi +
                                    "_med.csv")["Estimate"]
                y_shifts[gi] = -g_res[2]
                y_shifts_resolve[gi] = -g_res[1]

        cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x])
        cur_df["Y_count"] = cur_df["Y"] + cur_df["Y_shift"]

        if args.model_flag in ["m2", "m4"]:
            cur_df["Y_shift_resolve"] = cur_df["GR"].apply(
                lambda x: y_shifts_resolve[x])
            cur_df["Y_count_resolve"] = cur_df["Y"] + cur_df["Y_shift_resolve"]
            cur_df = cur_df.loc[:, orig_cols +
                                ["X_count", "Y_count", "Y_count_resolve"]]
        else:
            cur_df = cur_df.loc[:, orig_cols + ["Y_count"]]

        output_f = os.path.join(
            args.repo_dir, "counterfactual_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(
            ri) + "_count.csv"
        writeToCSV(output_f, cur_df)
        if cur_df.shape[0] != args.val_n:
            print("Error !!!!!", ri)
            exit()
        if args.verbose:
            print("--- Save counterfactual in", output_f, " --- \n")
Exemplo n.º 7
0
def get_counterfactual_data_single_m(args):
    # Only support mediation on gender now
    for ri in range(1, args.counter_run + 1):
        cur_df = pd.read_csv(
            os.path.join(args.repo_dir, args.data_dir) + "/" + args.data_flag +
            "/R" + str(ri) + ".csv")

        group_list = [x for x in cur_df["GR"].unique() if x != args.counter_g]
        orig_cols = list(cur_df.columns)

        data_name = os.path.join(
            args.repo_dir, "parameter_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(ri)

        if args.model_flag in ["m2", "m4"]:
            x_res = pd.read_csv(data_name + "_x.csv")

            counter_g_base = x_res[x_res["Unnamed: 0"] == args.med_s +
                                   args.counter_g]["Estimate"].values[0]
            other_g_base = x_res[x_res["Unnamed: 0"] == args.med_s +
                                 args.other_g]["Estimate"].values[0]
            x_shifts = {}
            for gi in group_list:
                if args.counter_g in gi:
                    x_shifts[gi] = 0
                else:
                    x_shifts[gi] = counter_g_base - other_g_base

            cur_df["X_shift"] = cur_df["GR"].apply(lambda x: x_shifts[x])
            cur_df["X_count"] = cur_df["X"] + cur_df["X_shift"]

        # get y shift
        if args.model_flag in ["m1", "m3"]:
            print(
                "Only support model m2 and m4 for mediation on single attribute!"
            )
            exit()
        else:
            y_res = pd.read_csv(data_name + "_y.csv")

            y_shifts = {}

            for gi in group_list:
                if args.hidden_g in gi:  # for BM and BF
                    gi_inter = y_res[y_res["Unnamed: 0"] == "G" +
                                     gi[0]]["Estimate"].values[0]
                else:
                    if "MW" in gi:
                        gi_inter = y_res[y_res["Unnamed: 0"] == "G" + gi[0]]["Estimate"].values[0] + \
                               y_res[y_res["Unnamed: 0"] == "R" + gi[1]]["Estimate"].values[0] + y_res[y_res["Unnamed: 0"] == "GM:RW"]["Estimate"].values[0]
                    else:
                        gi_inter = y_res[y_res["Unnamed: 0"] == "G" + gi[0]]["Estimate"].values[0] + \
                                   y_res[y_res["Unnamed: 0"] == "R" + gi[1]]["Estimate"].values[0]

                y_shifts[gi] = -gi_inter + y_res[y_res["Unnamed: 0"] ==
                                                 "GF"]["Estimate"].values[0]

            med_weight = pd.read_csv(data_name + "_" + args.other_g +
                                     "_med.csv")["Estimate"][0]

        cur_df["Y_shift"] = cur_df["GR"].apply(lambda x: y_shifts[x])
        cur_df["Y_count"] = cur_df["Y_shift"] + med_weight * cur_df["X_count"]

        if args.model_flag in ["m2", "m4"]:
            cur_df["Y_count_resolve"] = cur_df[
                "Y_shift"] + med_weight * cur_df["X"]
            cur_df = cur_df.loc[:, orig_cols +
                                ["X_count", "Y_count", "Y_count_resolve"]]
        else:
            cur_df = cur_df.loc[:, orig_cols + ["Y_count"]]

        output_f = os.path.join(
            args.repo_dir, "counterfactual_data"
        ) + "/" + args.data_flag + "/" + args.model_flag + "/R" + str(
            ri) + "_count.csv"
        writeToCSV(output_f, cur_df)
        if cur_df.shape[0] != args.val_n:
            print("Error !!!!!", ri)
            exit()
        if args.verbose:
            print("--- Save counterfactual in", output_f, " --- \n")