def run_eval(dataset):

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()

    else:
        exit(1)

    random.seed(1)
    rounds = 500
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    classifier = AdaFair(n_estimators=rounds,
                         saIndex=sa_index,
                         saValue=p_Group,
                         CSB="CSB1",
                         use_validation=True,
                         debug=True,
                         X_test=X_test,
                         y_test=y_test)
    classifier.fit(X_train, y_train)

    plot_per_round(rounds, classifier.performance, classifier.objective,
                   classifier.theta,
                   'Images/' + dataset + '_per_round_analysis.png')
Пример #2
0
def run_eval(dataset):
    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()
    else:
        exit(1)

    base_learners = 200
    no_cumul = train_classifier(X, y, sa_index, p_Group, 0, base_learners)
    cumul = train_classifier(X, y, sa_index, p_Group, 1, base_learners)

    plot_costs_per_round("Images/Costs/" + dataset, no_cumul, cumul)
Пример #3
0
def run_eval(dataset, iterations):
    suffixes = ['NC AdaFair', 'AdaFair']
    create_temp_files(dataset, suffixes)

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "dutch":
        X, y, sa_index, p_Group, x_control = load_dutch_data()
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()
    else:
        exit(1)
    create_temp_files(dataset, suffixes)

    threads = []
    mutex = []
    for lock in range(0, 2):
        mutex.append(Lock())

    random.seed(int(time.time()))

    for iter in range(0, iterations):
        start = time.time()

        sss = ShuffleSplit(n_splits=2, test_size=0.5)

        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for proc in range(1, 2):

                threads.append(
                    Process(target=train_classifier,
                            args=(X_train, X_test, y_train, y_test, sa_index,
                                  p_Group, dataset + suffixes[proc],
                                  mutex[proc], proc, 200)))

    for process in threads:
        process.start()

    for process in threads:
        process.join()

    threads = []

    print("elapsed time = " + str(time.time() - start))

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix, 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()

    plot_my_results(results, suffixes,
                    "Images/" + dataset + "_single_vs_accum", dataset)
    delete_temp_files(dataset, suffixes)
Пример #4
0
def run_eval(dataset, iterations):
    # suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair', 'SMOTEBoost' ]
    suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair CSB2', 'AdaFair CSB1' ]

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()

    else:
        exit(1)
    create_temp_files(dataset, suffixes)

    # init parameters for zafar method (default settings)
    tau = 3.0
    mu = 1.2
    cons_type = 4
    sensitive_attrs = x_control.keys()
    loss_function = "logreg"
    EPS = 1e-6
    # sensitive_attrs_to_cov_thresh = {sensitive_attrs[0]: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}}
    sensitive_attrs_to_cov_thresh = {0: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}}
    cons_params = {"cons_type": cons_type, "tau": tau, "mu": mu,
                   "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

    threads = []
    mutex = []
    for lock in range(0, 8):
        mutex.append(Lock())

    random.seed(int(time.time()))

    for iter in range(0, iterations):

        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for proc in range(0, 4):
                if proc < 3 :
                    time.sleep(1)
                    continue

                if proc > 0:
                    threads.append(Process(target=train_classifier, args=( copy.deepcopy(X_train),
                                                                           X_test, copy.deepcopy(y_train),
                                                                           y_test, sa_index, p_Group,
                                                                           dataset + suffixes[proc],
                                                                           mutex[proc],proc, 500, 1)))

                # elif proc == 0:
                #     temp_x_control_train = defaultdict(list)
                #     temp_x_control_test = defaultdict(list)
                #
                #     temp_x_control_train[sensitive_attrs[0]] = x_control[sensitive_attrs[0]][train_index]
                #     temp_x_control_test[sensitive_attrs[0]] = x_control[sensitive_attrs[0]][test_index]
                #
                #     x_zafar_train, y_zafar_train, x_control_train = ut.conversion(X[train_index], y[train_index],dict(temp_x_control_train), 1)
                #
                #     x_zafar_test, y_zafar_test, x_control_test = ut.conversion(X[test_index], y[test_index],dict(temp_x_control_test), 1)
                #
                #     threads.append(Process(target=train_zafar, args=(x_zafar_train, y_zafar_train, x_control_train,
                #                                                      x_zafar_test, y_zafar_test, x_control_test,
                #                                                      cons_params, loss_function, EPS,
                #                                                      dataset + suffixes[proc], mutex[proc],
                #                                                      sensitive_attrs)))
            break

    for process in threads:
        process.start()

    for process in threads:
        process.join()

    threads = []

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix, 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()

    plot_my_results(results, suffixes, "Images/" + dataset, dataset)
    delete_temp_files(dataset, suffixes)
Пример #5
0
def run_eval(dataset, iterations):
    suffixes = ['Adaboost', 'AdaFair', 'SMOTEBoost']

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "dutch":
        X, y, sa_index, p_Group, x_control = load_dutch_data()
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "credit":
        X, y, sa_index, p_Group, x_control = load_credit()
    elif dataset == "diabetes":
        X, y, sa_index, p_Group, x_control = load_diabetes()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()

    else:
        exit(1)
    create_temp_files(dataset, suffixes)
    threads = []
    mutex = []
    for lock in range(0, 8):
        mutex.append(Lock())
    print(dataset)
    random.seed(int(time.time()))

    for iter in range(0, iterations):
        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=.5,
                                     random_state=iter)
        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # for proc in range(0, 3):
            #     threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc],proc, 500, 1, dataset)))
            threads.append(
                Process(target=train_classifier,
                        args=(X_train, X_test, y_train, y_test, sa_index,
                              p_Group, dataset + suffixes[1], mutex[1], 1, 500,
                              1, dataset)))

            break
    for process in threads:
        process.start()

    for process in threads:
        process.join()

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix, 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()

    plot_my_results(results, suffixes, "Images/EqualOpportunity/" + dataset,
                    dataset)
    delete_temp_files(dataset, suffixes)
Пример #6
0
def run_eval(dataset, iterations):

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "dutch":
        X, y, sa_index, p_Group, x_control = load_dutch_data()
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()
    else:
        exit(1)

    suffixes = ['AdaFair NoConf.', 'AdaFair']
    random.seed(int(time.time()))

    base_learners = 500
    steps = numpy.arange(0, 1.001, step=0.2)

    create_temp_files(dataset, suffixes, steps)
    threads = []
    mutex = []
    for lock in range(0, 2):
        mutex.append(Lock())

    for iterations in range(0, iterations):
        start = time.time()

        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=.5,
                                     random_state=iterations)
        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for c in steps:
                threads.append(
                    Process(target=train_classifier,
                            args=(X_train, X_test, y_train, y_test, sa_index,
                                  p_Group, dataset + suffixes[1], mutex[1],
                                  base_learners, c)))

            break

        for process in threads:
            process.start()

        for process in threads:
            process.join()

        threads = []

        print("elapsed time = " + str(time.time() - start))

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix + "_dm", 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()
    plot_results_of_c_impact(results[0], results[1], steps, "Images/Impact_c/",
                             dataset)
    delete_temp_files(dataset, suffixes)
Пример #7
0
def run_eval(dataset):

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()
    else:
        exit(1)

    base_learners = 200
    adaboost, adaboost_weights, init_weights = train_classifier(
        X, y, sa_index, p_Group, 0, base_learners)
    csb1, csb1_weights, temp = train_classifier(X, y, sa_index, p_Group, 1,
                                                base_learners)
    csb2, csb2_weights, temp = train_classifier(X, y, sa_index, p_Group, 2,
                                                base_learners)

    adaboost *= y
    csb1 *= y
    csb2 *= y

    csb1_positives = csb1[y == 1]
    csb1_negatives = csb1[y == -1]

    csb2_positives = csb2[y == 1]
    csb2_negatives = csb2[y == -1]

    adaboost_positives = adaboost[y == 1]
    adaboost_negatives = adaboost[y == -1]

    num_bins = 50
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))
    # fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14,3))
    plt.rcParams.update({'font.size': 11})

    ax1.set_title("Positive CDF")
    ax1.grid(True)
    counts_ada_positives, bin_edges_ada_positives = numpy.histogram(
        adaboost_positives, bins=num_bins, normed=True)
    cdf_ada_positives = numpy.cumsum(counts_ada_positives)
    ax1.plot(bin_edges_ada_positives[1:],
             cdf_ada_positives / cdf_ada_positives[-1],
             c='blue',
             label='AdaBoost')

    counts_csb1_positives, bin_edges_csb1_positives = numpy.histogram(
        csb1_positives, bins=num_bins, normed=True)
    cdf_csb1_positives = numpy.cumsum(counts_csb1_positives)
    ax1.plot(bin_edges_csb1_positives[1:],
             cdf_csb1_positives / cdf_csb1_positives[-1],
             c='green',
             linestyle='-.',
             label='AdaFair NoConf')

    counts_csb2_positives, bin_edges_csb2_positives = numpy.histogram(
        csb2_positives, bins=num_bins, normed=True)
    cdf_csb2_positives = numpy.cumsum(counts_csb2_positives)
    ax1.plot(bin_edges_csb2_positives[1:],
             cdf_csb2_positives / cdf_csb2_positives[-1],
             c='red',
             linestyle='--',
             label='AdaFair')
    ax1.legend(loc='best')
    ax1.set_xlabel("Margin")

    ax1.set_ylabel("Cumulative Distribution")
    ax1.axhline(0, color='black')
    ax1.axvline(0, color='black')

    ax2.grid(True)

    ax2.axhline(0, color='black')
    ax2.axvline(0, color='black')
    ax2.set_title("Negative CDF")

    counts_ada_negatives, bin_edges_ada_negatives = numpy.histogram(
        adaboost_negatives, bins=num_bins, normed=True)
    cdf_ada_negatives = numpy.cumsum(counts_ada_negatives)
    ax2.plot(bin_edges_ada_negatives[1:],
             cdf_ada_negatives / cdf_ada_negatives[-1],
             c='blue',
             label='AdaBoost')
    ax2.set_ylabel("Cumulative Distribution")
    ax2.set_xlabel("Margin")

    counts_csb1_negatives, bin_edges_csb1_negatives = numpy.histogram(
        csb1_negatives, bins=num_bins, normed=True)
    cdf_csb1_negatives = numpy.cumsum(counts_csb1_negatives)
    ax2.plot(bin_edges_csb1_negatives[1:],
             cdf_csb1_negatives / cdf_csb1_negatives[-1],
             c='green',
             linestyle='-.',
             label='AdaFair NoConf')
    counts_csb2_negatives, bin_edges_csb2_negatives = numpy.histogram(
        csb2_negatives, bins=num_bins, normed=True)
    cdf_csb2_negatives = numpy.cumsum(counts_csb2_negatives)
    ax2.plot(bin_edges_csb2_negatives[1:],
             cdf_csb2_negatives / cdf_csb2_negatives[-1],
             c='red',
             linestyle='--',
             label='AdaFair')
    ax2.legend(loc='best')

    # index = numpy.arange(4)
    # bar_width = 0.2
    #
    # adaboost_weights = adaboost_weights.split(",")
    # init_weights = init_weights.split(",")
    # csb1_weights = csb1_weights.split(",")
    # csb2_weights = csb2_weights.split(",")
    #
    # ax3.set_title("Weights per group")
    # # ax3.set_ylabel("(%)")
    #
    #
    # prot_pos = [float(init_weights[4]), float(adaboost_weights[4]), float(csb1_weights[4]), float(csb2_weights[4])]
    # non_prot_pos = [float(init_weights[5]), float(adaboost_weights[5]), float(csb1_weights[5]), float(csb2_weights[5])]
    # prot_neg = [float(init_weights[6]), float(adaboost_weights[6]), float(csb1_weights[6]), float(csb2_weights[6])]
    # non_prot_neg = [float(init_weights[7]), float(adaboost_weights[7]), float(csb1_weights[7]), float(csb2_weights[7])]
    #
    # ax3.bar(index, prot_pos,label='Prot. Pos.', edgecolor='black', width= bar_width)
    # ax3.bar(index, non_prot_pos,label='Non-Prot. Pos.', bottom=prot_pos, edgecolor='red', width= bar_width)
    # ax3.bar(index, prot_neg,label='Prot. Neg.', bottom=[i+j for i,j in zip(prot_pos, non_prot_pos)],  edgecolor='green', width= bar_width)
    # ax3.bar(index, non_prot_neg,label='Non-Prot. Neg.', bottom=[i+j+z for i,j,z in zip(prot_pos, non_prot_pos, prot_neg)],  edgecolor='blue', width= bar_width)
    #
    #
    #
    # ax3.set_xticks([0  , 1 , 2 , 3 ])
    # ax3.grid(True)
    #
    # ax3.set_xticklabels(['Initial Weights','AdaBoost', 'AdaFair NoConf.', 'AdaFair'])
    # ax3.legend(loc='best', fancybox=True, framealpha=0.1)
    # plt.yticks(numpy.arange(0, 1.0001, step=0.1))

    # ax3.set_ylim([0.48, 0.52])
    # plt.rcParams.update({'font.size': 9})

    fig.tight_layout()

    plt.show()

    plt.legend(loc='best', fancybox=True, framealpha=0.2)
    plt.savefig("Images/cdf_" + dataset + ".png")
Пример #8
0
def run_eval(dataset):

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()
    else:
        exit(1)

    base_learners = 200
    adaboost, adaboost_weights, init_weights = train_classifier(
        X, y, sa_index, p_Group, 0, base_learners)
    csb1, csb1_weights, temp = train_classifier(X, y, sa_index, p_Group, 1,
                                                base_learners)
    csb2, csb2_weights, temp = train_classifier(X, y, sa_index, p_Group, 2,
                                                base_learners)

    adaboost *= y
    csb1 *= y
    csb2 *= y

    csb1_positives = csb1[y == 1]
    csb1_negatives = csb1[y == -1]

    csb2_positives = csb2[y == 1]
    csb2_negatives = csb2[y == -1]

    adaboost_positives = adaboost[y == 1]
    adaboost_negatives = adaboost[y == -1]

    num_bins = 50
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))
    # fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14,3))
    plt.rcParams.update({'font.size': 11})

    ax1.set_title("Positive CDF")
    ax1.grid(True)
    counts_ada_positives, bin_edges_ada_positives = numpy.histogram(
        adaboost_positives, bins=num_bins, normed=True)
    cdf_ada_positives = numpy.cumsum(counts_ada_positives)
    ax1.plot(bin_edges_ada_positives[1:],
             cdf_ada_positives / cdf_ada_positives[-1],
             c='blue',
             label='AdaBoost')

    counts_csb1_positives, bin_edges_csb1_positives = numpy.histogram(
        csb1_positives, bins=num_bins, normed=True)
    cdf_csb1_positives = numpy.cumsum(counts_csb1_positives)
    ax1.plot(bin_edges_csb1_positives[1:],
             cdf_csb1_positives / cdf_csb1_positives[-1],
             c='green',
             linestyle='-.',
             label='AdaFair NoConf')

    counts_csb2_positives, bin_edges_csb2_positives = numpy.histogram(
        csb2_positives, bins=num_bins, normed=True)
    cdf_csb2_positives = numpy.cumsum(counts_csb2_positives)
    ax1.plot(bin_edges_csb2_positives[1:],
             cdf_csb2_positives / cdf_csb2_positives[-1],
             c='red',
             linestyle='--',
             label='AdaFair')
    ax1.legend(loc='best')
    ax1.set_xlabel("Margin")
    ax1.set_ylabel("Cumulative Distribution")
    ax1.axhline(0, color='black')
    ax1.axvline(0, color='black')
    ax2.grid(True)

    ax2.axhline(0, color='black')
    ax2.axvline(0, color='black')
    ax2.set_title("Negative CDF")

    counts_ada_negatives, bin_edges_ada_negatives = numpy.histogram(
        adaboost_negatives, bins=num_bins, normed=True)
    cdf_ada_negatives = numpy.cumsum(counts_ada_negatives)
    ax2.plot(bin_edges_ada_negatives[1:],
             cdf_ada_negatives / cdf_ada_negatives[-1],
             c='blue',
             label='AdaBoost')
    ax2.set_ylabel("Cumulative Distribution")
    ax2.set_xlabel("Margin")

    counts_csb1_negatives, bin_edges_csb1_negatives = numpy.histogram(
        csb1_negatives, bins=num_bins, normed=True)
    cdf_csb1_negatives = numpy.cumsum(counts_csb1_negatives)
    ax2.plot(bin_edges_csb1_negatives[1:],
             cdf_csb1_negatives / cdf_csb1_negatives[-1],
             c='green',
             linestyle='-.',
             label='AdaFair NoConf')
    counts_csb2_negatives, bin_edges_csb2_negatives = numpy.histogram(
        csb2_negatives, bins=num_bins, normed=True)
    cdf_csb2_negatives = numpy.cumsum(counts_csb2_negatives)
    ax2.plot(bin_edges_csb2_negatives[1:],
             cdf_csb2_negatives / cdf_csb2_negatives[-1],
             c='red',
             linestyle='--',
             label='AdaFair')
    ax2.legend(loc='best')

    fig.tight_layout()
    plt.show()
    plt.legend(loc='best', fancybox=True, framealpha=0.2)
    plt.savefig("Images/cdf_" + dataset + "_sp.png")