Пример #1
0
def f1_score_test_sets(location, indices_info, n_splits, k_info,
                       threshold_info, best_lambda_euclidean,
                       best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)

    splitted_test_sets = split_test_sets(n_splits, location)
    i = 0
    f1_results = []

    for test_set in splitted_test_sets:
        print("Split: " + str(i))
        f1_baseline, f1_luong, f1_zhang, f1_euclidean, f1_mahalanobis = f1_score_all_approaches(
            loaded_train_data, test_set, loaded_optimization_info,
            indices_info, k_info, threshold_info)
        f1_results.append({
            'baseline': f1_baseline,
            'luong': f1_luong,
            'zhang': f1_zhang,
            'euclidean': f1_euclidean,
            'mahalanobis': f1_mahalanobis
        })
        print(f1_results)
        i += 1
    print("F1")
    utils.print_avg_results_from_dictionary(f1_results)
Пример #2
0
def f1_score_val_set(location, indices_info, k_info, possible_thresholds,
                     best_lambda_euclidean, best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)

    for threshold in possible_thresholds:
        print("threshold: " + str(threshold))
        threshold_info = {
            'baseline': threshold,
            'luong': threshold,
            'zhang': threshold,
            'euclidean': threshold,
            'mahalanobis': threshold
        }
        f1_baseline, f1_luong, f1_zhang, f1_euclidean, f1_mahalanobis = f1_score_all_approaches(
            loaded_train_data, loaded_val_data, loaded_optimization_info,
            indices_info, k_info, threshold_info)
        print(f1_baseline)
        print(f1_luong)
        print(f1_zhang)
        print(f1_euclidean)
        print(f1_mahalanobis)
    return
Пример #3
0
def give_disc_scores_validation_set(location, indices_info, k_info,
                                    best_lambda_euclidean,
                                    best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)

    protected_info_val = loaded_val_data['protected_info']
    protected_indices_val = np.where(protected_info_val == 1)[0]

    baseline_scores, luong_scores, zhang_scores, weighted_euclidean_scores, mahalanobis_scores = give_all_disc_scores(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        indices_info, k_info, "adult")

    data_frame_protected_indices_disc_scores = pd.DataFrame(
        list(
            zip(baseline_scores, luong_scores, zhang_scores,
                weighted_euclidean_scores, mahalanobis_scores)),
        index=protected_indices_val,
        columns=[
            'Baseline', 'Luong', 'Zhang', 'Weighted Euclidean', 'Mahalanobis'
        ])
    return data_frame_protected_indices_disc_scores
def inspect_mistakes(location, k, threshold):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location)

    train_data = loaded_train_data['data']
    train_data_standardized = loaded_train_data['standardized_data']
    train_protected_info = loaded_train_data['protected_info']
    train_class_label = loaded_train_data['class_label']
    train_protected_indices = list(np.where(train_protected_info == 1)[0])
    train_unprotected_indices = list(np.where(train_protected_info == 2)[0])

    val_data = loaded_val_data['data']
    val_data_standardized = loaded_val_data['standardized_data']
    val_ground_truth = loaded_val_data['ground_truth']
    val_protected_info = loaded_val_data['protected_info']
    val_class_label = loaded_val_data['class_label']
    val_protected_indices = list(np.where(val_protected_info == 1)[0])

    indices_info = loaded_optimization_info['indices_info']
    weights_euclidean = loaded_optimization_info['weights_euclidean']

    discrimination_scores = give_all_disc_scores_Luong(
        k,
        class_info_train=train_class_label,
        protected_indices_train=train_protected_indices,
        unprotected_indices_train=train_unprotected_indices,
        training_set=train_data_standardized,
        protected_indices_test=val_protected_indices,
        class_info_test=val_class_label,
        test_set=val_data_standardized,
        indices_info=indices_info)
    disc_labels = utils.give_disc_label(discrimination_scores, threshold)

    print(confusion_matrix(val_ground_truth, disc_labels))
Пример #5
0
def find_best_reject_threshold_based_on_k_distance_plot(location, indices_info, k, technique, lambda_l1, title):
    loaded_train_data = load_data(location, "train")
    loaded_test_data = load_data(location, "val")

    train_data_standardized = loaded_train_data['standardized_data']
    train_protected_info = loaded_train_data['protected_info']
    train_unprotected_indices = list(np.where(train_protected_info == 2)[0])

    val_data_standardized = loaded_test_data['standardized_data']
    val_class_label = loaded_train_data['class_label']
    val_protected_info = loaded_test_data['protected_info']

    val_indices_negative_class_label = set(np.where(val_class_label == 0)[0])
    val_protected_indices = set(np.where(val_protected_info == 1)[0])
    val_indices_protected_and_negative = val_indices_negative_class_label.intersection(val_protected_indices)

    train_data_standardized_unprotected = train_data_standardized.iloc[train_unprotected_indices]
    val_data_standardized_protected = val_data_standardized.iloc[list(val_indices_protected_and_negative)]

    loaded_optimization_info = load_optimization_info(location, lambda_l1, lambda_l1)

    if (technique == 'euclidean'):
        weights_euclidean = loaded_optimization_info['weights_euclidean']
        distance_matrix = cdist(val_data_standardized_protected.values, train_data_standardized_unprotected.values, weighted_euclidean_distance, weights= weights_euclidean, indices_info=indices_info)
    elif (technique == 'mahalanobis'):
        mahalanobis_matrix = loaded_optimization_info['mahalanobis_matrix']
        distance_matrix = cdist(val_data_standardized_protected.values, train_data_standardized_unprotected.values, mahalanobis_distance, weights= mahalanobis_matrix, indices_info=indices_info)

    sorted_distances = get_sorted_distances_to_k_neighbour(distance_matrix, k)
    best_threshold = find_knee_point_of_sorted_k_distance_plot_geometric(sorted_distances)
    #best_threshold = find_knee_point_of_sorted_k_distance_plot_largest_slope(sorted_distances)
    print(best_threshold)
    sorted_k_distance_plot(sorted_distances, len(sorted_distances), k, title)

    return best_threshold
Пример #6
0
def find_best_threshold_based_on_demographic_parity(location, indices_info, k, technique, lambda_l1=0):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1, lambda_l1)

    discrimination_scores = np.array(give_disc_scores_one_technique(loaded_train_data, loaded_val_data, loaded_optimization_info, technique, indices_info, k))

    discrimination_scores_negative_class_labels_only = discrimination_scores[discrimination_scores != -1]
    sorted_discrimination_scores_negative_class_labels = np.sort(discrimination_scores_negative_class_labels_only)
    reverse_sorted_disc_scores_neg_class_labels = sorted_discrimination_scores_negative_class_labels[::-1]

    loaded_test_data = load_data(location, "val")
    val_protected_info = loaded_test_data['protected_info']
    val_class_label = loaded_test_data['class_label']

    val_protected_indices = list(np.where(val_protected_info == 1)[0])
    val_unprotected_indices = list(np.where(val_protected_info == 2)[0])

    class_labels_unprotected = val_class_label[val_unprotected_indices]
    amount_of_positive_class_labels_unprotected = sum(class_labels_unprotected)
    print("Amount of positive class labels unprotected: " + str(amount_of_positive_class_labels_unprotected))

    class_labels_protected = val_class_label[val_protected_indices]
    amount_of_positive_class_labels_protected = sum(class_labels_protected)
    print("Amount of positive class labels protected: " + str(amount_of_positive_class_labels_protected))

    estimate_amount_of_discriminated_people = amount_of_positive_class_labels_unprotected - amount_of_positive_class_labels_protected
    print("Estimated amount of discriminated people: " + str(estimate_amount_of_discriminated_people))
    best_threshold = find_best_threshold_helper_function(reverse_sorted_disc_scores_neg_class_labels, estimate_amount_of_discriminated_people)
    print(best_threshold)
    return best_threshold
Пример #7
0
def compare_disc_detection_with_and_without_reject_option(
        location, indices_info, lambda_l1, k, reject_threshold, technique):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1,
                                                      lambda_l1)

    val_ground_truth = np.array(loaded_val_data['ground_truth'])

    disc_scores_without_reject, _ = give_disc_scores_one_technique(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        technique, indices_info, k)
    disc_scores_with_reject, rejected_indices = give_disc_scores_with_reject_one_technique(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        technique, indices_info, k, reject_threshold)

    rejected_indices = np.array(rejected_indices)
    print("Number of rejected indices: " + str(len(rejected_indices)))
    print(rejected_indices)
    disc_scores_without_reject = np.array(disc_scores_without_reject)
    disc_scores_with_reject = np.array(disc_scores_with_reject)
    rejected_protected_info_indices = np.where(
        disc_scores_with_reject == -1000)[0]
    not_rejected_protected_info_indices = np.where(
        disc_scores_with_reject != -1000)[0]

    val_ground_truth_of_non_rejected_indices = val_ground_truth[
        not_rejected_protected_info_indices]
    disc_scores_of_non_rejected_indices = disc_scores_without_reject[
        not_rejected_protected_info_indices]

    print("AUC Scores with reject option")
    print(
        utils.get_auc_scores(val_ground_truth_of_non_rejected_indices,
                             disc_scores_of_non_rejected_indices))
    print("AUC Scores without reject option")
    print(utils.get_auc_scores(val_ground_truth, disc_scores_without_reject))

    ground_truth_of_rejected_indices = np.array(
        val_ground_truth[rejected_protected_info_indices])
    disc_scores_normally_given_to_rejected_indices = disc_scores_without_reject[
        rejected_protected_info_indices]

    indices_where_ground_truth_discriminated = np.where(
        ground_truth_of_rejected_indices == 1)[0]
    indices_where_ground_truth_not_discriminated = np.where(
        ground_truth_of_rejected_indices == 0)[0]

    disc_scores_normally_given_to_discriminated_instances = disc_scores_normally_given_to_rejected_indices[
        indices_where_ground_truth_discriminated]
    disc_scores_normally_given_to_not_discriminated_instances = disc_scores_normally_given_to_rejected_indices[
        indices_where_ground_truth_not_discriminated]

    print(disc_scores_normally_given_to_discriminated_instances)
    print(disc_scores_normally_given_to_not_discriminated_instances)
    #print(sum(disc_scores_normally_given_to_discriminated_instances)/len(disc_scores_normally_given_to_discriminated_instances))
    #print(sum(disc_scores_normally_given_to_not_discriminated_instances)/len(disc_scores_normally_given_to_not_discriminated_instances))

    return rejected_indices
def comparing_algorithms_on_specific_indices(location, indices_info, k_info, best_lambda_euclidean, best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "test")
    loaded_optimization_info = load_optimization_info(location, best_lambda_euclidean, best_lambda_mahalanobis)

    indices_of_interest = [772]
    luong_scores, zhang_scores, weighted_euclidean_scores, mahalanobis_scores = give_disc_scores_to_given_indices(
        loaded_train_data, loaded_val_data,
        loaded_optimization_info, indices_info, k_info, "adult", indices_of_interest)
    return
Пример #9
0
def discrimination_detection_with_reject_option(location, indices_info,
                                                lambda_l1, k, reject_threshold,
                                                technique):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1,
                                                      lambda_l1)

    disc_scores_with_reject, rejected_indices = give_disc_scores_with_reject_one_technique(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        technique, indices_info, k, reject_threshold)
    print(len(rejected_indices))
    return rejected_indices
Пример #10
0
def area_under_curve_validation_set(location, indices_info, k_info,
                                    best_lambda_euclidean,
                                    best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)

    print(
        area_under_curve_all_approaches(loaded_train_data, loaded_val_data,
                                        loaded_optimization_info, indices_info,
                                        k_info))
    return
Пример #11
0
def find_best_threshold_based_on_unprotected_region(
        location,
        indices_info,
        k,
        technique,
        lambda_l1=0,
        adult_or_admission="admission"):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1,
                                                      lambda_l1)

    disc_scores_protected = give_disc_scores_one_technique(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        technique, indices_info, k, 1, adult_or_admission)
    disc_scores_unprotected = give_disc_scores_one_technique(
        loaded_train_data, loaded_val_data, loaded_optimization_info,
        technique, indices_info, k, 2, adult_or_admission)

    disc_scores_protected = np.array(disc_scores_protected)
    disc_scores_unprotected = np.array(disc_scores_unprotected)

    disc_scores_protected_neg_class = disc_scores_protected[np.where(
        disc_scores_protected != -1)[0]]
    disc_scores_unprotected_neg_class = disc_scores_unprotected[np.where(
        disc_scores_unprotected != -1)[0]]

    first_quartile = np.quantile(disc_scores_unprotected_neg_class, 0.25)
    third_quartile = np.quantile(disc_scores_unprotected_neg_class, 0.75)
    inter_quartile_range = third_quartile - first_quartile
    max_non_outlier = third_quartile + (1.5 * inter_quartile_range)
    print(max_non_outlier)

    boxplot(data=[
        disc_scores_unprotected_neg_class, disc_scores_protected_neg_class
    ],
            showmeans=True,
            meanprops={
                "marker": "o",
                "markerfacecolor": "grey",
                "markeredgecolor": "black",
                "markersize": "8"
            })
    stripplot(data=[
        disc_scores_unprotected_neg_class, disc_scores_protected_neg_class
    ],
              color=".3")

    plt.show()
    return max_non_outlier
Пример #12
0
def find_best_reject_threshold_based_on_percentage_rejected(location, indices_info, k, technique, lambda_l1, desired_percent_rejected):
    loaded_train_data = load_data(location, "train")
    loaded_test_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1, lambda_l1)

    disc_scores, dist_to_closest_neighbours = give_disc_scores_one_technique(loaded_train_data, loaded_test_data, loaded_optimization_info, technique, indices_info, k)
    dist_to_closest_neighbours = np.array(dist_to_closest_neighbours)

    print(disc_scores)
    print(len(disc_scores))
    print(len(dist_to_closest_neighbours))
    desired_amount_rejected = round(len(dist_to_closest_neighbours) * desired_percent_rejected)
    print(desired_amount_rejected)
    sorted_dist_to_closest_neighbours = -np.sort(-dist_to_closest_neighbours)
    print(sorted_dist_to_closest_neighbours)
    threshold = sorted_dist_to_closest_neighbours[desired_amount_rejected-1]
    return threshold
Пример #13
0
def visualize_mahalanobis(data_location, lambda_l1_norm, title):
    data_dict = load_data(data_location, "train")
    loaded_optimization_info = load_optimization_info(
        data_location,
        lambda_l1_norm_euclidean=lambda_l1_norm,
        lambda_l1_norm_mahalanobis=lambda_l1_norm)
    mahalanobis_matrix = loaded_optimization_info['mahalanobis_matrix']
    standardized_data = data_dict['standardized_data']
    protected_info = data_dict['protected_info']
    class_label = data_dict['class_label']
    discriminated_instances = data_dict['discriminated_instances']
    # mahalanobis_array = loaded_optimization_info['mahalanobis_matrix']

    print(mahalanobis_matrix)
    projected_data = utils.project_to_mahalanobis(standardized_data,
                                                  mahalanobis_matrix)
    print(projected_data)
Пример #14
0
def area_under_curve_test_sets(location,
                               indices_info,
                               n_splits,
                               k_info,
                               best_lambda_euclidean,
                               best_lambda_mahalanobis,
                               adult_or_admission="admission"):
    loaded_train_data = load_data(location, "train")
    #loaded_optimization_info = {}
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)
    splitted_test_sets = split_test_sets(n_splits, location)
    i = 0
    roc_results = []
    pr_results = []
    for test_set in splitted_test_sets:
        print("Split: " + str(i))
        pr_aucs, roc_aucs = area_under_curve_all_approaches(
            loaded_train_data, test_set, loaded_optimization_info,
            indices_info, k_info, adult_or_admission)
        roc_results.append({
            'baseline': roc_aucs['baseline'],
            'luong': roc_aucs['luong'],
            'zhang': roc_aucs['zhang'],
            'euclidean': roc_aucs['euclidean'],
            'mahalanobis': roc_aucs['mahalanobis']
        })
        pr_results.append({
            'baseline': pr_aucs['baseline'],
            'luong': pr_aucs['luong'],
            'zhang': pr_aucs['zhang'],
            'euclidean': pr_aucs['euclidean'],
            'mahalanobis': pr_aucs['mahalanobis']
        })
        # roc_results.append({'baseline': roc_aucs['baseline'], 'luong': roc_aucs['luong'], 'zhang': roc_aucs['zhang'], 'euclidean': roc_aucs['euclidean'], 'mahalanobis': roc_aucs['mahalanobis']})
        # pr_results.append({'baseline': pr_aucs['baseline'], 'luong': pr_aucs['luong'], 'zhang': pr_aucs['zhang'], 'euclidean': pr_aucs['euclidean'], 'mahalanobis': pr_aucs['mahalanobis']})
        i += 1
        print(pr_aucs)
        print(roc_aucs)
    utils.print_avg_results_from_dictionary(roc_results)
    utils.print_avg_results_from_dictionary(pr_results)

    return
Пример #15
0
def find_best_threshold_based_on_estimated_number_of_discriminated_people(location, indices_info, k, technique, estimated_percent_of_discrimination, lambda_l1=0):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location, lambda_l1, lambda_l1)

    discrimination_scores = np.array(give_disc_scores_one_technique(loaded_train_data, loaded_val_data, loaded_optimization_info, technique, indices_info, k))
    print("Amount of women: " + str(len(discrimination_scores)))
    discrimination_scores_negative_class_labels_only = discrimination_scores[discrimination_scores != -1]
    amount_of_positive_class_labels = len(discrimination_scores) - len(discrimination_scores_negative_class_labels_only)
    print("Amount of positive class labels" + str(amount_of_positive_class_labels))
    print("Amount of negative class labels" + str(len(discrimination_scores_negative_class_labels_only)))
    estimate_amount_of_actual_positive_class_labels = int(1/(1-estimated_percent_of_discrimination) * amount_of_positive_class_labels)
    estimate_amount_of_discriminated_people = estimate_amount_of_actual_positive_class_labels - amount_of_positive_class_labels
    print("Estimated number of discriminated people:" + str(estimate_amount_of_discriminated_people))
    sorted_discrimination_scores_negative_class_labels = np.sort(discrimination_scores_negative_class_labels_only)
    reverse_sorted_disc_scores_neg_class_labels = sorted_discrimination_scores_negative_class_labels[::-1]
    best_threshold = find_best_threshold_helper_function(reverse_sorted_disc_scores_neg_class_labels, estimate_amount_of_discriminated_people)
    print(best_threshold)
    return best_threshold
Пример #16
0
def visualize_euclidean(data_location, lambda_l1_norm, title):
    data_dict = load_data(data_location, "train")
    loaded_optimization_info = load_optimization_info(
        data_location,
        lambda_l1_norm_euclidean=lambda_l1_norm,
        lambda_l1_norm_mahalanobis=lambda_l1_norm)
    standardized_data = data_dict['standardized_data']
    protected_info = data_dict['protected_info']
    class_label = data_dict['class_label']
    discriminated_instances = data_dict['discriminated_instances']

    weights_euclidean = loaded_optimization_info['weights_euclidean']

    projected_data = utils.project_to_weighted_euclidean(
        standardized_data, weights_euclidean)

    visualize_positive_vs_negative(projected_data, protected_info, class_label,
                                   title)

    return
Пример #17
0
def area_under_curve_validation_set_different_k(location, indices_info,
                                                possible_ks,
                                                best_lambda_euclidean,
                                                best_lambda_mahalanobis):
    loaded_train_data = load_data(location, "train")
    loaded_val_data = load_data(location, "val")
    loaded_optimization_info = load_optimization_info(location,
                                                      best_lambda_euclidean,
                                                      best_lambda_mahalanobis)
    for k in possible_ks:
        print(k)
        k_info = {
            'baseline': k,
            'luong': k,
            'zhang': k,
            'euclidean': k,
            'mahalanobis': k
        }
        print(
            area_under_curve_all_approaches(loaded_train_data, loaded_val_data,
                                            loaded_optimization_info,
                                            indices_info, k_info))
    return
Пример #18
0
def visualize_inter_and_intra_distances(location, lambda_l1_euclidean,
                                        lambda_l1_mahalanobis, indices_info):
    loaded_data = load_data(location, "train")
    loaded_optimization_info = load_optimization_info(location,
                                                      lambda_l1_euclidean,
                                                      lambda_l1_mahalanobis)

    standardized_data = loaded_data['standardized_data']
    protected_info = loaded_data['protected_info']

    euclidean_weights = loaded_optimization_info['weights_euclidean']
    mahalanobis_matrix = loaded_optimization_info['mahalanobis_matrix']

    luong_distances = utils.make_distance_matrix_based_on_distance_function(
        standardized_data, luong_distance, [], indices_info)
    weighted_euclidean_distances = utils.make_distance_matrix_based_on_distance_function(
        standardized_data, weighted_euclidean_distance, euclidean_weights,
        indices_info)
    mahalanobis_distances = utils.make_distance_matrix_based_on_distance_function(
        standardized_data, mahalanobis_distance, mahalanobis_matrix,
        indices_info)

    # print("BASELINE")
    # inter_prot_base, inter_unprot_base, intra_base = utils.get_inter_and_intra_sens_distances(baseline_distances,
    #                                                                                           protected_info,
    #                                                                                           protected_label)
    print("Luong")
    inter_prot_luong, inter_unprot_luong, intra_luong = utils.get_inter_and_intra_sens_distances(
        luong_distances, protected_info, 1)
    # print("Zhang")
    # inter_prot_zhang, inter_unprot_zhang, intra_zhang = utils.get_inter_and_intra_sens_distances(zhang_distances,
    #                                                                                              protected_info,
    #                                                                                              protected_label)
    print("Weighted Euclidean")
    inter_prot_euclidean, inter_unprot_euclidean, intra_euclidean = utils.get_inter_and_intra_sens_distances(
        weighted_euclidean_distances, protected_info, 1)
    print("Mahalanobis")
    inter_prot_mahalanobis, inter_unprot_mahalanobis, intra_mahalanobis = utils.get_inter_and_intra_sens_distances(
        mahalanobis_distances, protected_info, 1)

    distances_inter_prot = pd.DataFrame(
        columns=["Luong", "Euclidean", "Mahalanobis"])

    # distances_inter_prot['Baseline'] = inter_prot_base
    distances_inter_prot['Luong'] = inter_prot_luong
    # distances_inter_prot['Zhang'] = inter_prot_zhang
    distances_inter_prot['Euclidean'] = inter_prot_euclidean
    distances_inter_prot['Mahalanobis'] = inter_prot_mahalanobis
    distances_inter_prot_melted = pd.melt(distances_inter_prot)
    distances_inter_prot_melted['Cluster'] = "Women vs. Women"

    distances_inter_unprot = pd.DataFrame(
        columns=["Luong", "Euclidean", "Mahalanobis"])
    # distances_inter_unprot['Baseline'] = inter_unprot_base
    distances_inter_unprot['Luong'] = inter_unprot_luong
    # distances_inter_unprot['Zhang'] = inter_unprot_zhang
    distances_inter_unprot['Euclidean'] = inter_unprot_euclidean
    distances_inter_unprot['Mahalanobis'] = inter_unprot_mahalanobis
    distances_inter_unprot_melted = pd.melt(distances_inter_unprot)
    distances_inter_unprot_melted['Cluster'] = "Men vs. Men"

    distances_intra = pd.DataFrame(
        columns=["Luong", "Euclidean", "Mahalanobis"])
    # distances_intra['Baseline'] = intra_base
    distances_intra['Luong'] = intra_luong
    # distances_intra['Zhang'] = intra_zhang
    distances_intra['Euclidean'] = intra_euclidean
    distances_intra['Mahalanobis'] = intra_mahalanobis
    distances_intra_melted = pd.melt(distances_intra)
    distances_intra_melted['Cluster'] = "Men vs. Women"

    all_distances = distances_inter_prot_melted.append(
        distances_inter_unprot_melted)
    all_distances = all_distances.append(distances_intra_melted)
    all_distances = all_distances.rename(columns={
        'variable': 'Measure',
        'value': 'Distance'
    },
                                         inplace=False)

    boxplot(x="Cluster",
            y="Distance",
            hue='Measure',
            data=all_distances,
            showmeans=True,
            meanprops={
                "marker": "o",
                "markerfacecolor": "white",
                "markeredgecolor": "black",
                "markersize": "10"
            })
    plt.xlabel("Gender clusters", size=14)
    plt.ylabel("Distance", size=14)
    plt.title("Inter- and intra distances within and between genders", size=18)
    plt.legend(loc='upper right')
    plt.show()
Пример #19
0
def decision_labels_properties_for_unprotected_group(location, indices_info, k, technique, unprotected_label, lambda_l1=0):
    loaded_train_data = load_data(location, "train")
    loaded_test_data = load_data(location, "val")

    train_data = loaded_train_data['data']
    train_data_standardized = loaded_train_data['standardized_data']
    train_protected_info = loaded_train_data['protected_info']
    train_class_label = loaded_train_data['class_label']
    train_unprotected_indices = list(np.where(train_protected_info == 2)[0])

    val_data = loaded_test_data['data']
    val_data_standardized = loaded_test_data['standardized_data']
    val_protected_info = loaded_test_data['protected_info']
    val_class_label = loaded_test_data['class_label']
    val_unprotected_indices = list(np.where(val_protected_info == unprotected_label)[0])

    if technique == 'baseline':
        predictions_negative_class, predictions_positive_class = give_decision_labels_unprotected_group(k, class_info_train=train_class_label,
                                                                   unprotected_indices_train=train_unprotected_indices,
                                                                   training_set=train_data,
                                                                   unprotected_indices_test=val_unprotected_indices,
                                                                   class_info_test=val_class_label,
                                                                   test_set=val_data,
                                                                   indices_info=indices_info,
                                                                   distance_function=luong_distance)
        return predictions_negative_class, predictions_positive_class
    elif technique == 'luong':
        predictions_negative_class, predictions_positive_class = give_decision_labels_unprotected_group(k, class_info_train=train_class_label,
                                                                   unprotected_indices_train=train_unprotected_indices,
                                                                   training_set=train_data_standardized,
                                                                   unprotected_indices_test=val_unprotected_indices,
                                                                   class_info_test=val_class_label,
                                                                   test_set=val_data_standardized,
                                                                   indices_info=indices_info,
                                                                   distance_function=luong_distance)
        return predictions_negative_class, predictions_positive_class
    elif technique == 'zhang':
        predictions_negative_class, predictions_positive_class = get_zhang_decision_scores_unprotected_group("adult", k=k, train_data=train_data,
                                                       train_sens_attribute=train_protected_info,
                                                       train_decision_attribute=train_class_label,
                                                       test_data=val_data,
                                                       test_sens_attribute=val_protected_info,
                                                       test_decision_attribute=val_class_label)
        print(predictions_positive_class)
        print(predictions_negative_class)
        return predictions_negative_class, predictions_positive_class

    if technique == 'euclidean':
        weights_euclidean = load_optimization_info(location, lambda_l1, 0.09)['weights_euclidean']
        predictions_negative_class, predictions_positive_class = give_decision_labels_unprotected_group(k, class_info_train=train_class_label,
                                                                   unprotected_indices_train=train_unprotected_indices,
                                                                   training_set=train_data_standardized,
                                                                   unprotected_indices_test=val_unprotected_indices,
                                                                   class_info_test=val_class_label,
                                                                   test_set=val_data_standardized,
                                                                   indices_info=indices_info,
                                                                   distance_function=weighted_euclidean_distance,
                                                                   weights=weights_euclidean)


        return predictions_negative_class, predictions_positive_class
    elif technique == 'mahalanobis':
        mahalanobis_matrix = load_optimization_info(location, lambda_l1, lambda_l1)['mahalanobis_matrix']
        predictions_negative_class, predictions_positive_class = give_decision_labels_unprotected_group(k, class_info_train=train_class_label,
                                                                   unprotected_indices_train=train_unprotected_indices,
                                                                   training_set=train_data_standardized,
                                                                   unprotected_indices_test=val_unprotected_indices,
                                                                   class_info_test=val_class_label,
                                                                   test_set=val_data_standardized,
                                                                   indices_info=indices_info,
                                                                   distance_function=mahalanobis_distance,
                                                                   weights=mahalanobis_matrix)
        return predictions_negative_class, predictions_positive_class
    return 0