Пример #1
0
def test_adult():
    ad = AdultDataset()
    # print(ad.feature_names)
    assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)

    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 45222
def get_evaluation(dataset_orig_vt, y_pred, privileged_groups,
                   unprivileged_groups, unpriv_val, priv_val, pos_label):
    print('Accuracy')
    print(accuracy_score(dataset_orig_vt.labels, y_pred))
    dataset_orig_vt_copy1 = dataset_orig_vt.copy()
    dataset_orig_vt_copy1.labels = y_pred

    metric_transf_train1 = BinaryLabelDatasetMetric(
        dataset_orig_vt_copy1,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print('p-rule')
    print(
        min(metric_transf_train1.disparate_impact(),
            1 / metric_transf_train1.disparate_impact()))
    print('FPR for unpriv group')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == unpriv_val],
                  y_pred[orig_sens_att == unpriv_val], pos_label))
    print("FNR for unpriv group")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == unpriv_val],
                  y_pred[orig_sens_att == unpriv_val], pos_label))

    print('FPR for priv group')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == priv_val],
                  y_pred[orig_sens_att == priv_val], pos_label))
    print("FNR for priv group")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == priv_val],
                  y_pred[orig_sens_att == priv_val], pos_label))
Пример #3
0
def get_metrics(dataset_train):
    metric_orig_train = BinaryLabelDatasetMetric(
        dataset_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    #print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())
    return metric_orig_train.mean_difference()
Пример #4
0
def test_adult_no_drop():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])
    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 48842
Пример #5
0
def calculate_bias_measures(data_orig_train, data_orig_vt, unprivileged_groups,
                            privileged_groups):
    model = RandomForestClassifier().fit(
        data_orig_train.features,
        data_orig_train.labels.ravel(),
        sample_weight=data_orig_train.instance_weights)
    dataset = data_orig_vt
    dataset_pred = dataset.copy()
    dataset_pred.labels = model.predict(data_orig_vt.features)
    classified_metric_race = ClassificationMetric(
        dataset,
        dataset_pred,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    metric_pred_race = BinaryLabelDatasetMetric(
        dataset_pred,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print("Mean difference {}".format(metric_pred_race.mean_difference()))
    print("Disparate Metric {}".format(metric_pred_race.disparate_impact()))
    print("Equal Opportunity Difference {}".format(
        classified_metric_race.equal_opportunity_difference()))
    print("Average Abs Odds Difference {}".format(
        classified_metric_race.average_abs_odds_difference()))
    print("Theil index {}".format(classified_metric_race.theil_index()))
Пример #6
0
def test_epsilon_all_groups():
    def custom_preprocessing(df):
        # slight workaround for non-binary protected attribute
        # feature should be categorical but protected attribute should be numerical
        mapping = {
            'Black': 0,
            'White': 1,
            'Asian-Pac-Islander': 2,
            'Amer-Indian-Eskimo': 3,
            'Other': 4
        }
        df['race-num'] = df.race.map(mapping)
        return df.fillna('Unknown')

    nonbinary_ad = AdultDataset(
        protected_attribute_names=['sex', 'native-country', 'race-num'],
        privileged_classes=[['Male'], ['United-States'], [1]],
        categorical_features=[
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race'
        ],
        custom_preprocessing=custom_preprocessing)
    # drop redundant race feature (not relevant to this test)
    index = nonbinary_ad.feature_names.index('race-num')
    nonbinary_ad.features = np.delete(nonbinary_ad.features, index, axis=1)
    nonbinary_ad.feature_names = np.delete(nonbinary_ad.feature_names, index)

    _, nonbinary_test = nonbinary_ad.split([32561], shuffle=False)
    dataset_metric = BinaryLabelDatasetMetric(nonbinary_test)
    eps_data = dataset_metric.smoothed_empirical_differential_fairness()
    assert eps_data == 2.063813731996515  # verified with reference implementation
Пример #7
0
def mean_diff_values(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric_orig = BinaryLabelDatasetMetric(df_aif, unprivileged_group, privileged_group)
    return abs(metric_orig.mean_difference().round(3))
Пример #8
0
def fair_metrics(dataset, pred, pred_is_dataset=False):
    if pred_is_dataset:
        dataset_pred = pred
    else:
        dataset_pred = dataset.copy()
        dataset_pred.labels = pred

    cols = [
        'statistical_parity_difference', 'equal_opportunity_difference',
        'average_abs_odds_difference', 'disparate_impact', 'theil_index'
    ]
    obj_fairness = [[0, 0, 0, 1, 0]]

    fair_metrics = pd.DataFrame(data=obj_fairness,
                                index=['objective'],
                                columns=cols)

    for attr in dataset_pred.protected_attribute_names:
        idx = dataset_pred.protected_attribute_names.index(attr)
        privileged_groups = [{
            attr:
            dataset_pred.privileged_protected_attributes[idx][0]
        }]
        unprivileged_groups = [{
            attr:
            dataset_pred.unprivileged_protected_attributes[idx][0]
        }]

        classified_metric = ClassificationMetric(
            dataset,
            dataset_pred,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        metric_pred = BinaryLabelDatasetMetric(
            dataset_pred,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        acc = classified_metric.accuracy()

        row = pd.DataFrame([[
            metric_pred.mean_difference(),
            classified_metric.equal_opportunity_difference(),
            classified_metric.average_abs_odds_difference(),
            metric_pred.disparate_impact(),
            classified_metric.theil_index()
        ]],
                           columns=cols,
                           index=[attr])
        fair_metrics = fair_metrics.append(row)

    fair_metrics = fair_metrics.replace([-np.inf, np.inf], 2)

    return fair_metrics
Пример #9
0
def calc_mean_diff(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric_orig = BinaryLabelDatasetMetric(df_aif, unprivileged_group, privileged_group)
    print(metric_orig.mean_difference().round(3))
    if abs(metric_orig.mean_difference().round(3)) < 0.2:
        print('The algorithm can be considered to be not biased')
    else:
        print('There is a potential bias')
Пример #10
0
def calc_disparity_index(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric_orig = BinaryLabelDatasetMetric(df_aif, unprivileged_group, privileged_group)
    print('1-min(DI, 1/DI):', get_disparity_index(metric_orig.disparate_impact()).round(3))
    if get_disparity_index(metric_orig.disparate_impact()).round(3) < 0.2:
        print('The algorithm can be considered to be not biased')
    else:
        print('There is a potential bias')
Пример #11
0
def get_dataset_metrics_list(binary_dataset_list):
    #Set privileged and unprivileged groups
    privileged_groups= [{'sex':1}]
    unprivileged_groups= [{'sex': 0}]
    
    mean_diff_list = []
    disp_imp_list = []
    for dataset in binary_dataset_list:
        metrics = BinaryLabelDatasetMetric(dataset, 
                            unprivileged_groups=unprivileged_groups, 
                            privileged_groups=privileged_groups)
        mean_diff_list.append(metrics.mean_difference())
        disp_imp_list.append(1 - metrics.disparate_impact())
    return mean_diff_list, disp_imp_list
Пример #12
0
def nondebiased_classifier(train, test, privileged_groups,
                           unprivileged_groups):
    sess = tf.Session()
    NN_model = AdversarialDebiasing(privileged_groups,
                                    unprivileged_groups,
                                    scope_name='nondebiased_classifier',
                                    debias=False,
                                    sess=sess)
    NN_model.fit(train)

    # predict outcome using the test set
    pred_NNmodel = NN_model.predict(test)
    sess.close()
    tf.reset_default_graph()

    # calculate accuracy
    accuracy = accuracy_score(y_true=test.labels, y_pred=pred_NNmodel.labels)

    # calculate fairness metrics
    metric_test = BinaryLabelDatasetMetric(
        pred_NNmodel,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    acc_test = ClassificationMetric(test,
                                    pred_NNmodel,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)
    equal_opportunity_difference = equal_opp_diff(test,
                                                  pred_NNmodel,
                                                  'sex',
                                                  privileged=1,
                                                  unprivileged=0,
                                                  favourable=1,
                                                  unfavourable=0)
    average_odds_difference = avg_odds_diff(test,
                                            pred_NNmodel,
                                            'sex',
                                            privileged=1,
                                            unprivileged=0,
                                            favourable=1,
                                            unfavourable=0)

    metrics = [
        metric_test.mean_difference(),
        acc_test.disparate_impact(), equal_opportunity_difference,
        average_odds_difference,
        acc_test.theil_index()
    ]

    return pred_NNmodel, accuracy, metrics
Пример #13
0
def prejudice(train, test, unprivileged_groups, privileged_groups):
    prejudice_model = PrejudiceRemover(eta=100, sensitive_attr='sex')
    prejudice_model.fit(train)

    # predict outcome using the test set
    pred_prejudice = prejudice_model.predict(test)

    # calculate accuracy
    accuracy = accuracy_score(y_true=test.labels, y_pred=pred_prejudice.labels)

    # calculate fairness metrics
    metric_test = BinaryLabelDatasetMetric(
        pred_prejudice,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    acc_test = ClassificationMetric(test,
                                    pred_prejudice,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)
    equal_opportunity_difference = equal_opp_diff(test,
                                                  pred_prejudice,
                                                  'sex',
                                                  privileged=1,
                                                  unprivileged=0,
                                                  favourable=1,
                                                  unfavourable=0)
    average_odds_difference = avg_odds_diff(test,
                                            pred_prejudice,
                                            'sex',
                                            privileged=1,
                                            unprivileged=0,
                                            favourable=1,
                                            unfavourable=0)

    if acc_test.disparate_impact() == math.inf:
        disparate_impact = 5.0
    else:
        disparate_impact = acc_test.disparate_impact()

    metrics = [
        metric_test.mean_difference(), disparate_impact,
        equal_opportunity_difference, average_odds_difference,
        acc_test.theil_index()
    ]

    return pred_prejudice, accuracy, metrics
Пример #14
0
def ensemble(test, pred_adversarial, pred_prejudice, pred_nondebiased,
             unprivileged_groups, privileged_groups):
    pred_labels = []
    for i in range(0, len(test.features)):
        arr = mode([
            pred_adversarial.labels[i], pred_prejudice.labels[i],
            pred_nondebiased.labels[i]
        ])
        pred_labels.append(arr[0][0])

    pred_ensemble = test.copy()
    pred_ensemble.labels = np.array(pred_labels)

    accuracy = accuracy_score(y_true=test.labels, y_pred=pred_ensemble.labels)

    metric_test = BinaryLabelDatasetMetric(
        pred_ensemble,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    acc_test = ClassificationMetric(test,
                                    pred_ensemble,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)
    equal_opportunity_difference = equal_opp_diff(test,
                                                  pred_ensemble,
                                                  'sex',
                                                  privileged=1,
                                                  unprivileged=0,
                                                  favourable=1,
                                                  unfavourable=0)
    average_odds_difference = avg_odds_diff(test,
                                            pred_ensemble,
                                            'sex',
                                            privileged=1,
                                            unprivileged=0,
                                            favourable=1,
                                            unfavourable=0)

    metrics = [
        metric_test.mean_difference(),
        acc_test.disparate_impact(), equal_opportunity_difference,
        average_odds_difference,
        acc_test.theil_index()
    ]

    return accuracy, metrics
Пример #15
0
def create_binary(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    return BinaryLabelDatasetMetric(df_aif, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
def compute_statistical_parity(data, unpriv_group, priv_group):
    if isinstance(data, pd.DataFrame):
        transformed_data = BinaryLabelDataset(
            df=data,
            label_names=["two_year_recid"],
            protected_attribute_names=["race"],
            favorable_label=0,
            unfavorable_label=1)
    else:
        transformed_data = data

    metric_test_data = BinaryLabelDatasetMetric(
        transformed_data,
        unprivileged_groups=unpriv_group,
        privileged_groups=priv_group)
    parity_difference = metric_test_data.statistical_parity_difference()
    print(
        f"Mean difference (statistical parity difference) = {parity_difference}"
    )
    return parity_difference
Пример #17
0
def plot_using_aif(df_predict,df_true):
   

    predict_list, true_list = [], []
    unpriv_label_list , priv_label_list = [], []
    for (u,p) in zip(unpriv_list,priv_list):
        cur_predict, cur_true = [], []

        unpriv_label = '+'.join(['-'.join([prot_attr_dict[key][u_el[key]] for key in u_el]) for u_el in u])
        priv_label = '+'.join(['-'.join([prot_attr_dict[key][p_el[key]] for key in p_el]) for p_el in p])

        print('-------------------------------------------------------------------')
        print('unpriv_label:-->',unpriv_label)
        print('-------------------------------------------------------------------')
        print('priv_label  :-->',priv_label)
        print('-------------------------------------------------------------------')
        print('\n\n')
        for i,label in enumerate(rating_names):
            #print('Fairness Metric for the label------>',label.upper())
        
            predict_dataset  = StandardDataset(df=predict_df_list[i], label_name=label, favorable_classes=[1.0,1.0],
                                protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) 
            
            true_dataset  = StandardDataset(df=true_df_list[i], label_name=label, favorable_classes=[1.0,1.0],
                                protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) 
            
           
            predict_dataset_metric = BinaryLabelDatasetMetric(predict_dataset, unprivileged_groups=u, privileged_groups=p)
            true_dataset_metric = BinaryLabelDatasetMetric(true_dataset, unprivileged_groups=u, privileged_groups=p)
            

            #classfication_metric = ClassificationMetric(true_dataset, predict_dataset, unprivileged_groups=u, privileged_groups=p)
            
            #x=classfication_metric.generalized_entropy_index()
            
            #print(label,':  -->','predicted :  -->',abs(predict_dataset_metric.disparate_impact()),'true :  -->',abs(true_dataset_metric.disparate_impact()))
            print(label,':  -->','predicted :  -->',abs(predict_dataset_metric.mean_difference()),'true :  -->',abs(true_dataset_metric.mean_difference()))
Пример #18
0
def show_metrics(binary_dataset_list):
    #Set privileged and unprivileged groups
    privileged_groups= [{'sex':1}]
    unprivileged_groups= [{'sex': 0}]
    
    for dataset in binary_dataset_list:
        display(Markdown("#### Model  dataset metrics"))
        metrics = BinaryLabelDatasetMetric(dataset, 
                            unprivileged_groups=unprivileged_groups, 
                            privileged_groups=privileged_groups)

        ex_metrics = MetricTextExplainer(metrics)

        print(ex_metrics.mean_difference())
        print('\n')
        print(ex_metrics.disparate_impact())
Пример #19
0
    def explain(self, request: Dict) -> Dict:
        inputs = request["instances"]
        predictions = np.array(request["outputs"])

        dataframe_predicted = pd.DataFrame(inputs, columns=self.feature_names)
        dataframe_predicted[self.label_names[0]] = predictions

        dataset_predicted = BinaryLabelDataset(
            favorable_label=self.favorable_label,
            unfavorable_label=self.unfavorable_label,
            df=dataframe_predicted,
            label_names=self.label_names,
            protected_attribute_names=['age'])

        metrics = BinaryLabelDatasetMetric(
            dataset_predicted,
            unprivileged_groups=self.unprivileged_groups,
            privileged_groups=self.privileged_groups)

        return {
            "predictions": predictions.tolist(),
            "metrics": {
                "base_rate":
                metrics.base_rate(),
                "consistency":
                metrics.consistency().tolist(),
                "disparate_impact":
                metrics.disparate_impact(),
                "num_instances":
                metrics.num_instances(),
                "num_negatives":
                metrics.num_negatives(),
                "num_positives":
                metrics.num_positives(),
                "statistical_parity_difference":
                metrics.statistical_parity_difference(),
            }
        }
Пример #20
0
def get_bias_amplification(train_data, prediction_data):
    privileged_groups= [{'sex':1}]
    unprivileged_groups= [{'sex': 0}]
    train_metrics = BinaryLabelDatasetMetric(train_data, 
                            unprivileged_groups=unprivileged_groups, 
                            privileged_groups=privileged_groups)
    
    prediction_metrics = BinaryLabelDatasetMetric(prediction_data, 
                            unprivileged_groups=unprivileged_groups, 
                            privileged_groups=privileged_groups)
    
    
    tedf = train_metrics.smoothed_empirical_differential_fairness()
    pedf = prediction_metrics.smoothed_empirical_differential_fairness()
    bias_amp = pedf - tedf
    return bias_amp
def generate_fairness_report(dataset, privileged_groups, unprivileged_groups):
    print(f'Shape: {dataset.features.shape}\n')

    print(f'Favorable label: {dataset.favorable_label}')
    print(f'Unfavorable label: {dataset.unfavorable_label}\n')

    print('Protected attribute names:')
    print(dataset.protected_attribute_names)
    print()

    print('Privileged attribute values:')
    print(dataset.privileged_protected_attributes)
    print('Unprivileged attribute values:')
    print(dataset.unprivileged_protected_attributes)
    print()

    binary_label_metric = BinaryLabelDatasetMetric(
        dataset,
        privileged_groups=privileged_groups,
        unprivileged_groups=unprivileged_groups)
    print(f'Statistical parity difference: '
          f'{binary_label_metric.statistical_parity_difference()}')
    print(f'Disparate impact: {binary_label_metric.disparate_impact()}')
Пример #22
0
    def fit(self, dataset):
        RW = Reweighing(unprivileged_groups=self.unprivileged_group,
                        privileged_groups=self.privileged_group)

        mean_diff_metric = lambda dataset: BinaryLabelDatasetMetric(
            dataset,
            unprivileged_groups=self.unprivileged_group,
            privileged_groups=self.privileged_group).mean_difference()
        dataset_ = RW.fit_transform(dataset)

        print("before reweighing (meandiff):", mean_diff_metric(dataset),
              "after:", mean_diff_metric(dataset_))

        #reg_ = LogisticRegression(solver='liblinear',max_iter=1000000000, C=1000000000000000000000.0).fit(dataset_.features, dataset_.labels.ravel())

        reg = LogisticRegression(solver='liblinear', max_iter=1000000000).fit(
            dataset_.features,
            dataset_.labels.ravel(),
            sample_weight=dataset_.instance_weights)
        #print("reweighted",sorted(list(zip(dataset.feature_names,reg.coef_[0])),key=lambda x: abs(x[1])))

        #print(sorted(list(zip(dataset.feature_names,reg.coef_[0])),key=lambda x: abs(x[1])))

        self.h = reg
Пример #23
0
def main():
    print('Calculate bias')
    np.random.seed(1)
    protected_attribute = 'ethnicity'
    dataset = load_preproc_data_heart([protected_attribute])

    privileged_groups = [{protected_attribute: 1}]
    unprivileged_groups = [{
        protected_attribute: 2
    }, {
        protected_attribute: 3
    }, {
        protected_attribute: 4
    }, {
        protected_attribute: 5
    }, {
        protected_attribute: 6
    }]

    data_orig_train, data_orig_vt = dataset.split([0.7], shuffle=True)
    data_orig_valid, data_orig_test = data_orig_vt.split([0.5], shuffle=True)

    metric_orig_train = BinaryLabelDatasetMetric(
        data_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print("Mean {}".format(metric_orig_train.mean_difference()))

    rw = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    data_transf_train = rw.fit_transform(data_orig_train)
    metric_transf_train = BinaryLabelDatasetMetric(
        data_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    print("Mean difference after transformation =%f " %
          metric_transf_train.mean_difference())

    calculate_bias_measures(data_orig_train, data_orig_vt, unprivileged_groups,
                            privileged_groups)
    calculate_bias_measures(data_orig_valid, data_orig_test,
                            unprivileged_groups, privileged_groups)
Пример #24
0
def train():
    np.random.seed(10)

    def quantizePrior1(x):
        if x <= 0:
            return 0
        elif 1 <= x <= 3:
            return 1
        else:
            return 2

    def quantizeLOS(x):
        if x <= 7:
            return 0
        if 8 < x <= 93:
            return 1
        else:
            return 2

    def group_race(x):
        if x == "Caucasian":
            return 1.0
        else:
            return 0.0

    filepath = 'AIF360/aif360/data/raw/compas/compas-scores-two-years.csv'
    df = pd.read_csv(filepath, index_col='id', na_values=[])

    df['age_cat'] = df['age_cat'].replace('Greater than 45', 2)
    df['age_cat'] = df['age_cat'].replace('25 - 45', 1)
    df['age_cat'] = df['age_cat'].replace('Less than 25', 0)
    df['score_text'] = df['score_text'].replace('High', 1)
    df['score_text'] = df['score_text'].replace('Medium', 1)
    df['score_text'] = df['score_text'].replace('Low', 0)
    df['priors_count'] = df['priors_count'].apply(lambda x: quantizePrior1(x))
    df['length_of_stay'] = (
        pd.to_datetime(df['c_jail_out']) -
        pd.to_datetime(df['c_jail_in'])).apply(lambda x: x.days)
    df['length_of_stay'] = df['length_of_stay'].apply(lambda x: quantizeLOS(x))
    df = df.loc[
        ~df['race'].isin(['Native American', 'Hispanic', 'Asian', 'Other']), :]
    df['c_charge_degree'] = df['c_charge_degree'].replace({'F': 0, 'M': 1})

    df1 = df[[
        'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text',
        'two_year_recid'
    ]]
    feature_list = []
    for index, row in df1.iterrows():
        feature_list.append('\t'.join(row.astype(str).to_list()))
    df1['feature_list'] = feature_list
    df3 = df1.groupby('feature_list').count() / len(df1.index)

    df2 = pd.DataFrame()
    df2['feature_list'] = list(df3.index)
    df2['prob_list'] = list(df3.priors_count)
    for index, row in df2.iterrows():
        if row['feature_list'][0] == '0' and row['feature_list'][
                -1] == '1' and 'African' in row['feature_list']:
            row['prob_list'] = row['prob_list'] * 10

        elif row['feature_list'][0] == '0' and row['feature_list'][-1] == '1':
            row['prob_list'] = row['prob_list'] * 7
        elif row['feature_list'][0] == '2' and row['feature_list'][-1] == '0':
            row['prob_list'] = row['prob_list'] * 7
    prob_list = list(df2.prob_list)

    df_new = pd.DataFrame()
    rng = np.random.default_rng()
    prob_list = np.array(prob_list)
    prob_list = prob_list / prob_list.sum()
    feature_list = rng.choice(list(df2.feature_list),
                              len(df1.index),
                              p=prob_list)
    var_list = [
        'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text',
        'two_year_recid'
    ]
    for i in var_list:
        vars()[i] = []

    for i in feature_list:
        tmp = i.split('\t')
        for j in range(len(var_list)):
            vars()[var_list[j]].append(tmp[j])

    for i in var_list:
        df_new[i] = vars()[i]

    df = df_new
    df1 = df[[
        'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text',
        'two_year_recid'
    ]]

    tot = []
    for index, row in df1.iterrows():
        result = ''
        for j in df1.columns:
            result = result + str(row[j])
        tot.append(result)
    df['tmp_feature'] = tot
    df['mis_prob'] = 0
    for i in df['tmp_feature'].unique():
        if 'African' in i and i[-1] == '0':
            df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.1
        elif 'African' in i:
            df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.02
        elif 'African' not in i and i[-1] == '0':
            df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.02
        else:
            df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.02
    new_label = []
    for i, j in zip(df['mis_prob'], df['priors_count']):
        if np.random.binomial(1, i, 1)[0] == 1:
            new_label.append(3)
        else:
            new_label.append(j)
    df['priors_count'] = new_label
    print(len(df.loc[df['priors_count'] == 3, :].index))
    print(len(df.index))
    df['priors_count'] = df['priors_count'].astype(int)
    df['score_text'] = df['score_text'].astype(int)
    df['age_cat'] = df['age_cat'].astype(int)
    df['score_text'] = df['score_text'].astype(int)
    df['c_charge_degree'] = df['c_charge_degree'].astype(int)
    df['two_year_recid'] = df['two_year_recid'].astype(int)

    df['c_charge_degree'] = df['c_charge_degree'].replace({0: 'F', 1: 'M'})

    def quantizePrior(x):
        if x == 0:
            return '0'
        elif x == 1:
            return '1 to 3'
        elif x == 2:
            return 'More than 3'
        else:
            return 'missing'

    # Quantize length of stay
    def quantizeLOS(x):
        if x == 0:
            return '<week'
        if x == 1:
            return '<3months'
        else:
            return '>3 months'

    # Quantize length of stay
    def adjustAge(x):
        if x == 0:
            return '25 to 45'
        elif x == 1:
            return 'Greater than 45'
        elif x == 2:
            return 'Less than 25'

    def quantizeScore(x):
        if x == 1:
            return 'MediumHigh'
        else:
            return 'Low'

    def group_race(x):
        if x == "Caucasian":
            return 1.0
        else:
            return 0.0

    df['priors_count'] = df['priors_count'].apply(lambda x: quantizePrior(x))
    df['score_text'] = df['score_text'].apply(lambda x: quantizeScore(x))
    df['age_cat'] = df['age_cat'].apply(lambda x: adjustAge(x))
    # Recode sex and race
    df['race'] = df['race'].apply(lambda x: group_race(x))
    df['race'] = df['race'].astype(int)

    df['two_year_recid'] = df['two_year_recid'].astype(int)

    df = df[[
        'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text',
        'two_year_recid'
    ]]

    df_train, df_test = train_test_split(df, test_size=0.3, random_state=10)

    all_protected_attribute_maps = {"race": {0.0: 0, 1.0: 1}}
    D_features = ['race']
    dataset_orig_train = CustomDataset(
        label_name='two_year_recid',
        favorable_classes=[0],
        protected_attribute_names=['race'],
        privileged_classes=[[1]],
        categorical_features=[
            'priors_count', 'c_charge_degree', 'age_cat', 'score_text'
        ],
        features_to_keep=[
            'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text'
        ],
        df=df_train,
        metadata={
            'label_maps': [{
                1: 'Did recid.',
                0: 'No recid.'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        })

    dataset_orig_vt = CustomDataset(
        label_name='two_year_recid',
        favorable_classes=[0],
        protected_attribute_names=['race'],
        privileged_classes=[[1]],
        categorical_features=[
            'priors_count', 'c_charge_degree', 'age_cat', 'score_text'
        ],
        features_to_keep=[
            'priors_count', 'c_charge_degree', 'race', 'age_cat', 'score_text'
        ],
        df=df_test,
        metadata={
            'label_maps': [{
                1: 'Did recid.',
                0: 'No recid.'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        })

    privileged_groups = [{'race': 1}]
    unprivileged_groups = [{'race': 0}]
    optim_options = {
        "distortion_fun": get_distortion_compas,
        "epsilon": 0.04,
        "clist": [0.99, 1.99, 2.99],
        "dlist": [.1, 0.05, 0]
    }

    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    OP = OptimPreproc(OptTools,
                      optim_options,
                      unprivileged_groups=unprivileged_groups,
                      privileged_groups=privileged_groups)

    OP = OP.fit(dataset_orig_train)

    dataset_transf_cat_test = OP.transform(dataset_orig_vt, transform_Y=True)
    dataset_transf_cat_test = dataset_orig_vt.align_datasets(
        dataset_transf_cat_test)

    dataset_transf_cat_train = OP.transform(dataset_orig_train,
                                            transform_Y=True)
    dataset_transf_cat_train = dataset_orig_train.align_datasets(
        dataset_transf_cat_train)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
    y_train = dataset_transf_cat_train.labels.ravel()

    X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)

    lmod = LogisticRegression()
    lmod.fit(X_train, y_train)
    y_pred = lmod.predict(X_test)
    print('Without reweight')
    print('Accuracy')
    print(accuracy_score(dataset_orig_vt.labels, y_pred))

    dataset_orig_vt_copy1 = dataset_orig_vt.copy()
    dataset_orig_vt_copy1.labels = y_pred

    metric_transf_train1 = BinaryLabelDatasetMetric(
        dataset_orig_vt_copy1,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print('p-rule')
    print(metric_transf_train1.disparate_impact())
    print('CV')
    print(metric_transf_train1.mean_difference())
    print('FPR for unpriv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print("FNR for unpriv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print('FPR for priv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    print("FNR for priv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    df_weight = dataset_orig_train.convert_to_dataframe()[0]
    df_weight['weight'] = 1
    df_weight['is_missing'] = 0
    df_weight['tmp'] = ''
    tmp_result = []
    for i, j in zip(df_weight['race'], df_weight['two_year_recid']):
        tmp_result.append(str(i) + str(j))
    df_weight['tmp'] = tmp_result

    df_weight.loc[df_weight['priors_count=missing'] == 1, 'is_missing'] = 1

    for i in df_weight['tmp'].unique():
        df_weight.loc[
            (df_weight['tmp'] == i) & (df_weight['is_missing'] == 0),
            'weight'] = len(
                df_weight.loc[(df_weight['tmp'] == i), :].index) / len(
                    df_weight.loc[(df_weight['tmp'] == i) &
                                  (df_weight['is_missing'] == 0), :].index)
        df_weight.loc[(df_weight['tmp'] == i) & (df_weight['is_missing'] == 1),
                      'weight'] = len(df_weight.loc[
                          (df_weight['tmp'] == i) &
                          (df_weight['is_missing'] == 0), :].index) / len(
                              df_weight.loc[(df_weight['tmp'] == i), :].index)
    dataset_orig_train.instance_weights = np.array(df_weight['weight'])

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
    y_train = dataset_transf_cat_train.labels.ravel()
    X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_orig_train.instance_weights)
    y_pred = lmod.predict(X_test)
    print('With reweight')
    print('Accuracy')
    print(accuracy_score(dataset_orig_vt.labels, y_pred))

    dataset_orig_vt_copy1 = dataset_orig_vt.copy()
    dataset_orig_vt_copy1.labels = y_pred

    metric_transf_train1 = BinaryLabelDatasetMetric(
        dataset_orig_vt_copy1,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print('p-rule')
    print(metric_transf_train1.disparate_impact())
    print('CV')
    print(metric_transf_train1.mean_difference())
    print('FPR for unpriv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print("FNR for unpriv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print('FPR for priv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    print("FNR for priv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
Пример #25
0
def main() -> None:

    # read from inventor
    filepath = ait_input.get_inventory_path('Data')

    # prepare column names as given by german.doc
    column_names = [
        'status', 'month', 'credit_history', 'purpose', 'credit_amount',
        'savings', 'employment', 'investment_as_income_percentage',
        'personal_status', 'other_debtors', 'residence_since', 'property',
        'age', 'installment_plans', 'housing', 'number_of_credits',
        'skill_level', 'people_liable_for', 'telephone', 'foreign_worker',
        'credit'
    ]

    # load into a dataframe
    df = data_loading(filepath=filepath,
                      column_names=column_names,
                      na_values=None)

    # prepare for mappings
    mappings = {
        'label_maps': [{
            1.0: 'Good Credit',
            2.0: 'Bad Credit'
        }],
        'protected_attribute_maps': [{
            1.0: 'Male',
            0.0: 'Female'
        }, {
            1.0: 'Old',
            0.0: 'Young'
        }]
    }

    # prepare for categorical features
    categorical_features = [
        'status', 'credit_history', 'purpose', 'savings', 'employment',
        'other_debtors', 'property', 'installment_plans', 'housing',
        'skill_level', 'telephone', 'foreign_worker'
    ]

    # load param
    protected_attribute = ait_input.get_method_param_value(
        'protected_attribute')
    privileged_classes = ait_input.get_method_param_value('privileged_classes')

    # input check
    ait_input_check(protected_attribute, privileged_classes)

    # prepare for structure from dataframe and edit data features setting
    dataset = StandardDataset(
        df=df,
        label_name='credit',
        favorable_classes=[1],
        protected_attribute_names=[protected_attribute],
        privileged_classes=[lambda x: x >= privileged_classes],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=None,
        features_to_drop=['personal_status', 'sex'],
        na_values=None,
        custom_preprocessing=preprocessing,
        metadata=mappings)

    # set two variables for the privileged (1) and unprivileged (0) values for the age attribute.
    privileged_groups = [{protected_attribute: 1}]
    unprivileged_groups = [{protected_attribute: 0}]

    # compute fairness metric on original training dataset
    metric_fairness = BinaryLabelDatasetMetric(
        dataset,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    print("Original training dataset: German Credit Data")
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f"
        % metric_fairness.mean_difference())
    print("unprivileged groups = %f" %
          metric_fairness.base_rate(privileged=False))
    print("privileged groups = %f" %
          metric_fairness.base_rate(privileged=True))

    # resource observed_predicted_plot
    save_metric_fairness_plot(metric_fairness, protected_attribute)

    # measures
    measure_mean_difference(metric_fairness.mean_difference())

    # ait.log
    move_log()
    def fit(self, dataset_true, dataset_pred):
        """Estimates the optimal classification threshold and margin for reject
        option classification that optimizes the metric provided.

        Note:
            The `fit` function is a no-op for this algorithm.

        Args:
            dataset_true (BinaryLabelDataset): Dataset containing the true
                `labels`.
            dataset_pred (BinaryLabelDataset): Dataset containing the predicted
                `scores`.

        Returns:
            RejectOptionClassification: Returns self.
        """

        fair_metric_arr = np.zeros(self.num_class_thresh*self.num_ROC_margin)
        balanced_acc_arr = np.zeros_like(fair_metric_arr)
        ROC_margin_arr = np.zeros_like(fair_metric_arr)
        class_thresh_arr = np.zeros_like(fair_metric_arr)

        cnt = 0
        # Iterate through class thresholds
        for class_thresh in np.linspace(self.low_class_thresh,
                                        self.high_class_thresh,
                                        self.num_class_thresh):

            self.classification_threshold = class_thresh
            if class_thresh <= 0.5:
                low_ROC_margin = 0.0
                high_ROC_margin = class_thresh
            else:
                low_ROC_margin = 0.0
                high_ROC_margin = (1.0-class_thresh)

                # Iterate through ROC margins
                for ROC_margin in np.linspace(
                                    low_ROC_margin,
                                    high_ROC_margin,
                                    self.num_ROC_margin):
                    self.ROC_margin = ROC_margin

                    # Predict using the current threshold and margin
                    dataset_transf_pred = self.predict(dataset_pred)

                    dataset_transf_metric_pred = BinaryLabelDatasetMetric(
                                                 dataset_transf_pred,
                                                 unprivileged_groups=self.unprivileged_groups,
                                                 privileged_groups=self.privileged_groups)
                    classified_transf_metric = ClassificationMetric(
                                                 dataset_true,
                                                 dataset_transf_pred,
                                                 unprivileged_groups=self.unprivileged_groups,
                                                 privileged_groups=self.privileged_groups)

                    ROC_margin_arr[cnt] = self.ROC_margin
                    class_thresh_arr[cnt] = self.classification_threshold

                    # Balanced accuracy and fairness metric computations
                    balanced_acc_arr[cnt] = 0.5*(classified_transf_metric.true_positive_rate()\
                                           +classified_transf_metric.true_negative_rate())
                    if self.metric_name == "Statistical parity difference":
                        fair_metric_arr[cnt] = dataset_transf_metric_pred.mean_difference()
                    elif self.metric_name == "Average odds difference":
                        fair_metric_arr[cnt] = classified_transf_metric.average_odds_difference()
                    elif self.metric_name == "Equal opportunity difference":
                        fair_metric_arr[cnt] = classified_transf_metric.equal_opportunity_difference()

                    cnt += 1

        rel_inds = np.logical_and(fair_metric_arr >= self.metric_lb,
                                  fair_metric_arr <= self.metric_ub)
        if any(rel_inds):
            best_ind = np.where(balanced_acc_arr[rel_inds]
                                == np.max(balanced_acc_arr[rel_inds]))[0][0]
        else:
            warn("Unable to satisy fairness constraints")
            rel_inds = np.ones(len(fair_metric_arr), dtype=bool)
            best_ind = np.where(fair_metric_arr[rel_inds]
                                == np.min(fair_metric_arr[rel_inds]))[0][0]

        self.ROC_margin = ROC_margin_arr[rel_inds][best_ind]
        self.classification_threshold = class_thresh_arr[rel_inds][best_ind]

        return self
# In[ ]:


new_dataset = BinaryLabelDataset(favorable_label=favorable_label,
                                unfavorable_label=unfavorable_label,
                                df=new_b_train,
                                label_names=['Attrition'],
                                protected_attribute_names=['Gender'],
                                unprivileged_protected_attributes=unprivileged_groups)


# In[ ]:


metric_orig_train = BinaryLabelDatasetMetric(new_dataset, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())


# In[ ]:



new_dataset.protected_attribute_names


# In[ ]:

Пример #28
0
 train1, test1 = dataset_orig.split([0.7], shuffle=True)
 RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
 RW.fit(train1)
 dataset_transf_train = RW.transform(train1)
 sess = tf.Session()
 debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                           unprivileged_groups = unprivileged_groups,
                           scope_name='debiased_classifier',
                           debias=True,
                           sess=sess)
 debiased_model.fit(train1)
 dataset_debiasing_train = debiased_model.predict(train1)
 dataset_debiasing_test = debiased_model.predict(test1)
 metric_debiasing_train = BinaryLabelDatasetMetric(dataset_debiasing_train, 
                                          unprivileged_groups=unprivileged_groups,
                                          privileged_groups=privileged_groups)
 metric_debiasing_test = BinaryLabelDatasetMetric(dataset_debiasing_test, 
                                          unprivileged_groups=unprivileged_groups,
                                          privileged_groups=privileged_groups)
 metric_dataset_debiasing_train.append(metric_debiasing_train.mean_difference())
 metric_dataset_debiasing_test.append(metric_debiasing_test.mean_difference())
 
 
 
 sess.close()
 tf.reset_default_graph()
 sess = tf.Session()
 debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                           unprivileged_groups = unprivileged_groups,
                           scope_name='debiased_classifier',
Пример #29
0
def train():
    privileged_groups = [{'race': 1}]
    unprivileged_groups = [{'race': 0}]
    dataset_orig = load_preproc_data_compas(['race'])

    optim_options = {
        "distortion_fun": get_distortion_compas,
        "epsilon": 0.05,
        "clist": [0.99, 1.99, 2.99],
        "dlist": [.1, 0.05, 0]
    }

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7],
                                                             shuffle=True)

    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    OP = OptimPreproc(OptTools,
                      optim_options,
                      unprivileged_groups=unprivileged_groups,
                      privileged_groups=privileged_groups)

    OP = OP.fit(dataset_orig_train)

    dataset_transf_cat_test = OP.transform(dataset_orig_vt, transform_Y=True)
    dataset_transf_cat_test = dataset_orig_vt.align_datasets(
        dataset_transf_cat_test)

    dataset_transf_cat_train = OP.transform(dataset_orig_train,
                                            transform_Y=True)
    dataset_transf_cat_train = dataset_orig_train.align_datasets(
        dataset_transf_cat_train)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
    y_train = dataset_transf_cat_train.labels.ravel()

    X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)

    lmod = LogisticRegression()
    lmod.fit(X_train, y_train)
    y_pred = lmod.predict(X_test)
    print('Without reweight')
    print('Accuracy')
    print(accuracy_score(dataset_orig_vt.labels, y_pred))

    dataset_orig_vt_copy1 = dataset_orig_vt.copy()
    dataset_orig_vt_copy1.labels = y_pred

    metric_transf_train1 = BinaryLabelDatasetMetric(
        dataset_orig_vt_copy1,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print('p-rule')
    print(metric_transf_train1.disparate_impact())
    print('CV')
    print(metric_transf_train1.mean_difference())
    print('FPR for unpriv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print("FNR for unpriv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print('FPR for priv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    print("FNR for priv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    df_weight = dataset_orig_train.convert_to_dataframe()[0]
    df_weight['weight'] = 1
    df_weight['is_missing'] = 0
    df_weight['tmp'] = ''
    tmp_result = []
    for i, j in zip(df_weight['race'], df_weight['two_year_recid']):
        tmp_result.append(str(i) + str(j))
    df_weight['tmp'] = tmp_result

    df_weight.loc[df_weight['priors_count=missing'] == 1, 'is_missing'] = 1

    for i in df_weight['tmp'].unique():
        df_weight.loc[
            (df_weight['tmp'] == i) & (df_weight['is_missing'] == 0),
            'weight'] = len(
                df_weight.loc[(df_weight['tmp'] == i), :].index) / len(
                    df_weight.loc[(df_weight['tmp'] == i) &
                                  (df_weight['is_missing'] == 0), :].index)
        df_weight.loc[(df_weight['tmp'] == i) & (df_weight['is_missing'] == 1),
                      'weight'] = len(df_weight.loc[
                          (df_weight['tmp'] == i) &
                          (df_weight['is_missing'] == 0), :].index) / len(
                              df_weight.loc[(df_weight['tmp'] == i), :].index)
    dataset_orig_train.instance_weights = np.array(df_weight['weight'])

    scale_transf = StandardScaler()
    #X_train = scale_transf.fit_transform(dataset_transf_cat_train.features[:,1:])
    X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
    y_train = dataset_transf_cat_train.labels.ravel()

    #X_test = scale_transf.fit_transform(dataset_transf_cat_test.features[:,1:])
    X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_orig_train.instance_weights)
    y_pred = lmod.predict(X_test)
    print('With reweight')
    print('Accuracy')
    print(accuracy_score(dataset_orig_vt.labels, y_pred))

    dataset_orig_vt_copy1 = dataset_orig_vt.copy()
    dataset_orig_vt_copy1.labels = y_pred

    metric_transf_train1 = BinaryLabelDatasetMetric(
        dataset_orig_vt_copy1,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print('p-rule')
    print(metric_transf_train1.disparate_impact())
    print('CV')
    print(metric_transf_train1.mean_difference())
    print('FPR for unpriv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print("FNR for unpriv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 0], y_pred[
        orig_sens_att == 0], 0))
    print('FPR for priv')
    orig_sens_att = dataset_orig_vt.protected_attributes.ravel()
    print(1 - TNR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
    print("FNR for priv")
    print(1 - TPR(dataset_orig_vt.labels.ravel()[orig_sens_att == 1], y_pred[
        orig_sens_att == 1], 0))
for i in range(validation_comp.shape[1]):
    f_c.append([])
    if (validation_comp.columns[i].find(unprivileged_group) != -1):
        prot.append(validation_comp.columns[i])
        unpriv_dict = {validation_comp.columns[i]: 1}
    if (validation_comp.columns[i].find(privileged_group) != -1):
        priv.append([1])
        prot.append(validation_comp.columns[i])
        priv_dict = {validation_comp.columns[i]: 1}
    else:
        priv.append([])

stdDs = StandardDataset(validation_comp, 'is_violent_recid', [0], prot, priv)
stdPred = StandardDataset(validation_pred, 'is_violent_recid', [0], prot, priv)
bi_met = BinaryLabelDatasetMetric(stdDs,
                                  privileged_groups=[priv_dict],
                                  unprivileged_groups=[unpriv_dict])
class_met = ClassificationMetric(stdDs,
                                 stdPred,
                                 unprivileged_groups=[unpriv_dict],
                                 privileged_groups=[priv_dict])

disparate_impact = bi_met.disparate_impact()
#error_rate_ratio = class_met.error_rate_ratio()
eq_diff = class_met.equal_opportunity_difference()

#Create 2 Bar Graphs
x = [1]
di_y = [disparate_impact]
er_y = [error_rate_ratio]
plt.ylim(bottom=0, top=2)