def __init__(self, step_name, step, df, sensitive_att, target_col, input_score=True, clf_threshold=0.5):
        """
        :param step_name: str, name of the current input step.
        :param step: object of the initialized class.
        :param df: pandas dataframe, stores the data.
        :param sensitive_att: str, the name of a sensitive attribute.
        :param target_col: str, the name of the target attribute.
        :param input_score: boolean, represent whether the post-processor takes predicted score as input. Default is True.
        :param clf_threshold: float in [0, 1], represents the threshold to categorize class labels from predicted scores.
        """
        if "pred_"+target_col not in df.columns:
            print("Require the predictions for ",target_col, " existing in the data!")
            raise ValueError
        super().__init__(step_name=step_name, df=df, sensitive_att=sensitive_att, target_col=target_col)
        # assume the data set has been encoded to numerical values,
        # intitialize a BinaryLabelDataset from AIF 360
        aif_true_df = BinaryLabelDataset(df=df.drop(columns=["pred_"+target_col]), label_names=[target_col], protected_attribute_names=[sensitive_att])

        aif_pred_df = aif_true_df.copy()

        if input_score:
            aif_pred_df.scores = df["pred_"+target_col]
        else:
            aif_pred_df.labels = np.array([int(x >= clf_threshold) for x in df["pred_"+target_col]])
        self.input_score = input_score
        self.step = step.fit(aif_true_df, aif_pred_df)
        self.clf_threshold = clf_threshold
Exemplo n.º 2
0
def test_generalized_entropy_index():
    data = np.array([[0, 1],
                     [0, 0],
                     [1, 0],
                     [1, 1],
                     [1, 0],
                     [1, 0],
                     [2, 1],
                     [2, 0],
                     [2, 1],
                     [2, 1]])
    pred = data.copy()
    pred[[3, 9], -1] = 0
    pred[[4, 5], -1] = 1
    df = pd.DataFrame(data, columns=['feat', 'label'])
    df2 = pd.DataFrame(pred, columns=['feat', 'label'])
    bld = BinaryLabelDataset(df=df, label_names=['label'],
        protected_attribute_names=['feat'])
    bld2 = BinaryLabelDataset(df=df2, label_names=['label'],
        protected_attribute_names=['feat'])
    cm = ClassificationMetric(bld, bld2)

    assert cm.generalized_entropy_index() == 0.2

    pred = data.copy()
    pred[:, -1] = np.array([0, 1, 1, 0, 0, 0, 0, 1, 1, 1])
    df2 = pd.DataFrame(pred, columns=['feat', 'label'])
    bld2 = BinaryLabelDataset(df=df2, label_names=['label'],
        protected_attribute_names=['feat'])
    cm = ClassificationMetric(bld, bld2)

    assert cm.generalized_entropy_index() == 0.3
    def __init__(self, data_set, index_train, index_validate,
                 sensitive_variable):

        self.sensitive_variable_string = list(
            data_set.columns)[sensitive_variable]

        self.s_train = data_set.iloc[:index_train, sensitive_variable]
        self.s_validate = data_set.iloc[index_train:index_validate,
                                        sensitive_variable]
        self.s_test = data_set.iloc[index_validate:, sensitive_variable]

        self.y_train = data_set.iloc[:index_train, -1]
        self.y_validate = data_set.iloc[index_train:index_validate, -1]
        self.y_test = data_set.iloc[index_validate:, -1]

        self.train = BinaryLabelDataset(
            df=data_set.iloc[:index_train, :],
            label_names=['label'],
            protected_attribute_names=[self.sensitive_variable_string],
            favorable_label=1,
            unfavorable_label=0)

        self.validate = BinaryLabelDataset(
            df=data_set.iloc[index_train:index_validate, :],
            label_names=['label'],
            protected_attribute_names=[self.sensitive_variable_string],
            favorable_label=1,
            unfavorable_label=0)

        self.test = BinaryLabelDataset(
            df=data_set.iloc[index_validate:, :],
            label_names=['label'],
            protected_attribute_names=[self.sensitive_variable_string],
            favorable_label=1,
            unfavorable_label=0)
Exemplo n.º 4
0
def test_between_group():
    data = np.array([[0, 0, 1], [0, 1, 0], [1, 1, 0], [1, 1, 1], [1, 0, 0],
                     [1, 0, 0]])
    pred = data.copy()
    pred[[0, 3], -1] = 0
    pred[[4, 5], -1] = 1
    df = pd.DataFrame(data, columns=['feat', 'feat2', 'label'])
    df2 = pd.DataFrame(pred, columns=['feat', 'feat2', 'label'])
    bld = BinaryLabelDataset(df=df,
                             label_names=['label'],
                             protected_attribute_names=['feat', 'feat2'])
    bld2 = BinaryLabelDataset(df=df2,
                              label_names=['label'],
                              protected_attribute_names=['feat', 'feat2'])
    cm = ClassificationMetric(bld,
                              bld2,
                              unprivileged_groups=[{
                                  'feat': 0
                              }],
                              privileged_groups=[{
                                  'feat': 1
                              }])

    b = np.array([0.5, 0.5, 1.25, 1.25, 1.25, 1.25])
    assert cm.between_group_generalized_entropy_index(
    ) == 1 / 12 * np.sum(b**2 - 1)
Exemplo n.º 5
0
def dataset_from_matrix(x, dataset):
    df = pd.DataFrame(data=x, columns=dataset.feature_names + dataset.label_names)
    dataset_ = BinaryLabelDataset(df=df, label_names=dataset.label_names, protected_attribute_names=dataset.protected_attribute_names)

    dataset_ = dataset.align_datasets(dataset_)
    #dataset_.favorable_label = dataset.favorable_label
    dataset_.validate_dataset()
    return dataset_
Exemplo n.º 6
0
def equal_ops_values(random_data, predicted_data, target_variable, protected_variable, unprivileged_input):
    random_data['Pred'] = np.random.binomial(1, .5, 1000)
    dataset = BinaryLabelDataset(df=random_data, label_names=[target_variable], protected_attribute_names=[protected_variable])
    classified_dataset = BinaryLabelDataset(df=predicted_data, label_names=[target_variable], protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in predicted_data[protected_variable].unique()[predicted_data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric = ClassificationMetric(dataset, classified_dataset, unprivileged_group, privileged_group)
    return abs(metric.equal_opportunity_difference())
Exemplo n.º 7
0
def reweigh_and_predict(df1, df2):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = 5410  #len(df1)
    ntest = 1804  #len(df2)
    df1 = df
    #df = pandas.read_csv("compas.csv")
    df = pandas.get_dummies(df,
                            prefix=['sex', 'race', 'c_charge_degree'],
                            drop_first=True)
    df = df.rename(
        columns={
            'race_Non-White': 'race',
            'sex_Male': 'sex',
            'c_charge_degree_M': 'charge_degree'
        })
    # set up the BinaryLabelDataset
    label_names = ['two_year_recid']
    protected_attribute_names = ['race']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    train_data = BinaryLabelDataset(
        df=train_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)
    test_data = BinaryLabelDataset(
        df=test_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)

    privileged_groups = [{'race': 1}]
    unprivileged_groups = [{'race': 0}]
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    RW.fit(train_data)
    dataset_transf_train = RW.transform(train_data)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod.predict(X_train)

    dataset_transf_test_pred = test_data
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod.predict(X_test)
    Y_hat = dataset_transf_test_pred.scores

    return Y_hat
def create_df_aif(df_train, df_test, label, protected_attribute, metadata):
    df_train_aif = BinaryLabelDataset(df = df_train, label_names=[label], 
                                      protected_attribute_names = [protected_attribute], 
                                      instance_weights_name=None, unprivileged_protected_attributes=[], 
                                      privileged_protected_attributes=[], metadata=metadata)
    
    df_test_aif = BinaryLabelDataset(df = df_test, label_names=[label], 
                                     protected_attribute_names = [protected_attribute], 
                                     instance_weights_name=None, unprivileged_protected_attributes=[], 
                                     privileged_protected_attributes=[], metadata=metadata)    
    return df_train_aif, df_test_aif
Exemplo n.º 9
0
def _preprocess_data(
    data, protected_attribute_name, protected_attribute_index, label_name, required_fairness
):
    from pandas import DataFrame
    from aif360.datasets import BinaryLabelDataset

    dataset = BinaryLabelDataset(
        df=DataFrame(data),
        protected_attribute_names={protected_attribute_name},
        label_names={label_name},
        favorable_label=2,
        unfavorable_label=1,
    )
    train, test = dataset.split([0.8])

    from aif360.algorithms.inprocessing import AdversarialDebiasing

    sess = tf.compat.v1.Session()
    debiaser = AdversarialDebiasing(
        unprivileged_groups=({protected_attribute_name: 0},),
        privileged_groups=({protected_attribute_name: 1},),
        scope_name="debiaser",
        debias=True,
        sess=sess,
    )
    debiaser.fit(train)

    from sklearn.ensemble import RandomForestClassifier

    model = RandomForestClassifier(class_weight="balanced")

    X_tr = np.delete(train.features, protected_attribute_index, axis=1)
    y_tr = train.labels.ravel()
    model.fit(X_tr, y_tr)

    test_pred = test.copy(deepcopy=True)
    test_pred.scores = model.predict(np.delete(debiaser.predict(test).features, protected_attribute_index, axis=1))

    accuracy = np.sum(np.equal(test.scores, test_pred.scores))

    from aif360.metrics import ClassificationMetric
    disparate_impact = ClassificationMetric(
        test,
        test_pred,
        unprivileged_groups=({protected_attribute_name: 0},),
        privileged_groups=({protected_attribute_name: 1},),
    ).disparate_impact()

    print(f"Accuracy: {accuracy}")
    print(f"Disparate impact: {disparate_impact}")
    if disparate_impact > float(required_fairness):
        raise ValueError(
            f"Too unfair! Disparate impact was {disparate_impact} but must be less than {required_fairness}"
        )
Exemplo n.º 10
0
def test(dataset, model, x_test, thresh_arr, unprivileged_groups,
         privileged_groups):

    bld = BinaryLabelDataset(df=dataset,
                             label_names=['labels'],
                             protected_attribute_names=['age'])

    if np.isin(k, model_AIF):
        y_val_pred_prob = model.predict_proba(bld)
    else:
        y_val_pred_prob, A_val_pred_prob = model.predict_proba(x_test)

    metric_arrs = np.empty([0, 8])
    for thresh in thresh_arr:
        if np.isin(k, model_AIF):
            y_val_pred = (y_val_pred_prob > thresh).astype(np.float64)
        else:
            y_val_pred = (y_val_pred_prob.numpy() > thresh).astype(np.float64)

        metric_arrs = np.append(metric_arrs,
                                roc_auc_score(y_test, y_val_pred_prob))

        if np.isin(k, model_AIF):
            metric_arrs = np.append(metric_arrs, 0)
        else:
            metric_arrs = np.append(metric_arrs,
                                    roc_auc_score(A_test, A_val_pred_prob))

        dataset_pred = dataset.copy()
        dataset_pred.labels = y_val_pred
        bld2 = BinaryLabelDataset(df=dataset_pred,
                                  label_names=['labels'],
                                  protected_attribute_names=['age'])

        metric = ClassificationMetric(bld,
                                      bld2,
                                      unprivileged_groups=unprivileged_groups,
                                      privileged_groups=privileged_groups)

        metric_arrs = np.append(
            metric_arrs,
            ((metric.true_positive_rate() + metric.true_negative_rate()) / 2))
        metric_arrs = np.append(metric_arrs, metric.average_odds_difference())
        metric_arrs = np.append(metric_arrs, metric.disparate_impact())
        metric_arrs = np.append(metric_arrs,
                                metric.statistical_parity_difference())
        metric_arrs = np.append(metric_arrs,
                                metric.equal_opportunity_difference())
        metric_arrs = np.append(metric_arrs, metric.theil_index())

    return metric_arrs
Exemplo n.º 11
0
def reweigh_and_predict(df1, df2):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = len(df1)
    ntest = len(df2)

    #df = pandas.read_csv("UCIAdult.csv")
    df = pandas.get_dummies(df, prefix = ['income', 'sex', 'native_country', 'marital_status',\
                              'workclass', 'occupation'], drop_first = True)
    df = df.rename(columns = {'income_>50K':'income', 'sex_Female':'sex', 'native_country_United-States':'native_country',\
    'marital_status_Not-Married':'marital_status'})
    #df = df.drop(columns = ['Unnamed: 0'])
    # set up the BinaryLabelDataset
    label_names = ['income']
    protected_attribute_names = ['sex']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    train_data = BinaryLabelDataset(
        df=train_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)
    test_data = BinaryLabelDataset(
        df=test_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)

    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    RW.fit(train_data)
    dataset_transf_train = RW.transform(train_data)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod.predict(X_train)

    dataset_transf_test_pred = test_data
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod.predict(X_test)
    Y_hat = dataset_transf_test_pred.scores

    return Y_hat
Exemplo n.º 12
0
def odds_diff(random_data, predicted_data, target_variable, protected_variable, unprivileged_input):
    random_data['Pred'] = np.random.binomial(1, .5, 1000)
    dataset = BinaryLabelDataset(df=random_data, label_names=[target_variable], protected_attribute_names=[protected_variable])
    classified_dataset = BinaryLabelDataset(df=predicted_data, label_names=[target_variable], protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in predicted_data[protected_variable].unique()[predicted_data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric = ClassificationMetric(dataset, classified_dataset, unprivileged_group, privileged_group)
    print(metric.average_abs_odds_difference())
    if abs(metric.average_abs_odds_difference().round(3)) < 0.2:
        print('The algorithm can be considered to be not biased')
    else:
        print('There is a potential bias')
Exemplo n.º 13
0
    def __init__(self, *args, **kwargs):
        # remove arguments for sim_args constructor
        sim_args_names = ['mutable_features', 'domains', 'cost_fns', 'discrete']
        sim_args = {k: kwargs.pop(k, None) for k in sim_args_names}
        self.means = kwargs.pop('means', [45,60])
        self.N = kwargs.pop('N', 1000)
        self.threshold = kwargs.pop('threshold', 55)
        self.human_readable_labels ={}

        df = self._generateData(means=self.means, N=self.N, threshold=self.threshold)

        kwargs = {'df':df, 'label_names':['y'], 'protected_attribute_names':['group']}

        BinaryLabelDataset.__init__(self, **kwargs)
        SimMixin.__init__(self, **sim_args)
Exemplo n.º 14
0
def get_transformed_data(dataset='data/simulated_data.csv',
                         protected_attribute='group'):
    sample_data = pd.read_csv(dataset, header=0)

    pre_transform = BinaryLabelDataset(
        1.0,
        0.0,
        df=sample_data,
        label_names=['outcome'],
        protected_attribute_names=[protected_attribute])

    RW = Reweighing(unprivileged_groups=[{
        'group': 0
    }],
                    privileged_groups=[{
                        'group': 1
                    }])
    #    RW.fit(pre_transform)
    post_transform = RW.fit_transform(pre_transform)
    ds = post_transform.convert_to_dataframe()[0]
    X = ds.drop('outcome', axis=1)
    y = ds['outcome']
    return {
        'simulated_data': {
            'data': X.values,
            'labels': y.values,
            'participant_ids': np.arange(0, len(ds)),
            'feature_names': np.array([f for f in ds if f not in ['outcome']])
        }
    }
Exemplo n.º 15
0
def create_binary_dataset_sb():    
    """This will create a binary dataset from the csv with a set salary 
    as the threshold for later predictions.
    
    Input - A numeric salary to be set as the threshold
    
    Out - A AIF360 binary dataset with one-hot encoded categorical columns
    """
    
    data = pd.read_csv('../company_x_sb.csv', index_col='employee_id')
    data_with_label = data.copy()
    data_with_label['sex'] = data_with_label['sex'].transform(lambda x: x == 'M').astype(int)

    std_data = StandardDataset(df=data_with_label,   
                             label_name='new_signing_bonus',
                             favorable_classes =[1],
                            protected_attribute_names=['sex'], 
                             privileged_classes=[[1]],
                            categorical_features=['degree_level', 'dept'], 
                              features_to_drop=['boss_id'])

    df_data = std_data.convert_to_dataframe()
    binary_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_data[0], label_names=['new_signing_bonus'],
                  protected_attribute_names=['sex'])
    
    return binary_dataset
Exemplo n.º 16
0
    def get_fairness_metrics():
        def get_bldm_metrics():

            metric_BLDM = BinaryLabelDatasetMetric(
                dataset, unprivileged_group, privileged_group)
            return {"Statistical Parity Difference": metric_BLDM.statistical_parity_difference(), "Disparate Impact": metric_BLDM.disparate_impact()}

        def get_cm_metrics():
            df_pred = X.copy()
            df_pred[df.columns[-1]] = np.expand_dims(ypred_class, axis=1)

            dataset_pred = BinaryLabelDataset(df=df_pred, label_names=[
                'action_taken_name'], protected_attribute_names=['applicant_sex_name_Female'])

            metric_CM = ClassificationMetric(
                dataset, dataset_pred, privileged_groups=privileged_group, unprivileged_groups=unprivileged_group)

            return {
                "Equal Opportunity Difference":   metric_CM.equal_opportunity_difference(),
                'Average Odds Difference': metric_CM.average_odds_difference(),
                "Accuracy Male": metric_CM.accuracy(privileged=True),
                "Accuracy Female":  metric_CM.accuracy(privileged=False)
            }

        dataset = BinaryLabelDataset(df=df, label_names=[
            'action_taken_name'], protected_attribute_names=['applicant_sex_name_Female'])

        privileged_group = [{'applicant_sex_name_Female': 0}]
        unprivileged_group = [{'applicant_sex_name_Female': 1}]

        return {**get_bldm_metrics(), **get_cm_metrics()}
Exemplo n.º 17
0
def make_dataset(features,
                 labels=None,
                 scores=None,
                 protected_columns=None,
                 privileged_groups=None,
                 unprivileged_groups=None,
                 favorable_label=None,
                 unfavorable_label=None):
    df = features.copy()

    if labels is None:
        labels = favorable_label

    df['outcome'] = labels

    if scores is not None:
        scores_names = 'scores'
        df[scores_names] = scores
    else:
        scores_names = []

    dataset = BinaryLabelDataset(
        df=df,
        label_names=['outcome'],
        scores_names=scores_names,
        protected_attribute_names=protected_columns,
        favorable_label=favorable_label,
        unfavorable_label=unfavorable_label,
        unprivileged_protected_attributes=unprivileged_groups)
    return dataset
Exemplo n.º 18
0
def get_adult_data():
    '''
    Preprocess the adult data set by removing some features and put adult data into a BinaryLabelDataset
    You need to download the adult dataset (both the adult.data and adult.test files) from https://archive.ics.uci.edu/ml/datasets/Adult
    '''

    headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-stataus', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'y']
    train = pd.read_csv('adult.data', header = None)
    test = pd.read_csv('adult.test', header = None)
    df = pd.concat([train, test], ignore_index=True)
    df.columns = headers

    df['y'] = df['y'].replace({' <=50K.': 0, ' >50K.': 1, ' >50K': 1, ' <=50K': 0 })

    df = df.drop(df[(df[headers[-2]] == ' ?') | (df[headers[6]] == ' ?')].index)
    df = pd.get_dummies(df, columns=[headers[1], headers[5], headers[6], headers[7], headers[9], headers[8], 'native-country'])

    delete_these = ['race_ Amer-Indian-Eskimo','race_ Asian-Pac-Islander','race_ Black','race_ Other', 'sex_ Female']

    delete_these += ['native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia']

    delete_these += ['fnlwgt', 'education']

    df.drop(delete_these, axis=1, inplace=True)

    return BinaryLabelDataset(df = df, label_names = ['y'], protected_attribute_names = ['sex_ Male', 'race_ White'])
Exemplo n.º 19
0
def prepare_dataset(features,
                    labels,
                    protected_attribute,
                    privileged_attribute_values,
                    unprivileged_attribute_values,
                    favorable_label=1.,
                    unfavorable_label=0.):
    """Prepare dataset for computing fairness metrics."""
    df = features.copy()
    df['outcome'] = labels

    return BinaryLabelDataset(
        df=df,
        label_names=['outcome'],
        scores_names=[],
        protected_attribute_names=[protected_attribute],
        privileged_protected_attributes=[
            np.array(privileged_attribute_values)
        ],
        unprivileged_protected_attributes=[
            np.array(unprivileged_attribute_values)
        ],
        favorable_label=favorable_label,
        unfavorable_label=unfavorable_label,
    )
    def apply(self, df):
        """
        :param df: pandas dataframe, stores the data to apply the learned discretizer.
        :return: pandas dataframe, stores the data after discretize.
        """
        if self.na_mark:
            df = df.replace({self.na_mark:np.nan})
        if self.fair_aware: # fair-preprocessor
            aif_df = BinaryLabelDataset(df=df, label_names=[self.target_col], protected_attribute_names=[self.sensitive_att])
            if self.fit_flag: # fit has been initialized
                after_aif_df = self.step.transform(aif_df)
            else: # fit and transform is combined, e.g. DisparateImpactRemover
                after_aif_df = self.step.fit_transform(aif_df)

            after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)
            if self.weight_flag:
                preprocessed_weights = after_aif_df.instance_weights

        else: # regular preprocessor
            after_df = df.copy()
            for ai in self.focus_atts:
                after_df[ai] = self.step[ai].transform(np.array(after_df[ai]).reshape(-1, 1))

        if self.weight_flag: # for the preprocessor that updates weights, e.g. Reweighing
            return after_df, preprocessed_weights
        else:
            return after_df
    def __init__(self, step_name, df, step=None, focus_atts=[], fit_flag=True, weight_flag=False, sensitive_att=None, target_col=None, fair_aware=False, na_mark=None):
        """
        :param step_name: str, name of the current input step.
        :param df: pandas dataframe, stores the data.
        :param step: object of the initialized class. If none, initialize here.
        :param focus_atts: lisf of str, each str represents the name of a column in above data that will be pre-processed.
        :param fit_flag: boolean, whether to initialize step object here.
        :param weight_flag: boolean, whether to output extra sample weight after fair-preprocessor.
        :param sensitive_att: str, the name of a sensitive attribute.
        :param target_col: str, the name of the target attribute.
        :param fair_aware: boolean, whether the preprocessor is fair-aware. Default is False. If true, sensitive_att and target_col can not be null.
        """
        super().__init__(step_name=step_name, df=df, focus_atts=focus_atts, sensitive_att=sensitive_att, target_col=target_col)

        if len(focus_atts) > 0 and fit_flag:
            fitted_step = {}
            for idx, ai in enumerate(focus_atts):
                fitted_step[ai] = step[ai].fit(np.array(df[ai]).reshape(-1, 1))
            self.step = fitted_step
        elif fair_aware and fit_flag: # for fair-preprocessors
            aif_df = BinaryLabelDataset(df=df, label_names=[target_col], protected_attribute_names=[sensitive_att])
            self.step = step.fit(aif_df)
        else:
            if step is not None:
                self.step = step

        # address different encoding of missing values
        if na_mark is not None:
            self.na_mark = na_mark
        else:
            self.na_mark = None
        self.fair_aware = fair_aware
        self.fit_flag = fit_flag
        self.weight_flag = weight_flag
Exemplo n.º 22
0
def fairness_IBM(y_pred, Ztr, ytr, verbose=0):
    from aif360.datasets import BinaryLabelDataset
    from aif360.metrics import ClassificationMetric

    assert np.array_equal(np.unique(Ztr),
                          np.array([0, 1])), "Z must contain either 0 or 1"
    # if len(ytr.shape) == 1:
    # ytr = np.expand_dims(ytr, -1)

    Ztr = np.squeeze(Ztr)
    if verbose:
        print(ytr.shape)
        print(Ztr.shape)
    unprivileged_groups = [{"zs": [0]}]
    privileged_groups = [{"zs": [1]}]
    metric_arrs = defaultdict(list)
    dict_ = {"y_true": ytr, "zs": Ztr}
    df = pd.DataFrame(dict_)
    dataset = BinaryLabelDataset(df=df,
                                 label_names=["y_true"],
                                 protected_attribute_names=["zs"],
                                 unprivileged_protected_attributes=[[0]],
                                 privileged_protected_attributes=[[1]])

    dataset_pred = dataset.copy()
    dataset_pred.labels = y_pred
    metric = ClassificationMetric(dataset,
                                  dataset_pred,
                                  unprivileged_groups=unprivileged_groups,
                                  privileged_groups=privileged_groups)

    # metric_arrs['bal_acc'].append((metric.true_positive_rate()
    #                              + metric.true_negative_rate()) / 2)
    metric_arrs["EA"].append(
        metric.accuracy(privileged=False) - metric.accuracy(privileged=True))
    # ASSUMING ALL OTHER METRICS RETURN U - P
    metric_arrs['EO'].append(metric.average_odds_difference())
    # The ideal value of this metric is 1.0
    # A value < 1 implies higher benefit for the privileged group
    # and a value >1 implies a higher
    metric_arrs['DI'].append(metric.disparate_impact() - 1)
    metric_arrs['DP'].append(metric.statistical_parity_difference())
    metric_arrs['EQ'].append(metric.equal_opportunity_difference())
    metric_arrs['TH'].append(metric.between_group_theil_index() * 10)
    results = pd.DataFrame(metric_arrs)
    return results
Exemplo n.º 23
0
def create_binary(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    return BinaryLabelDatasetMetric(df_aif, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
Exemplo n.º 24
0
def mean_diff_values(data, target_variable, protected_variable, unprivileged_input):
    df_aif = BinaryLabelDataset(df=data, label_names=[target_variable],
                                protected_attribute_names=[protected_variable])
    privileged_group = []
    for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]:
        privileged_group.append({protected_variable: v})
    unprivileged_group = [{protected_variable: unprivileged_input}] #female=0
    metric_orig = BinaryLabelDatasetMetric(df_aif, unprivileged_group, privileged_group)
    return abs(metric_orig.mean_difference().round(3))
Exemplo n.º 25
0
 def fit(self, data, labels, prot):
     ds = BinaryLabelDataset(df = data, label_names = labels, 
                          protected_attribute_names= prot)
     self.prot = prot
     x = self.model_reweight.fit_transform(ds)
     index = x.feature_names.index(prot[0])
     x_train = np.delete(x.features,index,1)
     y_train = x.labels.ravel()
     self.model.fit(x_train, y_train)
Exemplo n.º 26
0
def _make_dataset(data, outcome, protected_columns,
                  privileged_groups, unprivileged_groups,
                  favorable_label, unfavorable_label):
    df = data.copy()
    df['outcome'] = data[outcome].values

    dataset = BinaryLabelDataset(df=df, label_names=['outcome'], protected_attribute_names=protected_columns,
                                 favorable_label=favorable_label, unfavorable_label=unfavorable_label,
                                 unprivileged_protected_attributes=unprivileged_groups)
    return dataset
Exemplo n.º 27
0
def test_between_all_groups():
    data = np.array([[0, 1], [0, 0], [1, 0], [1, 1], [1, 0], [1, 0], [2, 1],
                     [2, 0], [2, 1], [2, 1]])
    pred = data.copy()
    pred[[3, 9], -1] = 0
    pred[[4, 5], -1] = 1
    df = pd.DataFrame(data, columns=['feat', 'label'])
    df2 = pd.DataFrame(pred, columns=['feat', 'label'])
    bld = BinaryLabelDataset(df=df,
                             label_names=['label'],
                             protected_attribute_names=['feat'])
    bld2 = BinaryLabelDataset(df=df2,
                              label_names=['label'],
                              protected_attribute_names=['feat'])
    cm = ClassificationMetric(bld, bld2)

    b = np.array([1, 1, 1.25, 1.25, 1.25, 1.25, 0.75, 0.75, 0.75, 0.75])
    assert cm.between_all_groups_generalized_entropy_index(
    ) == 1 / 20 * np.sum(b**2 - 1)
Exemplo n.º 28
0
 def fit(self, data, labels, prot):
     ds = BinaryLabelDataset(df = data, label_names = labels, 
                          protected_attribute_names= prot)
     self.prot = prot
     x = self.model_reweight.fit_transform(ds)
     index = x.feature_names.index(prot[0])
     x_train = np.delete(x.features,index,1)
     y_train = x.labels
     x_train = torch.tensor(x_train).type('torch.FloatTensor')
     y_train = torch.tensor(y_train).type('torch.FloatTensor')
     self.model.fit(x_train, y_train)
    def apply(self, df):
        """
        :param df: pandas dataframe, stores the data to apply the learned discretizer.
        :return: pandas dataframe, stores the data after discretize.
        """

        # initialize AIF360 BinaryLabelDataset

        if self.input_score: # use score prediction to fit model, e.g. RejectOptionClassification, CalibratedEqOddsPostprocessing
            aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col], scores_names=[self.pred_target_col],
                                             protected_attribute_names=[self.sensitive_att])
        else: # use label prediction to fit model, e.g. EqOddsPostprocessing
            df["pred_label_"+self.target_col] = [int(x >= self.clf_threshold) for x in df[self.pred_target_col]]
            aif_pred_df = BinaryLabelDataset(df=df.drop(columns=[self.pred_target_col]), label_names=["pred_label_"+self.target_col],
                                         protected_attribute_names=[self.sensitive_att])

        after_aif_df = self.step.predict(aif_pred_df)
        after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)
        after_df[self.pred_target_col] = after_aif_df.labels

        return after_df
Exemplo n.º 30
0
def reweigh_and_predict(df1, df2):
  label_names = ['Y']
  protected_attribute_names = ['A']

  df = pandas.concat([df1, df2])
  ntrain = len(df1)
  ntest = len(df2)

  train_data = df.head(ntrain)
  test_data = df.tail(ntest)


  train_data = BinaryLabelDataset(df = train_data, label_names = label_names,
                                   protected_attribute_names = protected_attribute_names)
  test_data = BinaryLabelDataset(df = test_data, label_names = label_names,
                                   protected_attribute_names = protected_attribute_names)

  privileged_groups = [{'A': 0}]
  unprivileged_groups = [{'A': 1}]
  RW = Reweighing(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups)
  RW.fit(train_data)
  dataset_transf_train = RW.transform(train_data)

  scale_transf = StandardScaler()
  X_train = scale_transf.fit_transform(dataset_transf_train.features)
  y_train = dataset_transf_train.labels.ravel()

  lmod = LogisticRegression()
  lmod.fit(X_train, y_train,
        sample_weight=dataset_transf_train.instance_weights)
  y_train_pred = lmod.predict(X_train)

  dataset_transf_test_pred = test_data
  X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
  y_test = dataset_transf_test_pred.labels
  dataset_transf_test_pred.scores = lmod.predict_proba(X_test)[:,1:2].ravel()
  Y_hat = dataset_transf_test_pred.scores

  return Y_hat