Exemplo n.º 1
0
def _preprocess_data(
    data, protected_attribute_name, protected_attribute_index, label_name, required_fairness
):
    from pandas import DataFrame
    from aif360.datasets import BinaryLabelDataset

    dataset = BinaryLabelDataset(
        df=DataFrame(data),
        protected_attribute_names={protected_attribute_name},
        label_names={label_name},
        favorable_label=2,
        unfavorable_label=1,
    )
    train, test = dataset.split([0.8])

    from aif360.algorithms.inprocessing import AdversarialDebiasing

    sess = tf.compat.v1.Session()
    debiaser = AdversarialDebiasing(
        unprivileged_groups=({protected_attribute_name: 0},),
        privileged_groups=({protected_attribute_name: 1},),
        scope_name="debiaser",
        debias=True,
        sess=sess,
    )
    debiaser.fit(train)

    from sklearn.ensemble import RandomForestClassifier

    model = RandomForestClassifier(class_weight="balanced")

    X_tr = np.delete(train.features, protected_attribute_index, axis=1)
    y_tr = train.labels.ravel()
    model.fit(X_tr, y_tr)

    test_pred = test.copy(deepcopy=True)
    test_pred.scores = model.predict(np.delete(debiaser.predict(test).features, protected_attribute_index, axis=1))

    accuracy = np.sum(np.equal(test.scores, test_pred.scores))

    from aif360.metrics import ClassificationMetric
    disparate_impact = ClassificationMetric(
        test,
        test_pred,
        unprivileged_groups=({protected_attribute_name: 0},),
        privileged_groups=({protected_attribute_name: 1},),
    ).disparate_impact()

    print(f"Accuracy: {accuracy}")
    print(f"Disparate impact: {disparate_impact}")
    if disparate_impact > float(required_fairness):
        raise ValueError(
            f"Too unfair! Disparate impact was {disparate_impact} but must be less than {required_fairness}"
        )
    def create_data():

        import pandas as pd
        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()
        # above is because aif360 requires tensorflow
        from aif360.datasets import BinaryLabelDataset
        from aif360.algorithms.preprocessing.reweighing import Reweighing

        """
        Update the below as needed
        """
        #########
        #########
        #########
        # Path to the data
        folder_path = 'tmp/'
        # Data file
        data_file = 'housing_train_proc.csv'
        full_data_file = folder_path + data_file

        if not os.path.isfile(full_data_file):
            # for testing, just return something
            if config.hard_asserts:
                return dt.Frame(np.array([[1, 2, 3], [4, 5, 6]]))
            else:
                return []

        train = pd.read_csv(full_data_file)

        validation_test_files = ['housing_test_proc.csv']

        validation_split = [0.6, 0.8]

        # Target column
        target = 'high_priced'
        favorable_label = 0
        unfavorable_label = 1

        # Privleged_group_info  = [[Protetected group name 1, prevleged level, unprivleged level], [Protetected group name 2, prevleged level, unprivleged level]]
        # The protected group columns need to be binary
        protected_group_info = [['hispanic', 0, 1], ['black', 0, 1]]
        #########
        #########
        #########

        # Set up protected group info
        protected_groups = [group_info[0] for group_info in protected_group_info]

        dataset_orig = BinaryLabelDataset(df=train, label_names=[target], favorable_label=favorable_label,
                                          unfavorable_label=unfavorable_label,
                                          protected_attribute_names=protected_groups)

        privileged_groups = []
        unprivileged_groups = []
        for protected_group in protected_group_info:
            privileged_groups_dict = {}
            unprivileged_groups_dict = {}
            privileged_groups_dict[protected_group[0]] = protected_group[1]
            unprivileged_groups_dict[protected_group[0]] = protected_group[2]
            privileged_groups.append(privileged_groups_dict)
            unprivileged_groups.append(unprivileged_groups_dict)

        # Fit weights on the full dataset to be used on the external test set, if given
        RW_full = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW_full.fit(dataset_orig)

        # Split the original data into train, validation, and test if applicable
        if len(validation_split) == 1:
            dataset_orig_train, dataset_orig_valid = dataset_orig.split(validation_split, shuffle=True)
        elif len(validation_split) == 2:
            dataset_orig_train_valid, dataset_orig_test = dataset_orig.split([validation_split[1]], shuffle=True)
            # Fit the weights on both the validation and test set for the test set split
            RW_train_valid = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
            RW_train_valid.fit(dataset_orig_train_valid)
            dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split(
                [validation_split[0] / (validation_split[1])], shuffle=True)
        else:
            dataset_orig_train = dataset_orig

        # Fit weights on the training set only    
        RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW.fit(dataset_orig_train)
        dataset_transf_train = RW.transform(dataset_orig_train)

        # Add the weigts to the training set
        train_df = pd.DataFrame(dataset_transf_train.features, columns=dataset_transf_train.feature_names)
        train_df[target] = dataset_transf_train.labels.ravel()
        train_df['weights'] = dataset_transf_train.instance_weights.ravel()

        # Create datasets with minimum features calculated the given number of days ahead
        dataset_dict = {}
        dataset_dict[data_file.split('.')[0] + "_rw_train.csv"] = train_df

        # Add weights to the validation split (if a validation split was specified)
        if len(validation_split) >= 1:
            dataset_transf_valid = RW.transform(dataset_orig_valid)
            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_validation.csv"] = valid_df

        # Add weights to the test split (if a test split was specified)
        if len(validation_split) >= 2:
            dataset_transf_test = RW_train_valid.transform(dataset_orig_test)
            test_df = pd.DataFrame(dataset_transf_test.features, columns=dataset_transf_test.feature_names)
            test_df[target] = dataset_transf_test.labels.ravel()
            test_df['weights'] = dataset_transf_test.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_test.csv"] = test_df

        # Add weights to the test files (If provided)       
        for valid_file in validation_test_files:
            valid = pd.read_csv(folder_path + valid_file)
            dataset_valid_orig = BinaryLabelDataset(df=valid, label_names=[target], favorable_label=favorable_label,
                                                    unfavorable_label=unfavorable_label,
                                                    protected_attribute_names=protected_groups)
            dataset_transf_valid = RW_full.transform(dataset_valid_orig)

            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()

            dataset_dict[valid_file.split('.')[0] + "_rw_transformed.csv"] = valid_df

        return dataset_dict