Пример #1
0
def german_dataset_age(name_prot=['age']):
    dataset_orig = GermanDataset(protected_attribute_names=name_prot,
                                 privileged_classes=[lambda x: x >= 25],
                                 features_to_drop=['personal_status', 'sex'])
    data, _ = dataset_orig.convert_to_dataframe()
    data.rename(columns={'credit': 'labels'}, inplace=True)
    data.to_csv("dataset/German_age.csv")
Пример #2
0
def load_german_dataset():
    """
    Collect the aif360 preprocessed German Credit Data Set.
    Assigns 'age' as the protected attribute with age >= 25 considered privileged.
    Sex-related attributes are removed (the other option for privileged attribute)

    :return: The German Credit Data Set
    """
    dataset = GermanDataset(protected_attribute_names=['age'],
                            privileged_classes=[lambda x: x >= 25],
                            features_to_drop=['personal_status', 'sex'])
    dataset_orig_train, dataset_orig_test = dataset.split([0.7], shuffle=True)

    return dataset_orig_train, dataset_orig_test
def german_dataset(name_prot=['sex']):
    dataset_orig = GermanDataset(protected_attribute_names=name_prot,
                                 features_to_drop=['personal_status', 'age'])

    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]

    data, _ = dataset_orig.convert_to_dataframe()
    data.rename(columns={'credit': 'labels'}, inplace=True)
    sensitive = data[name_prot]
    output = data['labels']
    output.replace((1, 2), (0, 1), inplace=True)
    atribute = data.drop('labels', axis=1, inplace=False)
    atribute.drop(name_prot, axis=1, inplace=True)
    return data, atribute, sensitive, output, privileged_groups, unprivileged_groups
Пример #4
0
def load_dataset(name):
    if name == 'Adult':
        ds = AdultDataset()
    elif name == 'German':
        ds = GermanDataset()
    elif name == 'Compas':
        ds = CompasDataset()
    return ds, name
Пример #5
0
def test_german():
    gd = GermanDataset()
    bldm = BinaryLabelDatasetMetric(gd)
    assert bldm.num_instances() == 1000
Пример #6
0
def LoadData(dataset_name,protected_attribute_name,raw=True):

	optim_options=None

	if dataset_name == "adult":
		if raw:
			dataset_original = AdultDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_adult,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['race'])
			optim_options = {
			"distortion_fun": get_distortion_adult,
			"epsilon": 0.05,
			"clist": [0.99, 1.99, 2.99],
			"dlist": [.1, 0.05, 0]
		}
	elif dataset_name == "german":
		if raw:
			dataset_original = GermanDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "age":
			privileged_groups = [{'age': 1}]
			unprivileged_groups = [{'age': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['age'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		dataset_original.labels = 2 - dataset_original.labels
		dataset_original.unfavorable_label = 0.
	elif dataset_name == "compas":
		if raw:
			dataset_original = CompasDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 0}]
			unprivileged_groups = [{'sex': 1}]
			if not raw:
				dataset_original = load_preproc_data_compas(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_compas(['race'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}

	protected_attribute_set={
		'sex':[[{'sex': 1}],[{'sex': 0}]],
		'age':[[{'age': 1}],[{'age': 0}]],
		'race':[[{'race': 1}],[{'race': 0}]]
	}

	if optim_options==None:
		print('No such dataset & group option:', dataset_name, protected_attribute_name)
		exit()

	return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
Пример #7
0
    def __init__(self, *args, **kwargs):
        # remove arguments for sim_args constructor
        sim_args_names = [
            'mutable_features', 'domains', 'cost_fns', 'discrete'
        ]
        sim_args = {k: kwargs.pop(k, None) for k in sim_args_names}

        kwargs['custom_preprocessing'] = custom_preprocessing
        kwargs['metadata'] = default_mappings
        kwargs['categorical_features'] = [
            'credit_history', 'purpose', 'employment', 'other_debtors',
            'property', 'installment_plans', 'housing', 'skill_level',
            'telephone'
        ]

        self.human_readable_labels = {
            "A40": "car (new)",
            "A41": "car (used)",
            "A42": "furniture/equipment",
            "A43": "radio/television",
            "A44": "domestic appliances",
            "A45": "repairs",
            "A46": "education",
            "A47": "vacation",
            "A48": "retraining",
            "A49": "business",
            "A410": "others",
            "A30": "no credits taken",
            "A31": "all credits at this bank paid back duly",
            "A32": "existing credits paid back duly till now",
            "A33": "delay in paying off in the past",
            "A34": "critical account",
            "A71": "unemployed",
            "A72": "< 1 year",
            "A73": "1  <= ... < 4 years",
            "A74": "4  <= ... < 7 years",
            "A75": ">= 7 years",
            "A101": "none",
            "A102": "co-applicant",
            "A103": "guarantor",
            "A121": "real estate",
            "A122": "building society savings agreement/life insurance",
            "A123": "car or other",
            "A124": "unknown / no property",
            "A141": "bank",
            "A142": "stores",
            "A143": "none",
            "A151": "rent",
            "A152": "own",
            "A153": "for free",
            "A171": "unemployed/ unskilled  - non-resident",
            "A172": "unskilled - resident",
            "A173": "skilled employee / official",
            "A174":
            "management/ self-employed/ Highly qualified employee/ officer",
            "A191": "none",
            "A192": "yes, registered under the customers name"
        }

        GermanDataset.__init__(*(tuple([self]) + args), **kwargs)
        SimMixin.__init__(self, **sim_args)
Пример #8
0
def german_dataset_sex(name_prot=['sex']):
    dataset_orig = GermanDataset(protected_attribute_names=name_prot,
                                 features_to_drop=['personal_status', 'age'])
    data, _ = dataset_orig.convert_to_dataframe()
    data.rename(columns={'credit': 'labels'}, inplace=True)
    data.to_csv("dataset/German_sex.csv")
Пример #9
0
def load_preproc_data_german(protected_attributes=None):
    """
    Load and pre-process german credit dataset.
    Args:
        protected_attributes(list or None): If None use all possible protected
            attributes, else subset the protected attributes to the list.

    Returns:
        GermanDataset: An instance of GermanDataset with required pre-processing.

    """
    def custom_preprocessing(df):
        """ Custom pre-processing for German Credit Data
        """
        def group_credit_hist(x):
            if x in ['A30', 'A31', 'A32']:
                return 'None/Paid'
            elif x == 'A33':
                return 'Delay'
            elif x == 'A34':
                return 'Other'
            else:
                return 'NA'

        def group_employ(x):
            if x == 'A71':
                return 'Unemployed'
            elif x in ['A72', 'A73']:
                return '1-4 years'
            elif x in ['A74', 'A75']:
                return '4+ years'
            else:
                return 'NA'

        def group_savings(x):
            if x in ['A61', 'A62']:
                return '<500'
            elif x in ['A63', 'A64']:
                return '500+'
            elif x == 'A65':
                return 'Unknown/None'
            else:
                return 'NA'

        def group_status(x):
            if x in ['A11', 'A12']:
                return '<200'
            elif x in ['A13']:
                return '200+'
            elif x == 'A14':
                return 'None'
            else:
                return 'NA'

        status_map = {
            'A91': 1.0,
            'A93': 1.0,
            'A94': 1.0,
            'A92': 0.0,
            'A95': 0.0
        }
        df['sex'] = df['personal_status'].replace(status_map)

        # group credit history, savings, and employment
        df['credit_history'] = df['credit_history'].apply(
            lambda x: group_credit_hist(x))
        df['savings'] = df['savings'].apply(lambda x: group_savings(x))
        df['employment'] = df['employment'].apply(lambda x: group_employ(x))
        df['age'] = df['age'].apply(lambda x: np.float(x >= 25))
        df['status'] = df['status'].apply(lambda x: group_status(x))

        return df

    # Feature partitions
    XD_features = ['credit_history', 'savings', 'employment', 'sex', 'age']
    D_features = ['sex', 'age'
                  ] if protected_attributes is None else protected_attributes
    Y_features = ['credit']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = ['credit_history', 'savings', 'employment']

    # privileged classes
    all_privileged_classes = {"sex": [1.0], "age": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            1.0: 'Male',
            0.0: 'Female'
        },
        "age": {
            1.0: 'Old',
            0.0: 'Young'
        }
    }

    return GermanDataset(
        label_name=Y_features[0],
        favorable_classes=[1],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        metadata={
            'label_maps': [{
                1.0: 'Good Credit',
                2.0: 'Bad Credit'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        },
        custom_preprocessing=custom_preprocessing)
def main():
    import sys
    sys.path.insert(1, "../")

    import numpy as np
    np.random.seed(0)

    #pip install numba==0.43.0
    #pip install --ignore-installed llvmlite==0.32.1

    from aif360.datasets import GermanDataset
    from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric as CM
    from aif360.algorithms.preprocessing import Reweighing

    from IPython.display import Markdown, display

    from sklearn.ensemble import RandomForestClassifier as RF
    from sklearn.datasets import make_classification as mc
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    #Step 2 Load dataset, specifying protected attribute, and split dataset into train and test
    dataset_orig = GermanDataset(
        protected_attribute_names=[
            'age'
        ],  # this dataset also contains protected attribute for "sex" 
        # which we do not consider in this evaluation
        privileged_classes=[lambda x: x >= 25
                            ],  # age >=25 is considered privileged
        features_to_drop=['personal_status',
                          'sex']  # ignore sex-related attributes
    )
    dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7],
                                                               shuffle=True)
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)

    privileged_groups = [{'age': 1}]
    unprivileged_groups = [{'age': 0}]

    #Step 3 Compute fairness metric on original training dataset
    metric_orig_train = BinaryLabelDatasetMetric(
        dataset_orig_train,  #mean difference
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    display(Markdown("#### Original training dataset"))
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f. AKA the privileged group is getting .17 more positive outcomes in the training dataset."
        % metric_orig_train.mean_difference())  #
    print()

    #metrics
    clf = RF()
    clf.fit(dataset_orig_train.features, dataset_orig_train.labels)

    predictions = clf.predict(dataset_orig_test.features)
    proba_predictions = clf.predict_proba(dataset_orig_test.features)

    dataset_orig_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1)
    dataset_orig_test_pred.labels = predictions.reshape(-1, 1)

    cm_pred_valid = CM(dataset_orig_test,
                       dataset_orig_test_pred,
                       unprivileged_groups=unprivileged_groups,
                       privileged_groups=privileged_groups)

    cm = ["precision", "recall", "accuracy"]

    metrics = {}
    for c in cm:
        metric = eval("cm_pred_valid." + c + "()")
        metrics[c] = metric

    metrics["recall"], metrics["accuracy"], metrics["precision"]

    print("AIF360 metrics")
    for key in ["recall", "accuracy", "precision"]:
        print("{} score is: {}".format(key, metrics[key]))

    #Step 4 Mitigate bias by transforming the original dataset
    RW = Reweighing(
        unprivileged_groups=
        unprivileged_groups,  #pre-processing mitigation algorithm
        privileged_groups=privileged_groups)
    dataset_transf_train = RW.fit_transform(dataset_orig_train)

    #Step 5 Compute fairness metric on transformed dataset
    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    display(Markdown("#### Transformed training dataset"))
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f"
        % metric_transf_train.mean_difference())  #

    #metrics
    #split
    dataset_transf_train, dataset_transf_test = dataset_transf_train.split(
        [0.7], shuffle=True)
    dataset_transf_test_pred = dataset_transf_test.copy(deepcopy=True)

    clf = RF()
    clf.fit(dataset_transf_train.features, dataset_transf_train.labels)

    predictions = clf.predict(dataset_transf_test.features)
    proba_predictions = clf.predict_proba(dataset_transf_test.features)

    dataset_transf_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1)
    dataset_transf_test_pred.labels = predictions.reshape(-1, 1)

    cm_pred_valid = CM(dataset_transf_test,
                       dataset_transf_test_pred,
                       unprivileged_groups=unprivileged_groups,
                       privileged_groups=privileged_groups)

    cm = ["precision", "recall", "accuracy"]

    metrics = {}
    for c in cm:
        metric = eval("cm_pred_valid." + c + "()")
        metrics[c] = metric

    metrics["recall"], metrics["accuracy"], metrics["precision"]

    print("AIF360 metrics")
    for key in ["recall", "accuracy", "precision"]:
        print("{} score is: {}".format(key, metrics[key]))
dataset_used = "compas" # "adult", "german", "compas"
protected_attribute_used = 2 # 1, 2

# code to identify the protected attributes from all of the dataset features
if dataset_used == "adult":
    dataset_orig = AdultDataset()
#     dataset_orig = load_preproc_data_adult()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
    
elif dataset_used == "german":
    dataset_orig = GermanDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'age': 1}]
        unprivileged_groups = [{'age': 0}]
    
elif dataset_used == "compas":
#     dataset_orig = CompasDataset()
    dataset_orig = load_preproc_data_compas()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
Пример #12
0
def get_data(dataset, protected_attribute, seed=101):
    def protected_attribute_error():
        raise ValueError(
            f'protected attribute {protected_attribute} is not available for dataset {dataset}'
        )

    if dataset == 'adult':
        from aif360.datasets import AdultDataset
        dataset_orig = AdultDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [2, 3]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'german':
        from aif360.datasets import GermanDataset
        dataset_orig = GermanDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'compas':
        from aif360.datasets import CompasDataset
        dataset_orig = CompasDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 0}]
            unprivileged_groups = [{'sex': 1}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [0, 2]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'bank':
        from aif360.datasets import BankDataset
        dataset_orig = BankDataset()
        if protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    else:
        raise ValueError(f'{dataset} is not an available dataset.')

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6],
                                                             shuffle=True,
                                                             seed=seed)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True,
                                                                  seed=seed)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups