Пример #1
0
def test_adult():
    ad = AdultDataset()
    # print(ad.feature_names)
    assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)

    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 45222
Пример #2
0
def test_adult_no_drop():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])
    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 48842
Пример #3
0
def test_epsilon_all_groups():
    def custom_preprocessing(df):
        # slight workaround for non-binary protected attribute
        # feature should be categorical but protected attribute should be numerical
        mapping = {
            'Black': 0,
            'White': 1,
            'Asian-Pac-Islander': 2,
            'Amer-Indian-Eskimo': 3,
            'Other': 4
        }
        df['race-num'] = df.race.map(mapping)
        return df.fillna('Unknown')

    nonbinary_ad = AdultDataset(
        protected_attribute_names=['sex', 'native-country', 'race-num'],
        privileged_classes=[['Male'], ['United-States'], [1]],
        categorical_features=[
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race'
        ],
        custom_preprocessing=custom_preprocessing)
    # drop redundant race feature (not relevant to this test)
    index = nonbinary_ad.feature_names.index('race-num')
    nonbinary_ad.features = np.delete(nonbinary_ad.features, index, axis=1)
    nonbinary_ad.feature_names = np.delete(nonbinary_ad.feature_names, index)

    _, nonbinary_test = nonbinary_ad.split([32561], shuffle=False)
    dataset_metric = BinaryLabelDatasetMetric(nonbinary_test)
    eps_data = dataset_metric.smoothed_empirical_differential_fairness()
    assert eps_data == 2.063813731996515  # verified with reference implementation
Пример #4
0
def load_dataset(name):
    if name == 'Adult':
        ds = AdultDataset()
    elif name == 'German':
        ds = GermanDataset()
    elif name == 'Compas':
        ds = CompasDataset()
    return ds, name
Пример #5
0
def Adult_dataset(name_prot='sex'):
    dataset_orig = AdultDataset(protected_attribute_names=['sex'],
                                privileged_classes=[['Male']],
                                features_to_keep=['age', 'education-num'])
    data, _ = dataset_orig.convert_to_dataframe()
    data.rename(columns={'income-per-year': 'labels'}, inplace=True)
    data.reset_index(inplace=True, drop=True)
    data.to_csv("dataset/Adult.csv")
Пример #6
0
def test_adult():
    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    #scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32562])
    assert np.any(test.labels)

    #print(test.labels)

    biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected)
    biased_model.fit(train)

    dataset_bias_test = biased_model.predict(test)

    predictions = [
        1 if y == train.favorable_label else -1
        for y in list(dataset_bias_test.labels)
    ]
    y_test = np.array(
        [1 if y == [train.favorable_label] else -1 for y in test.labels])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test)
    #print(unconstrainedFDR)

    tau = 0.9
    debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected)
    debiased_model.fit(train)

    #dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
    dataset_debiasing_test = debiased_model.predict(test)

    predictions = list(dataset_debiasing_test.labels)
    predictions = [
        1 if y == train.favorable_label else -1 for y in predictions
    ]
    y_test = np.array(
        [1 if y == [train.favorable_label] else -1 for y in test.labels])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, fdr = getStats(y_test, predictions, x_control_test)
    #print(fdr, unconstrainedFDR)
    assert (fdr >= unconstrainedFDR)


#test_adult()
Пример #7
0
def test_instance_weights():
    ad = AdultDataset(instance_weights_name='fnlwgt', features_to_drop=[])
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    rw = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    transf = rw.fit_transform(ad)
    print(transf.instance_weights.sum())
    assert np.isclose(ad.instance_weights.sum(), transf.instance_weights.sum())
def test_repair0():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])

    di = DisparateImpactRemover(repair_level=0.)
    ad_repd = di.fit_transform(ad)

    assert ad_repd == ad
Пример #9
0
def Adult_dataset(name_prot='sex'):
    dataset_orig = AdultDataset(protected_attribute_names=['sex'],
                                privileged_classes=[['Male']],
                                features_to_keep=['age', 'education-num'])

    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]

    data, _ = dataset_orig.convert_to_dataframe()
    data.rename(columns={'income-per-year': 'labels'}, inplace=True)
    data.reset_index(inplace=True, drop=True)
    sensitive = data[name_prot]
    output = dataset_orig.labels
    atribute = data.drop('labels', axis=1, inplace=False)
    atribute.drop(name_prot, axis=1, inplace=True)
    return data, atribute, sensitive, output, privileged_groups, unprivileged_groups
def getAdultDataset():

    dataset = AdultDataset()
    dataset_orig = load_preproc_data_adult(['sex'])

    features = ['race', 'sex', 'age decade', 'education years']
    domainArray = getAdultDomain()
    
    features.append(dataset_orig.label_names[0])
    simpleDomain = Domain(features, domainArray)

    labels = [y[0] for y in dataset_orig.labels]

    simpleSamples = dataset_orig.features
    simpleSamples = np.c_[simpleSamples, labels]    
    return simpleDomain, simpleSamples
Пример #11
0
def get_data(dataset_used, protected_attribute_used):
    if dataset_used == "adult":
        dataset_orig = AdultDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "german":
        dataset_orig = load_preproc_data_german()
        dataset_orig.labels -= 1
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]

    elif dataset_used == "compas":
        dataset_orig = CompasDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "bank":
        dataset_orig = BankDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    else:
        raise ValueError(f"{dataset_used} is not an available dataset.")

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=101)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=101)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
Пример #12
0
from sklearn.linear_model import LogisticRegression

from aif360.datasets import AdultDataset
from aif360.algorithms.postprocessing import EqOddsPostprocessing
from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing
from aif360.metrics import ClassificationMetric

train, val, test = AdultDataset().split([0.4, 0.7])
lr = LogisticRegression(solver='lbfgs').fit(train.features, train.labels)

val_pred = val.copy()
val_pred.labels = lr.predict(val.features).reshape((-1, 1))
val_pred.scores = lr.predict_proba(val.features)[:, 1]

pred = test.copy()
pred.labels = lr.predict(test.features).reshape((-1, 1))
pred.scores = lr.predict_proba(test.features)[:, 1]

cm_lr = ClassificationMetric(test,
                             pred,
                             unprivileged_groups=[{
                                 'sex': 0
                             }],
                             privileged_groups=[{
                                 'sex': 1
                             }])


def test_eqodds():
    eqo = EqOddsPostprocessing(unprivileged_groups=[{
        'sex': 0
Пример #13
0
def test_adult_test_set():
    ad = AdultDataset()
    # train, test = ad.split([32561])
    train, test = ad.split([30162])
    assert np.any(test.labels)
Пример #14
0
def get_data(dataset, protected_attribute, seed=101):
    def protected_attribute_error():
        raise ValueError(
            f'protected attribute {protected_attribute} is not available for dataset {dataset}'
        )

    if dataset == 'adult':
        from aif360.datasets import AdultDataset
        dataset_orig = AdultDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [2, 3]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'german':
        from aif360.datasets import GermanDataset
        dataset_orig = GermanDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        elif protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'compas':
        from aif360.datasets import CompasDataset
        dataset_orig = CompasDataset()
        if protected_attribute == 'sex':
            privileged_groups = [{'sex': 0}]
            unprivileged_groups = [{'sex': 1}]
        elif protected_attribute == 'sex_or_race':
            dataset_orig.feature_names += ['sex_or_race']
            dataset_orig.features = np.hstack([
                dataset_orig.features,
                np.expand_dims(
                    np.logical_or(*dataset_orig.features[:, [0, 2]].T).astype(
                        np.float64), -1)
            ])
            dataset_orig.protected_attributes = np.hstack([
                dataset_orig.protected_attributes, dataset_orig.features[:,
                                                                         [-1]]
            ])
            dataset_orig.protected_attribute_names += ['sex_or_race']
            dataset_orig.privileged_protected_attributes += [np.array([1.])]
            dataset_orig.unprivileged_protected_attributes += [np.array([0.])]
            privileged_groups = [{'sex_or_race': 1}]
            unprivileged_groups = [{'sex_or_race': 0}]
        elif protected_attribute == 'race':
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]
        else:
            protected_attribute_error()

    elif dataset == 'bank':
        from aif360.datasets import BankDataset
        dataset_orig = BankDataset()
        if protected_attribute == 'age':
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            protected_attribute_error()

    else:
        raise ValueError(f'{dataset} is not an available dataset.')

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6],
                                                             shuffle=True,
                                                             seed=seed)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True,
                                                                  seed=seed)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
Пример #15
0
def test_adult_test_set():
    ad = AdultDataset()
    # test, train = ad.split([16281])
    test, train = ad.split([15060])
    assert np.any(test.labels)
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from variable_cep import CalibratedEqOddsPostprocessing #modified for varying weight
from variable_cep import normed_rates
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve

## import dataset
dataset_used = "compas" # "adult", "german", "compas"
protected_attribute_used = 2 # 1, 2

# code to identify the protected attributes from all of the dataset features
if dataset_used == "adult":
    dataset_orig = AdultDataset()
#     dataset_orig = load_preproc_data_adult()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
    
elif dataset_used == "german":
    dataset_orig = GermanDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'age': 1}]
Пример #17
0
def LoadData(dataset_name,protected_attribute_name,raw=True):

	optim_options=None

	if dataset_name == "adult":
		if raw:
			dataset_original = AdultDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_adult,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['race'])
			optim_options = {
			"distortion_fun": get_distortion_adult,
			"epsilon": 0.05,
			"clist": [0.99, 1.99, 2.99],
			"dlist": [.1, 0.05, 0]
		}
	elif dataset_name == "german":
		if raw:
			dataset_original = GermanDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "age":
			privileged_groups = [{'age': 1}]
			unprivileged_groups = [{'age': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['age'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		dataset_original.labels = 2 - dataset_original.labels
		dataset_original.unfavorable_label = 0.
	elif dataset_name == "compas":
		if raw:
			dataset_original = CompasDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 0}]
			unprivileged_groups = [{'sex': 1}]
			if not raw:
				dataset_original = load_preproc_data_compas(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_compas(['race'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}

	protected_attribute_set={
		'sex':[[{'sex': 1}],[{'sex': 0}]],
		'age':[[{'age': 1}],[{'age': 0}]],
		'race':[[{'race': 1}],[{'race': 0}]]
	}

	if optim_options==None:
		print('No such dataset & group option:', dataset_name, protected_attribute_name)
		exit()

	return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
Пример #18
0
import numpy as np

from aif360.datasets import AdultDataset
from aif360.metrics import ClassificationMetric
from aif360.algorithms.inprocessing import MetaFairClassifier

protected = 'sex'
ad = AdultDataset(protected_attribute_names=[protected],
                  privileged_classes=[['Male']],
                  categorical_features=[],
                  features_to_keep=[
                      'age', 'education-num', 'capital-gain', 'capital-loss',
                      'hours-per-week'
                  ])
test, train = ad.split([16281], shuffle=False)


def test_adult_sr():
    biased_model = MetaFairClassifier(tau=0,
                                      sensitive_attr=protected,
                                      type='sr',
                                      seed=123).fit(train)
    dataset_bias_test = biased_model.predict(test)

    biased_cm = ClassificationMetric(test,
                                     dataset_bias_test,
                                     unprivileged_groups=[{
                                         protected: 0
                                     }],
                                     privileged_groups=[{
                                         protected: 1
def test_adult():
    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])
    assert np.any(test.labels)

    train.features = scaler.fit_transform(train.features)
    test.features = scaler.transform(test.features)

    index = train.feature_names.index(protected)
    X_tr = np.delete(train.features, index, axis=1)
    X_te = np.delete(test.features, index, axis=1)
    y_tr = train.labels.ravel()

    di = DisparateImpactRemover(repair_level=1.0)
    train_repd = di.fit_transform(train)
    # train_repd2 = di.fit_transform(train)
    # assert train_repd == train_repd2
    test_repd = di.fit_transform(test)

    assert np.all(
        train_repd.protected_attributes == train.protected_attributes)

    lmod = LogisticRegression(class_weight='balanced')
    # lmod = SVM(class_weight='balanced')
    lmod.fit(X_tr, y_tr)

    test_pred = test.copy()
    test_pred.labels = lmod.predict(X_te)

    X_tr_repd = np.delete(train_repd.features, index, axis=1)
    X_te_repd = np.delete(test_repd.features, index, axis=1)
    y_tr_repd = train_repd.labels.ravel()
    assert (y_tr == y_tr_repd).all()

    lmod.fit(X_tr_repd, y_tr_repd)
    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te_repd)

    p = [{protected: 1}]
    u = [{protected: 0}]

    cm = ClassificationMetric(test,
                              test_pred,
                              privileged_groups=p,
                              unprivileged_groups=u)
    before = cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(before))
    # print('Acc overall: {:.4}'.format(cm.accuracy()))

    repaired_cm = ClassificationMetric(test_repd,
                                       test_repd_pred,
                                       privileged_groups=p,
                                       unprivileged_groups=u)
    after = repaired_cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(after))
    # print('Acc overall: {:.4}'.format(repaired_cm.accuracy()))

    assert after > before
    assert abs(1 - after) <= 0.2
Пример #20
0
def test_adult():
    np.random.seed(1)
    # np.random.seed(9876)

    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    #scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])

    biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected)
    biased_model.fit(train)

    dataset_bias_test = biased_model.predict(test)

    biased_cm = ClassificationMetric(test,
                                     dataset_bias_test,
                                     unprivileged_groups=[{
                                         protected: 0
                                     }],
                                     privileged_groups=[{
                                         protected: 1
                                     }])
    unconstrainedFDR2 = biased_cm.false_discovery_rate_ratio()
    unconstrainedFDR2 = min(unconstrainedFDR2, 1 / unconstrainedFDR2)

    predictions = [
        1 if y == train.favorable_label else -1
        for y in dataset_bias_test.labels.ravel()
    ]
    y_test = np.array(
        [1 if y == train.favorable_label else -1 for y in test.labels.ravel()])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test)
    assert np.isclose(unconstrainedFDR, unconstrainedFDR2)

    tau = 0.9
    debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected)
    debiased_model.fit(train)

    #dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
    dataset_debiasing_test = debiased_model.predict(test)

    predictions = list(dataset_debiasing_test.labels)
    predictions = [
        1 if y == train.favorable_label else -1
        for y in dataset_debiasing_test.labels.ravel()
    ]
    y_test = np.array(
        [1 if y == train.favorable_label else -1 for y in test.labels.ravel()])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, fdr = getStats(y_test, predictions, x_control_test)

    debiased_cm = ClassificationMetric(test,
                                       dataset_debiasing_test,
                                       unprivileged_groups=[{
                                           protected: 0
                                       }],
                                       privileged_groups=[{
                                           protected: 1
                                       }])
    fdr2 = debiased_cm.false_discovery_rate_ratio()
    fdr2 = min(fdr2, 1 / fdr2)
    assert np.isclose(fdr, fdr2)
    #print(fdr, unconstrainedFDR)
    assert (fdr2 >= unconstrainedFDR2)
Пример #21
0
def load_preproc_data_adult(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb
        """
        np.random.seed(1)
        # Group age by decade
        df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10)

        # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0)

        def group_edu(x):
            if x == -1:
                return 'missing_edu'
            elif x <= 5:
                return '<6'
            elif x >= 13:
                return '>12'
            else:
                return x

        def age_cut(x):
            if x >= 70:
                return '>=70'
            else:
                return x

        def group_race(x):
            if x == "White":
                return 1.0
            else:
                return 0.0

        # Cluster education and age attributes.
        # Limit education range
        df['Education Years'] = df['education-num'].apply(
            lambda x: group_edu(x))
        df['Education Years'] = df['Education Years'].astype('category')

        # Limit age range
        df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x))

        # Rename income variable
        df['Income Binary'] = df['income-per-year']

        # Recode sex and race
        df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})
        df['race'] = df['race'].apply(lambda x: group_race(x))

        return df

    XD_features = ['Age (decade)', 'Education Years', 'sex', 'race']
    D_features = ['sex', 'race'
                  ] if protected_attributes is None else protected_attributes
    Y_features = ['Income Binary']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = ['Age (decade)', 'Education Years']

    # privileged classes
    all_privileged_classes = {"sex": [1.0], "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            1.0: 'Male',
            0.0: 'Female'
        },
        "race": {
            1.0: 'White',
            0.0: 'Non-white'
        }
    }

    return AdultDataset(
        label_name=Y_features[0],
        favorable_classes=['>50K', '>50K.'],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=['?'],
        metadata={
            'label_maps': [{
                1.0: '>50K',
                0.0: '<=50K'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        },
        custom_preprocessing=custom_preprocessing)
Пример #22
0
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from aif360.datasets import AdultDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

ad = AdultDataset(protected_attribute_names=['race', 'sex', 'native-country'],
                  privileged_classes=[['White'], ['Male'], ['United-States']],
                  categorical_features=[
                      'workclass', 'education', 'marital-status', 'occupation',
                      'relationship'
                  ],
                  custom_preprocessing=lambda df: df.fillna('Unknown'))
adult_train, adult_test = ad.split([32561], shuffle=False)

scaler = StandardScaler()
X = scaler.fit_transform(adult_train.features)
test_X = scaler.transform(adult_test.features)
clf = LogisticRegression(C=1.0, random_state=0, solver='liblinear')

adult_pred = adult_test.copy()
adult_pred.labels = clf.fit(X, adult_train.labels.ravel()).predict(test_X)

dataset_metric = BinaryLabelDatasetMetric(adult_test)
classifier_metric = BinaryLabelDatasetMetric(adult_pred)


def test_epsilon_dataset_binary_groups():
    eps_data = dataset_metric.smoothed_empirical_differential_fairness()
    assert eps_data == 1.53679014653623  # verified with reference implementation
Пример #23
0
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import tensorflow as tf

from aif360.datasets import AdultDataset
from aif360.sklearn.datasets import fetch_adult
from aif360.algorithms.inprocessing import AdversarialDebiasing as OldAdversarialDebiasing
from aif360.sklearn.inprocessing import AdversarialDebiasing

X, y, sample_weight = fetch_adult(numeric_only=True)
adult = AdultDataset(instance_weights_name='fnlwgt',
                     categorical_features=[],
                     features_to_keep=[
                         'age', 'education-num', 'capital-gain',
                         'capital-loss', 'hours-per-week'
                     ],
                     features_to_drop=[])


def test_adv_debias_old_reproduce():
    """Test that the old AdversarialDebiasing is reproducible."""
    sess = tf.Session()
    old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{
        'sex': 0
    }],
                                          privileged_groups=[{
                                              'sex': 1
                                          }],
                                          scope_name='old_classifier',
                                          sess=sess,
Пример #24
0
def load_preproc_data_adult(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb
        """
        np.random.seed(1)
        # Group age by decade
        df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10)

        # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0)

        def group_edu(x):
            if x == -1:
                return 'missing_edu'
            elif x <= 5:
                return '<6'
            elif x >= 13:
                return '>12'
            else:
                return x

        def age_cut(x):
            if x >= 70:
                return '>=70'
            else:
                return x

        def group_race(x):
            if x == "White":
                return 1.0
            else:
                return 0.0

        # Cluster education and age attributes.
        # Limit education range
        df['Education Years'] = df['education-num'].apply(
            lambda x: group_edu(x))
        df['Education Years'] = df['Education Years'].astype('category')

        # Limit age range
        df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x))

        # Rename income variable
        df['Income Binary'] = df['income-per-year']

        # Recode sex and race
        df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})
        df['race'] = df['race'].apply(lambda x: group_race(x))

        df1 = df[['sex', 'Education Years', 'Age (decade)', 'Income Binary']]
        tot = []
        for index, row in df1.iterrows():
            result = ''
            for j in df1.columns:
                result = result + str(row[j])
            tot.append(result)
        df1['tmp_feature'] = tot
        df1['mis_prob'] = 0
        for i in df1['tmp_feature'].unique():
            if '<=50K' in i and i[0] == '0':
                df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.8
            elif i[0] == '1':
                df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.08
            else:
                df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.04
        new_label = []
        for i, j in zip(df1['mis_prob'], df1['Education Years']):
            if np.random.binomial(1, i, 1)[0] == 1:
                new_label.append(-1)
            else:
                new_label.append(j)
        df['Education Years'] = new_label
        print('Total number of missing values')
        print(len(df.loc[df['Education Years'] == -1, :].index))
        print('Total number of observations')
        print(len(df.index))
        return df

    XD_features = ['Age (decade)', 'Education Years', 'sex']
    D_features = ['sex'
                  ] if protected_attributes is None else protected_attributes
    Y_features = ['Income Binary']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = ['Age (decade)', 'Education Years']

    # privileged classes
    all_privileged_classes = {"sex": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {"sex": {1.0: 'Male', 0.0: 'Female'}}

    return AdultDataset(
        label_name=Y_features[0],
        favorable_classes=['>50K', '>50K.'],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=['?'],
        metadata={
            'label_maps': [{
                1.0: '>50K',
                0.0: '<=50K'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        },
        custom_preprocessing=custom_preprocessing)