예제 #1
0
def adult_sex_race():
    """ Adult dataset - sex and race as sensitive attribtues
    return:
        X - DataFrame
        Y - Numpy Array
        A - Numpy Array
    """
    print('Fetching Adult data with sensitive_attribute=sex, race ...')

    from responsibly.dataset import AdultDataset

    adult_ds = AdultDataset()
    features = [
        'age', 'workclass', 'education', 'education-num', 'marital_status',
        'occupation', 'relationship', 'capital_gain', 'capital_loss',
        'hours_per_week', 'native_country'
    ]
    X = pd.get_dummies(adult_ds.df[features])
    X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X),
                     columns=X.columns)
    Y = preprocessing.LabelEncoder().fit_transform(
        adult_ds.df[adult_ds.target])
    A = unique_label(
        preprocessing.LabelEncoder().fit_transform(adult_ds.df['sex']),
        preprocessing.LabelEncoder().fit_transform(adult_ds.df['race']))

    return X, Y, A
예제 #2
0
    def _load_data(self):
        data = AdultDataset()

        # use race as the sensitive attribute
        race = data.df['race']
        s = race.where(race == 'White', 1)
        s.where(s == 1, 0, inplace=True)
        s = s.values.reshape(-1, 1)

        # Use capital gain/capital loss and hours per week
        x = whiten(
            data=data.df[['capital_gain', 'capital_loss', 'hours_per_week'
                          ]].values.astype(float))

        # work class, education, marital status and native country in one hot encoding
        for column in [
                "workclass", "education", "marital_status", "native_country"
        ]:
            for category in data.df[column].unique():
                category = data.df[column].where(data.df[column] == category,
                                                 0)
                category.where(category == 0, 1, inplace=True)
                x = np.hstack((x, category.values.reshape(-1, 1)))

        # use actual income as target variable: >50K = 1, <=50K = 0
        income = data.df[data.target]
        y = income.where(income == '>50K', 0)
        y.where(y == 0, 1, inplace=True)
        y = y.values.reshape(-1, 1)

        return x.astype(float), s.astype(float), y.astype(float)
예제 #3
0
def adult_ds():
    return AdultDataset()
예제 #4
0
파일: data.py 프로젝트: kwekuka/ot
def adultDataset():
    #Get the whole dataset, already nicely filtered for us from this library
    adult_ds = AdultDataset()

    #Make the dataframe
    adf = adult_ds.df

    #Make the occupations 1-hot
    adf = one_hot(adf, "occupation", drop=True)

    #Clean up Education
    adf = adf.replace({
        'education': {
            'Assoc-acdm': "HS-grad",
            'Some-college': "HS-grad",
            'Assoc-voc': "HS-grad",
            "Prof-school": "HS-grad",
            "11th": "No-HSD",
            "9th": "No-HSD",
            "10th": "No-HSD",
            "12th": "No-HSD",
            "5th-6th": "No-HSD",
            "7th-8th": "No-HSD",
            "1st-4th": "No-HSD",
            "Preschool": "No-HSD"
        }
    })

    #Drop smaller racial categories for now
    adf = adf[adf["race"] != "Amer-Indian-Eskimo"]
    adf = adf[adf["race"] != "Asian-Pac-Islander"]
    adf = adf[adf["race"] != "Other"]

    #Only united states (~40k/48k)
    # United States non US (~40k/48k)
    adf.loc[adf['native_country'] != ' United-States',
            'native_country'] = 'Non-US'
    adf.loc[adf['native_country'] == ' United-States', 'native_country'] = 'US'
    adf['native_country'] = adf['native_country'].map({
        'US': 1,
        'Non-US': 0
    }).astype(int)
    adf.head()

    #mMake race and gender binary (yuck)
    adf = adf.replace({'sex': {
        'Female': 1,
        'Male': 0,
    }})

    adf = adf.replace({'race': {
        'Black': 1,
        'White': 0,
    }})

    #Make all categorical 1-hot
    adf = one_hot(adf, "education", drop=True)

    #Simply work classes
    adf['workclass'] = adf['workclass'].replace(
        ['State-gov', 'Local-gov', 'Federal-gov'], 'Gov')
    adf['workclass'] = adf['workclass'].replace(
        ['Self-emp-inc', 'Self-emp-not-inc'], 'Self_employed')
    adf = one_hot(adf, "workclass", drop=True)

    #Make Marital status binary
    adf['marital_status'] = adf['marital_status'].replace([
        'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated',
        'Widowed'
    ], 'Single')
    adf['marital_status'] = adf['marital_status'].replace(
        ['Married-AF-spouse', 'Married-civ-spouse'], 'Couple')
    adf = one_hot(adf, "marital_status", drop=True)

    #Make relationships one-hot
    adf = one_hot(adf, "relationship", drop=True)

    #Make the income variables binary
    adf = adf.replace({'income_per_year': {
        '<=50K': 0,
        '>50K': 1,
    }})

    return adf