Exemplo n.º 1
0
def create_binary_dataset_sb():    
    """This will create a binary dataset from the csv with a set salary 
    as the threshold for later predictions.
    
    Input - A numeric salary to be set as the threshold
    
    Out - A AIF360 binary dataset with one-hot encoded categorical columns
    """
    
    data = pd.read_csv('../company_x_sb.csv', index_col='employee_id')
    data_with_label = data.copy()
    data_with_label['sex'] = data_with_label['sex'].transform(lambda x: x == 'M').astype(int)

    std_data = StandardDataset(df=data_with_label,   
                             label_name='new_signing_bonus',
                             favorable_classes =[1],
                            protected_attribute_names=['sex'], 
                             privileged_classes=[[1]],
                            categorical_features=['degree_level', 'dept'], 
                              features_to_drop=['boss_id'])

    df_data = std_data.convert_to_dataframe()
    binary_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_data[0], label_names=['new_signing_bonus'],
                  protected_attribute_names=['sex'])
    
    return binary_dataset
Exemplo n.º 2
0
def main(argv):
    df_data = pd.read_csv(r"adults_dataset/adult_train.csv")
    df_data = name_columns(df_data)
    df_test = pd.read_csv(r"adults_dataset/adult_test.csv")
    df_test = name_columns(df_test)

    df_data = data_preprocessing(df_data)
    df_test = data_preprocessing(df_test)

    # fig_proportion_of_rich(df_test, argv[1], False)

    df_data_encoded = one_hot_encoding(df_data)
    df_test_encoded = one_hot_encoding(df_test)

    normalization(df_data_encoded)
    normalization(df_test_encoded)

    samples = split_samples(df_data_encoded, df_test_encoded)
    
    model = random_forest_classifier(samples)

    predictions = predict(model, samples, False)

    # proportion_of_rich(argv[2], samples, predictions, False)

    gender_performance(df_test_encoded, predictions)
    demographic_parity(df_test_encoded, predictions)
    equalized_odds(df_test_encoded, predictions)

    #Kamiran and Calders
    train_sds = StandardDataset(df_data_encoded, label_name="earnings", favorable_classes=[1], 
                                protected_attribute_names=["sex"], privileged_classes=[[1]])

    test_sds = StandardDataset(df_test_encoded, label_name="earnings", favorable_classes=[1],
                               protected_attribute_names=["sex"], privileged_classes=[[1]])

    privileged_groups = [{"sex": 1.0}]
    unprivileged_groups = [{"sex": 0.0}]

    RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    RW.fit(train_sds)

    test_sds_pred = test_sds.copy(deepcopy=True)
    test_sds_transf = RW.transform(test_sds)

    samples_fair = split_samples_fair(train_sds, test_sds, test_sds_pred)
    
    model_fair = logistic_regression(test_sds_transf)

    predictions_fair, test_pred = predict_fair(model_fair, samples_fair, True)
    test_pred = test_pred.astype(int)

    dpd = demographic_parity_difference(
        df_test_encoded.earnings, test_pred, sensitive_features=df_test_encoded.sex)

    print(f"Model demographic parity difference:", dpd)
Exemplo n.º 3
0
 def to_dataframe(y_true, y_pred, y_prot):
     y_true, y_pred, y_prot = y_true.float().cpu().numpy(), y_pred.float(
     ).cpu().numpy(), y_prot.float().cpu().numpy()
     df = pd.DataFrame({
         'y_true': y_true,
         'y_pred': y_pred,
         'y_prot': y_prot
     })
     dataset = StandardDataset(df, 'y_true', [1.], ['y_prot'], [[1.]])
     dataset.scores = y_pred.reshape(-1, 1)
     return dataset
Exemplo n.º 4
0
def generate_formatted_dataframe(df, label_name, favorable_classes,
                                 protected_attribute_names, privileged_classes,
                                 categorical_features, features_to_keep,
                                 features_to_drop, na_values,
                                 custom_preprocessing, metadata):
    '''
    @usage:
        to transform the input data into accepted Standard Dataset and split 
        into training, validation (and testing)
        return: 'standardized' splitted dataframe into training, validation 
        (and testing)
    @param:
        - df: original input pandas dataframe
    '''
    # Transform into standardized dataframe
    dataset_standardized = StandardDataset(
        df,
        label_name,
        favorable_classes,
        protected_attribute_names,
        privileged_classes,
        categorical_features=categorical_features,
        features_to_keep=features_to_keep,
        features_to_drop=features_to_drop,
        na_values=na_values,
        custom_preprocessing=custom_preprocessing,
        metadata=metadata)

    return dataset_standardized
Exemplo n.º 5
0
def plot_using_aif(df_predict,df_true):
   

    predict_list, true_list = [], []
    unpriv_label_list , priv_label_list = [], []
    for (u,p) in zip(unpriv_list,priv_list):
        cur_predict, cur_true = [], []

        unpriv_label = '+'.join(['-'.join([prot_attr_dict[key][u_el[key]] for key in u_el]) for u_el in u])
        priv_label = '+'.join(['-'.join([prot_attr_dict[key][p_el[key]] for key in p_el]) for p_el in p])

        print('-------------------------------------------------------------------')
        print('unpriv_label:-->',unpriv_label)
        print('-------------------------------------------------------------------')
        print('priv_label  :-->',priv_label)
        print('-------------------------------------------------------------------')
        print('\n\n')
        for i,label in enumerate(rating_names):
            #print('Fairness Metric for the label------>',label.upper())
        
            predict_dataset  = StandardDataset(df=predict_df_list[i], label_name=label, favorable_classes=[1.0,1.0],
                                protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) 
            
            true_dataset  = StandardDataset(df=true_df_list[i], label_name=label, favorable_classes=[1.0,1.0],
                                protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes) 
            
           
            predict_dataset_metric = BinaryLabelDatasetMetric(predict_dataset, unprivileged_groups=u, privileged_groups=p)
            true_dataset_metric = BinaryLabelDatasetMetric(true_dataset, unprivileged_groups=u, privileged_groups=p)
            

            #classfication_metric = ClassificationMetric(true_dataset, predict_dataset, unprivileged_groups=u, privileged_groups=p)
            
            #x=classfication_metric.generalized_entropy_index()
            
            #print(label,':  -->','predicted :  -->',abs(predict_dataset_metric.disparate_impact()),'true :  -->',abs(true_dataset_metric.disparate_impact()))
            print(label,':  -->','predicted :  -->',abs(predict_dataset_metric.mean_difference()),'true :  -->',abs(true_dataset_metric.mean_difference()))
Exemplo n.º 6
0
def preprocess_adultdataset(df):
    df['Age'] = df['age'].apply(lambda x: x // 10 * 10)

    df['Education_years'] = df['education.num'].apply(
        lambda x: '<6' if x <= 5 else ('>12' if x >= 13 else x))
    df['Education_years'] = df['Education_years'].astype('category')

    df['Age'] = df['Age'].apply(lambda x: '>=70' if x >= 70 else x)

    df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})
    df['race'] = df['race'].apply(lambda x: 1.0 if x == 'White' else 0.0)

    protected_attribute = ['sex', 'race']
    label_name = 'income'
    categorical_features = ['Age', 'Education_years']
    features = categorical_features + [label_name] + protected_attribute

    privileged_class = {'sex': [1.0], 'race': [1.0]}
    protected_attribute_map = {
        'sex': {
            1.0: 'Male',
            0.0: 'Female'
        },
        'race': {
            1.0: 'White',
            0.0: 'Non-white'
        }
    }

    data = StandardDataset(
        df,
        label_name,
        favorable_classes=['>50K', '>50K.'],
        protected_attribute_names=protected_attribute,
        privileged_classes=[privileged_class[x] for x in protected_attribute],
        categorical_features=categorical_features,
        features_to_keep=features,
        metadata={
            'label_maps': [{
                1.0: '>50K',
                0.0: '<=50K'
            }],
            'protected_attribute_maps':
            [protected_attribute_map[x] for x in protected_attribute]
        })
    return data
Exemplo n.º 7
0
def main() -> None:

    # read from inventor
    filepath = ait_input.get_inventory_path('Data')

    # prepare column names as given by german.doc
    column_names = [
        'status', 'month', 'credit_history', 'purpose', 'credit_amount',
        'savings', 'employment', 'investment_as_income_percentage',
        'personal_status', 'other_debtors', 'residence_since', 'property',
        'age', 'installment_plans', 'housing', 'number_of_credits',
        'skill_level', 'people_liable_for', 'telephone', 'foreign_worker',
        'credit'
    ]

    # load into a dataframe
    df = data_loading(filepath=filepath,
                      column_names=column_names,
                      na_values=None)

    # prepare for mappings
    mappings = {
        'label_maps': [{
            1.0: 'Good Credit',
            2.0: 'Bad Credit'
        }],
        'protected_attribute_maps': [{
            1.0: 'Male',
            0.0: 'Female'
        }, {
            1.0: 'Old',
            0.0: 'Young'
        }]
    }

    # prepare for categorical features
    categorical_features = [
        'status', 'credit_history', 'purpose', 'savings', 'employment',
        'other_debtors', 'property', 'installment_plans', 'housing',
        'skill_level', 'telephone', 'foreign_worker'
    ]

    # load param
    protected_attribute = ait_input.get_method_param_value(
        'protected_attribute')
    privileged_classes = ait_input.get_method_param_value('privileged_classes')

    # input check
    ait_input_check(protected_attribute, privileged_classes)

    # prepare for structure from dataframe and edit data features setting
    dataset = StandardDataset(
        df=df,
        label_name='credit',
        favorable_classes=[1],
        protected_attribute_names=[protected_attribute],
        privileged_classes=[lambda x: x >= privileged_classes],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=None,
        features_to_drop=['personal_status', 'sex'],
        na_values=None,
        custom_preprocessing=preprocessing,
        metadata=mappings)

    # set two variables for the privileged (1) and unprivileged (0) values for the age attribute.
    privileged_groups = [{protected_attribute: 1}]
    unprivileged_groups = [{protected_attribute: 0}]

    # compute fairness metric on original training dataset
    metric_fairness = BinaryLabelDatasetMetric(
        dataset,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    print("Original training dataset: German Credit Data")
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f"
        % metric_fairness.mean_difference())
    print("unprivileged groups = %f" %
          metric_fairness.base_rate(privileged=False))
    print("privileged groups = %f" %
          metric_fairness.base_rate(privileged=True))

    # resource observed_predicted_plot
    save_metric_fairness_plot(metric_fairness, protected_attribute)

    # measures
    measure_mean_difference(metric_fairness.mean_difference())

    # ait.log
    move_log()
Exemplo n.º 8
0
    def apply_model(self, data, scalers, adjusted_annotated_train_data,
                    pre_processor, learner, model):
        filtered_data = self.missing_value_handler.handle_missing(data)
        print(self.missing_value_handler.name(), 'removed',
              len(data) - len(filtered_data), 'instances from validation data')

        for numerical_attribute, scaler in scalers.items():
            numerical_attribute_data = np.array(
                filtered_data[numerical_attribute]).reshape(-1, 1)
            scaled_numerical_attribute_data = scaler.transform(
                numerical_attribute_data)
            filtered_data.loc[:,
                              numerical_attribute] = scaled_numerical_attribute_data

        annotated_data = StandardDataset(
            df=filtered_data,
            label_name=self.label_name,
            favorable_classes=[self.positive_label],
            protected_attribute_names=self.protected_attribute_names,
            privileged_classes=self.privileged_classes,
            categorical_features=self.categorical_attribute_names,
            features_to_drop=self.attributes_to_drop_names,
            metadata=self.dataset_metadata)

        adjusted_annotated_data = self.preprocess_data(pre_processor,
                                                       annotated_data)

        train_feature_names = adjusted_annotated_train_data.feature_names
        current_feature_names = adjusted_annotated_data.feature_names

        feature_names_in_train_but_not_in_current = set(
            train_feature_names).difference(set(current_feature_names))

        print("Injecting zero columns for features not present",
              feature_names_in_train_but_not_in_current)

        validation_data_df, _ = adjusted_annotated_data.convert_to_dataframe()

        for feature_name in feature_names_in_train_but_not_in_current:
            validation_data_df.loc[:, feature_name] = 0.0

        adjusted_annotated_data.feature_names = train_feature_names
        adjusted_annotated_data.features = validation_data_df[
            train_feature_names].values.copy()

        adjusted_annotated__data_with_predictions = adjusted_annotated_data.copy(
        )

        if learner.needs_annotated_data_for_prediction():
            adjusted_annotated__data_with_predictions = model.predict(
                adjusted_annotated_data)
        else:
            adjusted_annotated__data_with_predictions.labels = model.predict(
                adjusted_annotated_data.features)

            try:
                class_probs = model.predict_proba(
                    adjusted_annotated_data.features)
                adjusted_annotated__data_with_predictions.scores = class_probs[:,
                                                                               0]
            except AttributeError:
                print("WARNING: MODEL CANNOT ASSIGN CLASS PROBABILITIES")

        return adjusted_annotated_data, adjusted_annotated__data_with_predictions
Exemplo n.º 9
0
    def run(self):
        """Executes all the possible experiments from the combination of  given
            learners, pre-processors and post-processors.
            
            No. of experiments = (#learners * #preprocessors * #postprocessors)
        """
        np.random.seed(self.fixed_random_seed)

        data = self.load_raw_data()

        all_train_data, test_and_validation_data = train_test_split(
            data,
            test_size=self.test_set_ratio + self.validation_set_ratio,
            random_state=self.fixed_random_seed)

        train_data = self.train_data_sampler.sample(all_train_data)

        second_split_ratio = self.test_set_ratio / (self.test_set_ratio +
                                                    self.validation_set_ratio)

        validation_data, test_data = train_test_split(
            test_and_validation_data,
            test_size=second_split_ratio,
            random_state=self.fixed_random_seed)

        self.missing_value_handler.fit(train_data)
        filtered_train_data = self.missing_value_handler.handle_missing(
            train_data)

        print(self.missing_value_handler.name(), 'removed',
              len(train_data) - len(filtered_train_data),
              'instances from training data')

        scalers = {}

        for numerical_attribute in self.numeric_attribute_names:
            numerical_attribute_data = np.array(
                filtered_train_data[numerical_attribute]).reshape(-1, 1)
            scaler = clone(
                self.numeric_attribute_scaler).fit(numerical_attribute_data)
            scaled_numerical_attribute_data = scaler.transform(
                numerical_attribute_data)

            filtered_train_data.loc[:,
                                    numerical_attribute] = scaled_numerical_attribute_data
            scalers[numerical_attribute] = scaler

        annotated_train_data = StandardDataset(
            df=filtered_train_data,
            label_name=self.label_name,
            favorable_classes=[self.positive_label],
            protected_attribute_names=self.protected_attribute_names,
            privileged_classes=self.privileged_classes,
            categorical_features=self.categorical_attribute_names,
            features_to_drop=self.attributes_to_drop_names,
            metadata=self.dataset_metadata)

        for pre_processor in self.pre_processors:
            for learner in self.learners:
                for post_processor in self.post_processors:
                    self.run_single_exp(annotated_train_data, validation_data,
                                        test_data, scalers, pre_processor,
                                        learner, post_processor)
        self.filter_optimal_results()
def load_TaiwanDataset():

    filepath = "C:\\Users\\Johannes\\Desktop\\Code - Copy\\data\\UCI_Credit_Card.csv"
    df = pd.read_csv(filepath, sep=',', na_values=[])

    df = df.rename(columns={'default.payment.next.month': 'TARGET'})
    del df['ID']
    df['AGE'] = df['AGE'].apply(lambda x: np.where(x > 25, 1.0, 0.0))
    df['CREDIT_AMNT'] = df['BILL_AMT1'] - df['PAY_AMT1']

    XD_features = [
        "LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE", "PAY_0", "PAY_2",
        "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2",
        "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
        "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6",
        "CREDIT_AMNT"
    ]
    D_features = ['AGE']
    Y_features = ['TARGET']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = [
        'SEX', 'EDUCATION', 'MARRIAGE', "PAY_0", "PAY_2", "PAY_3", "PAY_4",
        "PAY_5", "PAY_6"
    ]

    privileged_class = {"AGE": [1.0]}
    protected_attribute_map = {"AGE": {1.0: 'Old', 0.0: 'Young'}}

    def default_preprocessing(df):
        def label_sex(x):
            if x == 1:
                return 'Male'
            elif x == 2:
                return 'Female'
            else:
                return 'NA'

        def label_education(x):
            if x == 1:
                return 'graduate_school'
            elif x == 2:
                return 'university'
            elif x == 3:
                return 'high_school'
            elif x == 4:
                return 'others'
            elif x == 5:
                return 'others'
            elif x == 6:
                return 'others'
            else:
                return 'others'

        def label_marriage(x):
            if x == 1:
                return 'married'
            elif x == 2:
                return 'single'
            elif x == 3:
                return 'others'
            else:
                return 'others'

        #to be defined
        def label_pay(x):
            if x in [-2, -1]:
                return 0
            else:
                return x

        # group credit history, savings, and employment
        df['SEX'] = df['SEX'].apply(lambda x: label_sex(x))
        df['EDUCATION'] = df['EDUCATION'].apply(lambda x: label_education(x))
        df['MARRIAGE'] = df['MARRIAGE'].apply(lambda x: label_marriage(x))

        pay_col = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
        for p in pay_col:
            df[p] = df[p].apply(lambda x: label_pay(x))

        # Good credit == 1
        status_map = {0: 1.0, 1: 2.0}
        df['TARGET'] = df['TARGET'].replace(status_map)

        return df

    df_standard = StandardDataset(
        df=df,
        label_name=Y_features[0],
        favorable_classes=[1],
        protected_attribute_names=D_features,
        privileged_classes=[privileged_class["AGE"]],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        metadata={
            'label_maps': [{
                1.0: 'Good',
                2.0: 'Bad'
            }],
            'protected_attribute_maps': [protected_attribute_map]
        },
        custom_preprocessing=default_preprocessing)

    return df_standard
unpriv_dict = {}
priv_dict = {}
f_c = []
for i in range(validation_comp.shape[1]):
    f_c.append([])
    if (validation_comp.columns[i].find(unprivileged_group) != -1):
        prot.append(validation_comp.columns[i])
        unpriv_dict = {validation_comp.columns[i]: 1}
    if (validation_comp.columns[i].find(privileged_group) != -1):
        priv.append([1])
        prot.append(validation_comp.columns[i])
        priv_dict = {validation_comp.columns[i]: 1}
    else:
        priv.append([])

stdDs = StandardDataset(validation_comp, 'is_violent_recid', [0], prot, priv)
stdPred = StandardDataset(validation_pred, 'is_violent_recid', [0], prot, priv)
bi_met = BinaryLabelDatasetMetric(stdDs,
                                  privileged_groups=[priv_dict],
                                  unprivileged_groups=[unpriv_dict])
class_met = ClassificationMetric(stdDs,
                                 stdPred,
                                 unprivileged_groups=[unpriv_dict],
                                 privileged_groups=[priv_dict])

disparate_impact = bi_met.disparate_impact()
#error_rate_ratio = class_met.error_rate_ratio()
eq_diff = class_met.equal_opportunity_difference()

#Create 2 Bar Graphs
x = [1]
Exemplo n.º 12
0
        '-------------------------------------------------------------------')
    print('unpriv_label:-->', unpriv_label)
    print(
        '-------------------------------------------------------------------')
    print('priv_label  :-->', priv_label)
    print(
        '-------------------------------------------------------------------')

    for i, label in enumerate(rating_names):
        print(label)

        scaler = MinMaxScaler(copy=False)

        train_dataset = StandardDataset(
            df=train_df_list[i],
            label_name=label,
            favorable_classes=[1.0, 1.0],
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes)
        test_dataset = StandardDataset(
            df=test_df_list[i],
            label_name=label,
            favorable_classes=[1.0, 1.0],
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes)
        train_dataset.features = scaler.fit_transform(train_dataset.features)
        test_dataset.features = scaler.fit_transform(test_dataset.features)

        index = [
            test_dataset.feature_names.index(x)
            for x in protected_attribute_names
        ]
Exemplo n.º 13
0
def prepare_data(data,
                 priv_category,
                 priv_value,
                 target_label,
                 priv_target_value,
                 ignore_cols=None):
    """
    Prepare dataset for bias mitigation.
    
    Paramters:
    data (pandas dataframe): Data to fix (for fairness)
    priv_category (string): Column name that contains the privileged value (e.g. Race, Gender, etc)
    priv_value (string): Value or type in the column that denotes the privileged attribute (e.g. White, Male, etc)
    target_label (string): Column name of target variable (e.g. income, loan score, etc)
    priv_target_value (string): Value in target that favors the privileged (e.g. High income, favorable loan score, credit acceptance, etc).
                                Must be boolean (so if target is numeric, convert to categorical by thresholding before processing.)
    ignore_cols, optional (list of string): List of columns to exclude from bias assessment and modeling.
    
    Returns:
    data_priv (standard Dataset): Dataset prepared by aif360 for processing
    encoders (dict): dictionary of encoding models
    numerical_features (list): List of numerical columns
    categorical_features (list) List of categorical columns
    """

    if ignore_cols:
        data = data.drop(ignore_cols, axis=1)
    else:
        pass

    # Get categorical features
    categorical_features = data.columns[data.dtypes == 'object']
    data_encoded = data.copy()

    # Store categorical names and encoders
    categorical_names = {}
    encoders = {}

    # Use Label Encoder for categorical columns (including target column)
    for feature in categorical_features:
        le = LabelEncoder()
        le.fit(data_encoded[feature])

        data_encoded[feature] = le.transform(data_encoded[feature])

        categorical_names[feature] = le.classes_
        encoders[feature] = le

    # Scale numeric columns
    numerical_features = [
        c for c in data.columns.values if c not in categorical_features
    ]

    for feature in numerical_features:
        val = data_encoded[feature].values[:, np.newaxis]
        mms = MinMaxScaler().fit(val)
        data_encoded[feature] = mms.transform(val)
        encoders[feature] = mms

    data_encoded = data_encoded.astype(float)

    privileged_class = np.where(
        categorical_names[priv_category] == priv_value)[0]
    encoded_target_label = np.where(
        categorical_names[target_label] == priv_target_value)[0]

    data_priv = StandardDataset(data_encoded,
                                label_name=target_label,
                                favorable_classes=encoded_target_label,
                                protected_attribute_names=[priv_category],
                                privileged_classes=[privileged_class])

    return data_priv, encoders, numerical_features, categorical_features
Exemplo n.º 14
0
def train(request):
    df = pd.read_csv('./training/resume_data_5000.csv')
    df = df.drop(df.columns[0], axis=1)
    dataset_orig = StandardDataset(df,
                                   label_name='Accepted',
                                   favorable_classes=[1],
                                   protected_attribute_names=['Gender'],
                                   privileged_classes=[[1]],
                                   categorical_features=['School'],
                                   features_to_drop=['Name'])
    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7],
                                                             shuffle=True)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True)

    privileged_groups = [{'Gender': 1}]
    unprivileged_groups = [{'Gender': 0}]

    metric_orig_train = BinaryLabelDatasetMetric(
        dataset_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    orig_mean_difference = metric_orig_train.mean_difference()

    with open('./training/orig_mean_difference.pkl', 'wb') as f:
        pickle.dump(orig_mean_difference, f)

    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    dataset_transf_train = RW.fit_transform(dataset_orig_train)
    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    transf_mean_difference = metric_transf_train.mean_difference()

    with open('./training/transf_mean_difference.pkl', 'wb') as f:
        pickle.dump(transf_mean_difference, f)

    # Logistic regression classifier and predictions
    scale_orig = StandardScaler()
    X_train = scale_orig.fit_transform(dataset_orig_train.features)
    y_train = dataset_orig_train.labels.ravel()
    w_train = dataset_orig_train.instance_weights.ravel()

    with open('./training/scaler.pkl', 'wb') as f:
        pickle.dump(scale_orig, f)

    lmod_orig = LogisticRegression(solver='lbfgs')
    lmod_orig.fit(X_train,
                  y_train,
                  sample_weight=dataset_orig_train.instance_weights)
    y_train_pred = lmod_orig.predict(X_train)

    pos_ind = np.where(
        lmod_orig.classes_ == dataset_orig_train.favorable_label)[0][0]

    dataset_orig_train_pred = dataset_orig_train.copy()
    dataset_orig_train_pred.labels = y_train_pred

    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    X_valid = scale_orig.transform(dataset_orig_valid_pred.features)
    y_valid = dataset_orig_valid_pred.labels
    dataset_orig_valid_pred.scores = lmod_orig.predict_proba(
        X_valid)[:, pos_ind].reshape(-1, 1)

    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale_orig.transform(dataset_orig_test_pred.features)
    y_test = dataset_orig_test_pred.labels
    dataset_orig_test_pred.scores = lmod_orig.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)

    num_thresh = 100
    ba_arr = np.zeros(num_thresh)
    class_thresh_arr = np.linspace(0.01, 0.99, num_thresh)
    for idx, class_thresh in enumerate(class_thresh_arr):

        fav_inds = dataset_orig_valid_pred.scores > class_thresh
        dataset_orig_valid_pred.labels[
            fav_inds] = dataset_orig_valid_pred.favorable_label
        dataset_orig_valid_pred.labels[
            ~fav_inds] = dataset_orig_valid_pred.unfavorable_label

        classified_metric_orig_valid = ClassificationMetric(
            dataset_orig_valid,
            dataset_orig_valid_pred,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        ba_arr[idx] = 0.5*(classified_metric_orig_valid.true_positive_rate()\
                        +classified_metric_orig_valid.true_negative_rate())

    best_ind = np.where(ba_arr == np.max(ba_arr))[0][0]
    best_class_thresh = class_thresh_arr[best_ind]

    bal_acc_arr_orig = []
    disp_imp_arr_orig = []
    avg_odds_diff_arr_orig = []

    for thresh in tqdm(class_thresh_arr):
        fav_inds = dataset_orig_test_pred.scores > thresh
        dataset_orig_test_pred.labels[
            fav_inds] = dataset_orig_test_pred.favorable_label
        dataset_orig_test_pred.labels[
            ~fav_inds] = dataset_orig_test_pred.unfavorable_label

        metric_test_bef = compute_metrics(dataset_orig_test,
                                          dataset_orig_test_pred,
                                          unprivileged_groups,
                                          privileged_groups,
                                          disp=False)

        if thresh == best_class_thresh:
            with open('./training/metrics_orig.pkl', 'wb') as f:
                pickle.dump(metric_test_bef,
                            f,
                            protocol=pickle.HIGHEST_PROTOCOL)

        bal_acc_arr_orig.append(metric_test_bef["Balanced accuracy"])
        avg_odds_diff_arr_orig.append(
            metric_test_bef["Average odds difference"])
        disp_imp_arr_orig.append(metric_test_bef["Disparate impact"])

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod_transf = LogisticRegression(solver='lbfgs')
    lmod_transf.fit(X_train,
                    y_train,
                    sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod_transf.predict(X_train)

    dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod_transf.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)

    bal_acc_arr_transf = []
    disp_imp_arr_transf = []
    avg_odds_diff_arr_transf = []

    for thresh in tqdm(class_thresh_arr):
        fav_inds = dataset_transf_test_pred.scores > thresh
        dataset_transf_test_pred.labels[
            fav_inds] = dataset_transf_test_pred.favorable_label
        dataset_transf_test_pred.labels[
            ~fav_inds] = dataset_transf_test_pred.unfavorable_label

        metric_test_aft = compute_metrics(dataset_orig_test,
                                          dataset_transf_test_pred,
                                          unprivileged_groups,
                                          privileged_groups,
                                          disp=False)

        if thresh == best_class_thresh:
            with open('./training/metrics_transf.pkl', 'wb') as f:
                pickle.dump(metric_test_aft,
                            f,
                            protocol=pickle.HIGHEST_PROTOCOL)

        bal_acc_arr_transf.append(metric_test_aft["Balanced accuracy"])
        avg_odds_diff_arr_transf.append(
            metric_test_aft["Average odds difference"])
        disp_imp_arr_transf.append(metric_test_aft["Disparate impact"])

    with open('./training/model_orig.pkl', 'wb') as f:
        pickle.dump(lmod_orig, f)
    with open('./training/model_transf.pkl', 'wb') as f:
        pickle.dump(lmod_transf, f)

    return HttpResponse('Model trained')
Exemplo n.º 15
0
def preprocess_germandataset(df):
    def group_credit_hist(x):
        if x in [
                'no credits taken/ all credits paid back duly',
                'all credits at this bank paid back duly',
                'existing credits paid back duly till now'
        ]:
            return 'None/Paid'
        elif x == 'delay in paying off in the past':
            return 'Delay'
        elif x == 'critical account/ other credits existing (not at this bank)':
            return 'Other'
        else:
            return 'NA'

    def group_employ(x):
        if x == 'unemployed':
            return 'Unemployed'
        elif x in ['... < 1 year ', '1 <= ... < 4 years']:
            return '1-4 years'
        elif x in ['4 <= ... < 7 years', '.. >= 7 years']:
            return '4+ years'
        else:
            return 'NA'

    def group_savings(x):
        if x in ['... < 100 DM', '100 <= ... < 500 DM']:
            return '<500'
        elif x in ['500 <= ... < 1000 DM ', '.. >= 1000 DM ']:
            return '500+'
        elif x == 'unknown/ no savings account':
            return 'Unknown/None'
        else:
            return 'NA'

    def group_status(x):
        if x in ['< 0 DM', '0 <= ... < 200 DM']:
            return '<200'
        elif x in ['>= 200 DM / salary assignments for at least 1 year']:
            return '200+'
        elif x == 'no checking account':
            return 'None'
        else:
            return 'NA'

    status_map = {
        'male : divorced/separated': 1.0,
        'male : single': 1.0,
        'male : married/widowed': 1.0,
        'female : divorced/separated/married': 0.0,
        'female : single': 0.0
    }

    df['personal_status_sex'] = df['personal_status_sex'].replace(status_map)
    df['credit_history'] = df['credit_history'].apply(
        lambda x: group_credit_hist(x))
    df['savings'] = df['savings'].apply(lambda x: group_savings(x))
    df['present_emp_since'] = df['present_emp_since'].apply(
        lambda x: group_employ(x))
    df['age'] = df['age'].apply(lambda x: np.float(x >= 25))
    df['account_check_status'] = df['account_check_status'].apply(
        lambda x: group_status(x))

    df = df.rename(
        columns={
            'default': 'credit',
            'present_emp_since': 'employment',
            'account_check_status': 'status',
            'personal_status_sex': 'sex'
        })

    protected_attribute = ['sex', 'age']
    label_name = 'credit'
    categorical_features = ['credit_history', 'savings', 'employment']
    features = categorical_features + [label_name] + protected_attribute

    privileged_class = {'sex': [1.0], 'age': [1.0]}

    protected_attribute_map = {
        "sex": {
            1.0: 'male',
            0.0: 'female'
        },
        "age": {
            1.0: 'old',
            0.0: 'young'
        }
    }

    data = StandardDataset(
        df,
        label_name,
        favorable_classes=[1],
        protected_attribute_names=protected_attribute,
        privileged_classes=[privileged_class[x] for x in protected_attribute],
        categorical_features=categorical_features,
        features_to_keep=features,
        metadata={
            'label_maps': [{
                1.0: 'Good Credit',
                2.0: 'Bad Credit'
            }],
            'protected_attribute_maps':
            [protected_attribute_map[x] for x in protected_attribute]
        })

    return data
Exemplo n.º 16
0
def preprocess_compasdataset(df):
    df = df[[
        'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
        'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
        'two_year_recid', 'c_jail_in', 'c_jail_out'
    ]]

    # Indices of data samples to keep
    ix = df['days_b_screening_arrest'] <= 30
    ix = (df['days_b_screening_arrest'] >= -30) & ix
    ix = (df['is_recid'] != -1) & ix
    ix = (df['c_charge_degree'] != "O") & ix
    ix = (df['score_text'] != 'N/A') & ix
    df = df.loc[ix, :]
    df['length_of_stay'] = (
        pd.to_datetime(df['c_jail_out']) -
        pd.to_datetime(df['c_jail_in'])).apply(lambda x: x.days)

    # Restrict races to African-American and Caucasian
    df = df.loc[
        ~df['race'].isin(['Native American', 'Hispanic', 'Asian', 'Other']), :]

    df = df[[
        'sex', 'race', 'age_cat', 'c_charge_degree', 'score_text',
        'priors_count', 'is_recid', 'two_year_recid', 'length_of_stay'
    ]]

    df['priors_count'] = df['priors_count'].apply(lambda x: 0 if x <= 0 else (
        '1 to 3' if 1 <= x <= 3 else 'More than 3'))
    df['length_of_stay'] = df['length_of_stay'].apply(
        lambda x: '<week' if x <= 7 else ('<3months'
                                          if 8 < x <= 93 else '>3months'))
    df['score_text'] = df['score_text'].apply(
        lambda x: 'MediumHigh' if (x == 'High') | (x == 'Medium') else x)
    df['age_cat'] = df['age_cat'].apply(lambda x: '25 to 45'
                                        if x == '25 - 45' else x)

    df['sex'] = df['sex'].replace({'Female': 1.0, 'Male': 0.0})
    df['race'] = df['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0)

    df = df[[
        'two_year_recid', 'sex', 'race', 'age_cat', 'priors_count',
        'c_charge_degree'
    ]]

    protected_attributes = ['sex', 'race']
    label_name = 'two_year_recid'
    categorical_features = ['age_cat', 'priors_count', 'c_charge_degree']
    features = categorical_features + [label_name] + protected_attributes

    # privileged classes
    privileged_classes = {"sex": [1.0], "race": [1.0]}

    # protected attribute maps
    protected_attribute_map = {
        "sex": {
            0.0: 'Male',
            1.0: 'Female'
        },
        "race": {
            1.0: 'Caucasian',
            0.0: 'Not Caucasian'
        }
    }

    data = StandardDataset(
        df,
        label_name,
        favorable_classes=[0],
        protected_attribute_names=protected_attributes,
        privileged_classes=[
            privileged_classes[x] for x in protected_attributes
        ],
        categorical_features=categorical_features,
        features_to_keep=features,
        metadata={
            'label_maps': [{
                1.0: 'Did recid.',
                0.0: 'No recid.'
            }],
            'protected_attribute_maps':
            [protected_attribute_map[x] for x in protected_attributes]
        })

    return data