示例#1
0
    def get_df(self, repaired=False):

        df = pd.read_csv(os.path.dirname(os.path.realpath(__file__)) +
                         "/../raw/bank-additional-full.csv",
                         sep=";")
        df.rename(columns={'y': 'target'}, inplace=True)

        assert len(self.categorical_attributes) + len(
            self.continuous_attributes) == len(
                df.columns), "Error in classifying columns:" + str(
                    len(self.categorical_attributes) +
                    len(self.continuous_attributes)) + " " + str(
                        len(df.columns))

        # scale
        scaler = MinMaxScaler()
        df[self.continuous_attributes] = scaler.fit_transform(
            df[self.continuous_attributes])
        self.keep_columns = list(df.columns)

        for known_sensitive_attribute in self.known_sensitive_attributes:
            if (known_sensitive_attribute in self.continuous_attributes):
                df = utils.get_discretized_df(
                    df, columns_to_discretize=[known_sensitive_attribute])
                df = utils.get_one_hot_encoded_df(df,
                                                  [known_sensitive_attribute])
                self.continuous_attributes.remove(known_sensitive_attribute)

        if (self.verbose):
            print("-number of samples: (before dropping nan rows)", len(df))
        # drop rows with null values
        df = df.dropna()
        if (self.verbose):
            print("-number of samples: (after dropping nan rows)", len(df))
        return df
示例#2
0
    def get_df(self, repaired = False):

        df = pd.read_csv(self.filename)

        assert len(self.categorical_attributes) + len(self.continuous_attributes) == len(df.columns), "Error in classifying columns"
        self.keep_columns = list(df.columns)    
        for known_sensitive_attribute in self.known_sensitive_attributes:
            if(known_sensitive_attribute in self.continuous_attributes):
                df = utils.get_discretized_df(df, columns_to_discretize=[known_sensitive_attribute])
                df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute])
                self.continuous_attributes.remove(known_sensitive_attribute)

        # scale 
        scaler = MinMaxScaler()
        df[self.continuous_attributes] = scaler.fit_transform(df[self.continuous_attributes])

        df['target'] = df['target'].map({'good': 1, 'bad': 0})
        
        

        # df.to_csv("data/raw/reduced_german.csv", index=False)

        # if(repaired):
        #     df = pd.read_csv("data/raw/repaired_german.csv")

        
        
        if(self.verbose):
            print("-number of samples: (before dropping nan rows)", len(df))
        # drop rows with null values
        df = df.dropna()
        if(self.verbose):
            print("-number of samples: (after dropping nan rows)", len(df))
            
        return df
示例#3
0
    def get_df(self, repaired = False):

        df = pd.read_csv(self.filename)
        df = df[self.keep_columns]
        
        
        
        

        assert len(self.categorical_attributes) + len(self.continuous_attributes) == len(df.columns), "Error in classifying columns:" + str(len(self.categorical_attributes) + len(self.continuous_attributes)) + " " + str(len(df.columns))

        for known_sensitive_attribute in self.known_sensitive_attributes:
            if(known_sensitive_attribute in self.continuous_attributes):
                df = utils.get_discretized_df(df, columns_to_discretize=[known_sensitive_attribute])
                df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute])
                self.continuous_attributes.remove(known_sensitive_attribute)


        # scale 
        scaler = MinMaxScaler()
        df[self.continuous_attributes] = scaler.fit_transform(df[self.continuous_attributes])

        df.rename(columns={'two_year_recid': 'target'}, inplace=True)
        self.keep_columns.remove('two_year_recid')
        self.keep_columns.append("target")

        if(self.verbose):
            print("-number of samples: (before dropping nan rows)", len(df))
        # drop rows with null values
        df = df.dropna()
        if(self.verbose):
            print("-number of samples: (after dropping nan rows)", len(df))
            
        return df
示例#4
0
def init_synthetic():
    filename = "data/sample.csv"

    if (os.path.isfile(filename)):
        dataframe = pd.read_csv(filename)
    else:
        cols = 10
        rows = 200

        matrix = np.random.randint(2, size=(rows, cols))
        dataframe = pd.DataFrame.from_records(matrix)
        dataframe.columns = ['col-' + str(i)
                             for i in range(cols - 1)] + ['target']
        dataframe.to_csv(filename, index=False)

    # get X,y
    X = dataframe.drop(['target'], axis=1)
    y = dataframe['target']

    # one-hot
    X = utils.get_one_hot_encoded_df(X, X.columns.to_list())

    # split into train_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    known_sensitive_attributes = ['col-0']
    attributes, sensitive_attributes, probs = utils.get_statistics_from_df(
        X_train, known_sensitive_attributes)

    #  For linear classifier, we use Logistic regression model of sklearn
    clf = LogisticRegression(random_state=0)
    clf = clf.fit(X_train, y_train)

    print("\nFeatures: ", X_train.columns.to_list())
    print("\nWeights: ", clf.coef_)
    print("\nBias:", clf.intercept_[0])
    assert len(clf.coef_[0]) == len(
        X_train.columns), "Error: wrong dimension of features and weights"

    # print("Train Accuracy Score: ", clf.score(X_train, y_train), "positive ratio: ",y_train.mean())
    # print("Test Accuracy Score: ", clf.score(X_test, y_test), "positive ratio: ",y_test.mean())
    predict_train = clf.predict(X_train)
    predict_test = clf.predict(X_test)

    print("Train accuracy:", metrics.accuracy_score(y_train, predict_train),
          "positive ratio: ", y_train.mean())
    print("Test accuracy:", metrics.accuracy_score(y_test, predict_test),
          "positive ratio: ", y_test.mean())
    print("Train set positive prediction", predict_train.mean())
    print("Test set positive prediction", predict_test.mean())
    print()

    return clf.coef_[0], clf.intercept_[
        0], attributes, sensitive_attributes, probs
示例#5
0
def init_iris():
    """ 
    Returns weights, bias, features (including sensitive features), and sensitive features
    """

    # loading dataset

    target = "target"
    dataset = load_iris()
    dataset[target] = np.where(dataset[target] == 2, 0, dataset[target])

    # get df

    data_df = utils.sklearn_to_df(dataset)

    # discretize
    data = utils.get_discretized_df(
        data_df, columns_to_discretize=data_df.columns.to_list())

    # get X,y
    X = data.drop(['target'], axis=1)
    y = data['target']

    # one-hot
    X = utils.get_one_hot_encoded_df(X, X.columns.to_list())

    # split into train_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    known_sensitive_attributes = [['sepal length (cm)_1']]
    attributes, sensitive_attributes, probs = utils.get_statistics_from_df(
        X_train, known_sensitive_attributes)

    #  For linear classifier, we use Logistic regression model of sklearn
    clf = LogisticRegression(random_state=0)
    clf = clf.fit(X_train, y_train)

    print("\nFeatures: ", X_train.columns.to_list())
    print("\nWeights: ", clf.coef_)
    print("\nBias:", clf.intercept_[0])
    assert len(clf.coef_[0]) == len(
        X_train.columns), "Error: wrong dimension of features and weights"

    print("Train Accuracy Score: ", clf.score(X_train, y_train),
          "positive ratio: ", y_train.mean())
    print("Test Accuracy Score: ", clf.score(X_test, y_test),
          "positive ratio: ", y_test.mean())

    return clf.coef_[0], clf.intercept_[
        0], attributes, sensitive_attributes, probs
示例#6
0
    def get_df(self, repaired=False):

        df = pd.read_csv(self.filename)
        df.columns = [
            'passenger class', 'name', 'sex', 'age',
            'siblings or spouce aboard', 'parents or childred aboard',
            'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body',
            'home destination', 'target'
        ]

        df = df.drop(self.ignore_columns, axis=1)
        if (self.verbose):
            print("-number of samples: (before dropping nan rows)", len(df))
        # drop rows with null values
        df = df.dropna()
        if (self.verbose):
            print("-number of samples: (after dropping nan rows)", len(df))

        assert len(self.categorical_attributes) + len(
            self.continuous_attributes) == len(
                df.columns), str(len(self.categorical_attributes)) + " " + str(
                    len(self.continuous_attributes)) + " " + str(
                        len(df.columns))
        self.keep_columns = list(df.columns)

        # scale
        scaler = MinMaxScaler()
        df[self.continuous_attributes] = scaler.fit_transform(
            df[self.continuous_attributes])

        for known_sensitive_attribute in self.known_sensitive_attributes:
            if (known_sensitive_attribute in self.continuous_attributes):
                df = utils.get_discretized_df(
                    df, columns_to_discretize=[known_sensitive_attribute])
                df = utils.get_one_hot_encoded_df(df,
                                                  [known_sensitive_attribute])
                self.continuous_attributes.remove(known_sensitive_attribute)

        # df['sex'] = df['sex'].map({'female': 0, 'male': 1})

        df.to_csv(os.path.dirname(os.path.realpath(__file__)) +
                  "/../raw/reduced_titanic.csv",
                  index=False)

        if (repaired):
            df = pd.read_csv(
                os.path.dirname(os.path.realpath(__file__)) +
                "/../raw/repaired_titanic.csv")

        return df
示例#7
0
    def get_df(self, repaired=False):

        df = pd.read_csv(self.filename)

        # scale
        scaler = MinMaxScaler()
        df[self.continuous_attributes] = scaler.fit_transform(
            df[self.continuous_attributes])

        df = df[self.keep_columns]

        for known_sensitive_attribute in self.known_sensitive_attributes:
            if (known_sensitive_attribute in self.continuous_attributes):
                df = utils.get_discretized_df(
                    df, columns_to_discretize=[known_sensitive_attribute])
                df = utils.get_one_hot_encoded_df(df,
                                                  [known_sensitive_attribute])
                self.continuous_attributes.remove(known_sensitive_attribute)

        df['income-per-year'] = df['income-per-year'].map({
            '<=50K': 0,
            '>50K': 1
        })
        df.rename(columns={'income-per-year': 'target'}, inplace=True)
        self.keep_columns.remove('income-per-year')
        self.keep_columns.append("target")

        # df['race'] = df['race'].map({'White' : 'White', 'Black' : 'Others', 'Asian-Pac-Islander' : 'Others', 'Amer-Indian-Eskimo' : 'Others', 'Other' : 'Others'})

        df.to_csv(os.path.dirname(os.path.realpath(__file__)) +
                  "/../raw/reduced_adult.csv",
                  index=False)

        if (repaired):
            df = pd.read_csv(
                os.path.dirname(os.path.realpath(__file__)) +
                "/../raw/repaired_adult.csv")

        if (self.verbose):
            print("-number of samples: (before dropping nan rows)", len(df))
        # drop rows with null values
        df = df.dropna()
        if (self.verbose):
            print("-number of samples: (after dropping nan rows)", len(df))

        return df
示例#8
0
def init_synthetic():
    filename = "data/sample.csv"

    if (os.path.isfile(filename)):
        dataframe = pd.read_csv(filename)
    else:

        cols = 4
        rows = 200

        matrix = np.random.randint(2, size=(rows, cols))
        dataframe = pd.DataFrame.from_records(matrix)
        dataframe.columns = ['col-' + str(i)
                             for i in range(cols - 1)] + ['target']
        dataframe.to_csv(filename, index=False)

    # get X,y
    X = dataframe.drop(['target'], axis=1)
    y = dataframe['target']

    # one-hot
    X = utils.get_one_hot_encoded_df(X, X.columns.to_list())

    # split into train_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    sensitive_attributes = _get_sensitive_attibutes(['col-0'],
                                                    X_train.columns.to_list())

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    predict_train = clf.predict(X_train)
    predict_test = clf.predict(X_test)

    print("Train accuracy:", metrics.accuracy_score(y_train, predict_train),
          "positive ratio: ", y_train.mean())
    print("Test accuracy:", metrics.accuracy_score(y_test, predict_test),
          "positive ratio: ", y_test.mean())
    print("Train set positive prediction", predict_train.mean())
    print("Test set positive prediction", predict_test.mean())

    return clf, sensitive_attributes, X_train, X_test
示例#9
0
def init(dataset,
         repaired=False,
         verbose=False,
         compute_equalized_odds=False,
         thread=0,
         remove_column=None):

    df = dataset.get_df(repaired=repaired)

    if (remove_column is not None):
        assert isinstance(remove_column, str)
        df = df.drop([remove_column], axis=1)
        if (remove_column in dataset.continuous_attributes):
            dataset.continuous_attributes.remove(remove_column)

    # discretize
    data = utils.get_discretized_df(
        df,
        columns_to_discretize=dataset.continuous_attributes,
        verbose=verbose)

    # get X,y
    X = data.drop(['target'], axis=1)
    y = data['target']

    # one-hot
    X = utils.get_one_hot_encoded_df(X, X.columns.to_list(), verbose=verbose)

    skf = KFold(n_splits=5, shuffle=True, random_state=10)
    skf.get_n_splits(X, y)

    X_trains = []
    y_trains = []
    X_tests = []
    y_tests = []
    clfs = []
    clf_negs = []

    os.system("mkdir -p data/model/")
    cnt = 0

    for train, test in skf.split(X, y):

        X_trains.append(X.iloc[train])
        y_trains.append(y.iloc[train])
        X_tests.append(X.iloc[test])
        y_tests.append(y.iloc[test])

        if (remove_column is None):
            store_file = "data/model/CNF_" + dataset.name + "_" + str(
                dataset.config) + "_" + str(cnt) + ".pkl"
        else:
            store_file = "data/model/CNF_" + dataset.name + "_remove_" + remove_column.replace(
                " ", "_") + "_" + str(dataset.config) + "_" + str(cnt) + ".pkl"

        if (not os.path.isfile(store_file)):
            os.system("mkdir -p data/temp_" + str(thread))
            clf = imli(num_clause=2,
                       data_fidelity=10,
                       work_dir="data/temp_" + str(thread),
                       rule_type="CNF",
                       verbose=False)
            clf.fit(X_trains[-1].values, y_trains[-1].values)
            os.system("rm -r data/temp_" + str(thread))

            # save the classifier
            with open(store_file, 'wb') as fid:
                pickle.dump(clf, fid)

        else:
            # Load the classifier
            with open(store_file, 'rb') as fid:
                clf = pickle.load(fid)

        clfs.append(clf)

        if (verbose):
            print("\nFeatures: ", X_trains[-1].columns.to_list())
            print("Number of features:", len(X_trains[-1].columns.to_list()))
            print("\nlearned rule:")
            print(clf.get_rule(X_trains[-1].columns.to_list()))

        if (verbose):
            print(
                "\nTrain Accuracy Score: ",
                metrics.accuracy_score(clf.predict(X_trains[-1].values),
                                       y_trains[-1].values),
                "positive ratio: ", y_trains[-1].mean())
            print(
                "Test Accuracy Score: ",
                metrics.accuracy_score(clf.predict(X_tests[-1].values),
                                       y_tests[-1].values), "positive ratio: ",
                y_tests[-1].mean())

        cnt += 1

    if (compute_equalized_odds):
        return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests

    return clfs, X_trains, X_tests, dataset.known_sensitive_attributes
示例#10
0
def init(dataset,
         repaired=False,
         verbose=False,
         compute_equalized_odds=False,
         depth=5,
         remove_column=None):

    df = dataset.get_df(repaired=repaired)

    # get X,y
    X = df.drop(['target'], axis=1)
    y = df['target']

    if (remove_column is not None):
        assert isinstance(remove_column, str)
        X = X.drop([remove_column], axis=1)

    # one-hot
    X = utils.get_one_hot_encoded_df(X,
                                     dataset.categorical_attributes,
                                     verbose=verbose)

    skf = KFold(n_splits=5, shuffle=True, random_state=10)
    skf.get_n_splits(X, y)

    X_trains = []
    y_trains = []
    X_tests = []
    y_tests = []
    clfs = []

    os.system("mkdir -p data/model/")
    cnt = 0
    for train, test in skf.split(X, y):

        X_trains.append(X.iloc[train])
        y_trains.append(y.iloc[train])
        X_tests.append(X.iloc[test])
        y_tests.append(y.iloc[test])

        if (remove_column is None):
            store_file = "data/model/DT_" + dataset.name + "_" + str(
                dataset.config) + "_" + str(depth) + "_" + str(cnt) + ".pkl"
        else:
            store_file = "data/model/DT_" + dataset.name + "_remove_" + remove_column.replace(
                " ", "_") + "_" + str(dataset.config) + "_" + str(
                    depth) + "_" + str(cnt) + ".pkl"

        if (not os.path.isfile(store_file)):

            clf = tree.DecisionTreeClassifier(max_depth=depth)
            clf.fit(X_trains[-1], y_trains[-1])
            tree_preds = clf.predict_proba(X_tests[-1])[:, 1]

            # save the classifier
            with open(store_file, 'wb') as fid:
                pickle.dump(clf, fid)

        else:
            # Load the classifier
            with open(store_file, 'rb') as fid:
                clf = pickle.load(fid)

        clfs.append(clf)

        # clf = tree.DecisionTreeClassifier()
        # clf = clf.fit(X_train, y_train)
        predict_train = clf.predict(X_trains[-1])
        predict_test = clf.predict(X_tests[-1])

        if (verbose):
            print("\nTrain accuracy:",
                  metrics.accuracy_score(y_trains[-1], predict_train),
                  "positive ratio: ", y_trains[-1].mean())
            print("Test accuracy:",
                  metrics.accuracy_score(y_tests[-1], predict_test),
                  "positive ratio: ", y_tests[-1].mean())
            print("Train set positive prediction", predict_train.mean())
            print("Test set positive prediction", predict_test.mean())

        cnt += 1

    if (compute_equalized_odds):
        return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests

    return clfs, X_trains, X_tests, dataset.known_sensitive_attributes
示例#11
0
def init_iris():

    # dataset.data is a np matrix,
    # dataset.target is a np array
    # dataset['features] is the list of features in the original dataset

    # prepare iris dataset for binary classification
    target = "target"
    dataset = sklearn.datasets.load_iris()
    dataset[target] = np.where(dataset[target] == 2, 0, dataset[target])

    # get df
    dataset = utils.sklearn_to_df(dataset)

    index_of_sensitive_features = 0

    # discretize sensitive attributes
    data = utils.get_discretized_df(
        dataset,
        columns_to_discretize=[
            dataset.columns.to_list()[index_of_sensitive_features]
        ])

    # get X,y
    X = data.drop(['target'], axis=1)
    y = data['target']

    # one-hot
    X = utils.get_one_hot_encoded_df(X, X.columns.to_list())

    # split into train_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    # Extract new names of sensitive attributes
    _sensitive_attributes = {
    }  # it is a map because each entry contains all one-hot encoded variables
    for _column in X_train.columns.to_list():
        if ("_" in _column and _column.split("_")[0]
                in dataset.columns.to_list()[index_of_sensitive_features]):
            if (_column.split("_")[0] not in _sensitive_attributes):
                _sensitive_attributes[_column.split("_")[0]] = [_column]
            else:
                _sensitive_attributes[_column.split("_")[0]].append(_column)
        elif (_column
              in dataset.columns.to_list()[index_of_sensitive_features]):
            if (_column not in _sensitive_attributes):
                _sensitive_attributes[_column] = [_column]
            else:
                _sensitive_attributes[_column].append(_column)

    # Finally make a 2d list
    sensitive_attributes = []
    for key in _sensitive_attributes:
        sensitive_attributes.append(_sensitive_attributes[key])

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)

    predict_train = clf.predict(X_train)
    predict_test = clf.predict(X_test)

    print("Train accuracy:", metrics.accuracy_score(y_train, predict_train),
          "positive ratio: ", y_train.mean())
    print("Test accuracy:", metrics.accuracy_score(y_test, predict_test),
          "positive ratio: ", y_test.mean())
    print("Train set positive prediction", predict_train.mean())
    print("Test set positive prediction", predict_test.mean())

    return clf, X_train.columns.to_list(
    ), sensitive_attributes, X_train, X_test
示例#12
0
def init(dataset,
         classifier="lr",
         repaired=False,
         verbose=False,
         compute_equalized_odds=False,
         remove_column=None):
    df = dataset.get_df(repaired=repaired)

    # discretize
    # df =  utils.get_discretized_df(df, columns_to_discretize=dataset.continuous_attributes)

    # get X,y
    X = df.drop(['target'], axis=1)
    y = df['target']

    if (remove_column is not None):
        assert isinstance(remove_column, str)
        X = X.drop([remove_column], axis=1)

    # one-hot
    X = utils.get_one_hot_encoded_df(X, dataset.categorical_attributes)
    # X = utils.get_one_hot_encoded_df(X,X.columns.to_list())

    skf = KFold(n_splits=5, shuffle=True, random_state=10)
    skf.get_n_splits(X, y)

    X_trains = []
    y_trains = []
    X_tests = []
    y_tests = []
    clfs = []

    cnt = 0
    os.system("mkdir -p data/model/")

    for train, test in skf.split(X, y):

        X_trains.append(X.iloc[train])
        y_trains.append(y.iloc[train])
        X_tests.append(X.iloc[test])
        y_tests.append(y.iloc[test])

        clf = None

        if (classifier == "lr"):
            if (remove_column is None):
                store_file = "data/model/LR_" + dataset.name + "_" + str(
                    dataset.config) + "_" + str(cnt) + ".pkl"
            else:
                store_file = "data/model/LR_" + dataset.name + "_remove_" + remove_column.replace(
                    " ", "_") + "_" + str(
                        dataset.config) + "_" + str(cnt) + ".pkl"

            if (not os.path.isfile(store_file)):
                #  For linear classifier, we use Logistic regression model of sklearn
                clf = LogisticRegression(class_weight='balanced',
                                         solver='liblinear',
                                         random_state=0)
                clf.fit(X_trains[-1], y_trains[-1])

                # save the classifier
                with open(store_file, 'wb') as fid:
                    pickle.dump(clf, fid)

            else:
                # Load the classifier
                with open(store_file, 'rb') as fid:
                    clf = pickle.load(fid)

        elif (classifier == "svm-linear"):
            if (remove_column is None):
                store_file = "data/model/SVM_" + dataset.name + "_" + str(
                    dataset.config) + "_" + str(cnt) + ".pkl"
            else:
                store_file = "data/model/SVM_" + dataset.name + "_remove_" + remove_column.replace(
                    " ", "_") + "_" + str(
                        dataset.config) + "_" + str(cnt) + ".pkl"
            if (not os.path.isfile(store_file)):
                #  For linear classifier, we use Logistic regression model of sklearn
                clf = SVC(kernel="linear")
                clf.fit(X_trains[-1], y_trains[-1])

                # save the classifier
                with open(store_file, 'wb') as fid:
                    pickle.dump(clf, fid)

            else:
                # Load the classifier
                with open(store_file, 'rb') as fid:
                    clf = pickle.load(fid)

        else:
            raise ValueError(classifier)

        clfs.append(clf)

        if (verbose):
            print("\nFeatures: ", X_trains[-1].columns.to_list())
            print("Number of features:", len(X_trains[-1].columns.to_list()))
            print("\nWeights: ", clf.coef_[0])
            print("\nBias:", clf.intercept_[0])
            assert len(clf.coef_[0]) == len(
                X_trains[-1].columns
            ), "Error: wrong dimension of features and weights"

            print("Train Accuracy Score: ",
                  clf.score(X_trains[-1], y_trains[-1]), "positive ratio: ",
                  y_trains[-1].mean())
            print("Test Accuracy Score: ", clf.score(X_tests[-1], y_tests[-1]),
                  "positive ratio: ", y_tests[-1].mean())

        cnt += 1

    if (compute_equalized_odds):
        return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests

    return clfs, X_trains, X_tests, dataset.known_sensitive_attributes