def get_df(self, repaired=False): df = pd.read_csv(os.path.dirname(os.path.realpath(__file__)) + "/../raw/bank-additional-full.csv", sep=";") df.rename(columns={'y': 'target'}, inplace=True) assert len(self.categorical_attributes) + len( self.continuous_attributes) == len( df.columns), "Error in classifying columns:" + str( len(self.categorical_attributes) + len(self.continuous_attributes)) + " " + str( len(df.columns)) # scale scaler = MinMaxScaler() df[self.continuous_attributes] = scaler.fit_transform( df[self.continuous_attributes]) self.keep_columns = list(df.columns) for known_sensitive_attribute in self.known_sensitive_attributes: if (known_sensitive_attribute in self.continuous_attributes): df = utils.get_discretized_df( df, columns_to_discretize=[known_sensitive_attribute]) df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute]) self.continuous_attributes.remove(known_sensitive_attribute) if (self.verbose): print("-number of samples: (before dropping nan rows)", len(df)) # drop rows with null values df = df.dropna() if (self.verbose): print("-number of samples: (after dropping nan rows)", len(df)) return df
def get_df(self, repaired = False): df = pd.read_csv(self.filename) assert len(self.categorical_attributes) + len(self.continuous_attributes) == len(df.columns), "Error in classifying columns" self.keep_columns = list(df.columns) for known_sensitive_attribute in self.known_sensitive_attributes: if(known_sensitive_attribute in self.continuous_attributes): df = utils.get_discretized_df(df, columns_to_discretize=[known_sensitive_attribute]) df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute]) self.continuous_attributes.remove(known_sensitive_attribute) # scale scaler = MinMaxScaler() df[self.continuous_attributes] = scaler.fit_transform(df[self.continuous_attributes]) df['target'] = df['target'].map({'good': 1, 'bad': 0}) # df.to_csv("data/raw/reduced_german.csv", index=False) # if(repaired): # df = pd.read_csv("data/raw/repaired_german.csv") if(self.verbose): print("-number of samples: (before dropping nan rows)", len(df)) # drop rows with null values df = df.dropna() if(self.verbose): print("-number of samples: (after dropping nan rows)", len(df)) return df
def get_df(self, repaired = False): df = pd.read_csv(self.filename) df = df[self.keep_columns] assert len(self.categorical_attributes) + len(self.continuous_attributes) == len(df.columns), "Error in classifying columns:" + str(len(self.categorical_attributes) + len(self.continuous_attributes)) + " " + str(len(df.columns)) for known_sensitive_attribute in self.known_sensitive_attributes: if(known_sensitive_attribute in self.continuous_attributes): df = utils.get_discretized_df(df, columns_to_discretize=[known_sensitive_attribute]) df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute]) self.continuous_attributes.remove(known_sensitive_attribute) # scale scaler = MinMaxScaler() df[self.continuous_attributes] = scaler.fit_transform(df[self.continuous_attributes]) df.rename(columns={'two_year_recid': 'target'}, inplace=True) self.keep_columns.remove('two_year_recid') self.keep_columns.append("target") if(self.verbose): print("-number of samples: (before dropping nan rows)", len(df)) # drop rows with null values df = df.dropna() if(self.verbose): print("-number of samples: (after dropping nan rows)", len(df)) return df
def init_synthetic(): filename = "data/sample.csv" if (os.path.isfile(filename)): dataframe = pd.read_csv(filename) else: cols = 10 rows = 200 matrix = np.random.randint(2, size=(rows, cols)) dataframe = pd.DataFrame.from_records(matrix) dataframe.columns = ['col-' + str(i) for i in range(cols - 1)] + ['target'] dataframe.to_csv(filename, index=False) # get X,y X = dataframe.drop(['target'], axis=1) y = dataframe['target'] # one-hot X = utils.get_one_hot_encoded_df(X, X.columns.to_list()) # split into train_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) known_sensitive_attributes = ['col-0'] attributes, sensitive_attributes, probs = utils.get_statistics_from_df( X_train, known_sensitive_attributes) # For linear classifier, we use Logistic regression model of sklearn clf = LogisticRegression(random_state=0) clf = clf.fit(X_train, y_train) print("\nFeatures: ", X_train.columns.to_list()) print("\nWeights: ", clf.coef_) print("\nBias:", clf.intercept_[0]) assert len(clf.coef_[0]) == len( X_train.columns), "Error: wrong dimension of features and weights" # print("Train Accuracy Score: ", clf.score(X_train, y_train), "positive ratio: ",y_train.mean()) # print("Test Accuracy Score: ", clf.score(X_test, y_test), "positive ratio: ",y_test.mean()) predict_train = clf.predict(X_train) predict_test = clf.predict(X_test) print("Train accuracy:", metrics.accuracy_score(y_train, predict_train), "positive ratio: ", y_train.mean()) print("Test accuracy:", metrics.accuracy_score(y_test, predict_test), "positive ratio: ", y_test.mean()) print("Train set positive prediction", predict_train.mean()) print("Test set positive prediction", predict_test.mean()) print() return clf.coef_[0], clf.intercept_[ 0], attributes, sensitive_attributes, probs
def init_iris(): """ Returns weights, bias, features (including sensitive features), and sensitive features """ # loading dataset target = "target" dataset = load_iris() dataset[target] = np.where(dataset[target] == 2, 0, dataset[target]) # get df data_df = utils.sklearn_to_df(dataset) # discretize data = utils.get_discretized_df( data_df, columns_to_discretize=data_df.columns.to_list()) # get X,y X = data.drop(['target'], axis=1) y = data['target'] # one-hot X = utils.get_one_hot_encoded_df(X, X.columns.to_list()) # split into train_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) known_sensitive_attributes = [['sepal length (cm)_1']] attributes, sensitive_attributes, probs = utils.get_statistics_from_df( X_train, known_sensitive_attributes) # For linear classifier, we use Logistic regression model of sklearn clf = LogisticRegression(random_state=0) clf = clf.fit(X_train, y_train) print("\nFeatures: ", X_train.columns.to_list()) print("\nWeights: ", clf.coef_) print("\nBias:", clf.intercept_[0]) assert len(clf.coef_[0]) == len( X_train.columns), "Error: wrong dimension of features and weights" print("Train Accuracy Score: ", clf.score(X_train, y_train), "positive ratio: ", y_train.mean()) print("Test Accuracy Score: ", clf.score(X_test, y_test), "positive ratio: ", y_test.mean()) return clf.coef_[0], clf.intercept_[ 0], attributes, sensitive_attributes, probs
def get_df(self, repaired=False): df = pd.read_csv(self.filename) df.columns = [ 'passenger class', 'name', 'sex', 'age', 'siblings or spouce aboard', 'parents or childred aboard', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home destination', 'target' ] df = df.drop(self.ignore_columns, axis=1) if (self.verbose): print("-number of samples: (before dropping nan rows)", len(df)) # drop rows with null values df = df.dropna() if (self.verbose): print("-number of samples: (after dropping nan rows)", len(df)) assert len(self.categorical_attributes) + len( self.continuous_attributes) == len( df.columns), str(len(self.categorical_attributes)) + " " + str( len(self.continuous_attributes)) + " " + str( len(df.columns)) self.keep_columns = list(df.columns) # scale scaler = MinMaxScaler() df[self.continuous_attributes] = scaler.fit_transform( df[self.continuous_attributes]) for known_sensitive_attribute in self.known_sensitive_attributes: if (known_sensitive_attribute in self.continuous_attributes): df = utils.get_discretized_df( df, columns_to_discretize=[known_sensitive_attribute]) df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute]) self.continuous_attributes.remove(known_sensitive_attribute) # df['sex'] = df['sex'].map({'female': 0, 'male': 1}) df.to_csv(os.path.dirname(os.path.realpath(__file__)) + "/../raw/reduced_titanic.csv", index=False) if (repaired): df = pd.read_csv( os.path.dirname(os.path.realpath(__file__)) + "/../raw/repaired_titanic.csv") return df
def get_df(self, repaired=False): df = pd.read_csv(self.filename) # scale scaler = MinMaxScaler() df[self.continuous_attributes] = scaler.fit_transform( df[self.continuous_attributes]) df = df[self.keep_columns] for known_sensitive_attribute in self.known_sensitive_attributes: if (known_sensitive_attribute in self.continuous_attributes): df = utils.get_discretized_df( df, columns_to_discretize=[known_sensitive_attribute]) df = utils.get_one_hot_encoded_df(df, [known_sensitive_attribute]) self.continuous_attributes.remove(known_sensitive_attribute) df['income-per-year'] = df['income-per-year'].map({ '<=50K': 0, '>50K': 1 }) df.rename(columns={'income-per-year': 'target'}, inplace=True) self.keep_columns.remove('income-per-year') self.keep_columns.append("target") # df['race'] = df['race'].map({'White' : 'White', 'Black' : 'Others', 'Asian-Pac-Islander' : 'Others', 'Amer-Indian-Eskimo' : 'Others', 'Other' : 'Others'}) df.to_csv(os.path.dirname(os.path.realpath(__file__)) + "/../raw/reduced_adult.csv", index=False) if (repaired): df = pd.read_csv( os.path.dirname(os.path.realpath(__file__)) + "/../raw/repaired_adult.csv") if (self.verbose): print("-number of samples: (before dropping nan rows)", len(df)) # drop rows with null values df = df.dropna() if (self.verbose): print("-number of samples: (after dropping nan rows)", len(df)) return df
def init_synthetic(): filename = "data/sample.csv" if (os.path.isfile(filename)): dataframe = pd.read_csv(filename) else: cols = 4 rows = 200 matrix = np.random.randint(2, size=(rows, cols)) dataframe = pd.DataFrame.from_records(matrix) dataframe.columns = ['col-' + str(i) for i in range(cols - 1)] + ['target'] dataframe.to_csv(filename, index=False) # get X,y X = dataframe.drop(['target'], axis=1) y = dataframe['target'] # one-hot X = utils.get_one_hot_encoded_df(X, X.columns.to_list()) # split into train_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) sensitive_attributes = _get_sensitive_attibutes(['col-0'], X_train.columns.to_list()) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) predict_train = clf.predict(X_train) predict_test = clf.predict(X_test) print("Train accuracy:", metrics.accuracy_score(y_train, predict_train), "positive ratio: ", y_train.mean()) print("Test accuracy:", metrics.accuracy_score(y_test, predict_test), "positive ratio: ", y_test.mean()) print("Train set positive prediction", predict_train.mean()) print("Test set positive prediction", predict_test.mean()) return clf, sensitive_attributes, X_train, X_test
def init(dataset, repaired=False, verbose=False, compute_equalized_odds=False, thread=0, remove_column=None): df = dataset.get_df(repaired=repaired) if (remove_column is not None): assert isinstance(remove_column, str) df = df.drop([remove_column], axis=1) if (remove_column in dataset.continuous_attributes): dataset.continuous_attributes.remove(remove_column) # discretize data = utils.get_discretized_df( df, columns_to_discretize=dataset.continuous_attributes, verbose=verbose) # get X,y X = data.drop(['target'], axis=1) y = data['target'] # one-hot X = utils.get_one_hot_encoded_df(X, X.columns.to_list(), verbose=verbose) skf = KFold(n_splits=5, shuffle=True, random_state=10) skf.get_n_splits(X, y) X_trains = [] y_trains = [] X_tests = [] y_tests = [] clfs = [] clf_negs = [] os.system("mkdir -p data/model/") cnt = 0 for train, test in skf.split(X, y): X_trains.append(X.iloc[train]) y_trains.append(y.iloc[train]) X_tests.append(X.iloc[test]) y_tests.append(y.iloc[test]) if (remove_column is None): store_file = "data/model/CNF_" + dataset.name + "_" + str( dataset.config) + "_" + str(cnt) + ".pkl" else: store_file = "data/model/CNF_" + dataset.name + "_remove_" + remove_column.replace( " ", "_") + "_" + str(dataset.config) + "_" + str(cnt) + ".pkl" if (not os.path.isfile(store_file)): os.system("mkdir -p data/temp_" + str(thread)) clf = imli(num_clause=2, data_fidelity=10, work_dir="data/temp_" + str(thread), rule_type="CNF", verbose=False) clf.fit(X_trains[-1].values, y_trains[-1].values) os.system("rm -r data/temp_" + str(thread)) # save the classifier with open(store_file, 'wb') as fid: pickle.dump(clf, fid) else: # Load the classifier with open(store_file, 'rb') as fid: clf = pickle.load(fid) clfs.append(clf) if (verbose): print("\nFeatures: ", X_trains[-1].columns.to_list()) print("Number of features:", len(X_trains[-1].columns.to_list())) print("\nlearned rule:") print(clf.get_rule(X_trains[-1].columns.to_list())) if (verbose): print( "\nTrain Accuracy Score: ", metrics.accuracy_score(clf.predict(X_trains[-1].values), y_trains[-1].values), "positive ratio: ", y_trains[-1].mean()) print( "Test Accuracy Score: ", metrics.accuracy_score(clf.predict(X_tests[-1].values), y_tests[-1].values), "positive ratio: ", y_tests[-1].mean()) cnt += 1 if (compute_equalized_odds): return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests return clfs, X_trains, X_tests, dataset.known_sensitive_attributes
def init(dataset, repaired=False, verbose=False, compute_equalized_odds=False, depth=5, remove_column=None): df = dataset.get_df(repaired=repaired) # get X,y X = df.drop(['target'], axis=1) y = df['target'] if (remove_column is not None): assert isinstance(remove_column, str) X = X.drop([remove_column], axis=1) # one-hot X = utils.get_one_hot_encoded_df(X, dataset.categorical_attributes, verbose=verbose) skf = KFold(n_splits=5, shuffle=True, random_state=10) skf.get_n_splits(X, y) X_trains = [] y_trains = [] X_tests = [] y_tests = [] clfs = [] os.system("mkdir -p data/model/") cnt = 0 for train, test in skf.split(X, y): X_trains.append(X.iloc[train]) y_trains.append(y.iloc[train]) X_tests.append(X.iloc[test]) y_tests.append(y.iloc[test]) if (remove_column is None): store_file = "data/model/DT_" + dataset.name + "_" + str( dataset.config) + "_" + str(depth) + "_" + str(cnt) + ".pkl" else: store_file = "data/model/DT_" + dataset.name + "_remove_" + remove_column.replace( " ", "_") + "_" + str(dataset.config) + "_" + str( depth) + "_" + str(cnt) + ".pkl" if (not os.path.isfile(store_file)): clf = tree.DecisionTreeClassifier(max_depth=depth) clf.fit(X_trains[-1], y_trains[-1]) tree_preds = clf.predict_proba(X_tests[-1])[:, 1] # save the classifier with open(store_file, 'wb') as fid: pickle.dump(clf, fid) else: # Load the classifier with open(store_file, 'rb') as fid: clf = pickle.load(fid) clfs.append(clf) # clf = tree.DecisionTreeClassifier() # clf = clf.fit(X_train, y_train) predict_train = clf.predict(X_trains[-1]) predict_test = clf.predict(X_tests[-1]) if (verbose): print("\nTrain accuracy:", metrics.accuracy_score(y_trains[-1], predict_train), "positive ratio: ", y_trains[-1].mean()) print("Test accuracy:", metrics.accuracy_score(y_tests[-1], predict_test), "positive ratio: ", y_tests[-1].mean()) print("Train set positive prediction", predict_train.mean()) print("Test set positive prediction", predict_test.mean()) cnt += 1 if (compute_equalized_odds): return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests return clfs, X_trains, X_tests, dataset.known_sensitive_attributes
def init_iris(): # dataset.data is a np matrix, # dataset.target is a np array # dataset['features] is the list of features in the original dataset # prepare iris dataset for binary classification target = "target" dataset = sklearn.datasets.load_iris() dataset[target] = np.where(dataset[target] == 2, 0, dataset[target]) # get df dataset = utils.sklearn_to_df(dataset) index_of_sensitive_features = 0 # discretize sensitive attributes data = utils.get_discretized_df( dataset, columns_to_discretize=[ dataset.columns.to_list()[index_of_sensitive_features] ]) # get X,y X = data.drop(['target'], axis=1) y = data['target'] # one-hot X = utils.get_one_hot_encoded_df(X, X.columns.to_list()) # split into train_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # Extract new names of sensitive attributes _sensitive_attributes = { } # it is a map because each entry contains all one-hot encoded variables for _column in X_train.columns.to_list(): if ("_" in _column and _column.split("_")[0] in dataset.columns.to_list()[index_of_sensitive_features]): if (_column.split("_")[0] not in _sensitive_attributes): _sensitive_attributes[_column.split("_")[0]] = [_column] else: _sensitive_attributes[_column.split("_")[0]].append(_column) elif (_column in dataset.columns.to_list()[index_of_sensitive_features]): if (_column not in _sensitive_attributes): _sensitive_attributes[_column] = [_column] else: _sensitive_attributes[_column].append(_column) # Finally make a 2d list sensitive_attributes = [] for key in _sensitive_attributes: sensitive_attributes.append(_sensitive_attributes[key]) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) predict_train = clf.predict(X_train) predict_test = clf.predict(X_test) print("Train accuracy:", metrics.accuracy_score(y_train, predict_train), "positive ratio: ", y_train.mean()) print("Test accuracy:", metrics.accuracy_score(y_test, predict_test), "positive ratio: ", y_test.mean()) print("Train set positive prediction", predict_train.mean()) print("Test set positive prediction", predict_test.mean()) return clf, X_train.columns.to_list( ), sensitive_attributes, X_train, X_test
def init(dataset, classifier="lr", repaired=False, verbose=False, compute_equalized_odds=False, remove_column=None): df = dataset.get_df(repaired=repaired) # discretize # df = utils.get_discretized_df(df, columns_to_discretize=dataset.continuous_attributes) # get X,y X = df.drop(['target'], axis=1) y = df['target'] if (remove_column is not None): assert isinstance(remove_column, str) X = X.drop([remove_column], axis=1) # one-hot X = utils.get_one_hot_encoded_df(X, dataset.categorical_attributes) # X = utils.get_one_hot_encoded_df(X,X.columns.to_list()) skf = KFold(n_splits=5, shuffle=True, random_state=10) skf.get_n_splits(X, y) X_trains = [] y_trains = [] X_tests = [] y_tests = [] clfs = [] cnt = 0 os.system("mkdir -p data/model/") for train, test in skf.split(X, y): X_trains.append(X.iloc[train]) y_trains.append(y.iloc[train]) X_tests.append(X.iloc[test]) y_tests.append(y.iloc[test]) clf = None if (classifier == "lr"): if (remove_column is None): store_file = "data/model/LR_" + dataset.name + "_" + str( dataset.config) + "_" + str(cnt) + ".pkl" else: store_file = "data/model/LR_" + dataset.name + "_remove_" + remove_column.replace( " ", "_") + "_" + str( dataset.config) + "_" + str(cnt) + ".pkl" if (not os.path.isfile(store_file)): # For linear classifier, we use Logistic regression model of sklearn clf = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=0) clf.fit(X_trains[-1], y_trains[-1]) # save the classifier with open(store_file, 'wb') as fid: pickle.dump(clf, fid) else: # Load the classifier with open(store_file, 'rb') as fid: clf = pickle.load(fid) elif (classifier == "svm-linear"): if (remove_column is None): store_file = "data/model/SVM_" + dataset.name + "_" + str( dataset.config) + "_" + str(cnt) + ".pkl" else: store_file = "data/model/SVM_" + dataset.name + "_remove_" + remove_column.replace( " ", "_") + "_" + str( dataset.config) + "_" + str(cnt) + ".pkl" if (not os.path.isfile(store_file)): # For linear classifier, we use Logistic regression model of sklearn clf = SVC(kernel="linear") clf.fit(X_trains[-1], y_trains[-1]) # save the classifier with open(store_file, 'wb') as fid: pickle.dump(clf, fid) else: # Load the classifier with open(store_file, 'rb') as fid: clf = pickle.load(fid) else: raise ValueError(classifier) clfs.append(clf) if (verbose): print("\nFeatures: ", X_trains[-1].columns.to_list()) print("Number of features:", len(X_trains[-1].columns.to_list())) print("\nWeights: ", clf.coef_[0]) print("\nBias:", clf.intercept_[0]) assert len(clf.coef_[0]) == len( X_trains[-1].columns ), "Error: wrong dimension of features and weights" print("Train Accuracy Score: ", clf.score(X_trains[-1], y_trains[-1]), "positive ratio: ", y_trains[-1].mean()) print("Test Accuracy Score: ", clf.score(X_tests[-1], y_tests[-1]), "positive ratio: ", y_tests[-1].mean()) cnt += 1 if (compute_equalized_odds): return clfs, X_trains, X_tests, dataset.known_sensitive_attributes, y_trains, y_tests return clfs, X_trains, X_tests, dataset.known_sensitive_attributes