def test_predict_different_argument_lengths(sensitive_features, sensitive_feature_names, X_transform, y_transform, sensitive_features_transform, constraints): X = X_transform(_format_as_list_of_lists(sensitive_features)) y = y_transform(labels_ex) sensitive_features_ = sensitive_features_transform(sensitive_features) adjusted_predictor = ThresholdOptimizer( unconstrained_predictor=ExamplePredictor(), constraints=constraints) adjusted_predictor.fit(X, y, sensitive_features=sensitive_features_) with pytest.raises(ValueError, match=DIFFERENT_INPUT_LENGTH_ERROR_MESSAGE.format( "X and sensitive_features")): adjusted_predictor.predict( X, sensitive_features=sensitive_features_transform( sensitive_features[:-1])) with pytest.raises(ValueError, match=DIFFERENT_INPUT_LENGTH_ERROR_MESSAGE.format( "X and sensitive_features")): adjusted_predictor.predict(X_transform( _format_as_list_of_lists(sensitive_features))[:-1], sensitive_features=sensitive_features_)
def test_random_state_threshold_optimizer(): """Test that the random_state argument works as expected. This test case reproduces the problem reported in issue 588 if the random_state does not work as intended within ThresholdOptimizer. https://github.com/fairlearn/fairlearn/issues/588 """ X_train, X_test, y_train, y_test, race_train, race_test = _get_test_data() # Train a simple logistic regression model lr = LogisticRegression(max_iter=1000, random_state=0) lr.fit(X_train, y_train) # Train threshold optimizer to = ThresholdOptimizer(estimator=lr, constraints='equalized_odds', grid_size=1000) to.fit(X_train, y_train, sensitive_features=race_train) # score groups y_pred_test = to.predict(X_test, sensitive_features=race_test, random_state=0) for _ in range(100): assert (y_pred_test == to.predict(X_test, sensitive_features=race_test, random_state=0)).all() assert (y_pred_test != to.predict( X_test, sensitive_features=race_test, random_state=1)).any()
def test_predict_different_argument_lengths(data_X_y_sf, constraints): adjusted_predictor = ThresholdOptimizer( estimator=ExamplePredictor(scores_ex), constraints=constraints, predict_method="predict", ) adjusted_predictor.fit(data_X_y_sf.X, data_X_y_sf.y, sensitive_features=data_X_y_sf.sensitive_features) with pytest.raises( ValueError, match="Found input variables with inconsistent numbers of samples" ): adjusted_predictor.predict( data_X_y_sf.X, sensitive_features=data_X_y_sf.sensitive_features[:-1]) with pytest.raises( ValueError, match="Found input variables with inconsistent numbers of samples" ): adjusted_predictor.predict( data_X_y_sf.X[:-1], sensitive_features=data_X_y_sf.sensitive_features)
def thresholdOptimizer(X_train, Y_train, A_train, model, constraint): """ Parameters: y_train: input data for training the model X_train: list of ground truths constraints: either "demographic_parity" or "equalized_odds" Returns the predictions of the optimized model """ postprocess_est = ThresholdOptimizer(estimator=model, constraints=constraint) # Balanced data set is obtained by sampling the same number of points from the majority class (Y=0) # as there are points in the minority class (Y=1) Y_train = pd.Series(Y_train) balanced_idx1 = X_train[[Y_train == 1]].index pp_train_idx = balanced_idx1.union(Y_train[Y_train == 0].sample( n=balanced_idx1.size, random_state=1234).index) X_train_balanced = X_train.loc[pp_train_idx, :] Y_train_balanced = Y_train.loc[pp_train_idx] A_train_balanced = A_train.loc[pp_train_idx] postprocess_est.fit(X_train_balanced, Y_train_balanced, sensitive_features=A_train_balanced) postprocess_preds = postprocess_est.predict(X_test, sensitive_features=A_test) return postprocess_preds
def test_predict_multiple_sensitive_features_columns_error( sensitive_features, sensitive_feature_names, X_transform, y_transform, constraints): X = X_transform(_format_as_list_of_lists(sensitive_features)) y = y_transform(labels_ex) sensitive_features_ = pd.DataFrame({ "A1": sensitive_features, "A2": sensitive_features }) adjusted_predictor = ThresholdOptimizer( unconstrained_predictor=ExamplePredictor(), constraints=constraints) adjusted_predictor.fit(X, y, sensitive_features=sensitive_features_) with pytest.raises(ValueError, match=MULTIPLE_DATA_COLUMNS_ERROR_MESSAGE.format( "sensitive_features")): adjusted_predictor.predict(X, sensitive_features=sensitive_features_)
def run_thresholdoptimizer_classification(estimator): """Run classification test with ThresholdOptimizer.""" X, Y, A = fetch_adult() to = ThresholdOptimizer(estimator=estimator, prefit=False) to.fit(X, Y, sensitive_features=A) results = to.predict(X, sensitive_features=A) assert results is not None
def test_predict_output_0_or_1(data_X_y_sf, constraints): adjusted_predictor = ThresholdOptimizer(estimator=ExamplePredictor(scores_ex), constraints=constraints, predict_method='predict') adjusted_predictor.fit(data_X_y_sf.X, data_X_y_sf.y, sensitive_features=data_X_y_sf.sensitive_features) predictions = adjusted_predictor.predict( data_X_y_sf.X, sensitive_features=data_X_y_sf.sensitive_features) for prediction in predictions: assert prediction in [0, 1]
def test_predict_output_0_or_1(sensitive_features, sensitive_feature_names, X_transform, y_transform, sensitive_features_transform, constraints): X = X_transform(_format_as_list_of_lists(sensitive_features)) y = y_transform(labels_ex) sensitive_features_ = sensitive_features_transform(sensitive_features) adjusted_predictor = ThresholdOptimizer( unconstrained_predictor=ExamplePredictor(), constraints=constraints) adjusted_predictor.fit(X, y, sensitive_features=sensitive_features_) predictions = adjusted_predictor.predict( X, sensitive_features=sensitive_features_) for prediction in predictions: assert prediction in [0, 1]
class demographic_parity_classifier(base_binary_classifier): def fit(self, _X, _Y, _classifier_name="logistic", _predictor="hard"): my_erm_classifier = erm_classifier(self.train_X, self.train_Y) my_erm_classifier.fit(self.train_X, self.train_Y, classifier_name=_classifier_name) self.model = ThresholdOptimizer(estimator=my_erm_classifier, \ constraints="demographic_parity", prefit=True) self.model.fit(self.train_X, self.train_Y, \ sensitive_features=self.sensitive_train, _predictor=_predictor) def predict(self, x_samples, sensitive_features): y_samples = self.model.predict(x_samples, sensitive_features=sensitive_features) return y_samples def get_accuracy(self, X, y_true, sensitive_features): y_pred = self.predict(X, sensitive_features) return 1 - np.sum(np.power(y_pred - y_true, 2))/len(y_true) def predict_proba(self, x_samples, sensitive_features): y_samples = self.model._pmf_predict(x_samples, sensitive_features=sensitive_features) return y_samples
def run_thresholdoptimizer_classification(estimator): """Run classification test with ThresholdOptimizer.""" X_train, Y_train, A_train, X_test, Y_test, A_test = fetch_adult() unmitigated = copy.deepcopy(estimator) unmitigated.fit(X_train, Y_train) unmitigated_predictions = unmitigated.predict(X_test) to = ThresholdOptimizer(estimator=estimator, prefit=False, predict_method='predict') to.fit(X_train, Y_train, sensitive_features=A_train) mitigated_predictions = to.predict(X_test, sensitive_features=A_test) dp_diff_unmitigated = demographic_parity_difference( Y_test, unmitigated_predictions, sensitive_features=A_test) dp_diff_mitigated = demographic_parity_difference( Y_test, mitigated_predictions, sensitive_features=A_test) assert dp_diff_mitigated <= dp_diff_unmitigated
class fair_classifier(pseudo_classifier): def __init__(self, train_X, train_y, train_score_y, sensitive_train, \ test_X, test_y, test_score_y, sensitive_test, metric, sensitive_features_dict=None, HARD=False): self.train_X = train_X self.train_Y = train_y if HARD: self.train_score_Y = np.round(train_score_y) else: self.train_score_Y = train_score_y self.sensitive_train = sensitive_train self.test_X = test_X self.test_Y = test_y if HARD: self.test_score_Y = np.round(test_score_y) else: self.test_score_Y = test_score_y self.sensitive_test = sensitive_test self.sensitive_features_dict = sensitive_features_dict self.erm_classifier = pseudo_classifier(self.train_X, self.train_Y, self.train_score_Y, \ self.sensitive_train, self.test_X, self.test_Y, self.test_score_Y, self.sensitive_test) assert (metric in ["equalized_odds", "demographic_parity"]) self.metric = metric def fit(self): self.erm_classifier.fit(self.train_X, self.train_Y) self.model = ThresholdOptimizer(estimator=self.erm_classifier, constraints=self.metric, prefit=True) self.model.fit(self.train_X, self.train_Y, sensitive_features=self.sensitive_train) def predict(self, x_samples, sensitive_features): y_samples = self.model.predict(x_samples, sensitive_features=sensitive_features) return y_samples def get_accuracy(self, X, y_true, sensitive_features): y_pred = self.predict(X, sensitive_features) return 1 - np.sum(np.power(y_pred - y_true, 2)) / len(y_true) def predict_prob(self, x_samples, sensitive_features): y_samples = self.model._pmf_predict( x_samples, sensitive_features=sensitive_features) return y_samples def get_avg_group_confusion_matrix(self, sensitive_features, X, true_Y): # produces average tp/fp/tn/fn/acc per group # Basically get_group_confusion_matrix but modified to return average values where possible # For a trained classifier, get the true positive and true negative rates based on # group identity. Dobased on groups (currently only works for binary) # sensitive_index is the index of the sensitive attribute. groups = np.unique(sensitive_features) tp_rate = {} fp_rate = {} tn_rate = {} fn_rate = {} true_pos_index = np.where(true_Y == 1) true_neg_index = np.where(true_Y == 0) # Calculate probability of classification for each input y_pred_prob = self.predict_prob(X, sensitive_features) # Calculate average probability of correct classification (i.e. expected accuracy) avg_micro_acc = (np.sum(y_pred_prob[true_pos_index][:, 1]) + np.sum( y_pred_prob[true_neg_index][:, 0])) / len(true_Y) print("Average Overall Accuracy: ", avg_micro_acc) micro_auc = roc_auc_score(true_Y, y_pred_prob[:, 1]) print("Overall AUC: ", micro_auc) out_dict = {} # The format is: {group:[tp, fp, tn, fn]} avg_macro_acc = 0 macro_auc = 0 for index, group in enumerate(groups): indicies = np.where(sensitive_features == group)[0] true_class = true_Y[indicies] pred_prob = y_pred_prob[indicies] true_pos_index = np.where(true_class == 1)[0] true_neg_index = np.where(true_class == 0)[0] if len(true_pos_index) == 0 or len(true_neg_index) == 0: print("No True positives or no true negatives in this group") continue # Find avg rates (i.e. avg probability of tp/tn/fp/fn) tp = np.sum(pred_prob[true_pos_index][:, 1]) / len(true_pos_index) tn = np.sum(pred_prob[true_neg_index][:, 0]) / len(true_neg_index) fp = np.sum(pred_prob[true_neg_index][:, 1]) / len(true_neg_index) fn = np.sum(pred_prob[true_pos_index][:, 0]) / len(true_pos_index) tp_rate[group] = tp tn_rate[group] = tn fp_rate[group] = fp fn_rate[group] = fn # Expected accuracy accuracy = (np.sum(pred_prob[true_pos_index][:, 1]) + np.sum( pred_prob[true_neg_index][:, 0])) / len(true_class) avg_macro_acc += accuracy auc = roc_auc_score(true_class, pred_prob[:, 1]) macro_auc += auc out_dict[group] = [tp, tn, fp, fn, accuracy, auc] print(group, "average confusion matrix") if tp == 0 and fp == 0: print("None classified as Positive in group", group) print("\t Average Group Accuracy: ", accuracy) else: # Can't compute F1 out of these since dealing with average values #precision = tp / (tp + fp) #recall = tp / (tp + fn) #f1 = 2 * precision * recall / (precision + recall) #print("\t F1 score: ", f1) print("\t Average Group Accuracy: ", accuracy) print("\t Group AUC: ", auc) print("\t Average True positive rate:", tp) print("\t Average True negative rate:", tn) print("\t Average False positive rate:", fp) print("\t Average False negative rate:", fn) avg_macro_acc /= len(groups) macro_auc /= len(groups) return out_dict, { "Accuracy": (avg_micro_acc, avg_macro_acc), "AUC": (micro_auc, macro_auc) }
return scores from fairlearn.postprocessing import ThresholdOptimizer estimator_wrapper = LogisticRegressionAsRegression(estimator).fit( X_train, y_train) postprocessed_predictor_EO = ThresholdOptimizer(estimator=estimator_wrapper, constraints="equalized_odds", prefit=True) postprocessed_predictor_EO.fit(X_train, y_train, sensitive_features=sensitive_features_train) fairness_aware_predictions_EO_train = postprocessed_predictor_EO.predict( X_train, sensitive_features=sensitive_features_train) fairness_aware_predictions_EO_test = postprocessed_predictor_EO.predict( X_test, sensitive_features=sensitive_features_test) # show only test data related plot by default - uncomment the next line to see # training data plot as well # show_proportions( # X_train, sensitive_features_train, fairness_aware_predictions_EO_train, # y_train, # description="equalized odds with postprocessed model on training data:", # plot_row_index=1) show_proportions( X_test, sensitive_features_test, fairness_aware_predictions_EO_test,
LogisticRegression(solver="liblinear", fit_intercept=True), ), ]) # %% # Below we will pass the pipeline to some of our mitigation techniques, # starting with :class:`fairlearn.postprocessing.ThresholdOptimizer`: threshold_optimizer = ThresholdOptimizer( estimator=pipeline, constraints="demographic_parity", predict_method="predict_proba", prefit=False, ) threshold_optimizer.fit(X_train, y_train, sensitive_features=A_train) print(threshold_optimizer.predict(X_test, sensitive_features=A_test)) print( json.dumps( threshold_optimizer.interpolated_thresholder_.interpolation_dict, default=str, indent=4, )) plot_threshold_optimizer(threshold_optimizer) # %% # Similarly, :class:`fairlearn.reductions.ExponentiatedGradient` works with # pipelines. Since it requires the :code:`sample_weight` parameter of the # underlying estimator internally we need to provide it with the correct # way of passing :code:`sample_weight` to just the :code:`"classifier"` step # using the step name followed by two underscores and :code:`sample_weight`.
def test_threshold_optimizer_multiple_sensitive_features(): # Create sensitive features so that the third column is the first two combined. # Also, the name a2 is long since that caused bug #728. # The bug caused the merged names to get cut off, resulting in multiple groups # getting merged internally. To avoid that this test case checks even internal # representations. X = pd.DataFrame([ [0, 4], [6, 2], [1, 3], [10, 5], [1, 7], [-2, 1], [3, 10], [14, 5], [1, 3], [1, 5], [1, 7], [-5, 9], [3, 13], [7, 1], [-8, 4], [9, 1], ]) y = pd.Series([0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0]) a1 = "a" a2 = "a very very very very very very very long group name" a3 = "a group name with commas ,, in , it" a4 = "a group name with backslashes \\ in \\\\ it" A = pd.DataFrame( [ [a1, a3, a1 + a3], [a1, a3, a1 + a3], [a2, a3, a2 + a3], [a2, a3, a2 + a3], [a2, a3, a2 + a3], [a2, a3, a2 + a3], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a1, a4, a1 + a4], [a1, a4, a1 + a4], ], columns=["SF1", "SF2", "SF1+2"], ) estimator = LinearRegression() estimator.fit(X, y) postprocess_est_multi = ThresholdOptimizer( estimator=estimator, constraints="demographic_parity", objective="accuracy_score", prefit=True, predict_method="predict", ) postprocess_est_combined = ThresholdOptimizer( estimator=estimator, constraints="demographic_parity", objective="accuracy_score", prefit=True, predict_method="predict", ) postprocess_est_multi.fit(X, y, sensitive_features=A.loc[:, ["SF1", "SF2"]]) postprocess_est_combined.fit(X, y, sensitive_features=A.loc[:, "SF1+2"]) X_test = pd.concat([ pd.DataFrame([[5, 4], [7, 2], [0, 3], [1, 2], [-2, 9], [1, 1], [0, 5], [-3, 3]]), X, ]) A_test = pd.concat([ pd.DataFrame( [ [a1, a3, a1 + a3], [a1, a3, a1 + a3], [a2, a3, a2 + a3], [a2, a3, a2 + a3], [a2, a4, a2 + a4], [a2, a4, a2 + a4], [a1, a4, a1 + a4], [a1, a4, a1 + a4], ], columns=["SF1", "SF2", "SF1+2"], ), A, ]) y_test = pd.concat([pd.Series([0, 1, 0, 1, 0, 1, 0, 1]), y]) y_pred_multi = postprocess_est_multi.predict( X_test, sensitive_features=A_test.loc[:, ["SF1", "SF2"]], random_state=1) y_pred_combined = postprocess_est_combined.predict( X_test, sensitive_features=A_test.loc[:, "SF1+2"], random_state=1) metricframe_multi = MetricFrame( metrics=fairness_metrics, y_true=y_test, y_pred=y_pred_multi, sensitive_features=A_test.loc[:, ["SF1", "SF2"]], ) metricframe_combined = MetricFrame( metrics=fairness_metrics, y_true=y_test, y_pred=y_pred_combined, sensitive_features=A_test.loc[:, "SF1+2"], ) # multi - names after escaping a3_escaped = a3.replace(",", "\\,") a4_escaped = a4.replace("\\", "\\\\") a13 = f"{a1},{a3_escaped}" a14 = f"{a1},{a4_escaped}" a23 = f"{a2},{a3_escaped}" a24 = f"{a2},{a4_escaped}" assert (metricframe_combined.overall == metricframe_multi.overall).all() assert (metricframe_combined.by_group.loc[a1 + a3] == metricframe_multi.by_group.loc[(a1, a3)]).all() assert (metricframe_combined.by_group.loc[a2 + a3] == metricframe_multi.by_group.loc[(a2, a3)]).all() assert (metricframe_combined.by_group.loc[a1 + a4] == metricframe_multi.by_group.loc[(a1, a4)]).all() assert (metricframe_combined.by_group.loc[a2 + a4] == metricframe_multi.by_group.loc[(a2, a4)]).all() # comparing string representations of interpolation dicts is sufficient assert str( postprocess_est_combined.interpolated_thresholder_.interpolation_dict[ a1 + a3]) == str(postprocess_est_multi.interpolated_thresholder_. interpolation_dict[a13]) assert str( postprocess_est_combined.interpolated_thresholder_.interpolation_dict[ a1 + a4]) == str(postprocess_est_multi.interpolated_thresholder_. interpolation_dict[a14]) assert str( postprocess_est_combined.interpolated_thresholder_.interpolation_dict[ a2 + a3]) == str(postprocess_est_multi.interpolated_thresholder_. interpolation_dict[a23]) assert str( postprocess_est_combined.interpolated_thresholder_.interpolation_dict[ a2 + a4]) == str(postprocess_est_multi.interpolated_thresholder_. interpolation_dict[a24])
def test_perf(perf_test_configuration, request): print("Starting with test case {}".format(request.node.name)) print("Downloading dataset") dataset = datasets[perf_test_configuration.dataset]() X_train, X_test = dataset.get_X() y_train, y_test = dataset.get_y() print("Done downloading dataset") if perf_test_configuration.dataset == "adult_uci": # sensitive feature is 8th column (sex) sensitive_features_train = X_train[:, 7] sensitive_features_test = X_test[:, 7] elif perf_test_configuration.dataset == "diabetes_sklearn": # sensitive feature is 2nd column (sex) # features have been scaled, but sensitive feature needs to be str or int sensitive_features_train = X_train[:, 1].astype(str) sensitive_features_test = X_test[:, 1].astype(str) # labels can't be floats as of now y_train = y_train.astype(int) y_test = y_test.astype(int) elif perf_test_configuration.dataset == "compas": # sensitive feature is either race or sex # TODO add another case where we use sex as well, or both (?) sensitive_features_train, sensitive_features_test = dataset.get_sensitive_features( 'race') y_train = y_train.astype(int) y_test = y_test.astype(int) else: raise ValueError("Sensitive features unknown for dataset {}".format( perf_test_configuration.dataset)) print("Fitting estimator") estimator = models[perf_test_configuration.predictor]() unconstrained_predictor = models[perf_test_configuration.predictor]() unconstrained_predictor.fit(X_train, y_train) print("Done fitting estimator") start_time = time() if perf_test_configuration.mitigator == ThresholdOptimizer.__name__: mitigator = ThresholdOptimizer( unconstrained_predictor=unconstrained_predictor, constraints=DEMOGRAPHIC_PARITY) elif perf_test_configuration.mitigator == ExponentiatedGradient.__name__: mitigator = ExponentiatedGradient(estimator=estimator, constraints=DemographicParity()) elif perf_test_configuration.mitigator == GridSearch.__name__: mitigator = GridSearch(estimator=estimator, constraints=DemographicParity()) else: raise Exception("Unknown mitigation technique.") print("Fitting mitigator") mitigator.fit(X_train, y_train, sensitive_features=sensitive_features_train) if perf_test_configuration.mitigator == ThresholdOptimizer.__name__: mitigator.predict(X_test, sensitive_features=sensitive_features_test, random_state=1) else: mitigator.predict(X_test) # TODO evaluate accuracy/fairness tradeoff total_time = time() - start_time print("Total time taken: {}s".format(total_time)) print("Maximum allowed time: {}s".format( perf_test_configuration.max_time_consumption)) assert total_time <= perf_test_configuration.max_time_consumption print( "\n\n===============================================================\n\n" )