def test_valid_inputs(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion, grid_size=2) X, Y, A = _quick_data(A_two_dim) gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert_n_grid_search_results(2, gs)
def test_bgl_lagrange_specifications(A_two_dim): a0_count = 13 a1_count = 4 a0_label = 5 a1_label = 3 a0_factor = 1 a1_factor = 16 X, y, A = _simple_regression_data(a0_count, a1_count, a0_factor, a1_factor, a0_label, a1_label, A_two_dim) estimator = LinearRegression() # Do the grid search with a zero Lagrange multiplier idx = pd.Int64Index(sorted([a0_label, a1_label])) l0_series = pd.Series([2.0, 0.0], index=idx) l1_series = pd.Series([1.5, 0.5], index=idx) l2_series = pd.Series([1.0, 1.0], index=idx) l3_series = pd.Series([0.5, 1.5], index=idx) l4_series = pd.Series([0.0, 2.0], index=idx) grid_df = pd.concat([l0_series, l1_series, l2_series, l3_series, l4_series], axis=1) grid_search1 = GridSearch(copy.deepcopy(estimator), constraints=BoundedGroupLoss(ZeroOneLoss()), grid_size=5) grid_search2 = GridSearch(copy.deepcopy(estimator), constraints=BoundedGroupLoss(ZeroOneLoss()), grid=grid_df) tradeoffs = [0, 0.25, 0.5, 0.75, 1] grid_search1.fit(X, y, sensitive_features=A) grid_search2.fit(X, y, sensitive_features=A) assert_n_grid_search_results(len(tradeoffs), grid_search1) assert_n_grid_search_results(len(tradeoffs), grid_search2) # Check we generated the same multipliers for i in range(len(tradeoffs)): lm1 = grid_search1.lambda_vecs_[i] lm2 = grid_search2.lambda_vecs_[i] assert lm1.equals(lm2) # Check the models are the same for i in range(len(tradeoffs)): coef1 = grid_search1.predictors_[i].coef_ coef2 = grid_search2.predictors_[i].coef_ assert np.array_equal(coef1, coef2)
def test_bgl_unfair(A_two_dim): a0_count = 5 a1_count = 7 a0_label = 2 a1_label = 3 a0_factor = 1 a1_factor = 16 grid_size = 7 X, Y, A = _simple_regression_data( a0_count, a1_count, a0_factor, a1_factor, a0_label, a1_label, A_two_dim ) bgl_square_loss = BoundedGroupLoss(SquareLoss(-np.inf, np.inf)) grid_search = GridSearch( LinearRegression(), constraints=bgl_square_loss, grid_size=grid_size ) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame( { "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1], } ) best_predict = grid_search.predict(test_X) assert np.allclose([-1.91764706, 9.61176471], best_predict) all_predict = [predictor.predict(test_X) for predictor in grid_search.predictors_] # TODO: investigate where the different outcomes for the first grid point are from, likely # due to some ignored data points at the edge resulting in another solution with the same # least squares loss (i.e. both solutions acceptable). # Reflects https://github.com/fairlearn/fairlearn/issues/265 assert logging_all_close([[3.2, 11.2]], [all_predict[0]]) or logging_all_close( [[3.03010885, 11.2]], [all_predict[0]] ) assert logging_all_close( [ [-3.47346939, 10.64897959], [-2.68, 10.12], [-1.91764706, 9.61176471], [-1.18461538, 9.12307692], [-0.47924528, 8.65283019], [0.2, 0.7], ], all_predict[1:], )
def test_can_specify_and_generate_lambda_vecs(A_two_dim): score_threshold = 0.4 number_a0 = 32 number_a1 = 24 a0_label = 11 a1_label = 3 X, y, A = _simple_threshold_data(number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label) estimator = LogisticRegression(solver="liblinear", fit_intercept=True, random_state=97) iterables = [["+", "-"], ["all"], sorted([a0_label, a1_label])] midx = pd.MultiIndex.from_product(iterables, names=["sign", "event", "group_id"]) lagrange_negative_series = pd.Series([0.0, 0.0, 0.0, 2.0], index=midx) lagrange_zero_series = pd.Series(np.zeros(4), index=midx) lagrange_positive_series = pd.Series([0.0, 2.0, 0.0, 0.0], index=midx) grid_df = pd.concat( [ lagrange_negative_series, lagrange_zero_series, lagrange_positive_series ], axis=1, ) grid_search1 = GridSearch(copy.deepcopy(estimator), constraints=DemographicParity(), grid_size=3) grid_search2 = GridSearch(copy.deepcopy(estimator), constraints=DemographicParity(), grid=grid_df) # Try both ways of specifying the Lagrange multipliers grid_search2.fit(X, y, sensitive_features=A) grid_search1.fit(X, y, sensitive_features=A) assert_n_grid_search_results(3, grid_search1) assert_n_grid_search_results(3, grid_search2) # Check we generated the same multipliers for i in range(3): lm1 = grid_search1.lambda_vecs_[i] lm2 = grid_search2.lambda_vecs_[i] assert lm1.equals(lm2) # Check the models are the same for i in range(3): coef1 = grid_search1.predictors_[i].coef_ coef2 = grid_search2.predictors_[i].coef_ assert np.array_equal(coef1, coef2)
def test_demographicparity_fair_uneven_populations_with_grid_offset( A_two_dim, offset): # Grid of Lagrangian multipliers has some initial offset score_threshold = 0.625 number_a0 = 4 number_a1 = 4 a0_label = 17 a1_label = 37 grid_size = 11 iterables = [["+", "-"], ["all"], [a0_label, a1_label]] midx = pd.MultiIndex.from_product(iterables, names=["sign", "event", "group_id"]) grid_offset = pd.Series(offset, index=midx) X, Y, A = _simple_threshold_data( number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim, ) grid_search = GridSearch( LogisticRegression(solver="liblinear", fit_intercept=True), constraints=DemographicParity(), grid_size=grid_size, grid_offset=grid_offset, ) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1], }) sample_results = grid_search.predict(test_X) assert np.array_equal(sample_results, [0, 1]) sample_proba = grid_search.predict_proba(test_X) assert np.allclose(sample_proba, [[0.55069845, 0.44930155], [0.41546008, 0.58453992]]) sample_results = grid_search.predictors_[0].predict(test_X) assert np.array_equal(sample_results, [1, 0])
def test_demographicparity_fair_uneven_populations(A_two_dim): # Variant of test_demographicparity_already_fair, which has unequal # populations in the two classes # Also allow the threshold to be adjustable score_threshold = 0.625 number_a0 = 4 number_a1 = 4 a0_label = 17 a1_label = 37 grid_size = 11 X, Y, A = _simple_threshold_data( number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim, ) grid_search = GridSearch( LogisticRegression(solver="liblinear", fit_intercept=True), constraints=DemographicParity(), grid_size=grid_size, ) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1], }) sample_results = grid_search.predict(test_X) assert np.array_equal(sample_results, [0, 1]) sample_proba = grid_search.predict_proba(test_X) assert np.allclose(sample_proba, [[0.53748641, 0.46251359], [0.46688736, 0.53311264]]) sample_results = grid_search.predictors_[0].predict(test_X) assert np.array_equal(sample_results, [1, 0])
def test_lambda_vec_zero_unchanged_model(A_two_dim): score_threshold = 0.6 number_a0 = 64 number_a1 = 24 a0_label = 7 a1_label = 22 X, y, A = _simple_threshold_data( number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim, ) estimator = LogisticRegression(solver="liblinear", fit_intercept=True, random_state=97) # Train an unmitigated estimator unmitigated_estimator = copy.deepcopy(estimator) unmitigated_estimator.fit(X, y) # Do the grid search with a zero Lagrange multiplier iterables = [["+", "-"], ["all"], [a0_label, a1_label]] midx = pd.MultiIndex.from_product(iterables, names=["sign", "event", "group_id"]) lagrange_zero_series = pd.Series(np.zeros(4), index=midx) grid_df = pd.DataFrame(lagrange_zero_series) grid_search = GridSearch(estimator, constraints=DemographicParity(), grid=grid_df) grid_search.fit(X, y, sensitive_features=A) assert_n_grid_search_results(1, grid_search) # Check coefficients gs_coeff = grid_search.predictors_[grid_search.best_idx_].coef_ um_coeff = unmitigated_estimator.coef_ assert np.array_equal(gs_coeff, um_coeff)