def fit(self, x, y, **kwargs): _, y_train, sensitive_features = _validate_and_reformat_input( x, y, enforce_binary_labels=False, **kwargs) if self.loss == "square": # squared loss reweighting X, A, Y, W = augment.augment_data_sq(x, sensitive_features, y_train, self.Theta) elif self.loss == "absolute": # absolute loss reweighting (uniform) X, A, Y, W = augment.augment_data_ab(x, sensitive_features, y_train, self.Theta) elif self.loss == "logistic": # logisitic reweighting X, A, Y, W = augment.augment_data_logistic(x, sensitive_features, y_train, self.Theta) else: raise Exception('Loss not supported: ', str(loss)) if self.constraints == "DP": # DP constraint self.constraints = DemographicParity_Theta() self.expgrad = ExponentiatedGradient(self.estimator, self.constraints, self.eps, error_weights=W) self.expgrad.fit(X, Y, sensitive_features=A) self.weights_ = self.expgrad.weights_ self.best_classifier = lambda X: _mean_pred( X, self.expgrad._hs, self.expgrad.weights_) self._hs = self.expgrad._hs self.predictors_ = self.expgrad.predictors_ self.best_gap_ = self.expgrad.best_gap_ self.last_iter_ = self.expgrad.last_iter_ self.best_iter_ = self.expgrad.best_iter_ self.n_oracle_calls_ = self.expgrad.n_oracle_calls_ self.n_classifiers = len(self._hs) else: # exception raise Exception('Constraint not supported: ', str(constraint))
def _pmf_predict(self, X, *, sensitive_features): """Probabilistic mass function. :param X: Feature matrix :type X: numpy.ndarray or pandas.DataFrame :param sensitive_features: Sensitive features to identify groups by, currently allows only a single column :type sensitive_features: Currently 1D array as numpy.ndarray, list, pandas.DataFrame, or pandas.Series :return: array of tuples with probabilities for predicting 0 or 1, respectively. The sum of the two numbers in each tuple needs to add up to 1. :rtype: numpy.ndarray """ check_is_fitted(self) base_predictions = np.array(self.estimator_.predict(X)) _, base_predictions_vector, sensitive_feature_vector = _validate_and_reformat_input( X, y=base_predictions, sensitive_features=sensitive_features, expect_y=True, enforce_binary_labels=False) positive_probs = 0.0 * base_predictions_vector for a, interpolation in self.interpolation_dict.items(): interpolated_predictions = \ interpolation.p0 * interpolation.operation0(base_predictions_vector) + \ interpolation.p1 * interpolation.operation1(base_predictions_vector) if 'p_ignore' in interpolation: interpolated_predictions = \ interpolation.p_ignore * interpolation.prediction_constant + \ (1 - interpolation.p_ignore) * interpolated_predictions positive_probs[sensitive_feature_vector == a] = \ interpolated_predictions[sensitive_feature_vector == a] return np.array([1.0 - positive_probs, positive_probs]).transpose()
def load_data(self, X, y, *, sensitive_features): """Load data into the moment object.""" X_train, y_train, sf_train, _ = \ _validate_and_reformat_input(X, y, enforce_binary_labels=False, sensitive_features=sensitive_features) if self.no_groups: sf_train = y_train.apply(lambda v: _ALL) # The following uses X and not X_train so that the estimators get X untouched super().load_data(X, y_train, sensitive_features=sf_train) self.prob_attr = self.tags.groupby(_GROUP_ID).size() / self.total_samples self.index = self.prob_attr.index self.default_objective_lambda_vec = self.prob_attr # fill in the information about the basis attr_vals = self.tags[_GROUP_ID].unique() self.pos_basis = pd.DataFrame() self.neg_basis = pd.DataFrame() self.neg_basis_present = pd.Series(dtype='float64') zero_vec = pd.Series(0.0, self.index) i = 0 for attr in attr_vals: self.pos_basis[i] = 0 + zero_vec self.neg_basis[i] = 0 + zero_vec self.pos_basis[i][attr] = 1 self.neg_basis_present.at[i] = False i += 1
def predict(self, X, *, sensitive_features, random_state=None): """Predict label for each sample in X while taking into account sensitive features. :param X: feature matrix :type X: numpy.ndarray or pandas.DataFrame :param sensitive_features: sensitive features to identify groups by, currently allows only a single column :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame, or pandas.Series :param random_state: set to a constant for reproducibility :type random_state: int :return: predictions in numpy.ndarray """ if random_state: random.seed(random_state) self._validate_post_processed_predictor_is_fitted() _, _, sensitive_feature_vector = _validate_and_reformat_input( X, y=None, sensitive_features=sensitive_features, expect_y=False, enforce_binary_labels=True) unconstrained_predictions = self._unconstrained_predictor.predict(X) positive_probs = _vectorized_prediction( self._post_processed_predictor_by_sensitive_feature, sensitive_feature_vector, unconstrained_predictions) return (positive_probs >= np.random.rand(len(positive_probs))) * 1
def fit(self, X, y, *, sensitive_features, **kwargs): """Fit the model. The fit is based on training features and labels, sensitive features, as well as the fairness-unaware predictor or estimator. If an estimator was passed in the constructor this fit method will call `fit(X, y, **kwargs)` on said estimator. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list :param sensitive_features: sensitive features to identify groups by, currently allows only a single column :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame, or pandas.Series """ if self.estimator is None: raise ValueError(ESTIMATOR_ERROR_MESSAGE) if self.constraints not in _SUPPORTED_CONSTRAINTS: raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE) _, _, sensitive_feature_vector = _validate_and_reformat_input( X, y, sensitive_features=sensitive_features, enforce_binary_labels=True) # postprocessing can't handle 0/1 as floating point numbers, so this converts it to int if type(y) in [np.ndarray, pd.DataFrame, pd.Series]: y = y.astype(int) else: y = [int(y_val) for y_val in y] if not self.prefit: self.estimator_ = clone(self.estimator).fit(X, y, **kwargs) else: try: check_is_fitted(self.estimator) self.estimator_ = self.estimator except NotFittedError: self.estimator_ = clone(self.estimator).fit(X, y, **kwargs) scores = self.estimator_.predict(X) threshold_optimization_method = None if self.constraints == DEMOGRAPHIC_PARITY: threshold_optimization_method = \ self._threshold_optimization_demographic_parity elif self.constraints == EQUALIZED_ODDS: threshold_optimization_method = \ self._threshold_optimization_equalized_odds else: raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE) self._post_processed_predictor_by_sensitive_feature = threshold_optimization_method( sensitive_feature_vector, y, scores, self.grid_size, self.flip)
def load_data(self, X, y, *, sensitive_features, control_features=None): """Load the specified data into the object.""" _, y_train, sf_train, cf_train = \ _validate_and_reformat_input(X, y, enforce_binary_labels=True, sensitive_features=sensitive_features, control_features=control_features) # The following uses X so that the estimators get X untouched super().load_data(X, y_train, sensitive_features=sf_train) self.index = [_ALL]
def load_data(self, X, y, *, sensitive_features, control_features=None): """Load the specified data into the object.""" _, y_train, sf_train, cf_train = \ _validate_and_reformat_input(X, y, enforce_binary_labels=True, sensitive_features=sensitive_features, control_features=control_features) base_event = pd.Series(data=_ALL, index=y_train.index) event = _merge_event_and_control_columns(base_event, cf_train) super().load_data(X, y_train, event=event, sensitive_features=sf_train)
def load_data(self, X, y, *, sensitive_features, control_features=None): """Load the specified data into the object.""" _, y_train, sf_train, cf_train = \ _validate_and_reformat_input(X, y, enforce_binary_labels=True, sensitive_features=sensitive_features, control_features=control_features) base_event = y_train.apply(lambda v: _LABEL + "=" + str(v)) event = _merge_event_and_control_columns(base_event, cf_train) super().load_data(X, y_train, event=event, sensitive_features=sf_train)
def load_data(self, X, y, *, sensitive_features, control_features=None): """Load the specified data into the object.""" _, y_train, sf_train, cf_train = \ _validate_and_reformat_input(X, y, enforce_binary_labels=True, sensitive_features=sensitive_features, control_features=control_features) # The `where` clause is used to put `pd.nan` on all values where `Y!=0`. base_event = y_train.apply(lambda v: _LABEL + "=" + str(v)).where( y_train == 0) event = _merge_event_and_control_columns(base_event, cf_train) super().load_data(X, y_train, event=event, sensitive_features=sf_train)
def fit(self, x, y, **kwargs): _, y_train, sensitive_features = _validate_and_reformat_input( x, y, enforce_binary_labels=False, **kwargs) if self.loss == "square": # squared loss reweighting X, A, Y, W = augment.augment_data_sq(x, sensitive_features, y_train, self.Theta) elif self.loss == "absolute": # absolute loss reweighting (uniform) X, A, Y, W = augment.augment_data_ab(x, sensitive_features, y_train, self.Theta) elif self.loss == "logistic": # logisitic reweighting X, A, Y, W = augment.augment_data_logistic(x, sensitive_features, y_train, self.Theta) else: raise Exception('Loss not supported: ', str(loss)) if self.constraints == "DP": # DP constraint self.constraints = DemographicParity_Theta() self.grid_search = GridSearch(self.estimator, self.constraints, self.selection_rule, self.constraint_weight, self.grid_size, self.grid_limit, self.grid_offset, self.grid, W) self.grid_search.fit(X, Y, sensitive_features=A) else: # exception raise Exception('Constraint not supported: ', str(constraint))
def _pmf_predict(self, X, *, sensitive_features): """Probabilistic mass function. :param X: Feature matrix :type X: numpy.ndarray or pandas.DataFrame :param sensitive_features: Sensitive features to identify groups by, currently allows only a single column :type sensitive_features: Currently 1D array as numpy.ndarray, list, pandas.DataFrame, or pandas.Series :return: array of tuples with probabilities for predicting 0 or 1, respectively. The sum of the two numbers in each tuple needs to add up to 1. :rtype: numpy.ndarray """ self._validate_post_processed_predictor_is_fitted() _, _, sensitive_feature_vector = _validate_and_reformat_input( X, y=None, sensitive_features=sensitive_features, expect_y=False, enforce_binary_labels=True) positive_probs = _vectorized_prediction( self._post_processed_predictor_by_sensitive_feature, sensitive_feature_vector, self._unconstrained_predictor.predict(X)) return np.array([[1.0 - p, p] for p in positive_probs])
def fit(self, X, y, **kwargs): """Run the grid search. This will result in multiple copies of the estimator being made, and the :code:`fit(X)` method of each one called. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list :param sensitive_features: A (currently) required keyword argument listing the feature used by the constraints object :type sensitive_features: numpy.ndarray, pandas.DataFrame, pandas.Series, or list (for now) """ self.predictors_ = [] self.lambda_vecs_ = pd.DataFrame(dtype=np.float64) self.objectives_ = [] self.gammas_ = pd.DataFrame(dtype=np.float64) self.oracle_execution_times_ = [] if isinstance(self.constraints, ClassificationMoment): logger.debug("Classification problem detected") is_classification_reduction = True else: logger.debug("Regression problem detected") is_classification_reduction = False _, y_train, sensitive_features_train = _validate_and_reformat_input( X, y, enforce_binary_labels=is_classification_reduction, **kwargs) kwargs[_KW_SENSITIVE_FEATURES] = sensitive_features_train # Prep the parity constraints and objective logger.debug("Preparing constraints and objective") self.constraints.load_data(X, y_train, **kwargs) objective = self.constraints.default_objective() objective.load_data(X, y_train, **kwargs) # Basis information pos_basis = self.constraints.pos_basis neg_basis = self.constraints.neg_basis neg_allowed = self.constraints.neg_basis_present objective_in_the_span = (self.constraints.default_objective_lambda_vec is not None) if self.grid is None: logger.debug("Creating grid of size %i", self.grid_size) grid = _GridGenerator(self.grid_size, self.grid_limit, pos_basis, neg_basis, neg_allowed, objective_in_the_span, self.grid_offset).grid else: logger.debug("Using supplied grid") grid = self.grid # Fit the estimates logger.debug("Setup complete. Starting grid search") for i in grid.columns: lambda_vec = grid[i] logger.debug("Obtaining weights") weights = self.constraints.signed_weights(lambda_vec) if not objective_in_the_span: weights = weights + objective.signed_weights() if is_classification_reduction: logger.debug("Applying relabelling for classification problem") y_reduction = 1 * (weights > 0) weights = weights.abs() else: y_reduction = y_train y_reduction_unique = np.unique(y_reduction) if len(y_reduction_unique) == 1: logger.debug( "y_reduction had single value. Using DummyClassifier") current_estimator = DummyClassifier( strategy='constant', constant=y_reduction_unique[0]) else: logger.debug("Using underlying estimator") current_estimator = copy.deepcopy(self.estimator) oracle_call_start_time = time() current_estimator.fit(X, y_reduction, sample_weight=weights) oracle_call_execution_time = time() - oracle_call_start_time logger.debug("Call to estimator complete") def predict_fct(X): return current_estimator.predict(X) self.predictors_.append(current_estimator) self.lambda_vecs_[i] = lambda_vec self.objectives_.append(objective.gamma(predict_fct)[0]) self.gammas_[i] = self.constraints.gamma(predict_fct) self.oracle_execution_times_.append(oracle_call_execution_time) logger.debug("Selecting best_result") if self.selection_rule == TRADEOFF_OPTIMIZATION: def loss_fct(i): return self.objective_weight * self.objectives_[i] + \ self.constraint_weight * self.gammas_[i].max() losses = [loss_fct(i) for i in range(len(self.objectives_))] self.best_idx_ = losses.index(min(losses)) else: raise RuntimeError("Unsupported selection rule") return
def fit(self, X, y, *, sensitive_features, **kwargs): """Fit the model. The fit is based on training features and labels, sensitive features, as well as the fairness-unaware predictor or estimator. If an estimator was passed in the constructor this fit method will call `fit(X, y, **kwargs)` on said estimator. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list :param sensitive_features: sensitive features to identify groups by, currently allows only a single column :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame, or pandas.Series """ if self.estimator is None: raise ValueError(BASE_ESTIMATOR_NONE_ERROR_MESSAGE) if self.constraints in SIMPLE_CONSTRAINTS: if self.objective not in OBJECTIVES_FOR_SIMPLE_CONSTRAINTS: raise ValueError( NOT_SUPPORTED_OBJECTIVES_FOR_SIMPLE_CONSTRAINTS_ERROR_MESSAGE .format(self.constraints)) elif self.constraints == "equalized_odds": if self.objective not in OBJECTIVES_FOR_EQUALIZED_ODDS: raise ValueError( NOT_SUPPORTED_OBJECTIVES_FOR_EQUALIZED_ODDS_ERROR_MESSAGE) else: raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE) _, _, sensitive_feature_vector = _validate_and_reformat_input( X, y, sensitive_features=sensitive_features, enforce_binary_labels=True) # postprocessing can't handle 0/1 as floating point numbers, so this converts it to int if type(y) in [np.ndarray, pd.DataFrame, pd.Series]: y = y.astype(int) else: y = [int(y_val) for y_val in y] if not self.prefit: # Following is on two lines due to issue when estimator comes from TensorFlow self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **kwargs) else: try: check_is_fitted(self.estimator) except NotFittedError: warn( BASE_ESTIMATOR_NOT_FITTED_WARNING.format( type(self).__name__)) self.estimator_ = self.estimator scores = self.estimator_.predict(X) if self.constraints == "equalized_odds": self.x_metric_ = "false_positive_rate" self.y_metric_ = "true_positive_rate" threshold_optimization_method = self._threshold_optimization_for_equalized_odds else: self.x_metric_ = SIMPLE_CONSTRAINTS[self.constraints] self.y_metric_ = self.objective threshold_optimization_method = self._threshold_optimization_for_simple_constraints self.interpolated_thresholder_ = threshold_optimization_method( sensitive_feature_vector, y, scores) return self
def fit(self, X, y, **kwargs): """Return a fair classifier under specified fairness constraints. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list """ self.lambda_vecs_EG_ = pd.DataFrame() self.lambda_vecs_LP_ = pd.DataFrame() self.lambda_vecs_ = pd.DataFrame() if isinstance(self.constraints, ClassificationMoment): logger.debug("Classification problem detected") is_classification_reduction = True else: logger.debug("Regression problem detected") is_classification_reduction = False _, y_train, sensitive_features = _validate_and_reformat_input( X, y, enforce_binary_labels=is_classification_reduction, **kwargs) n = y_train.shape[0] logger.debug("...Exponentiated Gradient STARTING") B = 1 / self.eps lagrangian = _Lagrangian(X, sensitive_features, y_train, self.estimator, self.constraints, self.eps, B) theta = pd.Series(0, lagrangian.constraints.index) Qsum = pd.Series(dtype="float64") gaps_EG = [] gaps = [] Qs = [] last_regret_checked = _REGRET_CHECK_START_T last_gap = np.PINF for t in range(0, self.max_iter): logger.debug("...iter=%03d", t) # set lambdas for every constraint lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum()) self.lambda_vecs_EG_[t] = lambda_vec lambda_EG = self.lambda_vecs_EG_.mean(axis=1) # select classifier according to best_h method h, h_idx = lagrangian.best_h(lambda_vec) if t == 0: if self.nu is None: self.nu = _ACCURACY_MUL * ( h(X) - y_train).abs().std() / np.sqrt(n) eta_min = self.nu / (2 * B) eta = self.eta0 / B logger.debug( "...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f", self.eps, B, self.nu, self.max_iter, eta_min) if h_idx not in Qsum.index: Qsum.at[h_idx] = 0.0 Qsum[h_idx] += 1.0 gamma = lagrangian.gammas[h_idx] Q_EG = Qsum / Qsum.sum() result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self.nu) gap_EG = result_EG.gap() gaps_EG.append(gap_EG) if t == 0 or not self.run_linprog_step: gap_LP = np.PINF else: # saddle point optimization over the convex hull of # classifiers returned so far Q_LP, self.lambda_vecs_LP_[ t], result_LP = lagrangian.solve_linprog(self.nu) gap_LP = result_LP.gap() # keep values from exponentiated gradient or linear programming if gap_EG < gap_LP: Qs.append(Q_EG) gaps.append(gap_EG) else: Qs.append(Q_LP) gaps.append(gap_LP) logger.debug( "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f, gap=%.6f, disp=%.3f, " "err=%.3f, gap_LP=%.6f", _INDENTATION, eta, result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG, result_EG.gamma.max(), result_EG.error, gap_LP) if (gaps[t] < self.nu) and (t >= _MIN_ITER): # solution found break # update regret if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T: best_gap = min(gaps_EG) if best_gap > last_gap * _SHRINK_REGRET: eta *= _SHRINK_ETA last_regret_checked = t last_gap = best_gap # update theta based on learning rate theta += eta * (gamma - self.eps) # retain relevant result data gaps_series = pd.Series(gaps) gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION] self.best_iter_ = gaps_best.index[-1] self.best_gap_ = gaps[self.best_iter_] self.weights_ = Qs[self.best_iter_] self._hs = lagrangian.hs for h_idx in self._hs.index: if h_idx not in self.weights_.index: self.weights_.at[h_idx] = 0.0 self.last_iter_ = len(Qs) - 1 self.predictors_ = lagrangian.predictors self.n_oracle_calls_ = lagrangian.n_oracle_calls self.n_oracle_calls_dummy_returned_ = lagrangian.n_oracle_calls_dummy_returned self.oracle_execution_times_ = lagrangian.oracle_execution_times self.lambda_vecs_ = lagrangian.lambdas logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f", self.eps, B, self.nu, self.max_iter, eta_min) logger.debug( "...last_iter=%d, best_iter=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d", self.last_iter_, self.best_iter_, self.best_gap_, lagrangian.n_oracle_calls, len(lagrangian.predictors))
def fit(self, X, y, **kwargs): """Return a fair classifier under specified fairness constraints. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list """ _, y_train, A = _validate_and_reformat_input(X, y, **kwargs) n = y_train.shape[0] logger.debug("...Exponentiated Gradient STARTING") B = 1 / self._eps lagrangian = _Lagrangian(X, A, y_train, self._estimator, self._constraints, self._eps, B) theta = pd.Series(0, lagrangian.constraints.index) Qsum = pd.Series() lambdas = pd.DataFrame() gaps_EG = [] gaps = [] Qs = [] last_regret_checked = _REGRET_CHECK_START_T last_gap = np.PINF for t in range(0, self._T): logger.debug("...iter=%03d", t) # set lambdas for every constraint lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum()) lambdas[t] = lambda_vec lambda_EG = lambdas.mean(axis=1) # select classifier according to best_h method h, h_idx = lagrangian.best_h(lambda_vec) pred_h = h(X) if t == 0: if self._nu is None: self._nu = _ACCURACY_MUL * ( pred_h - y_train).abs().std() / np.sqrt(n) eta_min = self._nu / (2 * B) eta = self._eta_mul / B logger.debug( "...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f", self._eps, B, self._nu, self._T, eta_min) if h_idx not in Qsum.index: Qsum.at[h_idx] = 0.0 Qsum[h_idx] += 1.0 gamma = lagrangian.gammas[h_idx] Q_EG = Qsum / Qsum.sum() result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self._nu) gap_EG = result_EG.gap() gaps_EG.append(gap_EG) if t == 0 or not _RUN_LP_STEP: gap_LP = np.PINF else: # saddle point optimization over the convex hull of # classifiers returned so far Q_LP, _, result_LP = lagrangian.solve_linprog(self._nu) gap_LP = result_LP.gap() # keep values from exponentiated gradient or linear programming if gap_EG < gap_LP: Qs.append(Q_EG) gaps.append(gap_EG) else: Qs.append(Q_LP) gaps.append(gap_LP) logger.debug( "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f" ", gap=%.6f, disp=%.3f, err=%.3f, gap_LP=%.6f", _INDENTATION, eta, result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG, result_EG.gamma.max(), result_EG.error, gap_LP) if (gaps[t] < self._nu) and (t >= _MIN_T): # solution found break # update regret if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T: best_gap = min(gaps_EG) if best_gap > last_gap * _SHRINK_REGRET: eta *= _SHRINK_ETA last_regret_checked = t last_gap = best_gap # update theta based on learning rate theta += eta * (gamma - self._eps) # retain relevant result data gaps_series = pd.Series(gaps) gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION] self._best_t = gaps_best.index[-1] self._best_gap = gaps[self._best_t] self._weights = Qs[self._best_t] hs = lagrangian.hs for h_idx in hs.index: if h_idx not in self._weights.index: self._weights.at[h_idx] = 0.0 self._last_t = len(Qs) - 1 self._best_classifier = lambda X: _mean_pred(X, hs, self._weights) self._classifiers = lagrangian.classifiers self._n_oracle_calls = lagrangian.n_oracle_calls self._oracle_calls_execution_time = lagrangian.oracle_calls_execution_time logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f", self._eps, B, self._nu, self._T, eta_min) logger.debug( "...last_t=%d, best_t=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d", self._last_t, self._best_t, self._best_gap, lagrangian.n_oracle_calls, len(lagrangian.classifiers))
def fit(self, X, y, **kwargs): """Return a fair classifier under specified fairness constraints. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list """ self.lambda_vecs_EG_ = pd.DataFrame() self.lambda_vecs_LP_ = pd.DataFrame() self.lambda_vecs_ = pd.DataFrame() #what are EG and LP representing? EG for exponentiated gradient, and LP for linear programming if isinstance(self.constraints, ClassificationMoment): logger.debug("Classification problem detected") is_classification_reduction = True else: logger.debug("Regression problem detected") is_classification_reduction = False _, y_train, sensitive_features = _validate_and_reformat_input( X, y, enforce_binary_labels=is_classification_reduction, **kwargs) # print("X:",X) # print("y_train:",y_train) # print("sensitive_features",sensitive_features) # #ote that certain estimators rely on metadata encoded in X which may be stripped during the reformatting #process, so mitigation methods should ideally use the input X instead of the returned X #for training estimators and leave potential reformatting of X to the estimator. n = y_train.shape[0] if self.error_weights is None: self.error_weights = pd.Series(1, y_train.index) else: self.error_weights = n * self.error_weights / self.error_weights.sum( ) logger.debug("...Exponentiated Gradient STARTING") B = 1 / self.eps #according to the error analysis part, B is proportional to the reciporacal of eps. lagrangian = _Lagrangian(X, sensitive_features, y_train, self.estimator, self.constraints, self.eps, B, error_weights=self.error_weights) theta = pd.Series(0, lagrangian.constraints.index) #starting value is 0. Qsum = pd.Series(dtype="float64") gaps_EG = [] gaps = [] Qs = [] last_regret_checked = _REGRET_CHECK_START_T #default value is 5 last_gap = np.PINF for t in range(0, self.max_iter): logger.debug("...iter=%03d", t) # set lambdas for every constraint lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum()) #print("t:",t) #print("lambda_vec:",lambda_vec) self.lambda_vecs_EG_[t] = lambda_vec lambda_EG = self.lambda_vecs_EG_.mean(axis=1) #lambda_hat, get the mean of lambdas from start to now. # select classifier according to best_h method h, h_idx = lagrangian.best_h(lambda_vec) #print("new best h index:",h_idx) #why do we set nu and learning rate eta to this value? if t == 0: if self.nu is None: # self.nu = _ACCURACY_MUL * (h(X) - y_train).abs().std() / np.sqrt(n) self.nu = _ACCURACY_MUL * ( self.error_weights * (h(X) - y_train)).abs().std() / np.sqrt(n) #print("nu:",self.nu) eta_min = self.nu / (2 * B) eta = self.eta0 / B logger.debug( "...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f", self.eps, B, self.nu, self.max_iter, eta_min) if h_idx not in Qsum.index: Qsum.at[h_idx] = 0.0 #probably the best response is not a new one. Qsum[h_idx] += 1.0 #print("Qsum:",Qsum) gamma = lagrangian.gammas[h_idx] Q_EG = Qsum / Qsum.sum() #Q_hat result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self.nu) #inside this, calculate the L_high and L_low by calling best_h and best_lambda gap_EG = result_EG.gap() #print("gap_EG:",gap_EG) gaps_EG.append(gap_EG) if t == 0 or not self.run_linprog_step: gap_LP = np.PINF else: # saddle point optimization over the convex hull of # classifiers returned so far Q_LP, self.lambda_vecs_LP_[ t], result_LP = lagrangian.solve_linprog(self.nu) gap_LP = result_LP.gap() # keep values from exponentiated gradient or linear programming if gap_EG < gap_LP: #print("EG better!!!") Qs.append(Q_EG) gaps.append(gap_EG) else: #print("LP better!!!") Qs.append(Q_LP) gaps.append(gap_LP) logger.debug( "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f, gap=%.6f, disp=%.3f, " "err=%.3f, gap_LP=%.6f", _INDENTATION, eta, result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG, result_EG.gamma.max(), result_EG.error, gap_LP) if (gaps[t] < self.nu) and (t >= _MIN_ITER): # solution found break # update regret if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T: best_gap = min(gaps_EG) if best_gap > last_gap * _SHRINK_REGRET: eta *= _SHRINK_ETA last_regret_checked = t last_gap = best_gap # update theta based on learning rate theta += eta * (gamma - self.eps) # retain relevant result data #print("_PRECISION",_PRECISION) #print("Qs",Qs) gaps_series = pd.Series(gaps) #print("gaps_series",gaps_series) gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION] #print("gaps_best",gaps_best) self.best_iter_ = gaps_best.index[-1] #print("best_iter_",self.best_iter_) self.best_gap_ = gaps[self.best_iter_] #print("best_gap",self.best_gap_) self.weights_ = Qs[self.best_iter_] ##best_Q #print("self.weights_",self.weights_) self._hs = lagrangian.hs for h_idx in self._hs.index: if h_idx not in self.weights_.index: self.weights_.at[h_idx] = 0.0 self.last_iter_ = len(Qs) - 1 self.predictors_ = lagrangian.predictors self.n_oracle_calls_ = lagrangian.n_oracle_calls self.n_oracle_calls_dummy_returned_ = lagrangian.n_oracle_calls_dummy_returned self.oracle_execution_times_ = lagrangian.oracle_execution_times self.lambda_vecs_ = lagrangian.lambdas logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f", self.eps, B, self.nu, self.max_iter, eta_min) logger.debug( "...last_iter=%d, best_iter=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d", self.last_iter_, self.best_iter_, self.best_gap_, lagrangian.n_oracle_calls, len(lagrangian.predictors))
def fit(self, X, y, **kwargs): """Run the grid search. This will result in multiple copies of the estimator being made, and the :code:`fit(X)` method of each one called. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list :param sensitive_features: A (currently) required keyword argument listing the feature used by the constraints object :type sensitive_features: numpy.ndarray, pandas.DataFrame, pandas.Series, or list (for now) """ if isinstance(self.constraints, ClassificationMoment): logger.debug("Classification problem detected") is_classification_reduction = True else: logger.debug("Regression problem detected") is_classification_reduction = False X_train, y_train, sensitive_features_train = _validate_and_reformat_input( X, y, enforce_binary_sensitive_feature=True, enforce_binary_labels=is_classification_reduction, **kwargs) kwargs[_KW_SENSITIVE_FEATURES] = sensitive_features_train # Prep the parity constraints and objective logger.debug("Preparing constraints and objective") self.constraints.load_data(X_train, y_train, **kwargs) objective = self.constraints.default_objective() objective.load_data(X_train, y_train, **kwargs) # Basis information pos_basis = self.constraints.pos_basis neg_basis = self.constraints.neg_basis neg_allowed = self.constraints.neg_basis_present objective_in_the_span = (self.constraints.default_objective_lambda_vec is not None) if self.grid is None: logger.debug("Creating grid of size %i", self.grid_size) grid = _GridGenerator(self.grid_size, self.grid_limit, pos_basis, neg_basis, neg_allowed, objective_in_the_span).grid else: logger.debug("Using supplied grid") grid = self.grid # Fit the estimates logger.debug("Setup complete. Starting grid search") self._all_results = [] for i in grid.columns: lambda_vec = grid[i] logger.debug("Obtaining weights") weights = self.constraints.signed_weights(lambda_vec) if not objective_in_the_span: weights = weights + objective.signed_weights() if is_classification_reduction: logger.debug("Applying relabelling for classification problem") y_reduction = 1 * (weights > 0) weights = weights.abs() else: y_reduction = y_train current_estimator = copy.deepcopy(self.estimator) logger.debug("Calling underlying estimator") oracle_call_start_time = time() current_estimator.fit(X, y_reduction, sample_weight=weights) oracle_call_execution_time = time() - oracle_call_start_time logger.debug("Call to underlying estimator complete") def predict_fct(X): return current_estimator.predict(X) nxt = GridSearchResult(current_estimator, lambda_vec, objective.gamma(predict_fct)[0], self.constraints.gamma(predict_fct), oracle_call_execution_time) self._all_results.append(nxt) logger.debug("Selecting best_result") if self.selection_rule == TRADEOFF_OPTIMIZATION: def loss_fct(x): return self.objective_weight * x.objective + self.constraint_weight * x.gamma.max( ) self._best_result = min(self._all_results, key=loss_fct) else: raise RuntimeError("Unsupported selection rule") return