def _get_error_rate_and_constraints(preds, labels, didi, I): """ Computes the error and constraint violations. """ error = utils.mean_squared_error(preds, labels) ct_violation = utils.didi_r(preds, I) - 0.2 * didi return error, [ct_violation]
def _training_generator(self, x, y, minibatch_size, num_iterations_per_loop=1, num_loops=1): num_rows = x.shape[0] minibatch_size = min(minibatch_size, num_rows) permutation = list(range(x.shape[0])) random.shuffle(permutation) # print(f"Fairness bound: {0.2 * self.didi_tr}") minibatch_start_index = 0 for n in xrange(num_loops): for _ in xrange(num_iterations_per_loop): minibatch_indices = [] while len(minibatch_indices) < minibatch_size: minibatch_end_index = ( minibatch_start_index + minibatch_size - len(minibatch_indices)) if minibatch_end_index >= num_rows: minibatch_indices += range(minibatch_start_index, num_rows) minibatch_start_index = 0 else: minibatch_indices += range(minibatch_start_index, minibatch_end_index) minibatch_start_index = minibatch_end_index self.session.run( self.train_op, feed_dict=self._feed_dict_helper( x[[permutation[ii] for ii in minibatch_indices]], y[[permutation[ii] for ii in minibatch_indices]], [I[[permutation[ii] for ii in minibatch_indices]] for I in self.I_train.values()])) # ct = self.session.run( # self.mp.constraint, # feed_dict=self._feed_dict_helper( # x, # y, # [I for I in self.I_train.values()]) # ) # print(f"Loop {n}") # print("DIDItr: %.3f" % (0.2 * self.didi_tr)) # print(f"TF Constraint value {ct}") slack = self.session.run( self.mp.constraints(), feed_dict=self._feed_dict_helper( x, y, [I for I in self.I_train.values()]) ) # print(f"TF Slack value {slack}") p = self.session.run( self.predictions_tensor, feed_dict=self._feed_dict_helper(x) ) preds = (1 + np.sign(p)) / 2 perc_didi = utils.didi_r(preds, self.I_train) / self.didi_tr # print("Positive preds: %.0f / %.0f" % (sum(preds), len(preds))) # print("DIDI index: %.3f" % perc_didi) yield p
def cst_info(self, x, y): """ Print information about the cost (satisfaction) associated to the inputs. """ # Infer train /test set from the input arrays. I = None d = None n_points = len(x) if n_points == len(self.I_train[0]): I = self.I_train d = self.didi_tr elif n_points == len(self.I_test[0]): I = self.I_test d = self.didi_ts else: raise ValueError( "Cannot infer indicator matrix from input data. Input array has " "shape %d, with matrices having shape %d and %d" % (n_points, len(self.I_train[0]), len(self.I_test[0]))) perc_didi = utils.didi_r(y, I) / d cost = {'DIDI perc. index': perc_didi} return cost
def cross_val(): # New class implementation. xnp, xp, y = data_gen.load_adult() results = { 'Last_iterate_train_acc': [], 'Last_iterate_test_acc': [], 'Last_iterate_train_ct': [], 'Last_iterate_test_ct': [], 'Best_iterate_train_acc': [], 'Best_iterate_test_acc': [], 'Best_iterate_train_ct': [], 'Best_iterate_test_ct': [], 'Stoch_iterate_train_acc': [], 'Stoch_iterate_test_acc': [], 'Stoch_iterate_train_ct': [], 'Stoch_iterate_test_ct': [], } nfolds = 5 fsize = int(np.ceil(len(xnp) / nfolds)) for fidx in range(nfolds): print(f'\n### Processing fold {fidx}') # Build a full index set idx = np.arange(len(xnp)) # Separate index sets tridx = np.hstack((idx[:fidx * fsize], idx[(fidx + 1) * fsize:])) tsidx = idx[fidx * fsize:(fidx + 1) * fsize] # Separate training and test data xptr = xp[tridx] ytr = y[tridx] xpts = xp[tsidx] yts = y[tsidx] # Standardize train set. scl = MinMaxScaler() xnptr = scl.fit_transform(xnp[tridx]) xnpts = scl.transform(xnp[tsidx]) # Add protected features. xtr = np.hstack([xnptr, xptr]) xts = np.hstack([xnpts, xpts]) scl = MinMaxScaler() ytr = scl.fit_transform(ytr) yts = scl.transform(yts) print("Computing indicator matrices.") I_train = utils.compute_indicator_matrix_c(xptr) I_test = utils.compute_indicator_matrix_c(xpts) didi_tr = utils.didi_c(ytr, I_train) didi_ts = utils.didi_c(yts, I_test) tfco_model = TFCOFairCls(input_dim=xtr.shape[1], output_dim=1, I_train=I_train, didi_tr=didi_tr) minibatch_size = 200 iterations_per_loop = 200 loops = 100 train_pred, test_pred = tfco_model._full_training( xtr, xts, ytr, minibatch_size, iterations_per_loop, loops) train_errors = [] train_violations = [] train_didi = [] train_acc = [] for p in train_pred: p_class = (1 + np.sign(p)) / 2 err, viol = _get_error_rate_and_didi(p, ytr.reshape(-1, 1), didi_tr, I_train) acc = accuracy_score(ytr, p_class) didi = utils.didi_r(p_class, I_train) / didi_tr train_errors.append(err) train_violations.append(viol) train_didi.append(didi) train_acc.append(acc) test_errors = [] test_violations = [] test_didi = [] test_acc = [] for p in test_pred: p_class = (1 + np.sign(p)) / 2 err, viol = _get_error_rate_and_didi(p, yts.reshape(-1, 1), didi_ts, I_test) acc = accuracy_score(yts, p_class) didi = utils.didi_r(p_class, I_test) / didi_ts test_errors.append(err) test_violations.append(viol) test_didi.append(didi) test_acc.append(acc) train_violations = np.array(train_violations) print("Train Acc.", train_acc[-1]) print("Train DIDI.", train_didi[-1]) print("Test Acc.", test_acc[-1]) print("Test DIDI.", test_didi[-1]) print("Improving using Best Iterate instead of Last Iterate.") # # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch. # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness. # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks. # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data. # # This solution can be calculated using find_best_candidate_index given the list of training errors and violations # associated with each of the epochs. best_cand_index = tfco.find_best_candidate_index( train_errors, train_violations) print("Train Acc.", train_acc[best_cand_index]) print("Train DIDI.", train_didi[best_cand_index]) print("Test Acc.", test_acc[best_cand_index]) print("Test DIDI.", test_acc[best_cand_index]) print("m-stochastic solution.") # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at # least as good as the T-stochastic solution. # Here we see that indeed there is benefit in performing the shrinking. # # This solution can be computed using find_best_candidate_distribution by passing in the training errors and # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse. cand_dist = tfco.find_best_candidate_distribution( train_errors, train_violations) print(cand_dist) m_stoch_train_acc = np.dot(cand_dist, train_acc) m_stoch_train_didi = np.dot(cand_dist, train_didi) m_stoch_test_acc = np.dot(cand_dist, test_acc) m_stoch_test_didi = np.dot(cand_dist, test_didi) print("Train Acc", m_stoch_train_acc) print("Train DIDI", m_stoch_train_didi) print("Test Acc", m_stoch_test_acc) print("Test DIDI", m_stoch_test_didi) results['Last_iterate_train_acc'].append(train_acc[-1]) results['Last_iterate_test_acc'].append(test_acc[-1]) results['Last_iterate_train_ct'].append(train_didi[-1]) results['Last_iterate_test_ct'].append(test_didi[-1]) results['Best_iterate_train_acc'].append(train_acc[best_cand_index]) results['Best_iterate_test_acc'].append(test_acc[best_cand_index]) results['Best_iterate_train_ct'].append(train_didi[best_cand_index]) results['Best_iterate_test_ct'].append(test_didi[best_cand_index]) results['Stoch_iterate_train_acc'].append(m_stoch_train_acc) results['Stoch_iterate_test_acc'].append(m_stoch_test_acc) results['Stoch_iterate_train_ct'].append(m_stoch_train_didi) results['Stoch_iterate_test_ct'].append(m_stoch_test_didi) for k, val in results.items(): print(k, np.mean(val), np.std(val))
def adjust_targets(self, y, p, alpha, beta, use_prob): """ Solve the optimization model that returns the optimal prediction that satisfy the constraints. """ assert (alpha == 0 or p is not None) # self.logger.debug("Setting up Opt Model") # Input adjusting. y = y.reshape(-1) p = p.reshape(-1) # Determine feasibility _feasible = (utils.didi_r(p, self.I_train) <= self.constraint_value) # Model declaration. mod = CPModel('Fairness Reg Problem') # Set a time limit. mod.parameters.timelimit = _CPLEX_TIME_LIMIT # Variable declaration. n_points = len(y) idx_var = [i for i in range(n_points)] x = mod.continuous_var_list(keys=idx_var, lb=0.0, ub=1.0, name='y') # Fairness constraint: instead of adding a penalization term in the objective function - as done by # Phebe et al - I impose the fairness term to stay below a certain threshold. constraint = .0 abs_val = mod.continuous_var_list(keys=self.I_train.keys()) for key, val in self.I_train.items(): Np = np.sum(val) if Np > 0: tmp = (1.0 / n_points) * mod.sum(x) - \ (1.0 / Np) * mod.sum([val[j] * x[j] for j in idx_var]) # Linearization of the absolute value. mod.add_constraint(abs_val[key] >= tmp) mod.add_constraint(abs_val[key] >= -tmp) constraint += mod.sum(abs_val) mod.add_constraint(constraint <= self.constraint_value, ctname='fairness_cnst') # Objective Function. y_loss = (1.0 / n_points) * mod.sum([(y[i] - x[i]) * (y[i] - x[i]) for i in idx_var]) p_loss = (1.0 / n_points) * mod.sum([(p[i] - x[i]) * (p[i] - x[i]) for i in idx_var]) if _feasible and beta >= 0: # Constrain search on a ball. mod.add(p_loss <= beta) mod.minimize(y_loss) else: # Adds a regularization term to make sure the new targets are not too far from the actual # network's output. mod.minimize(y_loss + (1.0 / alpha) * p_loss) # 231020: Ball search. # First I compute the minimum range that assures feasibility and then impose # it as a costraint. # mod2 = mod.clone("Radius model") # mod2.minimize(n_points * p_loss) # mod2.solve() # r = mod2.objective_value # print("Objective value (radius): %.2f" % r) # mod.add(p_loss <= (1.05 * r)) # mod.minimize(y_loss) # Problem solving. # self.logger.info("Solving Opt Model...") mod.solve() # Check solution. self._check_solution(mod) # Obtain the adjusted targets. y_opt = np.array([x[i].solution_value for i in range(n_points)]) return y_opt
def validate(self): # Load data. # self.load_data() for ii in range(self.nfolds): # TRAIN TEST SPLIT if self.dataset in BALANCE_DATASET: train_idx, test_idx = self.get_train_val_index(ii) xnp_train, y_train = self.xnp_tr[train_idx], self.y_tr[train_idx] xnp_test, y_test = self.xnp_tr[test_idx], self.y_tr[test_idx] # STANDARDIZATION. # Standardize train set. x_train = self.scaler.fit_transform(xnp_train) x_test = self.scaler.transform(xnp_test) # y_train = self.scaler.fit_transform(y_train) # y_test = self.scaler.transform(y_test) else: train_idx, test_idx = self.get_train_val_index(ii) xnp_train, xp_train, y_train = self.xnp_tr[train_idx], self.xp_tr[train_idx], self.y_tr[train_idx] xnp_test, xp_test, y_test = self.xnp_tr[test_idx], self.xp_tr[test_idx], self.y_tr[test_idx] # STANDARDIZATION. xnp_train = self.scaler.fit_transform(xnp_train) xnp_test = self.scaler.transform(xnp_test) # Add protected features. x_train = np.hstack([xnp_train, xp_train]) x_test = np.hstack([xnp_test, xp_test]) y_train = self.scaler.fit_transform(y_train) y_test = self.scaler.transform(y_test) if self.dataset in BALANCE_DATASET: # Data shapes. input_dim = x_train.shape[1] output_dim = len(np.unique(y_train)) # Build the master if self.mtype == 'balance': nclasses = len(np.unique(y_train)) self.master = BalancedCountsMaster(nclasses=nclasses) else: raise ValueError(f'Unknown master type "{self.mtype}"') # Start the main process if self.ltype == 'cvx': self.learner = cls.BalanceMultiLogRegressor(self.alpha) elif self.ltype == 'sbrnn': self.learner = cls.SBRNN(input_dim, output_dim, self.alpha) elif self.ltype == 'lbrf': self.learner = cls.LowBiasRandomForestLearner(input_dim, output_dim) elif self.ltype == 'lr': self.learner = cls.LogisticRegressionLearner(input_dim, output_dim) elif self.ltype == 'rf': self.learner = cls.RandomForestLearner(input_dim, output_dim) elif self.ltype == 'nn': self.learner = cls.NeuralNetworkLearner(input_dim, output_dim) else: raise ValueError(f'Unknown learner type "{self.ltype}"') elif self.dataset == 'adult': print("Computing indicator matrices.") I_train = utils.compute_indicator_matrix_c(xp_train) I_test = utils.compute_indicator_matrix_c(xp_test) didi_tr = utils.didi_c(y_train, I_train) didi_ts = utils.didi_c(y_test, I_test) # Build the master if self.mtype == 'fairness': self.master = FairnessClsMaster(I_train, I_test, didi_tr, didi_ts) else: raise ValueError(f'Unknown master type "{self.mtype}"') input_dim = x_train.shape[1] output_dim = len(np.unique(y_train)) # Start the main process if self.ltype == 'cvx': self.learner = cls.FairBinLogRegressor(self.alpha, I_train) elif self.ltype == 'cnd': # Kamiran and Calders method. # learner = cls.CND(xnptr, xptr, ytr) raise NotImplementedError elif self.ltype == 'tfco': input_dim = x_train.shape[1] output_dim = 1 self.learner = tfco_cls.TFCOFairCls(input_dim, output_dim, I_train, didi_tr) elif self.ltype == 'lbrf': self.learner = cls.LowBiasRandomForestLearner(input_dim, output_dim) elif self.ltype == 'lr': self.learner = cls.LogisticRegressionLearner(input_dim, output_dim) elif self.ltype == 'rf': self.learner = cls.RandomForestLearner(input_dim, output_dim) elif self.ltype == 'nn': self.learner = cls.NeuralNetworkLearner(input_dim, output_dim) else: raise ValueError(f'Unknown learner type "{self.ltype}"') elif self.dataset == 'crime': print("Computing indicator matrices.") I_train = utils.compute_indicator_matrix_r(xp_train) I_test = utils.compute_indicator_matrix_r(xp_test) didi_tr = utils.didi_r(y_train, I_train) didi_ts = utils.didi_r(y_test, I_test) # Build the master if self.mtype == 'fairness': self.master = FairnessRegMaster(I_train, I_test, didi_tr, didi_ts) else: raise ValueError(f'Unknown master type "{self.mtype}"') # Build the learner. if self.ltype == 'cvx': self.learner = rgs.FairRegressor(self.alpha, I_train) elif self.ltype == 'tfco': input_dim = x_train.shape[1] output_dim = 1 self.learner = tfco_reg.TFCOFairReg(input_dim, output_dim, I_train, didi_tr) elif self.ltype == 'lbrf': self.learner = rgs.LowBiasRandomForestLearner() elif self.ltype == 'lr': self.learner = rgs.LRegressor() elif self.ltype == 'gb': self.learner = rgs.GBTree() elif self.ltype == 'nn': self.learner = rgs.Net((x_train.shape[1],), 1) else: raise ValueError(f'Unknown learner type "{self.ltype}"') # Loggers params = dict(fold=ii, alpha=self.alpha, beta=self.beta, init=self.initial_step, use_prob=self.use_prob) wb_log = WandBLogger(self.learner, self.master, x_train, y_train, x_test, y_test, params, f'{self.dataset}') cst_log = CustomLogger(self.learner, self.master, x_train, y_train, nfold=ii, x_test=x_test, y_test=y_test) self.logger = MultiLogger([cst_log, wb_log]) # Start the MACS process mp = macs.MACS(self.learner, self.master, self.logger) mp.fit(x_train, y_train, self.iterations, self.alpha, self.beta, self.initial_step, use_prob=self.use_prob) self.results[f'fold_{ii}'] = self.logger.results
def test(): # Data with our preprocessing routines. from sklearn.preprocessing import MinMaxScaler xnp, xp, y = data_gen.load_crime() scl = MinMaxScaler() train_pts = int(0.8 * len(xnp)) xnptr = scl.fit_transform(xnp[:train_pts]) xnpts = scl.transform(xnp[train_pts:]) xptr = xp[:train_pts] xpts = xp[train_pts:] ytr = y[:train_pts] yts = y[train_pts:] # Add protected features. xtr = np.hstack([xnptr, xptr]) xts = np.hstack([xnpts, xpts]) scl = MinMaxScaler() ytr = scl.fit_transform(ytr) yts = scl.transform(yts) I_train = utils.compute_indicator_matrix_r(xptr) I_test = utils.compute_indicator_matrix_r(xpts) didi_tr = utils.didi_r(ytr, I_train) didi_ts = utils.didi_r(yts, I_test) tfco_model = TFCOFairReg(input_dim=xtr.shape[1], output_dim=1, I_train=I_train, didi_tr=didi_tr) # Fitting. # train_errors, train_violations = tfco_model.fit(xtr, ytr) # train_errors, train_violations = np.array(train_errors), np.array(train_violations) # test_errors, test_violations = tfco_model.predict_err(x_ts.values, y_ts.values) # test_preds = tfco_model.predict(xts) # test_errors, test_violations = _get_error_rate_and_constraints( # test_preds, yts, didi_ts, I_test) minibatch_size = 200 iterations_per_loop = 200 loops = 80 train_pred, test_pred = tfco_model._full_training(xtr, xts, ytr, minibatch_size, iterations_per_loop, loops) train_errors = [] train_violations = [] train_didi = [] train_r2 = [] for p in train_pred: err, viol = _get_error_rate_and_constraints(p, ytr.reshape(-1, 1), didi_tr, I_train) r2 = r2_score(ytr, p) didi = utils.didi_r(p, I_train) / didi_tr train_errors.append(err) train_violations.append(viol) train_didi.append(didi) train_r2.append(r2) test_errors = [] test_violations = [] test_didi = [] test_r2 = [] for p in test_pred: err, viol = _get_error_rate_and_constraints(p, yts.reshape(-1, 1), didi_ts, I_test) r2 = r2_score(yts, p) didi = utils.didi_r(p, I_test) / didi_ts test_errors.append(err) test_violations.append(viol) test_didi.append(didi) test_r2.append(r2) train_violations = np.array(train_violations) # print("DIDI train", didi_tr) # print("DIDI test", didi_ts) # print("Train Error", train_errors[-1]) # print("Train Violation", max(train_violations[-1])) print("Train R2", train_r2[-1]) print("Train DIDI", train_didi[-1]) # print("Test Error", test_errors[-1]) # print("Test Violation", max(test_violations[-1])) print("Train R2", test_r2[-1]) print("Train DIDI", test_didi[-1]) print("Improving using Best Iterate instead of Last Iterate.") # # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch. # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness. # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks. # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data. # # This solution can be calculated using find_best_candidate_index given the list of training errors and violations # associated with each of the epochs. best_cand_index = tfco.find_best_candidate_index(train_errors, train_violations) # print("Train Error", train_errors[best_cand_index]) # print("Train Violation", max(train_violations[best_cand_index])) print("Train R2", train_r2[best_cand_index]) print("Train DIDI", train_didi[best_cand_index]) # print("Test Error", test_errors[best_cand_index]) # print("Test Violation", max(test_violations[best_cand_index])) print("Test R2", test_r2[best_cand_index]) print("Test DIDI", test_didi[best_cand_index]) print("m-stochastic solution.") # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at # least as good as the T-stochastic solution. # Here we see that indeed there is benefit in performing the shrinking. # # This solution can be computed using find_best_candidate_distribution by passing in the training errors and # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse. cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations) print(cand_dist) # m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors, # train_violations) # m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors, # test_violations) m_stoch_train_r2 = np.dot(cand_dist, train_r2) m_stoch_train_didi = np.dot(cand_dist, train_didi) m_stoch_test_r2 = np.dot(cand_dist, test_r2) m_stoch_test_didi = np.dot(cand_dist, test_didi) print("Train R2", m_stoch_train_r2) print("Train DIDI", m_stoch_train_didi) print("Test R2", m_stoch_test_r2) print("Test DIDI", m_stoch_test_didi)