Exemplo n.º 1
0
def _get_error_rate_and_didi(preds, labels, didi, I):
    """
    Computes the error and constraint violations.
    """
    error = error_rate(preds, labels)
    ct_violation = utils.didi_c(preds, I) - 0.2 * didi
    return error, [ct_violation]
Exemplo n.º 2
0
    def _training_generator(self,
                            x,
                            y,
                            minibatch_size,
                            num_iterations_per_loop=1,
                            num_loops=1):
        num_rows = x.shape[0]
        minibatch_size = min(minibatch_size, num_rows)
        permutation = list(range(x.shape[0]))
        random.shuffle(permutation)
        minibatch_start_index = 0
        for n in xrange(num_loops):
            for _ in xrange(num_iterations_per_loop):
                minibatch_indices = []
                while len(minibatch_indices) < minibatch_size:
                    minibatch_end_index = (minibatch_start_index +
                                           minibatch_size -
                                           len(minibatch_indices))
                    if minibatch_end_index >= num_rows:
                        minibatch_indices += range(minibatch_start_index,
                                                   num_rows)
                        minibatch_start_index = 0
                    else:
                        minibatch_indices += range(minibatch_start_index,
                                                   minibatch_end_index)
                        minibatch_start_index = minibatch_end_index
                self.session.run(
                    self.train_op,
                    feed_dict=self._feed_dict_helper(
                        x[[permutation[ii] for ii in minibatch_indices]],
                        y[[permutation[ii] for ii in minibatch_indices]], [
                            I[[permutation[ii] for ii in minibatch_indices]]
                            for I in self.I_train.values()
                        ]))
            # print(f"Loop {n}")
            # print("DIDItr: %.3f" % (0.2 * self.didi_tr))
            slack = self.session.run(self.mp.constraints(),
                                     feed_dict=self._feed_dict_helper(
                                         x, y,
                                         [I for I in self.I_train.values()]))
            # print(f"TF Slack value {slack}")

            p = self.session.run(self.predictions_tensor,
                                 feed_dict=self._feed_dict_helper(x))

            preds = (1 + np.sign(p)) / 2
            perc_didi = utils.didi_c(preds, self.I_train) / self.didi_tr
            # print("Positive preds: %.0f / %.0f" % (sum(preds), len(preds)))
            # print("DIDI index: %.3f" % perc_didi)

            yield p
Exemplo n.º 3
0
    def cst_info(self, x, y):
        """
        Print information about the cost (satisfaction) associated to the inputs.
        """
        # Infer train /test set from the input arrays.
        I = None
        d = None
        n_points = len(x)
        if n_points == len(self.I_train[0]):
            I = self.I_train
            d = self.didi_tr
        elif n_points == len(self.I_test[0]):
            I = self.I_test
            d = self.didi_ts
        else:
            raise ValueError(
                "Cannot infer indicator matrix from input data. Input array has "
                "shape %d, with matrices having shape %d and %d" %
                (n_points, len(self.I_train[0]), len(self.I_test[0])))

        perc_didi = utils.didi_c(y, I) / d
        cost = {'DIDI perc. index': perc_didi}

        return cost
Exemplo n.º 4
0
def cross_val():
    # New class implementation.
    xnp, xp, y = data_gen.load_adult()

    results = {
        'Last_iterate_train_acc': [],
        'Last_iterate_test_acc': [],
        'Last_iterate_train_ct': [],
        'Last_iterate_test_ct': [],
        'Best_iterate_train_acc': [],
        'Best_iterate_test_acc': [],
        'Best_iterate_train_ct': [],
        'Best_iterate_test_ct': [],
        'Stoch_iterate_train_acc': [],
        'Stoch_iterate_test_acc': [],
        'Stoch_iterate_train_ct': [],
        'Stoch_iterate_test_ct': [],
    }
    nfolds = 5
    fsize = int(np.ceil(len(xnp) / nfolds))
    for fidx in range(nfolds):
        print(f'\n### Processing fold {fidx}')

        # Build a full index set
        idx = np.arange(len(xnp))
        # Separate index sets
        tridx = np.hstack((idx[:fidx * fsize], idx[(fidx + 1) * fsize:]))
        tsidx = idx[fidx * fsize:(fidx + 1) * fsize]

        # Separate training and test data
        xptr = xp[tridx]
        ytr = y[tridx]
        xpts = xp[tsidx]
        yts = y[tsidx]

        # Standardize train set.
        scl = MinMaxScaler()
        xnptr = scl.fit_transform(xnp[tridx])
        xnpts = scl.transform(xnp[tsidx])

        # Add protected features.
        xtr = np.hstack([xnptr, xptr])
        xts = np.hstack([xnpts, xpts])

        scl = MinMaxScaler()
        ytr = scl.fit_transform(ytr)
        yts = scl.transform(yts)

        print("Computing indicator matrices.")
        I_train = utils.compute_indicator_matrix_c(xptr)
        I_test = utils.compute_indicator_matrix_c(xpts)
        didi_tr = utils.didi_c(ytr, I_train)
        didi_ts = utils.didi_c(yts, I_test)

        tfco_model = TFCOFairCls(input_dim=xtr.shape[1],
                                 output_dim=1,
                                 I_train=I_train,
                                 didi_tr=didi_tr)

        minibatch_size = 200
        iterations_per_loop = 200
        loops = 100

        train_pred, test_pred = tfco_model._full_training(
            xtr, xts, ytr, minibatch_size, iterations_per_loop, loops)

        train_errors = []
        train_violations = []
        train_didi = []
        train_acc = []

        for p in train_pred:
            p_class = (1 + np.sign(p)) / 2
            err, viol = _get_error_rate_and_didi(p, ytr.reshape(-1, 1),
                                                 didi_tr, I_train)
            acc = accuracy_score(ytr, p_class)
            didi = utils.didi_r(p_class, I_train) / didi_tr
            train_errors.append(err)
            train_violations.append(viol)
            train_didi.append(didi)
            train_acc.append(acc)

        test_errors = []
        test_violations = []
        test_didi = []
        test_acc = []

        for p in test_pred:
            p_class = (1 + np.sign(p)) / 2
            err, viol = _get_error_rate_and_didi(p, yts.reshape(-1, 1),
                                                 didi_ts, I_test)
            acc = accuracy_score(yts, p_class)
            didi = utils.didi_r(p_class, I_test) / didi_ts
            test_errors.append(err)
            test_violations.append(viol)
            test_didi.append(didi)
            test_acc.append(acc)

        train_violations = np.array(train_violations)
        print("Train Acc.", train_acc[-1])
        print("Train DIDI.", train_didi[-1])

        print("Test Acc.", test_acc[-1])
        print("Test DIDI.", test_didi[-1])

        print("Improving using Best Iterate instead of Last Iterate.")
        #
        # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice
        # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch.
        # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to
        # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness.
        # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks.
        # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data.
        #
        # This solution can be calculated using find_best_candidate_index given the list of training errors and violations
        # associated with each of the epochs.

        best_cand_index = tfco.find_best_candidate_index(
            train_errors, train_violations)

        print("Train Acc.", train_acc[best_cand_index])
        print("Train DIDI.", train_didi[best_cand_index])

        print("Test Acc.", test_acc[best_cand_index])
        print("Test DIDI.", test_acc[best_cand_index])

        print("m-stochastic solution.")
        # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down
        # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at
        # least as good as the T-stochastic solution.
        # Here we see that indeed there is benefit in performing the shrinking.
        #
        # This solution can be computed using find_best_candidate_distribution by passing in the training errors and
        # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

        cand_dist = tfco.find_best_candidate_distribution(
            train_errors, train_violations)
        print(cand_dist)

        m_stoch_train_acc = np.dot(cand_dist, train_acc)
        m_stoch_train_didi = np.dot(cand_dist, train_didi)
        m_stoch_test_acc = np.dot(cand_dist, test_acc)
        m_stoch_test_didi = np.dot(cand_dist, test_didi)

        print("Train Acc", m_stoch_train_acc)
        print("Train DIDI", m_stoch_train_didi)
        print("Test Acc", m_stoch_test_acc)
        print("Test DIDI", m_stoch_test_didi)

        results['Last_iterate_train_acc'].append(train_acc[-1])
        results['Last_iterate_test_acc'].append(test_acc[-1])
        results['Last_iterate_train_ct'].append(train_didi[-1])
        results['Last_iterate_test_ct'].append(test_didi[-1])

        results['Best_iterate_train_acc'].append(train_acc[best_cand_index])
        results['Best_iterate_test_acc'].append(test_acc[best_cand_index])
        results['Best_iterate_train_ct'].append(train_didi[best_cand_index])
        results['Best_iterate_test_ct'].append(test_didi[best_cand_index])

        results['Stoch_iterate_train_acc'].append(m_stoch_train_acc)
        results['Stoch_iterate_test_acc'].append(m_stoch_test_acc)
        results['Stoch_iterate_train_ct'].append(m_stoch_train_didi)
        results['Stoch_iterate_test_ct'].append(m_stoch_test_didi)

    for k, val in results.items():
        print(k, np.mean(val), np.std(val))
Exemplo n.º 5
0
    def adjust_targets(self, y, p, alpha, beta, use_prob=False):
        """
        Solve the optimization model that returns the optimal prediction that satisfy the constraints.
        """
        assert (alpha == 0 or p is not None)
        # self.logger.debug("Setting up Opt Model")

        if use_prob:
            prob = p.copy()
            # Output clipping to avoid infinities.
            prob = np.clip(prob, a_min=.01, a_max=.99)
            p = np.argmax(prob, axis=1)

        # Input adjusting.
        y = y.reshape(-1)
        p = p.reshape(-1)

        # Determine feasibility
        _feasible = (utils.didi_c(p, self.I_train) <= self.constraint_value)
        print(f'Current solution is feasible: {_feasible}')

        # Model declaration.
        mod = CPModel('Fairness Cls Problem')

        # Set a time limit (seconds).
        mod.parameters.timelimit = _CPLEX_TIME_LIMIT

        # Variable declaration.
        n_points = len(y)
        idx_var = [i for i in range(n_points)]
        x = mod.binary_var_list(keys=idx_var, name='y')

        # Fairness constraint: instead of adding a penalization term in the objective function - as done by
        # Phebe et al - I impose the fairness term to stay below a certain threshold.
        # self.logger.debug("...constraints declaration")
        constraint = .0
        abs_val = mod.continuous_var_list(keys=self.I_train.keys())
        for key, I in self.I_train.items():
            # print(key, i, var_i)
            Np = np.sum(I)
            if Np > 0:
                tmp = 2 * (mod.sum(x) / n_points -
                           mod.sum([I[j] * x[j] for j in idx_var]) / Np)

                mod.add_constraint(abs_val[key] >= tmp)
                mod.add_constraint(abs_val[key] >= -tmp)

        constraint += mod.sum(abs_val)
        mod.add_constraint(constraint <= self.constraint_value,
                           ctname='fairness_cnst')

        # Objective Function.
        y_loss = (1.0 / n_points) * mod.sum(
            [y[i] * (1 - x[i]) + (1 - y[i]) * x[i] for i in idx_var])
        if use_prob:
            p_loss = -(1.0 / n_points) * mod.sum([
                x[i] * np.log(prob[i][1]) + (1 - x[i]) * np.log(prob[i][0])
                for i in idx_var
            ])
        else:
            p_loss = (1.0 / n_points) * mod.sum(
                [p[i] * (1 - x[i]) + (1 - p[i]) * x[i] for i in idx_var])

        if _feasible and beta >= 0:
            # Search in a Ball
            # 1/alpha determines the allowed number of flips from the original solution.
            # dataset_scale = 0.01 * n_points
            mod.add(p_loss <= beta)
            # Minimize distance w.r.t. the targets
            mod.minimize(y_loss)
        else:
            # Project (with tie breaking)
            mod.minimize(y_loss + (1.0 / alpha) * p_loss)

            # 231020: Ball search.
            # First I compute the minimum range that assures feasibility and then impose
            # it as a costraint.
            # mod2 = mod.clone("Radius model")
            # mod2.minimize(n_points * p_loss)
            # mod2.solve()
            # self._check_solution(mod2)
            # r = mod2.objective_value
            # print("Objective value (radius): %.2f" % r)

            # mod.add(p_loss <= (1.05 * r))
            # mod.minimize(y_loss)

        # Problem solving.
        mod.solve()

        # Check solution.
        self._check_solution(mod)

        # Obtain the adjusted targets.
        y_opt = np.array([int(x[i].solution_value) for i in range(n_points)])

        print("Total flips: %d" % np.sum(np.abs(y_opt - p)))

        return y_opt
Exemplo n.º 6
0
    def validate(self):

        # Load data.
        # self.load_data()

        for ii in range(self.nfolds):

            # TRAIN TEST SPLIT
            if self.dataset in BALANCE_DATASET:
                train_idx, test_idx = self.get_train_val_index(ii)
                xnp_train, y_train = self.xnp_tr[train_idx], self.y_tr[train_idx]
                xnp_test, y_test = self.xnp_tr[test_idx], self.y_tr[test_idx]

                # STANDARDIZATION.
                # Standardize train set.
                x_train = self.scaler.fit_transform(xnp_train)
                x_test = self.scaler.transform(xnp_test)

                # y_train = self.scaler.fit_transform(y_train)
                # y_test = self.scaler.transform(y_test)

            else:
                train_idx, test_idx = self.get_train_val_index(ii)
                xnp_train, xp_train, y_train = self.xnp_tr[train_idx], self.xp_tr[train_idx], self.y_tr[train_idx]
                xnp_test, xp_test, y_test = self.xnp_tr[test_idx], self.xp_tr[test_idx], self.y_tr[test_idx]

                # STANDARDIZATION.
                xnp_train = self.scaler.fit_transform(xnp_train)
                xnp_test = self.scaler.transform(xnp_test)

                # Add protected features.
                x_train = np.hstack([xnp_train, xp_train])
                x_test = np.hstack([xnp_test, xp_test])

                y_train = self.scaler.fit_transform(y_train)
                y_test = self.scaler.transform(y_test)

            if self.dataset in BALANCE_DATASET:
                # Data shapes.
                input_dim = x_train.shape[1]
                output_dim = len(np.unique(y_train))

                # Build the master
                if self.mtype == 'balance':
                    nclasses = len(np.unique(y_train))
                    self.master = BalancedCountsMaster(nclasses=nclasses)
                else:
                    raise ValueError(f'Unknown master type "{self.mtype}"')

                # Start the main process
                if self.ltype == 'cvx':
                    self.learner = cls.BalanceMultiLogRegressor(self.alpha)

                elif self.ltype == 'sbrnn':
                    self.learner = cls.SBRNN(input_dim, output_dim, self.alpha)

                elif self.ltype == 'lbrf':
                    self.learner = cls.LowBiasRandomForestLearner(input_dim, output_dim)

                elif self.ltype == 'lr':
                    self.learner = cls.LogisticRegressionLearner(input_dim, output_dim)

                elif self.ltype == 'rf':
                    self.learner = cls.RandomForestLearner(input_dim, output_dim)

                elif self.ltype == 'nn':
                    self.learner = cls.NeuralNetworkLearner(input_dim, output_dim)

                else:
                    raise ValueError(f'Unknown learner type "{self.ltype}"')

            elif self.dataset == 'adult':
                print("Computing indicator matrices.")
                I_train = utils.compute_indicator_matrix_c(xp_train)
                I_test = utils.compute_indicator_matrix_c(xp_test)
                didi_tr = utils.didi_c(y_train, I_train)
                didi_ts = utils.didi_c(y_test, I_test)

                # Build the master
                if self.mtype == 'fairness':
                    self.master = FairnessClsMaster(I_train, I_test, didi_tr, didi_ts)
                else:
                    raise ValueError(f'Unknown master type "{self.mtype}"')

                input_dim = x_train.shape[1]
                output_dim = len(np.unique(y_train))

                # Start the main process
                if self.ltype == 'cvx':
                    self.learner = cls.FairBinLogRegressor(self.alpha, I_train)

                elif self.ltype == 'cnd':
                    # Kamiran and Calders method.
                    # learner = cls.CND(xnptr, xptr, ytr)
                    raise NotImplementedError

                elif self.ltype == 'tfco':
                    input_dim = x_train.shape[1]
                    output_dim = 1
                    self.learner = tfco_cls.TFCOFairCls(input_dim, output_dim, I_train, didi_tr)

                elif self.ltype == 'lbrf':
                    self.learner = cls.LowBiasRandomForestLearner(input_dim, output_dim)

                elif self.ltype == 'lr':
                    self.learner = cls.LogisticRegressionLearner(input_dim, output_dim)

                elif self.ltype == 'rf':
                    self.learner = cls.RandomForestLearner(input_dim, output_dim)

                elif self.ltype == 'nn':
                    self.learner = cls.NeuralNetworkLearner(input_dim, output_dim)

                else:
                    raise ValueError(f'Unknown learner type "{self.ltype}"')

            elif self.dataset == 'crime':
                print("Computing indicator matrices.")
                I_train = utils.compute_indicator_matrix_r(xp_train)
                I_test = utils.compute_indicator_matrix_r(xp_test)
                didi_tr = utils.didi_r(y_train, I_train)
                didi_ts = utils.didi_r(y_test, I_test)

                # Build the master
                if self.mtype == 'fairness':
                    self.master = FairnessRegMaster(I_train, I_test, didi_tr, didi_ts)
                else:
                    raise ValueError(f'Unknown master type "{self.mtype}"')

                # Build the learner.
                if self.ltype == 'cvx':
                    self.learner = rgs.FairRegressor(self.alpha, I_train)

                elif self.ltype == 'tfco':
                    input_dim = x_train.shape[1]
                    output_dim = 1
                    self.learner = tfco_reg.TFCOFairReg(input_dim, output_dim, I_train, didi_tr)

                elif self.ltype == 'lbrf':
                    self.learner = rgs.LowBiasRandomForestLearner()

                elif self.ltype == 'lr':
                    self.learner = rgs.LRegressor()

                elif self.ltype == 'gb':
                    self.learner = rgs.GBTree()

                elif self.ltype == 'nn':
                    self.learner = rgs.Net((x_train.shape[1],), 1)

                else:
                    raise ValueError(f'Unknown learner type "{self.ltype}"')

            # Loggers
            params = dict(fold=ii, alpha=self.alpha, beta=self.beta, init=self.initial_step, use_prob=self.use_prob)
            wb_log = WandBLogger(self.learner, self.master, x_train, y_train, x_test, y_test, params, f'{self.dataset}')
            cst_log = CustomLogger(self.learner, self.master, x_train, y_train, nfold=ii, x_test=x_test, y_test=y_test)
            self.logger = MultiLogger([cst_log, wb_log])
            # Start the MACS process
            mp = macs.MACS(self.learner, self.master, self.logger)
            mp.fit(x_train, y_train, self.iterations, self.alpha, self.beta, self.initial_step, use_prob=self.use_prob)
            self.results[f'fold_{ii}'] = self.logger.results