def _train_helper(self):
        """
        Helper function for self.train()
        """

        self.lnr.irl_selection = self.irl_selection
        self.lnr.train()
        self.loss = (logistic_loss(self.training_instances, self.lnr) /
                     sum(self.irl_selection))

        iteration = 0
        old_irl_selection = np.full(len(self.irl_selection), -1)
        while np.linalg.norm(self.irl_selection - old_irl_selection) != 0:
            old_irl_selection = deepcopy(self.irl_selection)

            self.irl_selection = np.full(len(self.irl_selection), 0)
            for i, loss in enumerate(self.loss):
                if loss < self.loss_threshold:
                    self.irl_selection[i] = 1

            # Have to have at least 50% of the instances to train with as
            # we assume at least 50% of the data is clean
            if sum(self.irl_selection) < 0.5 * len(self.training_instances):
                raise ValueError()

            if self.verbose:
                print('IRL Iteration:', iteration, '- number of instances:',
                      sum(self.irl_selection))

            self.lnr.irl_selection = self.irl_selection
            self.lnr.train()
            self.loss = (logistic_loss(self.training_instances, self.lnr) /
                         sum(self.irl_selection))

            iteration += 1
Exemplo n.º 2
0
    def _calculate_constants(self, instances: List[Instance]):
        """
        Calculates constants needed for the alternating minimization loop.
        :param instances: the list of Instances
        :return: the constants
        """

        half_n = len(instances)
        n = half_n * 2  # size of new (doubled) input

        feature_vectors = []
        labels = []
        labels_flipped = []
        for inst in instances:
            feature_vectors.append(inst.get_feature_vector())
            labels.append(inst.get_label())
            labels_flipped.append(-1 * inst.get_label())
        feature_vectors = np.array(feature_vectors + feature_vectors)
        labels = np.array(labels + labels_flipped)

        fvs, _ = get_fvs_and_labels(instances)
        orig_loss = logistic_loss(fvs, self.learner,
                                  np.array(labels[:(len(labels) // 2)]))
        orig_loss = np.concatenate([orig_loss, orig_loss])

        cost = np.concatenate([np.full(half_n, 0), np.array(self.cost)])

        return half_n, n, orig_loss, feature_vectors, labels, cost
    def train(self):
        """
        Train on the set of training instances.
        """

        if len(self.training_instances) < 2:
            raise ValueError('Must have at least 2 instances to train.')

        fvs, labels = get_fvs_and_labels(self.training_instances)
        orig_fvs, orig_labels = deepcopy(fvs), deepcopy(labels)

        cutoff = 10 * math.log(len(self.training_instances)) / fvs.shape[1]
        cutoff /= 100

        base_cutoff = cutoff
        factor = 1
        max_cutoff = cutoff * 100

        if self.verbose:
            print('\nBase cutoff:', cutoff, '\nMax cutoff:', max_cutoff, '\n')

        best_loss = None
        best_w = None
        iteration = 0

        while cutoff < max_cutoff:
            result = self._remove_outliers(fvs, labels, cutoff)
            if not result:
                continue

            fvs, labels = result

            self.w = np.full(fvs.shape[1], 0.0)
            for i, fv in enumerate(fvs):
                self.w += labels[i] * fv
            self.w /= fvs.shape[0]

            loss = logistic_loss(fvs, self, labels)
            loss = sum(loss) / fvs.shape[0]

            if self.verbose:
                print('\nORL Iteration:', iteration, '- factor:', factor,
                      '- cutoff:', cutoff, '- loss:', loss, '\n')

            if not best_loss or loss < best_loss:
                best_loss = loss
                best_w = deepcopy(self.w)

            factor += 1
            cutoff = base_cutoff * factor
            fvs, labels = deepcopy(orig_fvs), deepcopy(orig_labels)
            iteration += 1

        self.w = best_w
    def train(self):
        """
        Train on the set of training instances.
        """

        if len(self.training_instances) < 2:
            raise ValueError('Must have at least 2 instances to train.')

        step_size = 1 / len(self.training_instances)
        best_poison_percentage = 0.05
        best_lnr = None
        best_loss = None

        self.poison_percentage = 0.05
        self.n = int(
            (1 - self.poison_percentage) * len(self.training_instances))
        self.lnr.n = self.n

        while self.poison_percentage < 0.5:
            self.lnr.train()
            self.lnr.redo_problem_on_train = False

            loss = logistic_loss(self.training_instances, self.lnr) / self.n
            loss.sort()
            loss = sum(loss[:self.n])

            if self.verbose:
                print('\nPoison Percentage:', self.poison_percentage,
                      '- loss:', loss, '\n')

            if not best_loss or loss < best_loss:
                best_poison_percentage = self.poison_percentage
                best_loss = loss
                best_lnr = deepcopy(
                    (self.lnr.training_instances, self.lnr.n, self.lnr.lda,
                     self.lnr.verbose, self.lnr.w, self.lnr.b))

            self.poison_percentage += step_size
            self.n = int(
                (1 - self.poison_percentage) * len(self.training_instances))
            self.lnr.n = self.n

        self.poison_percentage = best_poison_percentage
        self.n = int(
            (1 - self.poison_percentage) * len(self.training_instances))
        self.lnr = TRIMLearner(best_lnr[0], best_lnr[1], best_lnr[2],
                               best_lnr[3])
        self.lnr.w, self.lnr.b = best_lnr[4], best_lnr[5]
    def train(self):
        """
        Train on the set of training instances.
        """

        if len(self.training_instances) < 2:
            raise ValueError('Must have at least 2 instances to train.')

        self.irl_selection = np.full(len(self.training_instances), 1)
        self.lnr.set_training_instances(self.training_instances)
        self.lnr.train()
        self.lnr.redo_problem_on_train = False
        self.loss = (logistic_loss(self.training_instances, self.lnr) /
                     sum(self.irl_selection))

        sorted_loss = self.loss[:]
        sorted_loss.sort()
        step_size = np.mean(
            np.array(
                list(
                    filter(lambda x: x > 0,
                           sorted_loss[1:] - sorted_loss[:-1]))))
        max_loss_threshold = np.max(self.loss)
        best_loss_threshold = np.median(self.loss)
        best_lnr = None
        best_loss = None
        loss_list = []

        if self.verbose:
            print('Minimum loss threshold:', best_loss_threshold,
                  '\nMaximum loss threshold:', max_loss_threshold,
                  '\nStep size:', step_size)

        self.loss_threshold = best_loss_threshold

        while self.loss_threshold < max_loss_threshold:
            self.irl_selection = np.full(len(self.training_instances), 1)
            try:
                self._train_helper()
            except:
                if self.verbose:
                    print('\nLoss threshold:', self.loss_threshold,
                          '- FAILURE\n')
                self.loss_threshold += step_size
                continue

            self.lnr.n = sum(self.irl_selection)
            loss = sum(map(lambda x, y: x * y, self.loss, self.irl_selection))

            if self.verbose:
                print('\nLoss threshold:', self.loss_threshold, '- loss:',
                      loss, '\n')

            if len(loss_list) > 1 and loss_list[-2] == loss_list[-1] == loss:
                print(
                    '\n---Exiting early as increasing threshold no longer changes loss---\n'
                )
                break
            else:
                loss_list.append(loss)

            if not best_loss or loss < best_loss:
                best_loss_threshold = self.loss_threshold
                best_loss = loss
                best_lnr = deepcopy(
                    (self.lnr.training_instances, self.lnr.n, self.lnr.lda,
                     self.lnr.verbose, self.lnr.w, self.lnr.b,
                     self.lnr.irl_selection))

            self.loss_threshold += step_size

        self.loss_threshold = best_loss_threshold
        self.lnr = TRIMLearner(best_lnr[0], best_lnr[1], best_lnr[2],
                               best_lnr[3])
        self.lnr.w, self.lnr.b = best_lnr[4], best_lnr[5]
        self.lnr.irl_selection = best_lnr[6]
        self.irl_selection = best_lnr[6]