Exemplo n.º 1
0
    def _add_training_data(self, X: modALinput, y: modALinput) -> None:
        """
        Adds the new data and label to the known data, but does not retrain the model.

        Args:
            X: The new samples for which the labels are supplied by the expert.
            y: Labels corresponding to the new instances in X.

        Note:
            If the classifier has been fitted, the features in X have to agree with the training samples which the
            classifier has seen.
        """
        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
                  force_all_finite=self.force_all_finite)

        if self.X_training is None:
            self.X_training = X
            self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
            self.y_training = y
        else:
            try:
                self.X_training = data_vstack((self.X_training, X))
                self.Xt_training = data_vstack((
                    self.Xt_training,
                    self.transform_without_estimating(X)
                )) if self.on_transformed else None
                self.y_training = data_vstack((self.y_training, y))
            except ValueError:
                raise ValueError('the dimensions of the new training data and label must'
                                 'agree with the training data and labels provided so far')
Exemplo n.º 2
0
def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
                 unlabeled: modALinput,
                 uncertainty_scores: np.ndarray,
                 n_instances: int,
                 metric: Union[str, Callable],
                 n_jobs: Union[int, None]) -> np.ndarray:
    """
    Query our top :n_instances: to request for labeling.

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    Args:
        classifier: One of modAL's supported active learning models.
        unlabeled: Set of records to be considered for our active learning model.
        uncertainty_scores: Our classifier's predictions over the response variable.
        n_instances: Limit on the number of records to query from our unlabeled set.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.

    Returns:
        The indices of the top n_instances ranked unlabelled samples.
    """
    # Make a local copy of our classifier's training data.
    # Define our record container and record the best cold start instance in the case of cold start.

    # transform unlabeled data if needed
    if classifier.on_transformed:
        unlabeled = classifier.transform_without_estimating(unlabeled)

    if classifier.X_training is None:
        best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
        instance_index_ranking = [best_coldstart_instance_index]
    elif classifier.X_training.shape[0] > 0:
        labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
        instance_index_ranking = []
    
    # The maximum number of records to sample.
    ceiling = np.minimum(unlabeled.shape[0], n_instances) - len(instance_index_ranking)

    # mask for unlabeled initialized as transparent
    mask = np.ones(unlabeled.shape[0], np.bool)

    for _ in range(ceiling):

        # Receive the instance and corresponding index from our unlabeled copy that scores highest.
        instance_index, instance, mask = select_instance(X_training=labeled, X_pool=unlabeled,
                                                         X_uncertainty=uncertainty_scores, mask=mask,
                                                         metric=metric, n_jobs=n_jobs)

        # Add our instance we've considered for labeling to our labeled set. Although we don't
        # know it's label, we want further iterations to consider the newly-added instance so
        # that we don't query the same instance redundantly.
        labeled = data_vstack((labeled, instance))

        # Finally, append our instance's index to the bottom of our ranking.
        instance_index_ranking.append(instance_index)

    # Return numpy array, not a list.
    return np.array(instance_index_ranking)
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
                             p_subsample: np.float = 1.0, n_instances: int = 1,
                             random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    Expected error reduction query strategy.

    References:
        Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)

    Args:
        learner: The ActiveLearner object for which the expected error
            is to be estimated.
        X: The samples.
        loss: The loss function to be used. Can be 'binary' or 'log'.
        p_subsample: Probability of keeping a sample from the pool when
            calculating expected error. Significantly improves runtime
            for large sample pools.
        n_instances: The number of instances to be sampled.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.


    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """

    assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
    assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''

    expected_error = np.zeros(shape=(len(X), ))
    possible_labels = np.unique(learner.y_training)

    try:
        X_proba = learner.predict_proba(X)
    except NotFittedError:
        # TODO: implement a proper cold-start
        return 0, X[0]

    cloned_estimator = clone(learner.estimator)

    for x_idx, x in enumerate(X):
        # subsample the data if needed
        if np.random.rand() <= p_subsample:
            # estimate the expected error
            for y_idx, y in enumerate(possible_labels):
                X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
                y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))

                cloned_estimator.fit(X_new, y_new)
                refitted_proba = cloned_estimator.predict_proba(X)
                if loss is 'binary':
                    loss = _proba_uncertainty(refitted_proba)
                elif loss is 'log':
                    loss = _proba_entropy(refitted_proba)

                expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx]

        else:
            expected_error[x_idx] = np.inf

    if not random_tie_break:
        query_idx = multi_argmax(expected_error, n_instances)
    else:
        query_idx = shuffled_argmax(expected_error, n_instances)

    return query_idx, X[query_idx]