Пример #1
0
def vote_entropy_sampling(
        committee: BaseCommittee,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break=True,
        **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Vote entropy sampling strategy.

    Args:
        committee: The committee for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
         the instances from X chosen to be labelled.
    """
    disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(disagreement, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
Пример #2
0
def entropy_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(entropy, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(entropy, n_instances=n_instances)

    return query_idx, X[query_idx]
Пример #3
0
def margin_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.
    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.
    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(-margin, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(-margin, n_instances=n_instances)

    return query_idx, X[query_idx]
Пример #4
0
def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
             n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:

    """
    Max Loss query strategy for SVM multilabel classification.

    For more details on this query strategy, see
    Li et al., Multilabel SVM active learning for image classification
    (http://dx.doi.org/10.1109/ICIP.2004.1421535)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
            such as the ones from sklearn.svm. Although the function will execute for other models as well,
            the mathematical calculations in Li et al. work only for SVM-s.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'

    most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1)
    loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes)

    if not random_tie_break:
        query_idx = multi_argmax(loss, n_instances)
    else:
        query_idx = shuffled_argmax(loss, n_instances)

    return query_idx, X_pool[query_idx]
Пример #5
0
def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
                       random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    SVM binary minimum multilabel active learning strategy. For details see the paper
    Klaus Brinker, On Active Learning in Multi-label Classification
    (https://link.springer.com/chapter/10.1007%2F3-540-31314-1_24)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model
            such as the ones from sklearn.svm.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    decision_function = np.array([svm.decision_function(X_pool)
                                  for svm in classifier.estimator.estimators_]).T

    min_abs_dist = np.min(np.abs(decision_function), axis=1)

    if not random_tie_break:
        query_idx = np.argmin(min_abs_dist)
    else:
        query_idx = shuffled_argmax(min_abs_dist)

    return query_idx, X_pool[query_idx]
Пример #6
0
def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
              n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    AvgScore query strategy for multilabel classification.

    For more details on this query strategy, see
    Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification
    (http://dx.doi.org/10.1007/978-3-642-00958-7_12)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    classwise_confidence = classifier.predict_proba(X_pool)
    classwise_predictions = classifier.predict(X_pool)
    classwise_scores = classwise_confidence*(classwise_predictions-1/2)
    classwise_mean = np.mean(classwise_scores, axis=1)

    if not random_tie_break:
        query_idx = multi_argmax(classwise_mean, n_instances)
    else:
        query_idx = shuffled_argmax(classwise_mean, n_instances)

    return query_idx, X_pool[query_idx]