示例#1
0
def entropy_sampling(classifier, X, n_instances=1, **uncertainty_measure_kwargs):
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.

    Parameters
    ----------
    classifier: sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
        The classifier for which the labels are to be queried.

    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    uncertainty_measure_kwargs: keyword arguments
        Keyword arguments to be passed for the uncertainty measure function.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X_pool[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(entropy, n_instances=n_instances)

    return query_idx, X[query_idx]
def mean_max_loss(
        classifier: OneVsRestClassifier,
        X_pool: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    Mean Max Loss query strategy for SVM multilabel classification.

    For more details on this query strategy, see
    Li et al., Multilabel SVM active learning for image classification
    (http://dx.doi.org/10.1109/ICIP.2004.1421535)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
            such as the ones from sklearn.svm. Although the function will execute for other models as well,
            the mathematical calculations in Li et al. work only for SVM-s.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    assert len(
        X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
    loss = _SVM_loss(classifier, X_pool)

    if not random_tie_break:
        query_idx = multi_argmax(loss, n_instances)
    else:
        query_idx = shuffled_argmax(loss, n_instances)

    return query_idx, X_pool[query_idx]
def avg_score(classifier: OneVsRestClassifier,
              X_pool: modALinput,
              n_instances: int = 1,
              random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    AvgScore query strategy for multilabel classification.

    For more details on this query strategy, see
    Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification
    (http://dx.doi.org/10.1007/978-3-642-00958-7_12)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    classwise_confidence = classifier.predict_proba(X_pool)
    classwise_predictions = classifier.predict(X_pool)
    classwise_scores = classwise_confidence * (classwise_predictions - 1 / 2)
    classwise_mean = np.mean(classwise_scores, axis=1)

    if not random_tie_break:
        query_idx = multi_argmax(classwise_mean, n_instances)
    else:
        query_idx = shuffled_argmax(classwise_mean, n_instances)

    return query_idx, X_pool[query_idx]
示例#4
0
def max_disagreement_sampling(committee, X, n_instances=1, **disagreement_measure_kwargs):
    """
    Maximum disagreement sampling strategy.

    Parameters
    ----------
    committee: Committee object
        The committee for which the labels are to be queried.

    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    disagreement_measure_kwargs:
        Keyword arguments to be passed for the disagreement measure function.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs)
    query_idx = multi_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#5
0
def max_std_sampling(regressor, X, n_instances=1, **predict_kwargs):
    """
    Regressor standard deviation sampling strategy.

    Parameters
    ----------
    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    predict_kwargs:
        Keyword arguments to be passed for the predict method.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(len(X), )
    query_idx = multi_argmax(std, n_instances=n_instances)
    return query_idx, X[query_idx]
def entropy_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(entropy, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(entropy, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#7
0
def max_EI(optimizer, X, tradeoff=0, n_instances=1):
    """
    Maximum EI query strategy. Selects the instance with highest expected improvement.

    :param optimizer:
        The BayesianEstimator object for which the utility is to be calculated.
    :type optimizer:
        modAL.models.BayesianEstimator object

    :param X:
        The samples for which the expected improvement is to be calculated.
    :type X:
        numpy.ndarray of shape (n_samples, n_features)

    :param tradeoff:
        Value controlling the tradeoff parameter.
    :type tradeoff:
        float

    :param n_instances:
        Number of samples to be queried.
    :type n_instances:
        int

    :returns:
      - **query_idx** *(numpy.ndarray of shape (n_instances, )*) --
        The indices of the instances from X chosen to be labelled.
      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features)*) --
        The instances from X chosen to be labelled.
    """
    ei = optimizer_EI(optimizer, X, tradeoff=tradeoff)
    query_idx = multi_argmax(ei, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#8
0
def margin_sampling(classifier,
                    X,
                    n_instances=1,
                    **uncertainty_measure_kwargs):
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.

    Parameters
    ----------
    classifier: sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
        The classifier for which the labels are to be queried.

    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    uncertainty_measure_kwargs: keyword arguments
        Keyword arguments to be passed for the uncertainty measure function.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(-margin, n_instances=n_instances)

    return query_idx, X[query_idx]
def uncertainty_sampling(classifier: BaseEstimator,
                         X: modALinput,
                         n_instances: int = 1,
                         random_tie_break: bool = False,
                         **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Uncertainty sampling query strategy. Selects the least sure instances for labelling.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X,
                                         **uncertainty_measure_kwargs)

    if not random_tie_break:
        return multi_argmax(uncertainty, n_instances=n_instances)

    return shuffled_argmax(uncertainty, n_instances=n_instances)
示例#10
0
def mean_max_loss(classifier: OneVsRestClassifier,
                  X_pool: modALinput,
                  n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
    """
    Mean Max Loss query strategy for SVM multilabel classification.

    For more details on this query strategy, see
    Li et al., Multilabel SVM active learning for image classification
    (http://dx.doi.org/10.1109/ICIP.2004.1421535)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
            such as the ones from sklearn.svm. Although the function will execute for other models as well,
            the mathematical calculations in Li et al. work only for SVM-s.
        X: The pool of samples to query from.

    Returns:
        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
    """

    assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
    loss = _SVM_loss(classifier, X_pool)

    query_idx = multi_argmax(loss, n_instances)
    return query_idx, X_pool[query_idx]
示例#11
0
def entropy_sampling(classifier,
                     X,
                     n_instances=1,
                     **uncertainty_measure_kwargs):
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.

    Parameters
    ----------
    classifier: sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
        The classifier for which the labels are to be queried.

    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    uncertainty_measure_kwargs: keyword arguments
        Keyword arguments to be passed for the uncertainty measure function.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(entropy, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#12
0
def avg_score(classifier: OneVsRestClassifier,
              X_pool: modALinput,
              n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
    """
    AvgScore query strategy for multilabel classification.

    For more details on this query strategy, see
    Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification
    (http://dx.doi.org/10.1007/978-3-642-00958-7_12)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried.
        X: The pool of samples to query from.

    Returns:
        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
    """

    classwise_confidence = classifier.predict_proba(X_pool)
    classwise_predictions = classifier.predict(X_pool)
    classwise_scores = classwise_confidence*(classwise_predictions-1/2)
    classwise_mean = np.mean(classwise_scores, axis=1)
    query_idx = multi_argmax(classwise_mean, n_instances)

    return query_idx, X_pool[query_idx]
示例#13
0
def max_disagreement_sampling(committee: BaseCommittee,
                              X: modALinput,
                              n_instances: int = 1,
                              random_tie_break=False,
                              **disagreement_measure_kwargs) -> np.ndarray:
    """
    Maximum disagreement sampling strategy.

    Args:
        committee: The committee for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
         measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    disagreement = KL_max_disagreement(committee, X,
                                       **disagreement_measure_kwargs)

    if not random_tie_break:
        return multi_argmax(disagreement, n_instances=n_instances)

    return shuffled_argmax(disagreement, n_instances=n_instances)
示例#14
0
def max_std_sampling(regressor: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     random_tie_break=False,
                     **predict_kwargs) -> np.ndarray:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
        return multi_argmax(std, n_instances=n_instances)

    return shuffled_argmax(std, n_instances=n_instances)
示例#15
0
def margin_sampling(classifier, X, n_instances=1, **uncertainty_measure_kwargs):
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.

    Parameters
    ----------
    classifier: sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
        The classifier for which the labels are to be queried.

    X: numpy.ndarray of shape (n_samples, n_features)
        The pool of samples to query from.

    n_instances: int
        Number of samples to be queried.

    uncertainty_measure_kwargs: keyword arguments
        Keyword arguments to be passed for the uncertainty measure function.

    Returns
    -------
    query_idx: numpy.ndarray of shape (n_instances, )
        The indices of the instances from X_pool chosen to be labelled.

    X_pool[query_idx]: numpy.ndarray of shape (n_instances, n_features)
        The instances from X_pool chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(-margin, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#16
0
def max_UCB(optimizer, X, beta=1, n_instances=1):
    """
    Maximum UCB query strategy. Selects the instance with highest upper confidence
    bound.

    :param optimizer:
        The BayesianEstimator object for which the utility is to be calculated.
    :type optimizer:
        modAL.models.BayesianEstimator object

    :param X:
        The samples for which the probability of improvement is to be calculated.
    :type X:
        numpy.ndarray of shape (n_samples, n_features)

    :param beta:
        Value controlling the beta parameter.
    :type beta:
        float

    :param n_instances:
        Number of samples to be queried.
    :type n_instances:
        int

    :returns:
      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
        The indices of the instances from X chosen to be labelled.
      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
        The instances from X chosen to be labelled.
    """
    ucb = optimizer_UCB(optimizer, X, beta=beta)
    query_idx = multi_argmax(ucb, n_instances=n_instances)

    return query_idx, X[query_idx]
def margin_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.
    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.
    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(-margin, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(-margin, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#18
0
def max_std_sampling(regressor, X, n_instances=1, **predict_kwargs):
    """
    Regressor standard deviation sampling strategy.

    :param X:
        The pool of samples to query from.
    :type X:
        numpy.ndarray of shape (n_samples, n_features)

    :param n_instances:
        Number of samples to be queried.
    :type n_instances:
        int

    :param predict_kwargs:
        Keyword arguments to be passed for the predict method.
    :type predict_kwargs:
        keyword arguments

    :returns:
      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
        The indices of the instances from X chosen to be labelled.

      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
        The instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(len(X), )
    query_idx = multi_argmax(std, n_instances=n_instances)
    return query_idx, X[query_idx]
    def _query(self, X, pool_idx, n_instances=1, proba=None):
        uncertainty = 1 - np.max(proba[pool_idx], axis=1)
        if not self.random_tie_break:
            query_idx = multi_argmax(uncertainty, n_instances=n_instances)
        else:
            query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)

        return pool_idx[query_idx], X[pool_idx[query_idx]]
示例#20
0
    def _query(self, X, pool_idx, n_instances=1, proba=None):
        proba = proba[pool_idx]
        if not self.random_tie_break:
            query_idx = multi_argmax(proba[:, 1], n_instances=n_instances)
        else:
            query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances)

        return pool_idx[query_idx], X[pool_idx[query_idx]]
def max_sampling(classifier: BaseEstimator,
                 X: modALinput,
                 n_instances: int = 1,
                 random_tie_break: bool = False,
                 pool_idx=None,
                 query_kwargs={},
                 **kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Maximum sampling query strategy.
    Selects the samples with the highest prediction probability.

    Parameters
    ----------
    classifier: BaseEstimator
        The classifier for which the labels are to be queried.
    X: modALinput
        The pool of samples to query from.
    n_instances: int
        Number of samples to be queried.
    random_tie_break: bool
        If True, shuffles utility scores to randomize the order.
        This can be used to break the tie when the highest
        utility score is not unique.
    **kwargs:
        Keyword arguments to be passed for
        the prediction measure function.

    Returns
    -------
    np.ndarray, modALinput
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    n_samples = X.shape[0]
    if pool_idx is None:
        pool_idx = np.arange(n_samples)

    # First attempt to get the probabilities from the dictionary.
    proba = query_kwargs.get('pred_proba', [])
    if len(proba) != n_samples:
        try:
            proba = classifier.predict_proba(X, **kwargs)
        except NotFittedError:
            proba = np.ones(shape=(n_samples, ))
        query_kwargs['pred_proba'] = proba

    proba = proba[pool_idx]
    if not random_tie_break:
        query_idx = multi_argmax(proba[:, 1], n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances)

    for idx in query_idx:
        query_kwargs['current_queries'][pool_idx[idx]] = "max"

    return pool_idx[query_idx], X[pool_idx[query_idx]]
def query_selection(model, X, config, n_instances=1, al_epoch=None,
                    al_num_workers=5, **kwargs):
    """
        Query the ids of the most promising data
        :parm model: segmentation model that is supposed to be trained by al loop
        :parm X: Data for prediction, type list of Datasets e.g. DataGenerator objects
                The form as list is so that never too much data at once is in calculation
        :parm config: config parameters
        :return: indices of the queried (best) data
        Note: the ids returned are the indices of the position in the data
              provided, not the indices of the hdf5 file used in
              CustomActiveLearner!
    """
    # choose the type of utility function used for calculation of utility
    utility_functions = {'entropy': _proba_entropy,
                         'uncertainty': _proba_uncertainty,
                         'margin': _proba_margin}
    utility_function = utility_functions[config['information_estimation']]
    # choose how segmentation is condensed to a single utility value
    # (using utility function from above)
    reduction_functions = {'value_of_means': _value_of_means,
                           'mean_of_values': _mean_of_values}
    reduction_function = reduction_functions[config['reduce_segmentation']]

    # utility evaluation using the predictions of the model for the data
    utilities = np.array([])
    for data in X:
        print('Calculating utilities of {0} batches'.format(len(data)))
        predictions = model.predict(data, workers=al_num_workers,
                                    use_multiprocessing=True)
        _utilities = reduction_function(predictions, utility_function)
        utilities = np.concatenate((utilities, _utilities))

    # selecting the best instances
    query_idx = multi_argmax(utilities, n_instances=n_instances)

    # save utility values of queried instances
    pickle_path = Path(config['result_rootdir'],
                       'al_utilities' + '_' + config['exp_name'] + '.pickle')
    if not os.path.exists(pickle_path):
        with open(pickle_path, 'w'): pass
    with open(pickle_path, 'rb+') as f:
        if al_epoch == 0:
            data = np.empty((config['al_iterations'], n_instances))
            data[al_epoch] = utilities[query_idx]
        else:
            data = pickle.load(f)
            data[al_epoch] = utilities[query_idx]
        pickle.dump(data, f)

    return query_idx
示例#23
0
def uncertainty_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        pool_idx=None,
        query_kwargs={},
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Uncertainty sampling query strategy.
    Selects the least sure instances for labelling.

    Parameters
    ----------
    classifier: BaseEstimator
        The classifier for which the labels are to be queried.
    X: modALinput
        The pool of samples to query from.
    n_instances: int
        Number of samples to be queried.
    random_tie_break: bool
        If True, shuffles utility scores to randomize the order.
        This can be used to break the tie when the highest
        utility score is not unique.
    **uncertainty_measure_kwargs:
        Keyword arguments to be passed for
        the uncertainty measure function.

    Returns
    -------
    np.ndarray, modALinput
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    n_samples = X.shape[0]
    if pool_idx is None:
        pool_idx = np.arange(n_samples)
    query_kwargs['pred_proba'] = []

    uncertainty = classifier_uncertainty(classifier, X[pool_idx], query_kwargs,
                                         **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(uncertainty, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#24
0
def max_UCB(optimizer: BaseLearner,
            X: modALinput,
            beta: float = 1,
            n_instances: int = 1) -> np.ndarray:
    """
    Maximum UCB query strategy. Selects the instance with highest upper confidence bound.

    Args:
        optimizer: The :class:`~modAL.models.BayesianOptimizer` object for which the utility is to be calculated.
        X: The samples for which the maximum upper confidence bound is to be calculated.
        beta: Value controlling the beta parameter.
        n_instances: Number of samples to be queried.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    ucb = optimizer_UCB(optimizer, X, beta=beta)
    return multi_argmax(ucb, n_instances=n_instances)
示例#25
0
def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
                         n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Uncertainty sampling query strategy. Selects the least sure instances for labelling.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(uncertainty, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#26
0
def max_EI(optimizer: BaseLearner,
           X: modALinput,
           tradeoff: float = 0,
           n_instances: int = 1) -> np.ndarray:
    """
    Maximum EI query strategy. Selects the instance with highest expected improvement.

    Args:
        optimizer: The :class:`~modAL.models.BayesianOptimizer` object for which the utility is to be calculated.
        X: The samples for which the expected improvement is to be calculated.
        tradeoff: Value controlling the tradeoff parameter.
        n_instances: Number of samples to be queried.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    ei = optimizer_EI(optimizer, X, tradeoff=tradeoff)
    return multi_argmax(ei, n_instances=n_instances)
示例#27
0
def margin_sampling(classifier: BaseEstimator, X: modALinput,
                    n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Margin sampling query strategy. Selects the instances where the difference between the first most likely and second
    most likely classes are the smallest.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
    query_idx = multi_argmax(-margin, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#28
0
文件: acquisition.py 项目: yyht/modAL
def max_PI(optimizer: BaseLearner,
           X: modALinput,
           tradeoff: float = 0,
           n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
    """
    Maximum PI query strategy. Selects the instance with highest probability of improvement.

    Args:
        optimizer: The :class:`~modAL.models.BayesianOptimizer` object for which the utility is to be calculated.
        X: The samples for which the probability of improvement is to be calculated.
        tradeoff: Value controlling the tradeoff parameter.
        n_instances: Number of samples to be queried.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    pi = optimizer_PI(optimizer, X, tradeoff=tradeoff)
    query_idx = multi_argmax(pi, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#29
0
def max_std_sampling(regressor: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )
    query_idx = multi_argmax(std, n_instances=n_instances)
    return query_idx, X[query_idx]
示例#30
0
def vote_entropy_sampling(
        committee: BaseCommittee,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break=False,
        **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:

    disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
    version_size = 0
    for i in disagreement:
        if i != 0:
            version_size += 1

    committee.version_sizes.append(version_size)

    if not random_tie_break:
        query_idx = multi_argmax(disagreement, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#31
0
def vote_entropy_sampling(
        committee: BaseCommittee,
        X: modALinput,
        n_instances: int = 1,
        **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Vote entropy sampling strategy.

    Args:
        committee: The committee for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function.

    Returns:
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
    query_idx = multi_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
示例#32
0
def max_disagreement_sampling(committee,
                              X,
                              n_instances=1,
                              **disagreement_measure_kwargs):
    """
    Maximum disagreement sampling strategy.

    :param committee:
        The committee for which the labels are to be queried.
    :type committee:
        Committee object

    :param X:
        The pool of samples to query from.
    :type X:
        numpy.ndarray of shape (n_samples, n_features)

    :param n_instances:
        Number of samples to be queried.
    :type n_instances:
        int

    :param disagreement_measure_kwargs:
        Keyword arguments to be passed for the disagreement measure function.
    :type disagreement_measure_kwargs:
        keyword arguments

    :returns:
      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
        The indices of the instances from X chosen to be labelled.

      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
        The instances from X chosen to be labelled.
    """
    disagreement = KL_max_disagreement(committee, X,
                                       **disagreement_measure_kwargs)
    query_idx = multi_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]