def test_check_scalar_valid(x, target_type, min_val, max_val):
    """Test that check_scalar returns no error/warning if valid inputs are
    provided"""
    with pytest.warns(None) as record:
        check_scalar(x, "test_name", target_type=target_type,
                     min_val=min_val, max_val=max_val)
    assert len(record) == 0
def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val,
                              err_msg):
    """Test that check_scalar returns the right error if a wrong input is
    given"""
    with pytest.raises(Exception) as raised_error:
        check_scalar(x, target_name, target_type=target_type,
                     min_val=min_val, max_val=max_val)
    assert str(raised_error.value) == str(err_msg)
    assert type(raised_error.value) == type(err_msg)
def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val,
                              err_msg):
    """Test that check_scalar returns the right error if a wrong input is
    given"""
    with pytest.raises(Exception) as raised_error:
        check_scalar(x, target_name, target_type=target_type,
                     min_val=min_val, max_val=max_val)
    assert str(raised_error.value) == str(err_msg)
    assert type(raised_error.value) == type(err_msg)
示例#4
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model using X as training data and y as class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The sample matrix `X` is the feature matrix representing the
            samples.
        y : array-like of shape (n_samples)
            It contains the class labels of the training samples.
        sample_weight : array-like of shape (n_samples)
            It contains the weights of the training samples' class labels.
            It must have the same shape as y.

        Returns
        -------
        self: PWC,
            The PWC is fitted on the training data.
        """
        # Check input parameters.
        X, y, sample_weight = self._validate_data(X, y, sample_weight)

        # Check whether metric is available.
        if self.metric not in PWC.METRICS and not callable(self.metric):
            raise ValueError("The parameter 'metric' must be callable or "
                             "in {}".format(KERNEL_PARAMS.keys()))

        # Check number of neighbors which must be a positive integer.
        if self.n_neighbors is not None:
            check_scalar(self.n_neighbors,
                         name='n_neighbors',
                         min_val=1,
                         target_type=int)

        # Ensure that metric_dict is a Python dictionary.
        self.metric_dict_ = self.metric_dict if self.metric_dict is not None \
            else {}
        if not isinstance(self.metric_dict_, dict):
            raise TypeError("'metric_dict' must be a Python dictionary.")

        self._check_n_features(X, reset=True)

        # Store train samples.
        self.X_ = X.copy()

        # Convert labels to count vectors.
        if self.n_features_in_ is None:
            self.V_ = 0
        else:
            self.V_ = compute_vote_vectors(y=y,
                                           w=sample_weight,
                                           classes=np.arange(len(
                                               self.classes_)))

        return self
def test_check_scalar_valid(x, target_type, min_val, max_val):
    """Test that check_scalar returns no error/warning if valid inputs are
    provided"""
    with pytest.warns(None) as record:
        check_scalar(x, "test_name", target_type, min_val, max_val)
    assert len(record) == 0
    def query(self,
              X_cand,
              clf,
              X=None,
              y=None,
              sample_weight=None,
              return_utilities=False,
              batch_size=1):
        """Ask the query strategy which sample in 'X_cand' to query.

        Parameters
        ----------
        X_cand : array-like, shape (n_candidate_samples, n_features)
            Candidate samples from which the strategy can select.
        clf : skactiveml.classifier.CMM
            GMM-based classifier to be trained.
        X: array-like, shape (n_samples, n_features), optional (default=None)
            Complete training data set.
        y: array-like, shape (n_samples), optional (default=None)
            Labels of the training data set.
        sample_weight: array-like, shape (n_samples), optional (default=None)
            Weights of training samples in `X`.
        batch_size : int, optional (default=1)
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray, shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            to queried, e.g., `query_indices[0]` indicates the first selected
            sample.
        utilities : numpy.ndarray, shape (batch_size, n_samples)
            The utilities of all candidate samples after each selected
            sample of the batch, e.g., `utilities[0]` indicates the utilities
            used for selecting the first sample (with index `query_indices[0]`)
            of the batch.
        """
        # Validate input.
        X_cand, return_utilities, batch_size, random_state = \
            self._validate_data(X_cand, return_utilities, batch_size,
                                self.random_state, reset=True)

        # Check input training data.
        X = check_array(X, ensure_min_samples=0)
        self._check_n_features(X, reset=False)
        y = column_or_1d(y)

        # Check classifier type.
        check_type(clf, 'clf', CMM)

        # Storage for query indices.
        query_indices = np.full(batch_size, fill_value=-1, dtype=int)

        # Check lmbda.
        lmbda = self.lmbda
        if lmbda is None:
            lmbda = np.min(((batch_size - 1) * 0.05, 0.5))
        check_scalar(lmbda,
                     target_type=float,
                     name='lmbda',
                     min_val=0,
                     max_val=1)

        # Fit the classifier and get the probabilities.
        clf = fit_if_not_fitted(clf, X, y, sample_weight)
        P_cand = clf.predict_proba(X_cand)
        R_cand = clf.mixture_model_.predict_proba(X_cand)
        is_lbld = is_labeled(y, missing_label=clf.missing_label)
        if np.sum(is_lbld) >= 1:
            R_lbld = clf.mixture_model_.predict_proba(X[is_lbld])
        else:
            R_lbld = np.array([0])

        # Compute distance according to Eq. 9 in [1].
        P_cand_sorted = np.sort(P_cand, axis=1)
        distance_cand = np.log(
            (P_cand_sorted[:, -1] + 1.e-5) / (P_cand_sorted[:, -2] + 1.e-5))
        distance_cand = (distance_cand - np.min(distance_cand) + 1.e-5) / (
            np.max(distance_cand) - np.min(distance_cand) + 1.e-5)

        # Compute densities according to Eq. 10 in [1].
        density_cand = clf.mixture_model_.score_samples(X_cand)
        density_cand = (density_cand - np.min(density_cand) + 1.e-5) / (
            np.max(density_cand) - np.min(density_cand) + 1.e-5)

        # Compute distributions according to Eq. 11 in [1].
        R_lbld_sum = np.sum(R_lbld, axis=0, keepdims=True)
        R_sum = R_cand + R_lbld_sum
        R_mean = R_sum / (len(R_lbld) + 1)
        distribution_cand = clf.mixture_model_.weights_ - R_mean
        distribution_cand = np.maximum(np.zeros_like(distribution_cand),
                                       distribution_cand)
        distribution_cand = 1 - np.sum(distribution_cand, axis=1)

        # Compute rho according to Eq. 15  in [1].
        diff = np.sum(
            np.abs(clf.mixture_model_.weights_ - np.mean(R_lbld, axis=0)))
        rho = min(1, diff)

        # Compute e_dwus according to Eq. 13  in [1].
        e_dwus = np.mean((1 - P_cand_sorted[:, -1]) * density_cand)

        # Normalization such that alpha, beta, and rho sum up to one.
        alpha = (1 - rho) * e_dwus
        beta = 1 - rho - alpha

        # Compute utilities to select sample.
        utilities = np.empty((batch_size, len(X_cand)), dtype=float)
        utilities[0] = alpha * (
                1 - distance_cand) + beta * density_cand + \
                          rho * distribution_cand
        query_indices[0] = rand_argmax(utilities[0], random_state)
        is_selected = np.zeros(len(X_cand), dtype=bool)
        is_selected[query_indices[0]] = True

        if batch_size > 1:
            # Compute e_us according to Eq. 14  in [1].
            e_us = np.mean(1 - P_cand_sorted[:, -1])

            # Normalization of the coefficients alpha, beta, and rho such
            # that these coefficients plus
            # lmbda sum up to one.
            rho = min(rho, 1 - lmbda)
            alpha = (1 - (rho + lmbda)) * (1 - e_us)
            beta = 1 - (rho + lmbda) - alpha

            for i in range(1, batch_size):
                # Update distributions according to Eq. 11 in [1].
                R_sum = R_cand + np.sum(
                    R_cand[is_selected], axis=0, keepdims=True) + R_lbld_sum
                R_mean = R_sum / (len(R_lbld) + len(query_indices) + 1)
                distribution_cand = clf.mixture_model_.weights_ - R_mean
                distribution_cand = np.maximum(
                    np.zeros_like(distribution_cand), distribution_cand)
                distribution_cand = 1 - np.sum(distribution_cand, axis=1)

                # Compute diversity according to Eq. 12 in [1].
                diversity_cand = -np.log(density_cand +
                                         np.sum(density_cand[is_selected])) / (
                                             len(query_indices) + 1)
                diversity_cand = (diversity_cand - np.min(diversity_cand)) / (
                    np.max(diversity_cand) - np.min(diversity_cand))

                # Compute utilities to select sample.
                utilities[i] = alpha * (
                        1 - distance_cand) + beta * density_cand + \
                                  lmbda * diversity_cand \
                                  + rho * distribution_cand
                utilities[i, is_selected] = np.nan
                query_indices[i] = rand_argmax(utilities[i], random_state)
                is_selected[query_indices[i]] = True

        # Check whether utilities are to be returned.
        if return_utilities:
            return query_indices, utilities
        else:
            return query_indices