def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: check_scalar(x, "test_name", target_type=target_type, min_val=min_val, max_val=max_val) assert len(record) == 0
def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, err_msg): """Test that check_scalar returns the right error if a wrong input is given""" with pytest.raises(Exception) as raised_error: check_scalar(x, target_name, target_type=target_type, min_val=min_val, max_val=max_val) assert str(raised_error.value) == str(err_msg) assert type(raised_error.value) == type(err_msg)
def fit(self, X, y, sample_weight=None): """Fit the model using X as training data and y as class labels. Parameters ---------- X : array-like of shape (n_samples, n_features) The sample matrix `X` is the feature matrix representing the samples. y : array-like of shape (n_samples) It contains the class labels of the training samples. sample_weight : array-like of shape (n_samples) It contains the weights of the training samples' class labels. It must have the same shape as y. Returns ------- self: PWC, The PWC is fitted on the training data. """ # Check input parameters. X, y, sample_weight = self._validate_data(X, y, sample_weight) # Check whether metric is available. if self.metric not in PWC.METRICS and not callable(self.metric): raise ValueError("The parameter 'metric' must be callable or " "in {}".format(KERNEL_PARAMS.keys())) # Check number of neighbors which must be a positive integer. if self.n_neighbors is not None: check_scalar(self.n_neighbors, name='n_neighbors', min_val=1, target_type=int) # Ensure that metric_dict is a Python dictionary. self.metric_dict_ = self.metric_dict if self.metric_dict is not None \ else {} if not isinstance(self.metric_dict_, dict): raise TypeError("'metric_dict' must be a Python dictionary.") self._check_n_features(X, reset=True) # Store train samples. self.X_ = X.copy() # Convert labels to count vectors. if self.n_features_in_ is None: self.V_ = 0 else: self.V_ = compute_vote_vectors(y=y, w=sample_weight, classes=np.arange(len( self.classes_))) return self
def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: check_scalar(x, "test_name", target_type, min_val, max_val) assert len(record) == 0
def query(self, X_cand, clf, X=None, y=None, sample_weight=None, return_utilities=False, batch_size=1): """Ask the query strategy which sample in 'X_cand' to query. Parameters ---------- X_cand : array-like, shape (n_candidate_samples, n_features) Candidate samples from which the strategy can select. clf : skactiveml.classifier.CMM GMM-based classifier to be trained. X: array-like, shape (n_samples, n_features), optional (default=None) Complete training data set. y: array-like, shape (n_samples), optional (default=None) Labels of the training data set. sample_weight: array-like, shape (n_samples), optional (default=None) Weights of training samples in `X`. batch_size : int, optional (default=1) The number of samples to be selected in one AL cycle. return_utilities : bool, optional (default=False) If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray, shape (batch_size) The query_indices indicate for which candidate sample a label is to queried, e.g., `query_indices[0]` indicates the first selected sample. utilities : numpy.ndarray, shape (batch_size, n_samples) The utilities of all candidate samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. """ # Validate input. X_cand, return_utilities, batch_size, random_state = \ self._validate_data(X_cand, return_utilities, batch_size, self.random_state, reset=True) # Check input training data. X = check_array(X, ensure_min_samples=0) self._check_n_features(X, reset=False) y = column_or_1d(y) # Check classifier type. check_type(clf, 'clf', CMM) # Storage for query indices. query_indices = np.full(batch_size, fill_value=-1, dtype=int) # Check lmbda. lmbda = self.lmbda if lmbda is None: lmbda = np.min(((batch_size - 1) * 0.05, 0.5)) check_scalar(lmbda, target_type=float, name='lmbda', min_val=0, max_val=1) # Fit the classifier and get the probabilities. clf = fit_if_not_fitted(clf, X, y, sample_weight) P_cand = clf.predict_proba(X_cand) R_cand = clf.mixture_model_.predict_proba(X_cand) is_lbld = is_labeled(y, missing_label=clf.missing_label) if np.sum(is_lbld) >= 1: R_lbld = clf.mixture_model_.predict_proba(X[is_lbld]) else: R_lbld = np.array([0]) # Compute distance according to Eq. 9 in [1]. P_cand_sorted = np.sort(P_cand, axis=1) distance_cand = np.log( (P_cand_sorted[:, -1] + 1.e-5) / (P_cand_sorted[:, -2] + 1.e-5)) distance_cand = (distance_cand - np.min(distance_cand) + 1.e-5) / ( np.max(distance_cand) - np.min(distance_cand) + 1.e-5) # Compute densities according to Eq. 10 in [1]. density_cand = clf.mixture_model_.score_samples(X_cand) density_cand = (density_cand - np.min(density_cand) + 1.e-5) / ( np.max(density_cand) - np.min(density_cand) + 1.e-5) # Compute distributions according to Eq. 11 in [1]. R_lbld_sum = np.sum(R_lbld, axis=0, keepdims=True) R_sum = R_cand + R_lbld_sum R_mean = R_sum / (len(R_lbld) + 1) distribution_cand = clf.mixture_model_.weights_ - R_mean distribution_cand = np.maximum(np.zeros_like(distribution_cand), distribution_cand) distribution_cand = 1 - np.sum(distribution_cand, axis=1) # Compute rho according to Eq. 15 in [1]. diff = np.sum( np.abs(clf.mixture_model_.weights_ - np.mean(R_lbld, axis=0))) rho = min(1, diff) # Compute e_dwus according to Eq. 13 in [1]. e_dwus = np.mean((1 - P_cand_sorted[:, -1]) * density_cand) # Normalization such that alpha, beta, and rho sum up to one. alpha = (1 - rho) * e_dwus beta = 1 - rho - alpha # Compute utilities to select sample. utilities = np.empty((batch_size, len(X_cand)), dtype=float) utilities[0] = alpha * ( 1 - distance_cand) + beta * density_cand + \ rho * distribution_cand query_indices[0] = rand_argmax(utilities[0], random_state) is_selected = np.zeros(len(X_cand), dtype=bool) is_selected[query_indices[0]] = True if batch_size > 1: # Compute e_us according to Eq. 14 in [1]. e_us = np.mean(1 - P_cand_sorted[:, -1]) # Normalization of the coefficients alpha, beta, and rho such # that these coefficients plus # lmbda sum up to one. rho = min(rho, 1 - lmbda) alpha = (1 - (rho + lmbda)) * (1 - e_us) beta = 1 - (rho + lmbda) - alpha for i in range(1, batch_size): # Update distributions according to Eq. 11 in [1]. R_sum = R_cand + np.sum( R_cand[is_selected], axis=0, keepdims=True) + R_lbld_sum R_mean = R_sum / (len(R_lbld) + len(query_indices) + 1) distribution_cand = clf.mixture_model_.weights_ - R_mean distribution_cand = np.maximum( np.zeros_like(distribution_cand), distribution_cand) distribution_cand = 1 - np.sum(distribution_cand, axis=1) # Compute diversity according to Eq. 12 in [1]. diversity_cand = -np.log(density_cand + np.sum(density_cand[is_selected])) / ( len(query_indices) + 1) diversity_cand = (diversity_cand - np.min(diversity_cand)) / ( np.max(diversity_cand) - np.min(diversity_cand)) # Compute utilities to select sample. utilities[i] = alpha * ( 1 - distance_cand) + beta * density_cand + \ lmbda * diversity_cand \ + rho * distribution_cand utilities[i, is_selected] = np.nan query_indices[i] = rand_argmax(utilities[i], random_state) is_selected[query_indices[i]] = True # Check whether utilities are to be returned. if return_utilities: return query_indices, utilities else: return query_indices