def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data, but does not retrain the model. Args: X: The new samples for which the labels are supplied by the expert. y: Labels corresponding to the new instances in X. Note: If the classifier has been fitted, the features in X have to agree with the training samples which the classifier has seen. """ check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, force_all_finite=self.force_all_finite) if self.X_training is None: self.X_training = X self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None self.y_training = y else: try: self.X_training = data_vstack((self.X_training, X)) self.Xt_training = data_vstack(( self.Xt_training, self.transform_without_estimating(X) )) if self.on_transformed else None self.y_training = data_vstack((self.y_training, y)) except ValueError: raise ValueError('the dimensions of the new training data and label must' 'agree with the training data and labels provided so far')
def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], unlabeled: modALinput, uncertainty_scores: np.ndarray, n_instances: int, metric: Union[str, Callable], n_jobs: Union[int, None]) -> np.ndarray: """ Query our top :n_instances: to request for labeling. Refer to Cardoso et al.'s "Ranked batch-mode active learning": https://www.sciencedirect.com/science/article/pii/S0020025516313949 Args: classifier: One of modAL's supported active learning models. unlabeled: Set of records to be considered for our active learning model. uncertainty_scores: Our classifier's predictions over the response variable. n_instances: Limit on the number of records to query from our unlabeled set. metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. Returns: The indices of the top n_instances ranked unlabelled samples. """ # Make a local copy of our classifier's training data. # Define our record container and record the best cold start instance in the case of cold start. # transform unlabeled data if needed if classifier.on_transformed: unlabeled = classifier.transform_without_estimating(unlabeled) if classifier.X_training is None: best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs) instance_index_ranking = [best_coldstart_instance_index] elif classifier.X_training.shape[0] > 0: labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:] instance_index_ranking = [] # The maximum number of records to sample. ceiling = np.minimum(unlabeled.shape[0], n_instances) - len(instance_index_ranking) # mask for unlabeled initialized as transparent mask = np.ones(unlabeled.shape[0], np.bool) for _ in range(ceiling): # Receive the instance and corresponding index from our unlabeled copy that scores highest. instance_index, instance, mask = select_instance(X_training=labeled, X_pool=unlabeled, X_uncertainty=uncertainty_scores, mask=mask, metric=metric, n_jobs=n_jobs) # Add our instance we've considered for labeling to our labeled set. Although we don't # know it's label, we want further iterations to consider the newly-added instance so # that we don't query the same instance redundantly. labeled = data_vstack((labeled, instance)) # Finally, append our instance's index to the bottom of our ranking. instance_index_ranking.append(instance_index) # Return numpy array, not a list. return np.array(instance_index_ranking)
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', p_subsample: np.float = 1.0, n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Expected error reduction query strategy. References: Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) Args: learner: The ActiveLearner object for which the expected error is to be estimated. X: The samples. loss: The loss function to be used. Can be 'binary' or 'log'. p_subsample: Probability of keeping a sample from the pool when calculating expected error. Significantly improves runtime for large sample pools. n_instances: The number of instances to be sampled. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\'' expected_error = np.zeros(shape=(len(X), )) possible_labels = np.unique(learner.y_training) try: X_proba = learner.predict_proba(X) except NotFittedError: # TODO: implement a proper cold-start return 0, X[0] cloned_estimator = clone(learner.estimator) for x_idx, x in enumerate(X): # subsample the data if needed if np.random.rand() <= p_subsample: # estimate the expected error for y_idx, y in enumerate(possible_labels): X_new = data_vstack((learner.X_training, x.reshape(1, -1))) y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) cloned_estimator.fit(X_new, y_new) refitted_proba = cloned_estimator.predict_proba(X) if loss is 'binary': loss = _proba_uncertainty(refitted_proba) elif loss is 'log': loss = _proba_entropy(refitted_proba) expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] else: expected_error[x_idx] = np.inf if not random_tie_break: query_idx = multi_argmax(expected_error, n_instances) else: query_idx = shuffled_argmax(expected_error, n_instances) return query_idx, X[query_idx]