def estimate_competence(self, competence_region, distances=None, predictions=None): """estimate the competence level of each base classifier :math:`c_{i}` for the classification of the query sample. The competence is estimated using the accuracy and diversity criteria. First the classification accuracy of the base classifiers in the region of competence is estimated. Then the diversity of the base classifiers is estimated. The method returns two arrays: One containing the accuracy and the other the diversity of each base classifier. Parameters ---------- competence_region : array of shape (n_samples, n_neighbors) Indices of the k nearest neighbors according for each test sample. distances : array of shape (n_samples, n_neighbors) Distances from the k nearest neighbors to the query predictions : array of shape (n_samples, n_classifiers) Predictions of the base classifiers for all test examples. Notes ------ This technique uses both the accuracy and diversity information to perform dynamic selection. For this reason the function returns a dictionary containing these two values instead of a single ndarray containing the competence level estimates for each base classifier. Returns ------- accuracy : array of shape = [n_samples, n_classifiers} Local Accuracy estimates (competences) of the base classifiers for all query samples. diversity : array of shape = [n_samples, n_classifiers} Average pairwise diversity of each base classifiers for all test examples. """ accuracy = np.mean(self.DSEL_processed_[competence_region, :], axis=1) predicted_matrix = self.BKS_DSEL_[competence_region, :] targets = self.DSEL_target_[competence_region] # TODO: optimize this part with numpy instead of for loops diversity = np.zeros((competence_region.shape[0], self.n_classifiers_)) for sample_idx in range(competence_region.shape[0]): this_diversity = compute_pairwise_diversity( targets[sample_idx, :], predicted_matrix[sample_idx, :, :], self.diversity_func_) diversity[sample_idx, :] = this_diversity return accuracy, diversity
def fit(self, X, y): """ Train the DS model by setting the Clustering algorithm and pre-processing the information required to apply the DS methods. First the data is divided into K clusters. Then, for each cluster, the N most accurate classifiers are first selected. Then, the J more diverse classifiers from the N most accurate classifiers are selected to compose the ensemble of the corresponding cluster. An ensemble of classifiers is assigned to each of the K clusters. Parameters ---------- X : array of shape = [n_samples, n_features] Data used to fit the model. y : array of shape = [n_samples] class labels of each example in X. Returns ------- self """ y_ind = self.setup_label_encoder(y) self._set_dsel(X, y_ind) labels = self.roc_algorithm.fit_predict(X) # For each cluster estimate the most accurate and most competent classifiers for it. for cluster_index in range(self.k): # Get the indices of the samples in the corresponding cluster. sample_indices = np.where(labels == cluster_index)[0] # Compute accuracy of each classifier in this cluster accuracy = np.mean(self.processed_dsel[sample_indices, :], axis=0) self.accuracy_cluster[cluster_index, :] = accuracy # Get the N most accurate classifiers for the corresponding cluster accuracy_indices = np.argsort(accuracy)[::-1][0:self.N] # Get the target labels for the samples in the corresponding cluster for the diversity calculation. targets = self.DSEL_target[sample_indices] self.diversity_cluster[cluster_index, :] = \ compute_pairwise_diversity(targets, self.BKS_dsel[sample_indices, :], self.diversity_func) diversity_of_selected = self.diversity_cluster[cluster_index, accuracy_indices] if self.more_diverse: diversity_indices = np.argsort(diversity_of_selected)[::-1][0:self.J] else: diversity_indices = np.argsort(diversity_of_selected)[0:self.J] self.indices[cluster_index, :] = accuracy_indices[diversity_indices]
def estimate_competence(self, query, predictions=None): """estimate the competence level of each base classifier :math:`c_{i}` for the classification of the query sample. The competence is estimated using the accuracy and diversity criteria. First the classification accuracy of the base classifiers in the region of competence is estimated. Then the diversity of the base classifiers in the region of competence is estimated. The method returns two arrays: One containing the accuracy and the other the diversity of each base classifier. Parameters ---------- query : array cf shape = [n_samples, n_features] The query sample. predictions : array of shape = [n_samples, n_classifiers] Predictions of the base classifiers for all test examples. Notes ------ This technique uses both the accuracy and diversity information to perform dynamic selection. For this reason the function returns a dictionary containing these two values instead of a single ndarray containing the competence level estimates for each base classifier. Returns ------- accuracy : array of shape = [n_samples, n_classifiers} Local Accuracy estimates (competences) of the base classifiers for all query samples. diversity : array of shape = [n_samples, n_classifiers} Average pairwise diversity of each base classifiers for all test examples. """ _, idx_neighbors = self._get_region_competence(query) # calculate the classifiers mean accuracy for all samples/base classifier accuracy = np.mean(self.processed_dsel[idx_neighbors, :], axis=1) predicted_matrix = self.BKS_dsel[idx_neighbors, :] targets = self.DSEL_target[idx_neighbors] # TODO: optimize this part with numpy instead of for loops # Calculate the more_diverse matrix. It becomes computationally expensive # When the region of competence is high diversity = np.zeros((query.shape[0], self.n_classifiers)) for sample_idx in range(query.shape[0]): this_diversity = compute_pairwise_diversity(targets[sample_idx, :], predicted_matrix[sample_idx, :, :], self.diversity_func) diversity[sample_idx, :] = this_diversity return accuracy, diversity
def _preprocess_clusters(self): """Preprocess the competence as well as the average diversity of each base classifier for each specific cluster. This process makes the test routines faster, since the ensemble of classifiers of each cluster is already predefined. The class attributes Accuracy_cluster_ and diversity_cluster_ stores the accuracy and diversity information respectively of each base classifier for each cluster. The attribute indices_ stores the pre-selected base classifiers for each cluster. """ labels = self.clustering_.predict(self.DSEL_data_) for cluster_index in range(self.clustering_.n_clusters): # Get the indices_ of the samples in the corresponding cluster. sample_indices = np.where(labels == cluster_index)[0] # Compute performance metric of each classifier in this cluster score_classifier = self.get_scores_(sample_indices) self.performance_cluster_[cluster_index, :] = score_classifier # Get the N_ most accurate classifiers in the cluster performance_indices = np.argsort(score_classifier)[::-1][0:self.N_] # Get the target labels for the samples in the corresponding # cluster for the diversity calculation. targets = self.DSEL_target_[sample_indices] self.diversity_cluster_[cluster_index, :] = \ compute_pairwise_diversity(targets, self.BKS_DSEL_[sample_indices, :], self.diversity_func_) diversity_of_selected = self.diversity_cluster_[ cluster_index, performance_indices] if self.more_diverse: diversity_indices = np.argsort( diversity_of_selected)[::-1][0:self.J_] else: diversity_indices = np.argsort( diversity_of_selected)[0:self.J_] self.indices_[ cluster_index, :] = performance_indices[diversity_indices]