예제 #1
0
    def estimate_competence(self,
                            competence_region,
                            distances=None,
                            predictions=None):
        """estimate the competence level of each base classifier :math:`c_{i}`
        for the classification of the query sample.

        The competence is estimated using the accuracy and diversity criteria.
        First the classification accuracy of the base classifiers in the
        region of competence is estimated. Then the diversity of the
        base classifiers is estimated.

        The method returns two arrays: One containing the accuracy and the
        other the diversity of each base classifier.

        Parameters
        ----------
        competence_region : array of shape (n_samples, n_neighbors)
            Indices of the k nearest neighbors according for each test sample.

        distances : array of shape (n_samples, n_neighbors)
                        Distances from the k nearest neighbors to the query


        predictions : array of shape (n_samples, n_classifiers)
            Predictions of the base classifiers for all test examples.

        Notes
        ------
        This technique uses both the accuracy and diversity information to
        perform dynamic selection. For this reason the function returns a
        dictionary containing these two values instead of a single ndarray
        containing the competence level estimates for each base classifier.

        Returns
        -------
        accuracy : array of shape = [n_samples, n_classifiers}
                   Local Accuracy estimates (competences) of the base
                   classifiers for all query samples.

        diversity : array of shape = [n_samples, n_classifiers}
                    Average pairwise diversity of each base classifiers for
                    all test examples.

        """
        accuracy = np.mean(self.DSEL_processed_[competence_region, :], axis=1)

        predicted_matrix = self.BKS_DSEL_[competence_region, :]
        targets = self.DSEL_target_[competence_region]

        # TODO: optimize this part with numpy instead of for loops
        diversity = np.zeros((competence_region.shape[0], self.n_classifiers_))
        for sample_idx in range(competence_region.shape[0]):
            this_diversity = compute_pairwise_diversity(
                targets[sample_idx, :], predicted_matrix[sample_idx, :, :],
                self.diversity_func_)

            diversity[sample_idx, :] = this_diversity

        return accuracy, diversity
예제 #2
0
    def fit(self, X, y):
        """ Train the DS model by setting the Clustering algorithm and
        pre-processing the information required to apply the DS
        methods.

        First the data is divided into K clusters. Then, for each cluster, the N most accurate classifiers
        are first selected. Then, the J more diverse classifiers from the N most accurate classifiers are
        selected to compose the ensemble of the corresponding cluster. An ensemble of classifiers is assigned
        to each of the K clusters.

        Parameters
        ----------
        X : array of shape = [n_samples, n_features]
            Data used to fit the model.

        y : array of shape = [n_samples]
            class labels of each example in X.

        Returns
        -------
        self
        """

        y_ind = self.setup_label_encoder(y)
        self._set_dsel(X, y_ind)
        labels = self.roc_algorithm.fit_predict(X)

        # For each cluster estimate the most accurate and most competent classifiers for it.
        for cluster_index in range(self.k):

            # Get the indices of the samples in the corresponding cluster.
            sample_indices = np.where(labels == cluster_index)[0]

            # Compute accuracy of each classifier in this cluster
            accuracy = np.mean(self.processed_dsel[sample_indices, :], axis=0)
            self.accuracy_cluster[cluster_index, :] = accuracy

            # Get the N most accurate classifiers for the corresponding cluster
            accuracy_indices = np.argsort(accuracy)[::-1][0:self.N]

            # Get the target labels for the samples in the corresponding cluster for the diversity calculation.

            targets = self.DSEL_target[sample_indices]
            self.diversity_cluster[cluster_index, :] = \
                compute_pairwise_diversity(targets, self.BKS_dsel[sample_indices, :], self.diversity_func)

            diversity_of_selected = self.diversity_cluster[cluster_index, accuracy_indices]

            if self.more_diverse:
                diversity_indices = np.argsort(diversity_of_selected)[::-1][0:self.J]
            else:
                diversity_indices = np.argsort(diversity_of_selected)[0:self.J]

            self.indices[cluster_index, :] = accuracy_indices[diversity_indices]
예제 #3
0
    def estimate_competence(self, query, predictions=None):
        """estimate the competence level of each base classifier :math:`c_{i}` for
        the classification of the query sample.

        The competence is estimated using the accuracy and diversity criteria. First the classification accuracy
        of the base classifiers in the region of competence is estimated. Then the diversity of the base classifiers
        in the region of competence is estimated.

        The method returns two arrays: One containing the accuracy and the other the diversity of each base classifier.

        Parameters
        ----------
        query : array cf shape  = [n_samples, n_features]
                The query sample.

        predictions : array of shape = [n_samples, n_classifiers]
                      Predictions of the base classifiers for all test examples.

        Notes
        ------
        This technique uses both the accuracy and diversity information to perform dynamic selection. For this
        reason the function returns a dictionary containing these two values instead of a single ndarray containing
        the competence level estimates for each base classifier.

        Returns
        -------
        accuracy : array of shape = [n_samples, n_classifiers}
                   Local Accuracy estimates (competences) of the base classifiers for all query samples.

        diversity : array of shape = [n_samples, n_classifiers}
                    Average pairwise diversity of each base classifiers for all test examples.

        """
        _, idx_neighbors = self._get_region_competence(query)
        # calculate the classifiers mean accuracy for all samples/base classifier
        accuracy = np.mean(self.processed_dsel[idx_neighbors, :], axis=1)

        predicted_matrix = self.BKS_dsel[idx_neighbors, :]
        targets = self.DSEL_target[idx_neighbors]

        # TODO: optimize this part with numpy instead of for loops
        # Calculate the more_diverse matrix. It becomes computationally expensive
        # When the region of competence is high
        diversity = np.zeros((query.shape[0], self.n_classifiers))
        for sample_idx in range(query.shape[0]):
            this_diversity = compute_pairwise_diversity(targets[sample_idx, :],
                                                      predicted_matrix[sample_idx, :, :], self.diversity_func)

            diversity[sample_idx, :] = this_diversity

        return accuracy, diversity
예제 #4
0
    def _preprocess_clusters(self):
        """Preprocess the competence as well as the average diversity of each
        base classifier for each specific cluster.

        This process makes the test routines faster, since the ensemble of
        classifiers of each cluster is already predefined.

        The class attributes Accuracy_cluster_ and diversity_cluster_ stores
        the accuracy and diversity information respectively of each base
        classifier for each cluster. The attribute indices_ stores the
        pre-selected base classifiers for each cluster.
        """
        labels = self.clustering_.predict(self.DSEL_data_)

        for cluster_index in range(self.clustering_.n_clusters):

            # Get the indices_ of the samples in the corresponding cluster.
            sample_indices = np.where(labels == cluster_index)[0]

            # Compute performance metric of each classifier in this cluster
            score_classifier = self.get_scores_(sample_indices)

            self.performance_cluster_[cluster_index, :] = score_classifier

            # Get the N_ most accurate classifiers in the cluster
            performance_indices = np.argsort(score_classifier)[::-1][0:self.N_]

            # Get the target labels for the samples in the corresponding
            #  cluster for the diversity calculation.

            targets = self.DSEL_target_[sample_indices]
            self.diversity_cluster_[cluster_index, :] = \
                compute_pairwise_diversity(targets,
                                           self.BKS_DSEL_[sample_indices, :],
                                           self.diversity_func_)

            diversity_of_selected = self.diversity_cluster_[
                cluster_index, performance_indices]

            if self.more_diverse:
                diversity_indices = np.argsort(
                    diversity_of_selected)[::-1][0:self.J_]
            else:
                diversity_indices = np.argsort(
                    diversity_of_selected)[0:self.J_]

            self.indices_[
                cluster_index, :] = performance_indices[diversity_indices]