Пример #1
0
    def _compute_eigenvectors_ncluster(self, affinity, constraint_matrix=None):
        """Perform eigen decomposition and estiamte the number of clusters.

    Perform affinity refinement, eigen decomposition and sort eigenvectors by
    the real part of eigenvalues. Estimate the number of clusters using EigenGap
    principle.

    Args:
      affinity: the affinity matrix of input data
      constraint_matrix: numpy array of shape (n_samples, n_samples). The
        constraint matrix with prior information

    Returns:
      eigenvectors: sorted eigenvectors. numpy array of shape
      (n_samples, n_samples)
      n_clusters: number of clusters as an integer
      max_delta_norm: normalized maximum eigen gap
    """
        # Perform refinement operations on the affinity matrix.
        for refinement_name in self.refinement_options.refinement_sequence:
            refinement_operator = self.refinement_options.get_refinement_operator(
                refinement_name)
            affinity = refinement_operator.refine(affinity)

        if (self.constraint_options
                and not self.constraint_options.apply_before_refinement):
            # Perform the constraint operation after refinement
            affinity = self.constraint_options.constraint_operator.adjust_affinity(
                affinity, constraint_matrix)

        if not self.laplacian_type or self.laplacian_type == LaplacianType.Affinity:
            # Perform eigen decomposion.
            (eigenvalues,
             eigenvectors) = utils.compute_sorted_eigenvectors(affinity)
            # Get number of clusters.
            n_clusters, max_delta_norm = utils.compute_number_of_clusters(
                eigenvalues,
                max_clusters=self.max_clusters,
                stop_eigenvalue=self.stop_eigenvalue,
                eigengap_type=self.eigengap_type,
                descend=True)
        else:
            # Compute Laplacian matrix
            laplacian_norm = laplacian.compute_laplacian(
                affinity, laplacian_type=self.laplacian_type)
            # Perform eigen decomposion. Eigen values are sorted in an ascending
            # order
            (eigenvalues,
             eigenvectors) = utils.compute_sorted_eigenvectors(laplacian_norm,
                                                               descend=False)
            # Get number of clusters. Eigen values are sorted in an ascending order
            n_clusters, max_delta_norm = utils.compute_number_of_clusters(
                eigenvalues,
                max_clusters=self.max_clusters,
                eigengap_type=self.eigengap_type,
                descend=False)
        return eigenvectors, n_clusters, max_delta_norm
Пример #2
0
 def test_3by2_matrix(self):
     X = np.array([[1, 2], [3, 4], [1, 3]])
     affinity = utils.compute_affinity_matrix(X)
     w, v = utils.compute_sorted_eigenvectors(affinity)
     self.assertEqual((3, ), w.shape)
     self.assertEqual((3, 3), v.shape)
     self.assertGreater(w[0], w[1])
     self.assertGreater(w[1], w[2])
Пример #3
0
 def test_ascend(self):
   matrix = np.array([[1, 2], [3, 4], [1, 3]])
   affinity = utils.compute_affinity_matrix(matrix)
   w, v = utils.compute_sorted_eigenvectors(affinity, descend=False)
   self.assertEqual((3,), w.shape)
   self.assertEqual((3, 3), v.shape)
   self.assertLess(w[0], w[1])
   self.assertLess(w[1], w[2])
    def predict(self, X):
        """Perform spectral clustering on data X.

        Args:
            X: numpy array of shape (n_samples, n_features)

        Returns:
            labels: numpy array of shape (n_samples,)

        Raises:
            TypeError: if X has wrong type
            ValueError: if X has wrong shape
        """
        if not isinstance(X, np.ndarray):
            raise TypeError("X must be a numpy array")
        if len(X.shape) != 2:
            raise ValueError("X must be 2-dimensional")
        #  Compute affinity matrix.
        affinity = utils.compute_affinity_matrix(X)

        # Refinement opertions on the affinity matrix.
        for refinement_name in self.refinement_sequence:
            op = self._get_refinement_operator(refinement_name)
            affinity = op.refine(affinity)

        # Perform eigen decomposion.
        (eigenvalues,
         eigenvectors) = utils.compute_sorted_eigenvectors(affinity)
        # Get number of clusters.
        k = utils.compute_number_of_clusters(eigenvalues, self.stop_eigenvalue)
        if self.min_clusters is not None:
            k = max(k, self.min_clusters)
        if self.max_clusters is not None:
            k = min(k, self.max_clusters)

        # Get spectral embeddings.
        spectral_embeddings = eigenvectors[:, :k]

        # Run K-Means++ on spectral embeddings.
        # Note: The correct way should be using a K-Means implementation
        # that supports customized distance measure such as cosine distance.
        # This implemention from scikit-learn does NOT, which is inconsistent
        # with the paper.
        kmeans_clusterer = KMeans(n_clusters=k,
                                  init="k-means++",
                                  max_iter=300,
                                  random_state=0)
        labels = kmeans_clusterer.fit_predict(spectral_embeddings)
        return labels
    def get_eigen_inputs(self, X, sparse=False, **kwargs):
        """Get the values used as input to Kmeans.

        Args:
            X: numpy array to performe eigen-decomposition on
            sparse: whether or not to use sparse eigen-decomposition
            **kwargs:  extra arguments passed to spectralcluster.utils.compute_sorted_eigenvalues

        Returns:
            k: predicted number of clusters
            affinity: the refined affinity matrix
            eigenvectors: real eigenvectors of the affinity matrix
            eigenvalues:  real eigenvalues of the affinity matrix

        Raises:
            ValueError: if name is an unknown refinement operation
        """
        if not isinstance(X, np.ndarray):
            raise TypeError("X must be a numpy array")
        if len(X.shape) != 2:
            raise ValueError("X must be 2-dimensional")
        #  Compute affinity matrix.
        affinity = utils.compute_affinity_matrix(X)

        # Refinement opertions on the affinity matrix.
        for refinement_name in self.refinement_sequence:
            op = self._get_refinement_operator(refinement_name)
            affinity = op.refine(affinity)

        # Perform eigen decomposion.
        (eigenvalues,
         eigenvectors) = utils.compute_sorted_eigenvectors(affinity,
                                                           sparse=sparse,
                                                           **kwargs)
        # Get number of clusters.
        k = utils.compute_number_of_clusters(eigenvalues, self.max_clusters,
                                             self.stop_eigenvalue)
        if self.min_clusters is not None:
            k = max(k, self.min_clusters)

        return k, affinity, eigenvectors, eigenvalues
    def predict(self, X):
        """Perform spectral clustering on data X.

        Args:
            X: numpy array of shape (n_samples, n_features)

        Returns:
            labels: numpy array of shape (n_samples,)

        Raises:
            TypeError: if X has wrong type
            ValueError: if X has wrong shape, or we see an unknown refinement
                operation
        """
        if not isinstance(X, np.ndarray):
            raise TypeError("X must be a numpy array")
        if len(X.shape) != 2:
            raise ValueError("X must be 2-dimensional")
        #  Compute affinity matrix.
        affinity = utils.compute_affinity_matrix(X)

        # Refinement opertions on the affinity matrix.
        for op in self.refinement_sequence:
            if op == "CropDiagonal":
                affinity = refinement.CropDiagonal().refine(affinity)
            elif op == "GaussianBlur":
                affinity = refinement.GaussianBlur(
                    self.gaussian_blur_sigma).refine(affinity)
            elif op == "RowWiseThreshold":
                affinity = refinement.RowWiseThreshold(
                    self.p_percentile,
                    self.thresholding_soft_multiplier).refine(affinity)
            elif op == "Symmetrize":
                affinity = refinement.Symmetrize().refine(affinity)
            elif op == "Diffuse":
                affinity = refinement.Diffuse().refine(affinity)
            elif op == "RowWiseNormalize":
                affinity = refinement.RowWiseNormalize().refine(affinity)
            else:
                raise ValueError("Unknown refinement operation: {}".format(op))

        # Perform eigen decomposion.
        (eigenvalues,
         eigenvectors) = utils.compute_sorted_eigenvectors(affinity)
        # Get number of clusters.
        k = utils.compute_number_of_clusters(eigenvalues, self.stop_eigenvalue)
        if self.min_clusters is not None:
            k = max(k, self.min_clusters)
        if self.max_clusters is not None:
            k = min(k, self.max_clusters)

        # Get spectral embeddings.
        spectral_embeddings = eigenvectors[:, :k]

        # Run K-Means++ on spectral embeddings.
        # Note: The correct way should be using a K-Means implementation
        # that supports customized distance measure such as cosine distance.
        # This implemention from scikit-learn does NOT, which is inconsistent
        # with the paper.
        kmeans_clusterer = KMeans(n_clusters=k,
                                  init="k-means++",
                                  max_iter=300,
                                  random_state=0)
        labels = kmeans_clusterer.fit_predict(spectral_embeddings)
        return labels