def _compute_eigenvectors_ncluster(self, affinity, constraint_matrix=None): """Perform eigen decomposition and estiamte the number of clusters. Perform affinity refinement, eigen decomposition and sort eigenvectors by the real part of eigenvalues. Estimate the number of clusters using EigenGap principle. Args: affinity: the affinity matrix of input data constraint_matrix: numpy array of shape (n_samples, n_samples). The constraint matrix with prior information Returns: eigenvectors: sorted eigenvectors. numpy array of shape (n_samples, n_samples) n_clusters: number of clusters as an integer max_delta_norm: normalized maximum eigen gap """ # Perform refinement operations on the affinity matrix. for refinement_name in self.refinement_options.refinement_sequence: refinement_operator = self.refinement_options.get_refinement_operator( refinement_name) affinity = refinement_operator.refine(affinity) if (self.constraint_options and not self.constraint_options.apply_before_refinement): # Perform the constraint operation after refinement affinity = self.constraint_options.constraint_operator.adjust_affinity( affinity, constraint_matrix) if not self.laplacian_type or self.laplacian_type == LaplacianType.Affinity: # Perform eigen decomposion. (eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(affinity) # Get number of clusters. n_clusters, max_delta_norm = utils.compute_number_of_clusters( eigenvalues, max_clusters=self.max_clusters, stop_eigenvalue=self.stop_eigenvalue, eigengap_type=self.eigengap_type, descend=True) else: # Compute Laplacian matrix laplacian_norm = laplacian.compute_laplacian( affinity, laplacian_type=self.laplacian_type) # Perform eigen decomposion. Eigen values are sorted in an ascending # order (eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(laplacian_norm, descend=False) # Get number of clusters. Eigen values are sorted in an ascending order n_clusters, max_delta_norm = utils.compute_number_of_clusters( eigenvalues, max_clusters=self.max_clusters, eigengap_type=self.eigengap_type, descend=False) return eigenvectors, n_clusters, max_delta_norm
def test_max_clusters(self): max_clusters = 2 eigenvalues = np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5]) result_1 = utils.compute_number_of_clusters(eigenvalues) self.assertEqual(5, result_1) result_2 = utils.compute_number_of_clusters(eigenvalues, max_clusters=max_clusters) self.assertEqual(max_clusters, result_2)
def test_max_clusters(self): max_clusters = 2 eigenvalues = np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5]) result_1, max_delta_norm_1 = utils.compute_number_of_clusters(eigenvalues) self.assertEqual(5, result_1) self.assertTrue(np.allclose(1.2, max_delta_norm_1, atol=0.01)) result_2, max_delta_norm_2 = utils.compute_number_of_clusters( eigenvalues, max_clusters=max_clusters) self.assertEqual(max_clusters, result_2) self.assertTrue(np.allclose(1.125, max_delta_norm_2, atol=0.01))
def predict(self, X): """Perform spectral clustering on data X. Args: X: numpy array of shape (n_samples, n_features) Returns: labels: numpy array of shape (n_samples,) Raises: TypeError: if X has wrong type ValueError: if X has wrong shape """ if not isinstance(X, np.ndarray): raise TypeError("X must be a numpy array") if len(X.shape) != 2: raise ValueError("X must be 2-dimensional") # Compute affinity matrix. affinity = utils.compute_affinity_matrix(X) # Refinement opertions on the affinity matrix. for refinement_name in self.refinement_sequence: op = self._get_refinement_operator(refinement_name) affinity = op.refine(affinity) # Perform eigen decomposion. (eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(affinity) # Get number of clusters. k = utils.compute_number_of_clusters(eigenvalues, self.stop_eigenvalue) if self.min_clusters is not None: k = max(k, self.min_clusters) if self.max_clusters is not None: k = min(k, self.max_clusters) # Get spectral embeddings. spectral_embeddings = eigenvectors[:, :k] # Run K-Means++ on spectral embeddings. # Note: The correct way should be using a K-Means implementation # that supports customized distance measure such as cosine distance. # This implemention from scikit-learn does NOT, which is inconsistent # with the paper. kmeans_clusterer = KMeans(n_clusters=k, init="k-means++", max_iter=300, random_state=0) labels = kmeans_clusterer.fit_predict(spectral_embeddings) return labels
def get_eigen_inputs(self, X, sparse=False, **kwargs): """Get the values used as input to Kmeans. Args: X: numpy array to performe eigen-decomposition on sparse: whether or not to use sparse eigen-decomposition **kwargs: extra arguments passed to spectralcluster.utils.compute_sorted_eigenvalues Returns: k: predicted number of clusters affinity: the refined affinity matrix eigenvectors: real eigenvectors of the affinity matrix eigenvalues: real eigenvalues of the affinity matrix Raises: ValueError: if name is an unknown refinement operation """ if not isinstance(X, np.ndarray): raise TypeError("X must be a numpy array") if len(X.shape) != 2: raise ValueError("X must be 2-dimensional") # Compute affinity matrix. affinity = utils.compute_affinity_matrix(X) # Refinement opertions on the affinity matrix. for refinement_name in self.refinement_sequence: op = self._get_refinement_operator(refinement_name) affinity = op.refine(affinity) # Perform eigen decomposion. (eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(affinity, sparse=sparse, **kwargs) # Get number of clusters. k = utils.compute_number_of_clusters(eigenvalues, self.max_clusters, self.stop_eigenvalue) if self.min_clusters is not None: k = max(k, self.min_clusters) return k, affinity, eigenvectors, eigenvalues
def test_5_values(self): eigenvalues = np.array([1.0, 0.9, 0.8, 0.2, 0.1]) result = utils.compute_number_of_clusters(eigenvalues) self.assertEqual(3, result)
def predict(self, X): """Perform spectral clustering on data X. Args: X: numpy array of shape (n_samples, n_features) Returns: labels: numpy array of shape (n_samples,) Raises: TypeError: if X has wrong type ValueError: if X has wrong shape, or we see an unknown refinement operation """ if not isinstance(X, np.ndarray): raise TypeError("X must be a numpy array") if len(X.shape) != 2: raise ValueError("X must be 2-dimensional") # Compute affinity matrix. affinity = utils.compute_affinity_matrix(X) # Refinement opertions on the affinity matrix. for op in self.refinement_sequence: if op == "CropDiagonal": affinity = refinement.CropDiagonal().refine(affinity) elif op == "GaussianBlur": affinity = refinement.GaussianBlur( self.gaussian_blur_sigma).refine(affinity) elif op == "RowWiseThreshold": affinity = refinement.RowWiseThreshold( self.p_percentile, self.thresholding_soft_multiplier).refine(affinity) elif op == "Symmetrize": affinity = refinement.Symmetrize().refine(affinity) elif op == "Diffuse": affinity = refinement.Diffuse().refine(affinity) elif op == "RowWiseNormalize": affinity = refinement.RowWiseNormalize().refine(affinity) else: raise ValueError("Unknown refinement operation: {}".format(op)) # Perform eigen decomposion. (eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(affinity) # Get number of clusters. k = utils.compute_number_of_clusters(eigenvalues, self.stop_eigenvalue) if self.min_clusters is not None: k = max(k, self.min_clusters) if self.max_clusters is not None: k = min(k, self.max_clusters) # Get spectral embeddings. spectral_embeddings = eigenvectors[:, :k] # Run K-Means++ on spectral embeddings. # Note: The correct way should be using a K-Means implementation # that supports customized distance measure such as cosine distance. # This implemention from scikit-learn does NOT, which is inconsistent # with the paper. kmeans_clusterer = KMeans(n_clusters=k, init="k-means++", max_iter=300, random_state=0) labels = kmeans_clusterer.fit_predict(spectral_embeddings) return labels
def test_ascend(self): eigenvalues = np.array([1.0, 0.9, 0.8, 0.2, 0.1]) result, max_delta_norm = utils.compute_number_of_clusters( eigenvalues, max_clusters=3, descend=False) self.assertEqual(2, result) self.assertTrue(np.allclose(0.88, max_delta_norm, atol=0.01))
def test_5_values(self): eigenvalues = np.array([1.0, 0.9, 0.8, 0.2, 0.1]) result, max_delta_norm = utils.compute_number_of_clusters(eigenvalues) self.assertEqual(3, result) self.assertTrue(np.allclose(4.0, max_delta_norm, atol=0.01))