def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == "randomized": kwargs = {} if self.n_svd_vecs is not None: kwargs["n_oversamples"] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == "arpack": u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == 'randomized': kwargs = {} if self.n_svd_vecs is not None: kwargs['n_oversamples'] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == 'arpack': u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def custom_svd(array, n_components, n_discard,n_svd_vecs): u, _, vt = svds(array, k=n_components, ncv=n_svd_vecs) if np.any(np.isnan(vt)): _, v = eigsh(safe_sparse_dot(array.T, array),ncv=n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T),ncv=n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def spectralcluster(A, n_cluster, n_neighbors=6, random_state=None, eigen_tol=0.0): #maps = spectral_embedding(affinity, n_components=n_components,eigen_solver=eigen_solver,random_state=random_state,eigen_tol=eigen_tol, drop_first=False) # dd is diag laplacian, dd = graph_laplacian(A, normed=True, return_diag=True) # set the diagonal of the laplacian matrix and convert it to a sparse format well suited for e # igenvalue decomposition laplacian = _set_diag(laplacian, 1) # diffusion_map is eigenvectors # LM largest eigenvalues laplacian *= -1 eigenvalues, eigenvectors = eigsh(laplacian, k=n_cluster, sigma=1.0, which='LM', tol=eigen_tol) y = eigenvectors.T[n_cluster::-1] * dd y = _deterministic_vector_sign_flip(y)[:n_cluster].T random_state = check_random_state(random_state) centroids, labels, _ = k_means(y, n_cluster, random_state=random_state) return eigenvalues, y, centroids, labels
def my_uniteigenvector_zeroeigenvalue_cluster(k): G = nx.read_gpickle('data/undirected(fortest).gpickle') A = nx.adjacency_matrix(G, nodelist=G.nodes()[:-1], weight='weight') #A=A.toarray() #np.fill_diagonal(A,0.01) #add node with its own weight to itself #Tri = np.diag(np.sum(A, axis=1)) #L = Tri - A #Tri_1 = np.diag(np.reciprocal(np.sqrt(Tri).diagonal())) #Ls = Tri_1.dot(L).dot(Tri_1) Ls, dd = graph_laplacian(A,normed=True, return_diag=True) eigenvalue_n, eigenvector_n = eigsh(Ls*(-1), k=k, sigma=1.0, which='LM', tol=0.0) #for ic,vl in enumerate(eigenvalue_n): # if abs(vl-0)<=1e-10: # eigenvector_n[:, ic] = np.full(len(G.nodes()[:-1]),1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue eigenvector_n[:, -1] = np.full(len(G.nodes()[:-1]), 1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue for ir,n in enumerate(eigenvector_n): eigenvector_n[ir]=n/float(np.linalg.norm(n)) # normalize to unitvector _, labels, _ = k_means(eigenvector_n, k, random_state=None, n_init=100) return labels
def embed(self, laplacian, diagonal, k, tol=0): k = k + 1 lambdas, diffusion_map = eigsh(-laplacian, k=k, which='SM', tol=tol) embedding = diffusion_map.T[k::-1] * diagonal if self.do_scale: return scale(embedding[1:k].T, axis=1) else: return embedding[1:k].T
def runEmbed(data, n_components): lambdas, vectors = eigsh(data, k=n_components) lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors/vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] #embedding_sorted = np.argsort(embedding[:], axis=1) return embedding
def runEmbed(data, n_components): lambdas, vectors = eigsh(data, k=n_components) lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors / vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] #embedding_sorted = np.argsort(embedding[:], axis=1) return embedding
def seriation(self, A): n_components = 2 eigen_tol = 0.00001 if sparse.issparse(A): A = A.todense() np.fill_diagonal(A, 0) laplacian, dd = graph_laplacian(A, return_diag=True) laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] # * dd sort_index = np.argsort(embedding[1]) return sort_index
def main(argv): # Set defaults: n_components_embedding = 25 comp_min = 2 comp_max = 20 + 1 varname = 'data' filename = './test' # Import files f = h5py.File(('%s.mat' % filename), 'r') dataCorr = np.array(f.get('%s' % varname)) # Prep matrix K = (dataCorr + 1) / 2. v = np.sqrt(np.sum(K, axis=1)) A = K / (v[:, None] * v[None, :]) del K A = np.squeeze(A * [A > 0]) # Run embedding lambdas, vectors = eigsh(A, k=n_components_embedding) lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors / vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1)] * lambdas[:n_components_embedding][None, :] # Run kmeans clustering def kmeans(embedding, n_components): est = KMeans(n_clusters=n_components, n_jobs=-1, init='k-means++', n_init=300) est.fit_transform(embedding) labels = est.labels_ data = labels.astype(np.float) return data results = list() for n_components in xrange(comp_min, comp_max): results.append(kmeans(embedding, n_components)) savemat(('%s_results.mat' % filename), {'results': results})
def runFiedler(conn): # https://github.com/margulies/topography # prep for embedding K = (conn + 1) / 2. v = np.sqrt(np.sum(K, axis=1)) A = K/(v[:, None] * v[None, :]) del K A = np.squeeze(A * [A > 0]) # diffusion embedding n_components_embedding = 2 lambdas, vectors = eigsh(A, k=n_components_embedding+1) del A lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors/vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1 + 1)] * lambdas[:n_components_embedding+1][None, :] return embedding
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1,1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1, 1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def _fit_transform(self, K): """ Fit's using kernel K""" # center kernel K = self._centerer.fit_transform(K) if self.n_components is None: n_components = K.shape[0] else: n_components = min(K.shape[0], self.n_components) # compute eigenvectors if self.eigen_solver == 'auto': if K.shape[0] > 200 and n_components < 10: eigen_solver = 'arpack' else: eigen_solver = 'dense' else: eigen_solver = self.eigen_solver if eigen_solver == 'dense': self.lambdas_, self.alphas_ = linalg.eigh( K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1)) self.evals_, self.evecs_ = linalg.eigh(K) elif eigen_solver == 'arpack': self.lambdas_, self.alphas_ = eigsh(K, n_components, which="LA", tol=self.tol, maxiter=self.max_iter) # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] self.lambdas_ = self.lambdas_[indices] self.alphas_ = self.alphas_[:, indices] # remove eigenvectors with a zero eigenvalue if self.remove_zero_eig or self.n_components is None: self.alphas_ = self.alphas_[:, self.lambdas_ > 0] self.lambdas_ = self.lambdas_[self.lambdas_ > 0] return K
def main(argv): # Set defaults: n_components_embedding = 25 comp_min = 2 comp_max = 20 + 1 varname = 'data' filename = './test' # Import files f = h5py.File(('%s.mat' % filename),'r') dataCorr = np.array(f.get('%s' % varname)) # Prep matrix K = (dataCorr + 1) / 2. v = np.sqrt(np.sum(K, axis=1)) A = K/(v[:, None] * v[None, :]) del K A = np.squeeze(A * [A > 0]) # Run embedding lambdas, vectors = eigsh(A, k=n_components_embedding) lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors/vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1)] * lambdas[:n_components_embedding][None, :] # Run kmeans clustering def kmeans(embedding, n_components): est = KMeans(n_clusters=n_components, n_jobs=-1, init='k-means++', n_init=300) est.fit_transform(embedding) labels = est.labels_ data = labels.astype(np.float) return data results = list() for n_components in xrange(comp_min,comp_max): results.append(kmeans(embedding, n_components)) savemat(('%s_results.mat' % filename), {'results':results})
def DoFiedler(conn): # prep for embedding K = (conn + 1) / 2. v = np.sqrt(np.sum(K, axis=1)) A = K / (v[:, None] * v[None, :]) del K A = np.squeeze(A * [A > 0]) # diffusion embedding n_components_embedding = 2 lambdas, vectors = eigsh(A, k=n_components_embedding + 1) del A lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors / vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1 + 1)] * lambdas[:n_components_embedding + 1][None, :] return embedding
def predict_k(affinity_matrix): normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True) laplacian = _set_diag(normed_laplacian, 1,norm_laplacian=True) n_components = affinity_matrix.shape[0] - 1 eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] # Reverse and sign inversion. max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap: max_gap = gap gap_pre_index = i - 1 k = gap_pre_index + 1 return k
def DoFiedler(conn): # prep for embedding # K : matrix of similarities / Kernel matrix / Gram matrix # make conn non-negative, -1<since conn<1 K = (conn + 1) / 2. # axis=1 meaning operating over rows, "row sum's of K" v = np.sqrt(np.sum(K, axis=1)) # make a random walk on data, D is diagonal matrix D = v[:, None] * v[None, :] # row-normalization of K gives transition matrix A => A = D^-1 * K A = K/D del K A = np.squeeze(A * [A > 0]) n_components_embedding = 5 lambdas, vectors = eigsh(A, k=n_components_embedding) del A # sorting eigenvalues and -vectors in descending order lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors/vectors[:, 0][:, None] # begin from second largest eigenvalue and corr. eigenvector lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1)] * lambdas[:n_components_embedding][None, :] return embedding
fullsize = len(dataAll) del dataAll # correlate dataCorr = np.corrcoef(np.transpose(np.array(dataNorm))) del dataNorm dataCorr[np.isnan(dataCorr)] = 0 # prep for embedding K = (dataCorr + 1) / 2. del dataCorr v = np.sqrt(np.sum(K, axis=1)) A = K/(v[:, None] * v[None, :]) del K A = np.squeeze(A * [A > 0]) # diffusion embedding lambdas, vectors = eigsh(A, k=n_components_embedding+1) del A lambdas = lambdas[::-1] vectors = vectors[:, ::-1] psi = vectors/vectors[:, 0][:, None] lambdas = lambdas[1:] / (1 - lambdas[1:]) embedding = psi[:, 1:(n_components_embedding + 1 + 1)] * lambdas[:n_components_embedding+1][None, :] # kmeans clustering results = [] for n_components in xrange(comp_min, comp_max+1): est = KMeans(n_clusters=n_components, n_jobs=-1, init='k-means++', n_init=300) est.fit_transform(embedding) labels = est.labels_ clust = labels.astype(np.float) # reinsert zeros: padded = np.zeros(fullsize)
def spectral_embedding(self, adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, drop_first=True): """ see original at https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/spectral_embedding_.py#L133 custermize1: return lambdas with the embedded matrix. custermize2: norm_laplacian is always True """ norm_laplacian = True adjacency = check_symmetric(adjacency) try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg": raise ValueError( "The eigen_solver was set to 'amg', but pyamg is " "not available.") if eigen_solver is None: eigen_solver = 'arpack' elif eigen_solver not in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1, norm_laplacian) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" # Revert the laplacian to its opposite to have lobpcg work laplacian *= -1 if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to eigh, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.toarray() lambdas, diffusion_map = eigh(laplacian) embedding = diffusion_map.T[:n_components] * dd else: laplacian = _set_diag(laplacian, 1, norm_laplacian) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError embedding = _deterministic_vector_sign_flip(embedding) if drop_first: return embedding[1:n_components].T, lambdas else: return embedding[:n_components].T, lambdas
def null_space(M, k, k_skip=1, eigen_solver='dense', tol=1E-6, max_iter=100, random_state=None): """ Find the null space of a matrix M. Parameters ---------- M : {array, matrix, sparse matrix, LinearOperator} Input covariance matrix: should be symmetric positive semi-definite k : integer Number of eigenvalues/vectors to return k_skip : integer, optional Number of low eigenvalues to skip. eigen_solver : string, {'auto', 'arpack', 'dense'} auto : algorithm will attempt to choose the best method for input data arpack : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. Warning: ARPACK can be unstable for some problems. It is best to try several random seeds in order to check results. dense : use standard dense matrix operations for the eigenvalue decomposition. For this method, M must be an array or matrix type. This method should be avoided for large problems. tol : float, optional Tolerance for 'arpack' method. Not used if eigen_solver=='dense'. max_iter : maximum number of iterations for 'arpack' method not used if eigen_solver=='dense' random_state: numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random. Returns ------- embedding_vectors : array[float, float], shape=(n_components, n_samples) Eigenvectors used for embedding eigenvectors : array[float, float], shape=(n_features, n_samples) All eigenvectors, in descending order by eigenvalues. Eigenvectors are stored in columns. The vector corresponding to evals[i] is stored in evecs[:,i] eigenvalues : array[float], shape=(n_features) All eigenvalues, in descending order """ if eigen_solver == 'auto': if M.shape[0] > 200 and k + k_skip < 10: eigen_solver = 'arpack' else: eigen_solver = 'dense' if eigen_solver == 'arpack': random_state = check_random_state(random_state) v0 = random_state.rand(M.shape[0]) try: eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0) except RuntimeError as msg: raise ValueError("Error in determining null-space with ARPACK. " "Error message: '%s'. " "Note that method='arpack' can fail when the " "weight matrix is singular or otherwise " "ill-behaved. method='dense' is recommended. " "See online documentation for more information." % msg) return eigen_vectors[:, k_skip:], eigen_values[k_skip:] elif eigen_solver == 'dense': if hasattr(M, 'toarray'): M = M.toarray() eigen_values, eigen_vectors = eigh( M, eigvals=(k_skip, k + k_skip - 1)) index = np.argsort(np.abs(eigen_values)) evals, evecs = eigh(M, overwrite_a=True) order = np.argsort(evals)[::-1] evals = evals[order] evecs = evecs[:, order] return eigen_vectors[:, index], evals, evecs else: raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
def predict_k(affinity_matrix): """ Predict number of clusters based on the eigengap. Parameters ---------- affinity_matrix : array-like or sparse matrix, shape: (n_samples, n_samples) adjacency matrix. Each element of this matrix contains a measure of similarity between two of the data points. Returns ---------- k : integer estimated number of cluster. Note --------- If graph is not fully connected, zero component as single cluster. References ---------- A Tutorial on Spectral Clustering, 2007 Luxburg, Ulrike http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf """ """ If normed=True, L = D^(-1/2) * (D - A) * D^(-1/2) else L = D - A. normed=True is recommended. """ normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True) laplacian = _set_diag(normed_laplacian, 1) """ n_components size is N - 1. Setting N - 1 may lead to slow execution time... """ n_components = affinity_matrix.shape[0] - 1 """ shift-invert mode The shift-invert mode provides more than just a fast way to obtain a few small eigenvalues. http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html The normalized Laplacian has eigenvalues between 0 and 2. I - L has eigenvalues between -1 and 1. """ eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] # Reverse and sign inversion. max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap: max_gap = gap gap_pre_index = i - 1 k = gap_pre_index + 1 return k
cat_tuple = (cat, matrix[cat].mean()) cat_perc.append(cat_tuple) # sort category percentages cat_perc = sorted(cat_perc, key=lambda x: x[1]) graph = cosine_similarity(matrix) # use cosine similarity, as in Noulas et al. # https://github.com/mingmingyang/auto_spectral_clustering/blob/master/autosp.py # how to calculate spectral clusters norm_laplacian, dd = graph_laplacian(graph, normed=True, return_diag=True) laplacian = _set_diag(norm_laplacian, 1, norm_laplacian=True) n_components = graph.shape[0] - 1 eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap: max_gap = gap gap_pre_index = i - 1 k = gap_pre_index + 1 print k
def spectral_embedding(laplacian, n_components=8, eigen_solver=None, random_state=None, eigen_tol=1e-20, drop_first=False): """ ---------------------------------------------------------------- *****!!!sklearn function variation for spectral embeding!!!***** ---------------------------------------------------------------- Project the sample on the first eigenvectors of the graph Laplacian. This embedding can also 'work' even if the ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). However care must taken to always make the affinity matrix symmetric so that the eigenvector decomposition works as expected. Read more in the :ref:`User Guide <spectral_embedding>`. Parameters ---------- laplacian : array-like or sparse matrix, shape: (n_samples, n_samples) The laplacian matrix of the graph to embed. n_components : integer, optional, default 8 The dimension of the projection subspace. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}, default None The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigenvectors decomposition when eigen_solver == 'amg'. By default, arpack is used. eigen_tol : float, optional, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. drop_first : bool, optional, default=True Whether to drop the first eigenvector. For spectral embedding, this should be True as the first eigenvector should be constant vector for connected graph, but for spectral clustering, this should be kept as False to retain the first eigenvector. Returns ------- embedding : array, shape=(n_samples, n_components) The reduced samples. Notes ----- Spectral embedding is most useful when the graph has one connected component. If there graph has many components, the first few eigenvectors will simply uncover the connected components of the graph. References ---------- * http://en.wikipedia.org/wiki/LOBPCG * Toward the Optimal Preconditioned Eigensolver: Locally Optimal Block Preconditioned Conjugate Gradient Method Andrew V. Knyazev http://dx.doi.org/10.1137%2FS1064827500366124 """ try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg": raise ValueError("The eigen_solver was set to 'amg', but pyamg is " "not available.") if eigen_solver is None: eigen_solver = 'arpack' elif eigen_solver not in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = laplacian.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 dd = laplacian.diagonal() if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" # Revert the laplacian to its opposite to have lobpcg work laplacian *= -1 if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to eigh, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.toarray() lambdas, diffusion_map = eigh(laplacian) embedding = diffusion_map.T[:n_components] * dd else: laplacian = _set_diag(laplacian, 1) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError embedding = _deterministic_vector_sign_flip(embedding) if drop_first: return embedding[1:n_components].T else: return embedding[:n_components].T
from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components=30, eigen_solver='arpack', affinity="nearest_neighbors") se.fit(X) app.quit() # TODO : check budget distribution, draw budget conditionnaly out = connected_components(graph)
def spectral_embedding(adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True, mode=None): """Project the sample on the first eigen vectors of the graph Laplacian. MMP:TO CHANGE THIS The adjacency matrix is used to compute a normalized graph Laplacian whose spectrum (especially the eigen vectors associated to the smallest eigen values) has an interpretation in terms of minimal number of cuts necessary to split the graph into comparably sized components. This embedding can also 'work' even if the ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). However care must taken to always make the affinity matrix symmetric so that the eigen vector decomposition works as expected. Parameters ---------- adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) The adjacency matrix of the graph to embed. n_components : integer, optional The dimension of the projection subspace. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'. By default, arpack is used. eigen_tol : float, optional, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. drop_first : bool, optional, default=True Whether to drop the first eigenvector. For spectral embedding, this should be True as the first eigenvector should be constant vector for connected graph, but for spectral clustering, this should be kept as False to retain the first eigenvector. Returns ------- embedding : array, shape=(n_samples, n_components) The reduced samples. Notes ----- Spectral embedding is most useful when the graph has one connected component. If there graph has many components, the first few eigenvectors will simply uncover the connected components of the graph. References ---------- * http://en.wikipedia.org/wiki/LOBPCG * Toward the Optimal Preconditioned Eigensolver: Locally Optimal Block Preconditioned Conjugate Gradient Method Andrew V. Knyazev http://dx.doi.org/10.1137%2FS1064827500366124 """ try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg" or mode == "amg": raise ValueError("The eigen_solver was set to 'amg', but pyamg is " "not available.") if not mode is None: warnings.warn("'mode' was renamed to eigen_solver " "and will be removed in 0.15.", DeprecationWarning) eigen_solver = mode if eigen_solver is None: eigen_solver = 'arpack' elif not eigen_solver in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 # Check that the matrices given is symmetric if ((not sparse.isspmatrix(adjacency) and not np.all((adjacency - adjacency.T) < 1e-10)) or (sparse.isspmatrix(adjacency) and not np.all((adjacency - adjacency.T).data < 1e-10))): warnings.warn("Graph adjacency matrix should be symmetric. " "Converted to be symmetric by average with its " "transpose.") adjacency = .5 * (adjacency + adjacency.T) if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: lambdas, diffusion_map = eigsh(-laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") laplacian = laplacian.astype(np.float) # lobpcg needs native floats laplacian = _set_diag(laplacian, 1) ml = smoothed_aggregation_solver(atleast2d_or_csr(laplacian)) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": laplacian = laplacian.astype(np.float) # lobpcg needs native floats if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to symeig, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.todense() lambdas, diffusion_map = symeig(laplacian) embedding = diffusion_map.T[:n_components] * dd else: # lobpcg needs native floats laplacian = laplacian.astype(np.float) laplacian = _set_diag(laplacian, 1) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError if drop_first: return embedding[1:n_components].T else: return embedding[:n_components].T
import numpy as np from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components = 30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components = 30, eigen_solver='arpack', affinity="nearest_neighbors") se.fit(X) app.quit() # TODO : check budget distribution, draw budget conditionnaly out = connected_components(graph)
def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, random_state=None): """ Find the null space of a matrix M. Parameters ---------- M : {array, matrix, sparse matrix, LinearOperator} Input covariance matrix: should be symmetric positive semi-definite k : integer Number of eigenvalues/vectors to return k_skip : integer, optional Number of low eigenvalues to skip. eigen_solver : string, {'auto', 'arpack', 'dense'} auto : algorithm will attempt to choose the best method for input data arpack : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. Warning: ARPACK can be unstable for some problems. It is best to try several random seeds in order to check results. dense : use standard dense matrix operations for the eigenvalue decomposition. For this method, M must be an array or matrix type. This method should be avoided for large problems. tol : float, optional Tolerance for 'arpack' method. Not used if eigen_solver=='dense'. max_iter : maximum number of iterations for 'arpack' method not used if eigen_solver=='dense' random_state: numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random. """ if eigen_solver == 'auto': if M.shape[0] > 200 and k + k_skip < 10: eigen_solver = 'arpack' else: eigen_solver = 'dense' if eigen_solver == 'arpack': random_state = check_random_state(random_state) # initialize with [-1,1] as in ARPACK v0 = random_state.uniform(-1, 1, M.shape[0]) try: eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0) except RuntimeError as msg: raise ValueError("Error in determining null-space with ARPACK. " "Error message: '%s'. " "Note that method='arpack' can fail when the " "weight matrix is singular or otherwise " "ill-behaved. method='dense' is recommended. " "See online documentation for more information." % msg) return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:]) elif eigen_solver == 'dense': if hasattr(M, 'toarray'): M = M.toarray() eigen_values, eigen_vectors = eigh( M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True) index = np.argsort(np.abs(eigen_values)) return eigen_vectors[:, index], np.sum(eigen_values) else: raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)