def test_connectivity(seed=36): # Test that graph connectivity test works as expected graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert_equal(_graph_is_connected(graph), False) assert_equal(_graph_is_connected(csr_matrix(graph)), False) assert_equal(_graph_is_connected(csc_matrix(graph)), False) graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert_equal(_graph_is_connected(graph), True) assert_equal(_graph_is_connected(csr_matrix(graph)), True) assert_equal(_graph_is_connected(csc_matrix(graph)), True)
def compute_markov_matrix(self, skip_checks=False, overwrite=False): """ Slightly modified code originally written by Satrajit Ghosh ([email protected] github.com/satra/mapalign) """ import numpy as np import scipy.sparse as sps L = self.current_cmat_ alpha = self.diff_alpha use_sparse = False if sps.issparse(L): use_sparse = True if not skip_checks: from sklearn.manifold.spectral_embedding_ import _graph_is_connected if not _graph_is_connected(L): raise ValueError('Graph is disconnected') ndim = L.shape[0] if overwrite: L_alpha = L else: L_alpha = L.copy() if alpha > 0: # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, np.newaxis] * L_alpha L_alpha = L_alpha * d_alpha[np.newaxis, :] # Step 3 d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, np.newaxis] * L_alpha return L_alpha
def compute_markov_matrix(L, alpha=0.5, diffusion_time=0, skip_checks=False, overwrite=False): """ Computes a markov transition matrix from the affinity matrix. Code written by Satra Ghosh (https://github.com/satra/mapalign) """ use_sparse = False if sps.issparse(L): use_sparse = True if not skip_checks: from sklearn.manifold.spectral_embedding_ import _graph_is_connected if not _graph_is_connected(L): raise ValueError('Graph is disconnected') ndim = L.shape[0] if overwrite: L_alpha = L else: L_alpha = L.copy() if alpha > 0: # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, np.newaxis] * L_alpha L_alpha = L_alpha * d_alpha[np.newaxis, :] # Step 3 d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, np.newaxis] * L_alpha return L_alpha
def test_connectivity(seed=36): # Test that graph connectivity test works as expected graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert not _graph_is_connected(graph) assert not _graph_is_connected(sparse.csr_matrix(graph)) assert not _graph_is_connected(sparse.csc_matrix(graph)) graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert _graph_is_connected(graph) assert _graph_is_connected(sparse.csr_matrix(graph)) assert _graph_is_connected(sparse.csc_matrix(graph))
def test_connectivity(seed=36): # Test that graph connectivity test works as expected graph = np.array([[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert_equal(_graph_is_connected(graph), False) assert_equal(_graph_is_connected(csr_matrix(graph)), False) assert_equal(_graph_is_connected(csc_matrix(graph)), False) graph = np.array([[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]) assert_equal(_graph_is_connected(graph), True) assert_equal(_graph_is_connected(csr_matrix(graph)), True) assert_equal(_graph_is_connected(csc_matrix(graph)), True)
def compute_markov_matrix(L, alpha=0.5, diffusion_time=0, skip_checks=False, overwrite=False): import numpy as np import scipy.sparse as sps use_sparse = False if sps.issparse(L): use_sparse = True if not skip_checks: from sklearn.manifold.spectral_embedding_ import _graph_is_connected if not _graph_is_connected(L): raise ValueError('Graph is disconnected') ndim = L.shape[0] if overwrite: L_alpha = L else: L_alpha = L.copy() if alpha > 0: # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, np.newaxis] * L_alpha L_alpha = L_alpha * d_alpha[np.newaxis, :] # Step 3 d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, np.newaxis] * L_alpha return L_alpha
def diffusemapReduction(dist, epi=0.5, mode='manual', n_neighbors=10, n_components=10): if mode == 'auto': dist_thresh, epi = thresholdDistanceMatrix(dist) else: dist_thresh, epi = thresholdDistanceMatrix(dist, epi=epi, mode='manual') plt.figure(10) plt.imshow(dist_thresh) plt.colorbar() if not _graph_is_connected(dist_thresh): raise ValueError('Graph is disconnected') embedding, result = compute_diffusion_map(dist_thresh, alpha=0., n_components=n_components) result['epi'] = epi return embedding, result
def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0, skip_checks=False, overwrite=False): """Compute the diffusion maps of a symmetric similarity matrix L : matrix N x N L is symmetric and L(x, y) >= 0 alpha: float [0, 1] Setting alpha=1 and the diffusion operator approximates the Laplace-Beltrami operator. We then recover the Riemannian geometry of the data set regardless of the distribution of the points. To describe the long-term behavior of the point distribution of a system of stochastic differential equations, we can use alpha=0.5 and the resulting Markov chain approximates the Fokker-Planck diffusion. With alpha=0, it reduces to the classical graph Laplacian normalization. n_components: int The number of diffusion map components to return. Due to the spectrum decay of the eigenvalues, only a few terms are necessary to achieve a given relative accuracy in the sum M^t. diffusion_time: float >= 0 use the diffusion_time (t) step transition matrix M^t t not only serves as a time parameter, but also has the dual role of scale parameter. One of the main ideas of diffusion framework is that running the chain forward in time (taking larger and larger powers of M) reveals the geometric structure of X at larger and larger scales (the diffusion process). t = 0 empirically provides a reasonable balance from a clustering perspective. Specifically, the notion of a cluster in the data set is quantified as a region in which the probability of escaping this region is low (within a certain time t). skip_checks: bool Avoid expensive pre-checks on input data. The caller has to make sure that input data is valid or results will be undefined. overwrite: bool Optimize memory usage by re-using input matrix L as scratch space. References ---------- [1] https://en.wikipedia.org/wiki/Diffusion_map [2] Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006 """ import numpy as np import scipy.sparse as sps use_sparse = False if sps.issparse(L): use_sparse = True if not skip_checks: from sklearn.manifold.spectral_embedding_ import _graph_is_connected if not _graph_is_connected(L): raise ValueError('Graph is disconnected') ndim = L.shape[0] if overwrite: L_alpha = L else: L_alpha = L.copy() if alpha > 0: # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, np.newaxis] * L_alpha L_alpha = L_alpha * d_alpha[np.newaxis, :] # Step 3 d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, np.newaxis] * L_alpha M = L_alpha from scipy.sparse.linalg import eigsh, eigs # Step 4 func = eigs if n_components is not None: lambdas, vectors = func(M, k=n_components + 1) else: lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim)))) del M if func == eigsh: lambdas = lambdas[::-1] vectors = vectors[:, ::-1] else: lambdas = np.real(lambdas) vectors = np.real(vectors) lambda_idx = np.argsort(lambdas)[::-1] lambdas = lambdas[lambda_idx] vectors = vectors[:, lambda_idx] # Step 5 psi = vectors / vectors[:, [0]] diffusion_times = diffusion_time if diffusion_time == 0: diffusion_times = np.exp(1. - np.log(1 - lambdas[1:]) / np.log(lambdas[1:])) lambdas = lambdas[1:] / (1 - lambdas[1:]) else: lambdas = lambdas[1:]**float(diffusion_time) lambda_ratio = lambdas / lambdas[0] threshold = max(0.05, lambda_ratio[-1]) n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0]) n_components_auto = min(n_components_auto, ndim) if n_components is None: n_components = n_components_auto embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] result = dict(lambdas=lambdas, vectors=vectors, n_components=n_components, diffusion_time=diffusion_times, n_components_auto=n_components_auto) return embedding, result
def diffusion_mapping(adj, n_components=10, alpha=0.5, diffusion_time=0, random_state=None): """Compute diffusion map of affinity matrix. Parameters ---------- adj : ndarray or sparse matrix, shape = (n, n) Affinity matrix. n_components : int or None, optional Number of eigenvectors. If None, selection of `n_components` is based on 95% drop-off in eigenvalues. When `n_components` is None, the maximum number of eigenvectors is restricted to ``n_components <= sqrt(n)``. Default is 10. alpha : float, optional Anisotropic diffusion parameter, ``0 <= alpha <= 1``. Default is 0.5. diffusion_time : int, optional Diffusion time or scale. If ``diffusion_time == 0`` use multi-scale diffusion maps. Default is 0. random_state : int or None, optional Random state. Default is None. Returns ------- v : ndarray, shape (n, n_components) Eigenvectors of the affinity matrix in same order. w : ndarray, shape (n_components,) Eigenvalues of the affinity matrix in descending order. References ---------- * Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006 * Joseph W.R., Peter E.F., Ann B.L., Chad M.S. Accurate parameter estimation for star formation history in galaxies using SDSS spectra. """ rs = check_random_state(random_state) use_sparse = ssp.issparse(adj) # Make symmetric if not is_symmetric(adj, tol=1E-10): warnings.warn('Affinity is not symmetric. Making symmetric.') adj = make_symmetric(adj, check=False, copy=True, sparse_format='coo') else: # Copy anyways because we will be working on the matrix adj = adj.tocoo(copy=True) if use_sparse else adj.copy() # Check connected if not _graph_is_connected(adj): warnings.warn('Graph is not fully connected.') ########################################################### # Step 2 ########################################################### # When α=0, you get back the diffusion map based on the random walk-style # diffusion operator (and Laplacian Eigenmaps). For α=1, the diffusion # operator approximates the Laplace-Beltrami operator and for α=0.5, # you get Fokker-Planck diffusion. The anisotropic diffusion # parameter: \alpha \in \[0, 1\] # W(α) = D^{−1/\alpha} W D^{−1/\alpha} if alpha > 0: if use_sparse: d = np.power(adj.sum(axis=1).A1, -alpha) adj.data *= d[adj.row] adj.data *= d[adj.col] else: d = adj.sum(axis=1, keepdims=True) d = np.power(d, -alpha) adj *= d.T adj *= d ########################################################### # Step 3 ########################################################### # Diffusion operator # P(α) = D(α)^{−1}W(α) if use_sparse: d_alpha = np.power(adj.sum(axis=1).A1, -1) adj.data *= d_alpha[adj.row] else: adj *= np.power(adj.sum(axis=1, keepdims=True), -1) ########################################################### # Step 4 ########################################################### if n_components is None: n_components = max(2, int(np.sqrt(adj.shape[0]))) auto_n_comp = True else: auto_n_comp = False # For repeatability of results v0 = rs.uniform(-1, 1, adj.shape[0]) # Find largest eigenvalues and eigenvectors w, v = eigsh(adj, k=n_components + 1, which='LM', tol=0, v0=v0) # Sort descending w, v = w[::-1], v[:, ::-1] ########################################################### # Step 5 ########################################################### # Force first eigenvector to be all ones. v /= v[:, [0]] # Largest eigenvalue should be equal to one too w /= w[0] # Discard first (largest) eigenvalue and eigenvector w, v = w[1:], v[:, 1:] if diffusion_time <= 0: # use multi-scale diffusion map, ref [4] # considers all scales: t=1,2,3,... w /= (1 - w) else: # Raise eigenvalues to the power of diffusion time w **= diffusion_time if auto_n_comp: # Choose n_comp to coincide with a 95 % drop-off # in the eigenvalue multipliers, ref [4] lambda_ratio = w / w[0] # If all eigenvalues larger than 0.05, select all # (i.e., sqrt(adj.shape[0])) threshold = max(0.05, lambda_ratio[-1]) n_components = np.argmin(lambda_ratio > threshold) w = w[:n_components] v = v[:, :n_components] # Rescale eigenvectors with eigenvalues v *= w[None, :] # Consistent sign (s.t. largest value of element eigenvector is pos) v *= np.sign(v[np.abs(v).argmax(axis=0), range(v.shape[1])]) return v, w
def laplacian_eigenmaps(adj, n_components=10, norm_laplacian=True, random_state=None): """Compute embedding using Laplacian eigenmaps. Adapted from Scikit-learn to also provide eigenvalues. Parameters ---------- adj : 2D ndarray or sparse matrix Affinity matrix. n_components : int, optional Number of eigenvectors. Default is 10. norm_laplacian : bool, optional If True use normalized Laplacian. Default is True. random_state : int or None, optional Random state. Default is None. Returns ------- v : 2D ndarray, shape (n, n_components) Eigenvectors of the affinity matrix in same order. Where `n` is the number of rows of the affinity matrix. w : 1D ndarray, shape (n_components,) Eigenvalues of the affinity matrix in ascending order. References ---------- * Belkin, M. and Niyogi, P. (2003). Laplacian Eigenmaps for dimensionality reduction and data representation. Neural Computation 15(6): 1373-96. doi:10.1162/089976603321780317 """ rs = check_random_state(random_state) # Make symmetric if not is_symmetric(adj, tol=1E-10): warnings.warn('Affinity is not symmetric. Making symmetric.') adj = make_symmetric(adj, check=False) # Check connected if not _graph_is_connected(adj): warnings.warn('Graph is not fully connected.') lap, dd = laplacian(adj, normed=norm_laplacian, return_diag=True) if norm_laplacian: if ssp.issparse(lap): lap.setdiag(1) else: np.fill_diagonal(lap, 1) lap *= -1 v0 = rs.uniform(-1, 1, lap.shape[0]) w, v = eigsh(lap, k=n_components + 1, sigma=1, which='LM', tol=0, v0=v0) # Sort descending and change sign of eigenvalues w, v = -w[::-1], v[:, ::-1] if norm_laplacian: v /= dd[:, None] # Drop smallest w, v = w[1:], v[:, 1:] # Consistent sign (s.t. largest value of element eigenvector is pos) v *= np.sign(v[np.abs(v).argmax(axis=0), range(v.shape[1])]) return v, w
thresholdDistanceMatrix(dist_Y) #using diffusion mapping to do this epi =100 dist_thresh = np.exp(-dist_Y**2/epi/2.0) M = dist_thresh for i in range(M.shape[0]): M[i] = M[i]/np.sum(M[i]) from sklearn.manifold.spectral_embedding_ import _graph_is_connected plt.figure(10) plt.imshow(dist_thresh) plt.colorbar() if not _graph_is_connected(dist_thresh): raise ValueError('Graph is disconnected') embedding, result = compute_diffusion_map(dist_thresh, n_components=5) # w,v = np.linalg.eig(M) # U,S,V = np.linalg.svd(M) # plt.figure(1) # plt.plot(np.real(v[:,1])) # plt.figure(2) # plt.plot(np.real(v[:,2]))
def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0, skip_checks=False, overwrite=False): """Compute the diffusion maps of a symmetric similarity matrix L : matrix N x N L is symmetric and L(x, y) >= 0 alpha: float [0, 1] Setting alpha=1 and the diffusion operator approximates the Laplace-Beltrami operator. We then recover the Riemannian geometry of the data set regardless of the distribution of the points. To describe the long-term behavior of the point distribution of a system of stochastic differential equations, we can use alpha=0.5 and the resulting Markov chain approximates the Fokker-Planck diffusion. With alpha=0, it reduces to the classical graph Laplacian normalization. n_components: int The number of diffusion map components to return. Due to the spectrum decay of the eigenvalues, only a few terms are necessary to achieve a given relative accuracy in the sum M^t. diffusion_time: float >= 0 use the diffusion_time (t) step transition matrix M^t t not only serves as a time parameter, but also has the dual role of scale parameter. One of the main ideas of diffusion framework is that running the chain forward in time (taking larger and larger powers of M) reveals the geometric structure of X at larger and larger scales (the diffusion process). t = 0 empirically provides a reasonable balance from a clustering perspective. Specifically, the notion of a cluster in the data set is quantified as a region in which the probability of escaping this region is low (within a certain time t). skip_checks: bool Avoid expensive pre-checks on input data. The caller has to make sure that input data is valid or results will be undefined. overwrite: bool Optimize memory usage by re-using input matrix L as scratch space. References ---------- [1] https://en.wikipedia.org/wiki/Diffusion_map [2] Coifman, R.R.; S. Lafon. (2006). "Diffusion maps". Applied and Computational Harmonic Analysis 21: 5-30. doi:10.1016/j.acha.2006.04.006 """ import numpy as np import scipy.sparse as sps use_sparse = False if sps.issparse(L): use_sparse = True if not skip_checks: from sklearn.manifold.spectral_embedding_ import _graph_is_connected if not _graph_is_connected(L): raise ValueError("Graph is disconnected") ndim = L.shape[0] if overwrite: L_alpha = L else: L_alpha = L.copy() if alpha > 0: # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, np.newaxis] * L_alpha L_alpha = L_alpha * d_alpha[np.newaxis, :] # Step 3 d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, np.newaxis] * L_alpha M = L_alpha from scipy.sparse.linalg import eigsh, eigs # Step 4 func = eigs if n_components is not None: lambdas, vectors = func(M, k=n_components + 1) else: lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim)))) del M if func == eigsh: lambdas = lambdas[::-1] vectors = vectors[:, ::-1] else: lambdas = np.real(lambdas) vectors = np.real(vectors) lambda_idx = np.argsort(lambdas)[::-1] lambdas = lambdas[lambda_idx] vectors = vectors[:, lambda_idx] # Step 5 psi = vectors / vectors[:, [0]] if diffusion_time == 0: lambdas = lambdas[1:] / (1 - lambdas[1:]) else: lambdas = lambdas[1:] ** float(diffusion_time) lambda_ratio = lambdas / lambdas[0] threshold = max(0.05, lambda_ratio[-1]) n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0]) n_components_auto = min(n_components_auto, ndim) if n_components is None: n_components = n_components_auto embedding = psi[:, 1 : (n_components + 1)] * lambdas[:n_components][None, :] result = dict( lambdas=lambdas, vectors=vectors, n_components=n_components, diffusion_time=diffusion_time, n_components_auto=n_components_auto, ) return embedding, result
def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0, verbose=False): # from https://github.com/satra/mapalign/blob/master/mapalign/embed.py import numpy as np import scipy.sparse as sps from sklearn.manifold.spectral_embedding_ import _graph_is_connected use_sparse = False if sps.issparse(L): use_sparse = True if not _graph_is_connected(L): raise ValueError('Graph is disconnected') if verbose: print 'checked conditions' ndim = L.shape[0] L_alpha = L.copy() if alpha > 0: if verbose: print 'step2' # Step 2 d = np.array(L_alpha.sum(axis=1)).flatten() d_alpha = np.power(d, -alpha) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) L_alpha.data *= d_alpha[L_alpha.indices] L_alpha = sps.csr_matrix(L_alpha.transpose().toarray()) else: L_alpha = d_alpha[:, None] * L_alpha * d_alpha[None, :] # Step 3 if verbose: print 'step 3' d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1) if use_sparse: L_alpha.data *= d_alpha[L_alpha.indices] else: L_alpha = d_alpha[:, None] * L_alpha M = L_alpha from sklearn.utils.arpack import eigsh, eigs # Step 4 if verbose: print 'step 4' func = eigs if n_components is not None: lambdas, vectors = func(M, k=n_components + 1) else: lambdas, vectors = func(M, k=max(2, int(np.sqrt(ndim)))) del M if func == eigsh: lambdas = lambdas[::-1] vectors = vectors[:, ::-1] else: lambdas = np.real(lambdas) vectors = np.real(vectors) lambda_idx = np.argsort(lambdas)[::-1] lambdas = lambdas[lambda_idx] vectors = vectors[:, lambda_idx] # Step 5 if verbose: print 'step 5' psi = vectors/vectors[:, [0]] if diffusion_time == 0: lambdas = lambdas[1:] / (1 - lambdas[1:]) else: lambdas = lambdas[1:] ** float(diffusion_time) lambda_ratio = lambdas/lambdas[0] threshold = max(0.05, lambda_ratio[-1]) n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0]) n_components_auto = min(n_components_auto, ndim) if n_components is None: n_components = n_components_auto embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] result = dict(lambdas=lambdas, vectors=vectors, n_components=n_components, diffusion_time=diffusion_time, n_components_auto=n_components_auto) return embedding, result
def spectral_embedding(self, adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, drop_first=True): """ see original at https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/spectral_embedding_.py#L133 custermize1: return lambdas with the embedded matrix. custermize2: norm_laplacian is always True """ norm_laplacian = True adjacency = check_symmetric(adjacency) try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg": raise ValueError( "The eigen_solver was set to 'amg', but pyamg is " "not available.") if eigen_solver is None: eigen_solver = 'arpack' elif eigen_solver not in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1, norm_laplacian) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" # Revert the laplacian to its opposite to have lobpcg work laplacian *= -1 if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to eigh, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.toarray() lambdas, diffusion_map = eigh(laplacian) embedding = diffusion_map.T[:n_components] * dd else: laplacian = _set_diag(laplacian, 1, norm_laplacian) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError embedding = _deterministic_vector_sign_flip(embedding) if drop_first: return embedding[1:n_components].T, lambdas else: return embedding[:n_components].T, lambdas