def test_graph_laplacian(): for mat in (np.arange(10) * np.arange(10)[:, np.newaxis], np.ones((7, 7)), np.eye(19), np.vander(np.arange(4)) + np.vander(np.arange(4)).T,): sp_mat = sparse.csr_matrix(mat) for normed in (True, False): laplacian = graph_laplacian(mat, normed=normed) n_nodes = mat.shape[0] if not normed: np.testing.assert_array_almost_equal(laplacian.sum(axis=0), np.zeros(n_nodes)) np.testing.assert_array_almost_equal(laplacian.T, laplacian) np.testing.assert_array_almost_equal( laplacian, graph_laplacian(sp_mat, normed=normed).toarray())
def my_uniteigenvector_zeroeigenvalue_cluster(k): G = nx.read_gpickle('data/undirected(fortest).gpickle') A = nx.adjacency_matrix(G, nodelist=G.nodes()[:-1], weight='weight') #A=A.toarray() #np.fill_diagonal(A,0.01) #add node with its own weight to itself #Tri = np.diag(np.sum(A, axis=1)) #L = Tri - A #Tri_1 = np.diag(np.reciprocal(np.sqrt(Tri).diagonal())) #Ls = Tri_1.dot(L).dot(Tri_1) Ls, dd = graph_laplacian(A,normed=True, return_diag=True) eigenvalue_n, eigenvector_n = eigsh(Ls*(-1), k=k, sigma=1.0, which='LM', tol=0.0) #for ic,vl in enumerate(eigenvalue_n): # if abs(vl-0)<=1e-10: # eigenvector_n[:, ic] = np.full(len(G.nodes()[:-1]),1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue eigenvector_n[:, -1] = np.full(len(G.nodes()[:-1]), 1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue for ir,n in enumerate(eigenvector_n): eigenvector_n[ir]=n/float(np.linalg.norm(n)) # normalize to unitvector _, labels, _ = k_means(eigenvector_n, k, random_state=None, n_init=100) return labels
def fit(self, X, y, unlabeled_data=None): num_data = X.shape[0] + unlabeled_data.shape[0] num_labeled = X.shape[0] num_unlabeled = unlabeled_data.shape[0] labeled = np.zeros((num_data,), dtype=np.float32) labeled[0:num_labeled] = 1.0 if issparse(X): self.X_ = vstack((util.cast_to_float32(X), util.cast_to_float32(unlabeled_data)), format='csr') else: self.X_ = np.concatenate((util.cast_to_float32(X), util.cast_to_float32(unlabeled_data))) self.gamma = ( self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.kernel_params = {'gamma':self.gamma, 'degree':self.degree, 'coef0':self.coef0} kernel_matrix = pairwise_kernels(self.X_, metric=self.kernel, filter_params=True, **self.kernel_params) A = np.dot(np.diag(labeled), kernel_matrix) if self.nu2 != 0: if self.kernel == 'rbf': laplacian_kernel_matrix = kernel_matrix else: laplacian_kernel_matrix = rbf_kernel(self.X_, gamma=self.gamma) laplacian_x_kernel = np.dot(graph_laplacian( laplacian_kernel_matrix, normed=self.normalize_laplacian), kernel_matrix) A += self.nu2 * laplacian_x_kernel y = np.concatenate((y, -np.ones((num_unlabeled,), dtype=np.float32)), axis=0) super(LapRLSC, self).fit(A, y, class_for_unlabeled=-1)
def spectralcluster(A, n_cluster, n_neighbors=6, random_state=None, eigen_tol=0.0): #maps = spectral_embedding(affinity, n_components=n_components,eigen_solver=eigen_solver,random_state=random_state,eigen_tol=eigen_tol, drop_first=False) # dd is diag laplacian, dd = graph_laplacian(A, normed=True, return_diag=True) # set the diagonal of the laplacian matrix and convert it to a sparse format well suited for e # igenvalue decomposition laplacian = _set_diag(laplacian, 1) # diffusion_map is eigenvectors # LM largest eigenvalues laplacian *= -1 eigenvalues, eigenvectors = eigsh(laplacian, k=n_cluster, sigma=1.0, which='LM', tol=eigen_tol) y = eigenvectors.T[n_cluster::-1] * dd y = _deterministic_vector_sign_flip(y)[:n_cluster].T random_state = check_random_state(random_state) centroids, labels, _ = k_means(y, n_cluster, random_state=random_state) return eigenvalues, y, centroids, labels
def caculate_key_frame(filenames): vector_data = generate_vector(filenames) adjacent_matrix = gen_matrix_gausses(vector_data) lap_m, diag_m = graph_laplacian(adjacent_matrix, normed=True, return_diag=True) w, v = eig(lap_m) w = np.sort(w)[::-1] cum_w = np.cumsum(w) print cum_w print np.diff(cum_w)
def test_graph_laplacian(): for mat in ( np.arange(10) * np.arange(10)[:, np.newaxis], np.ones((7, 7)), np.eye(19), np.vander(np.arange(4)) + np.vander(np.arange(4)).T, ): sp_mat = sparse.csr_matrix(mat) for normed in (True, False): laplacian = graph_laplacian(mat, normed=normed) n_nodes = mat.shape[0] if not normed: np.testing.assert_array_almost_equal(laplacian.sum(axis=0), np.zeros(n_nodes)) np.testing.assert_array_almost_equal(laplacian.T, laplacian) np.testing.assert_array_almost_equal( laplacian, graph_laplacian(sp_mat, normed=normed).toarray())
def _build_graph(self): n_samples = len(self.similarity_matrix) laplacian = graph_laplacian(self.similarity_matrix, normed=True) laplacian = -laplacian if sparse.isspmatrix(laplacian): diag_mask = (laplacian.row == laplcaian.col) laplacian.data[diag_mask] = 0.0 else: laplacian.flat[::n_samples + 1] = 0.0 return laplacian
def _build_graph(self): """Graph matrix for Label Spreading computes the graph laplacian""" # compute affinity matrix (or gram matrix) if self.kernel == 'knn': self.nn_fit = None n_samples = self.X_.shape[0] affinity_matrix = self._get_kernel(self.X_) laplacian = graph_laplacian(affinity_matrix, normed=True) laplacian = -laplacian if sparse.isspmatrix(laplacian): diag_mask = (laplacian.row == laplacian.col) laplacian.data[diag_mask] = 0.0 else: laplacian.flat[::n_samples + 1] = 0.0 # set diag to 0.0 return laplacian
def test_spectral_embedding_unnormalized(): # Test that spectral_embedding is also processing unnormalized laplacian correctly random_state = np.random.RandomState(36) data = random_state.randn(10, 30) sims = rbf_kernel(data) n_components = 8 embedding_1 = spectral_embedding(sims, norm_laplacian=False, n_components=n_components, drop_first=False) # Verify using manual computation with dense eigh laplacian, dd = graph_laplacian(sims, normed=False, return_diag=True) _, diffusion_map = eigh(laplacian) embedding_2 = diffusion_map.T[:n_components] * dd embedding_2 = _deterministic_vector_sign_flip(embedding_2).T assert_array_almost_equal(embedding_1, embedding_2)
def seriation(self, A): n_components = 2 eigen_tol = 0.00001 if sparse.issparse(A): A = A.todense() np.fill_diagonal(A, 0) laplacian, dd = graph_laplacian(A, return_diag=True) laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] # * dd sort_index = np.argsort(embedding[1]) return sort_index
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1,1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1, 1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def cluster(self, affinities): laplacian, diagonal = graphutil.graph_laplacian(affinities, normed=True, return_diag=True) self.embedding = self.embed(laplacian, diagonal, self.k, self.tol) centroid_vals, self.labels, _ = k_means(self.embedding, self.k, random_state=self.rand, n_init=self.n_init, init=self.init_centroids) self.centroids = [] for c in centroid_vals: self.centroids.append( np.argmin([np.sum((c - e)**2) for e in self.embedding])) return self.labels
def _check_manifold_gradient(self): print "Gradient verification for the manifold regularization term...", n_examples = 10 n_features = 5 beta = np.random.rand() * 100.0 epsilon = 1e-6 # Initialization O = np.random.randint(0, 2, (n_examples, n_features)) A = np.random.rand(n_examples, n_examples) A = np.dot(A.T, A) # Make it symmetric np.fill_diagonal(A, 100.) L = graph_laplacian(A) w = np.random.rand(n_features) # Compute the gradient according to the expression OLO = np.dot(np.dot(O.T, L), O) gradient = 4 * beta * np.dot(OLO, w) / (n_examples * L.shape[0]**2) # Compute the empirical gradient estimate def loss(w): l = 0.0 for i in xrange(O.shape[0]): for j in xrange(O.shape[0]): l += A[i, j] * (np.dot(w, O[i]) - np.dot(w, O[j]))**2 l /= (n_examples * L.shape[0]**2) return beta * l # Check the gradient for each component of w for i in xrange(w.shape[0]): w_1 = w.copy() w_2 = w.copy() w_1[i] += epsilon w_2[i] -= epsilon empirical_gradient = (loss(w_1) - loss(w_2)) / (2 * epsilon) if not np.allclose(empirical_gradient, gradient[i]): print "FAILED. Expected gradient: %.8f Calculated gradient: %.8f" % ( empirical_gradient, gradient[i]) return False else: print "PASSED" return True
def create_laplacian(Adjacency, norm_lap=None, sparse=None): """Finds the Graph Laplacian from a Weighted Adjacency Matrix Parameters ---------- * Adjacency - a sparse NxN array Returns ------- * Laplacian - an NxN laplacian array * Diagonal - an NxN diagonal array """ L, D = graph_laplacian(Adjacency, normed=norm_lap, return_diag=True) D = spdiags(data=D, diags=[0], m=Adjacency.shape[0], n=Adjacency.shape[0]) return L, D
def predict_k(affinity_matrix): normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True) laplacian = _set_diag(normed_laplacian, 1,norm_laplacian=True) n_components = affinity_matrix.shape[0] - 1 eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] # Reverse and sign inversion. max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap: max_gap = gap gap_pre_index = i - 1 k = gap_pre_index + 1 return k
def create_laplacian(Adjacency, norm_lap=None, method='Personal', sparse=None): """Finds the Graph Laplacian from a Weighted Adjacency Matrix Parameters ---------- * Adjacency - a sparse NxN array Returns ------- * Laplacian - an NxN laplacian array * Diagonal - an NxN diagonal array """ if method in ['personal', 'Personal']: D = spdiags(data=np.squeeze(np.asarray(Adjacency.sum(axis=1))), diags=[0], m=Adjacency.shape[0], n=Adjacency.shape[0]) return D-Adjacency, D elif method in ['sklearn', 'scikit']: L, D = graph_laplacian(Adjacency, normed=norm_lap, return_diag=True) D = spdiags(data=D, diags=[0], m=Adjacency.shape[0], n=Adjacency.shape[0]) return L, D else: raise ValueError('Unrecognized Graph Laplacian method' 'construction.')
import service.prodbox from sklearn.manifold import SpectralEmbedding, spectral_embedding_ from sklearn.cluster import SpectralClustering from sklearn.utils.sparsetools import connected_components from sklearn.neighbors import kneighbors_graph from sklearn.utils.graph import graph_laplacian import numpy as np from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components=30, eigen_solver='arpack', affinity="nearest_neighbors")
def predict_k(affinity_matrix): """ Predict number of clusters based on the eigengap. Parameters ---------- affinity_matrix : array-like or sparse matrix, shape: (n_samples, n_samples) adjacency matrix. Each element of this matrix contains a measure of similarity between two of the data points. Returns ---------- k : integer estimated number of cluster. Note --------- If graph is not fully connected, zero component as single cluster. References ---------- A Tutorial on Spectral Clustering, 2007 Luxburg, Ulrike http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf """ """ If normed=True, L = D^(-1/2) * (D - A) * D^(-1/2) else L = D - A. normed=True is recommended. """ normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True) laplacian = _set_diag(normed_laplacian, 1) """ n_components size is N - 1. Setting N - 1 may lead to slow execution time... """ n_components = affinity_matrix.shape[0] - 1 """ shift-invert mode The shift-invert mode provides more than just a fast way to obtain a few small eigenvalues. http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html The normalized Laplacian has eigenvalues between 0 and 2. I - L has eigenvalues between -1 and 1. """ eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] # Reverse and sign inversion. max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap: max_gap = gap gap_pre_index = i - 1 k = gap_pre_index + 1 return k
import service.prodbox from sklearn.manifold import SpectralEmbedding, spectral_embedding_ from sklearn.cluster import SpectralClustering from sklearn.utils.sparsetools import connected_components from sklearn.neighbors import kneighbors_graph from sklearn.utils.graph import graph_laplacian import numpy as np from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components = 30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components = 30, eigen_solver='arpack', affinity="nearest_neighbors") se.fit(X)
def laplacian_matrix(B, d, normed=False): A = similarity_matrix(B, d) # return np.diag(np.sum(A, axis=0)) - A return graph_laplacian(A, normed=normed)
percent = perc_matrix.div(perc_matrix['total'], axis='index') * 100 # calculate row percentage percent = percent.drop(['total'], axis=1) # drop total column cat_perc = [] for cat in matrix.columns: cat_tuple = (cat, matrix[cat].mean()) cat_perc.append(cat_tuple) # sort category percentages cat_perc = sorted(cat_perc, key=lambda x: x[1]) graph = cosine_similarity(matrix) # use cosine similarity, as in Noulas et al. # https://github.com/mingmingyang/auto_spectral_clustering/blob/master/autosp.py # how to calculate spectral clusters norm_laplacian, dd = graph_laplacian(graph, normed=True, return_diag=True) laplacian = _set_diag(norm_laplacian, 1, norm_laplacian=True) n_components = graph.shape[0] - 1 eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000) eigenvalues = -eigenvalues[::-1] max_gap = 0 gap_pre_index = 0 for i in range(1, eigenvalues.size): gap = eigenvalues[i] - eigenvalues[i - 1] if gap > max_gap:
def spectral_embedding(self, adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, drop_first=True): """ see original at https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/spectral_embedding_.py#L133 custermize1: return lambdas with the embedded matrix. custermize2: norm_laplacian is always True """ norm_laplacian = True adjacency = check_symmetric(adjacency) try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg": raise ValueError( "The eigen_solver was set to 'amg', but pyamg is " "not available.") if eigen_solver is None: eigen_solver = 'arpack' elif eigen_solver not in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1, norm_laplacian) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" # Revert the laplacian to its opposite to have lobpcg work laplacian *= -1 if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to eigh, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.toarray() lambdas, diffusion_map = eigh(laplacian) embedding = diffusion_map.T[:n_components] * dd else: laplacian = _set_diag(laplacian, 1, norm_laplacian) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError embedding = _deterministic_vector_sign_flip(embedding) if drop_first: return embedding[1:n_components].T, lambdas else: return embedding[:n_components].T, lambdas
def spectral_embedding(adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True, mode=None): """Project the sample on the first eigen vectors of the graph Laplacian. MMP:TO CHANGE THIS The adjacency matrix is used to compute a normalized graph Laplacian whose spectrum (especially the eigen vectors associated to the smallest eigen values) has an interpretation in terms of minimal number of cuts necessary to split the graph into comparably sized components. This embedding can also 'work' even if the ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). However care must taken to always make the affinity matrix symmetric so that the eigen vector decomposition works as expected. Parameters ---------- adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) The adjacency matrix of the graph to embed. n_components : integer, optional The dimension of the projection subspace. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'. By default, arpack is used. eigen_tol : float, optional, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. drop_first : bool, optional, default=True Whether to drop the first eigenvector. For spectral embedding, this should be True as the first eigenvector should be constant vector for connected graph, but for spectral clustering, this should be kept as False to retain the first eigenvector. Returns ------- embedding : array, shape=(n_samples, n_components) The reduced samples. Notes ----- Spectral embedding is most useful when the graph has one connected component. If there graph has many components, the first few eigenvectors will simply uncover the connected components of the graph. References ---------- * http://en.wikipedia.org/wiki/LOBPCG * Toward the Optimal Preconditioned Eigensolver: Locally Optimal Block Preconditioned Conjugate Gradient Method Andrew V. Knyazev http://dx.doi.org/10.1137%2FS1064827500366124 """ try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg" or mode == "amg": raise ValueError("The eigen_solver was set to 'amg', but pyamg is " "not available.") if not mode is None: warnings.warn("'mode' was renamed to eigen_solver " "and will be removed in 0.15.", DeprecationWarning) eigen_solver = mode if eigen_solver is None: eigen_solver = 'arpack' elif not eigen_solver in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 # Check that the matrices given is symmetric if ((not sparse.isspmatrix(adjacency) and not np.all((adjacency - adjacency.T) < 1e-10)) or (sparse.isspmatrix(adjacency) and not np.all((adjacency - adjacency.T).data < 1e-10))): warnings.warn("Graph adjacency matrix should be symmetric. " "Converted to be symmetric by average with its " "transpose.") adjacency = .5 * (adjacency + adjacency.T) if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: lambdas, diffusion_map = eigsh(-laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) embedding = diffusion_map.T[n_components::-1] * dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") laplacian = laplacian.astype(np.float) # lobpcg needs native floats laplacian = _set_diag(laplacian, 1) ml = smoothed_aggregation_solver(atleast2d_or_csr(laplacian)) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T * dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": laplacian = laplacian.astype(np.float) # lobpcg needs native floats if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to symeig, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.todense() lambdas, diffusion_map = symeig(laplacian) embedding = diffusion_map.T[:n_components] * dd else: # lobpcg needs native floats laplacian = laplacian.astype(np.float) laplacian = _set_diag(laplacian, 1) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] * dd if embedding.shape[0] == 1: raise ValueError if drop_first: return embedding[1:n_components].T else: return embedding[:n_components].T
# -*- coding: utf-8 -*- import numpy as np from sklearn.utils.graph import graph_laplacian def assign_undirected_weight(W, i, j, v): W[i,j] = W[j,i] = v n = 5 W = np.zeros((n,n)) assign_undirected_weight(W,0,1,0.08) assign_undirected_weight(W,0,2,0.09) assign_undirected_weight(W,1,2,0.45) assign_undirected_weight(W,1,3,0.22) assign_undirected_weight(W,1,4,0.24) assign_undirected_weight(W,2,3,0.2) assign_undirected_weight(W,2,4,0.19) assign_undirected_weight(W,3,4,1) adjacency = W; print W laplacian, dd = graph_laplacian(adjacency, normed=True, return_diag=True) print laplacian print dd
test_data = np.array(test_data) test_label = np.array(test_label) train_data = np.concatenate((train_data, test_data)) train_label = np.concatenate((train_label, test_label)) print 'nsample: ' + str(train_data.shape[0]) print 'nclass: ' + str(np.unique(train_label).shape[0]) gamma_value = 5.0 affinity_matrix = rbf_kernel(train_data, gamma=gamma_value) from sklearn.utils.graph import graph_laplacian from sklearn.utils.extmath import _deterministic_vector_sign_flip ### calculate laplacian matrix laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True) laplacian *= -1. nclass = np.unique(train_label).shape[0] nsample = train_data.shape[0] #### Configuring AdaGrad print 'mini batch size = 100' master_stepsize = 0.0025 outer_iter = 600 nsampleround = 50 ncols = 2 auto_corr = 0.0 ndim = nclass print 'nsampleround: ' + str(nsampleround) print 'ncols: ' + str(ncols)
def fit(self, X, X_species, y, orthologs, species_graph_adjacency, species_graph_names): """Fit the model Parameters ---------- X: array_like, dtype=float, shape=(n_examples, n_features) The feature vectors of each labeled example. X_species: array_like, dtype=str, shape=(n_examples,) The name of the species to which each example belongs. y: array_like, dtype=float, shape(n_examples,) The labels of the examples in X. orthologs: dict A dictionnary in which the keys are indices of X and the values are another dict, which contain the orthologous sequences and their species names. TIP: use an HDF5 file to store this information if the data doesn't fit into memory. Note: assumes that there is at most 1 ortholog per species. ex: {0: {"species": ["species1", "species5", "species2"], "X": [[0, 2, 1, 4], # Ortholog 1 [9, 4, 3, 1], # Ortholog 2 [0, 0, 2, 1]]}, # Ortholog 3 1: {"species": ["species1", "species3"], "X": [[1, 4, 7, 6], [4, 4, 9, 3]]}} species_graph_adjacency: array_like, dtype=float, shape=(n_species, n_species) The adjacency matrix of the species graph. species_graph_names: array_like, dtype=str, shape(n_species,) The names of the species in the graph. The names should follow the same order as the adjacency matrix. ex: If species_graph_names[4] relates to species_graph_adjacency[4] and species_graph_adjacency[:, 4]. Note ---- It is recommended to center the features vectors for the examples and their orthologs using a standard scaler. (see: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html). """ # Create a mapping between species names and indices in the graph adjacency matrix idx_by_species = dict( zip(species_graph_names, range(len(species_graph_names)))) if self.fit_intercept: X = np.hstack((X, np.ones(X.shape[0]).reshape( -1, 1))) # Add a feature for each example that serves as bias # Precompute the laplacian of the species graph # Note: we assume that there is one entry per species. This sacrifices a bit of memory, but allows the precomputation # the graph laplacian. L = graph_laplacian(species_graph_adjacency, normed=self.normalize_laplacian) L *= 2.0 * self.beta matrix_to_invert = np.zeros((X.shape[1], X.shape[1])) # Compute the Phi^t x L x Phi product, where L is the block diagonal matrix with blocks equal to variable L for i, x in enumerate(X): # H5py doesn't support integer keys if isinstance(orthologs, h.File): i = str(i) if len(orthologs[i]["species"]) > 0: # Load the orthologs of X and create a matrix that also contains x x_orthologs_species = [ idx_by_species[s] for s in orthologs[i]["species"] ] x_orthologs_feats = orthologs[i]["X"] if self.fit_intercept: x_orthologs_feats = np.hstack( (x_orthologs_feats, np.ones(x_orthologs_feats.shape[0]).reshape( -1, 1))) # Add this bias term X_tmp = np.zeros( (len(species_graph_names), x_orthologs_feats.shape[1])) X_tmp[x_orthologs_species] = x_orthologs_feats X_tmp[idx_by_species[X_species[i]]] = x # Compute the efficient product and add it to the nasty product matrix_to_invert += np.dot(np.dot(X_tmp.T, L), X_tmp) # Compute the Phi^T x Phi matrix product that includes the labeled examples only matrix_to_invert += np.dot(X.T, X) # Compute the alpha * I product matrix_to_invert += self.alpha * np.eye(X.shape[1]) # Compute the value of w, the predictor that minimizes the objective function self.w = np.dot(np.dot(np.linalg.inv(matrix_to_invert), X.T), y).reshape(-1, )
def spectral_embedding(adjacency, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True): """Project the sample on the first eigenvectors of the graph Laplacian. The adjacency matrix is used to compute a normalized graph Laplacian whose spectrum (especially the eigenvectors associated to the smallest eigenvalues) has an interpretation in terms of minimal number of cuts necessary to split the graph into comparably sized components. This embedding can also 'work' even if the ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). However care must taken to always make the affinity matrix symmetric so that the eigenvector decomposition works as expected. Read more in the :ref:`User Guide <spectral_embedding>`. Parameters ---------- adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) The adjacency matrix of the graph to embed. n_components : integer, optional, default 8 The dimension of the projection subspace. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}, default None The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigenvectors decomposition when eigen_solver == 'amg'. By default, arpack is used. eigen_tol : float, optional, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. drop_first : bool, optional, default=True Whether to drop the first eigenvector. For spectral embedding, this should be True as the first eigenvector should be constant vector for connected graph, but for spectral clustering, this should be kept as False to retain the first eigenvector. norm_laplacian : bool, optional, default=True If True, then compute normalized Laplacian. Returns ------- embedding : array, shape=(n_samples, n_components) The reduced samples. Notes ----- Spectral embedding is most useful when the graph has one connected component. If there graph has many components, the first few eigenvectors will simply uncover the connected components of the graph. References ---------- * https://en.wikipedia.org/wiki/LOBPCG * Toward the Optimal Preconditioned Eigensolver: Locally Optimal Block Preconditioned Conjugate Gradient Method Andrew V. Knyazev http://dx.doi.org/10.1137%2FS1064827500366124 """ adjacency = check_symmetric(adjacency) try: from pyamg import smoothed_aggregation_solver except ImportError: if eigen_solver == "amg": raise ValueError("The eigen_solver was set to 'amg', but pyamg is " "not available.") if eigen_solver is None: eigen_solver = 'arpack' elif eigen_solver not in ('arpack', 'lobpcg', 'amg'): raise ValueError("Unknown value for eigen_solver: '%s'." "Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver) random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] # Whether to drop the first eigenvector if drop_first: n_components = n_components + 1 if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)): # lobpcg used with eigen_solver='amg' has bugs for low number of nodes # for details see the source code in scipy: # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen # /lobpcg/lobpcg.py#L237 # or matlab: # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m laplacian = _set_diag(laplacian, 1, norm_laplacian) # Here we'll use shift-invert mode for fast eigenvalues # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html # for a short explanation of what this means) # Because the normalized Laplacian has eigenvalues between 0 and 2, # I - L has eigenvalues between -1 and 1. ARPACK is most efficient # when finding eigenvalues of largest magnitude (keyword which='LM') # and when these eigenvalues are very large compared to the rest. # For very large, very sparse graphs, I - L can have many, many # eigenvalues very near 1.0. This leads to slow convergence. So # instead, we'll use ARPACK's shift-invert mode, asking for the # eigenvalues near 1.0. This effectively spreads-out the spectrum # near 1.0 and leads to much faster convergence: potentially an # orders-of-magnitude speedup over simply using keyword which='LA' # in standard mode. try: # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 v0 = random_state.uniform(-1, 1, laplacian.shape[0]) lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol, v0=v0) # 根据模型的假设,将原来的乘法改为除法 embedding = diffusion_map.T[n_components::-1] / dd except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg eigen_solver = "lobpcg" # Revert the laplacian to its opposite to have lobpcg work laplacian *= -1 if eigen_solver == 'amg': # Use AMG to get a preconditioner and speed up the eigenvalue # problem. if not sparse.issparse(laplacian): warnings.warn("AMG works better for sparse matrices") # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) # 根据模型的假设,将原来的乘法改为除法 embedding = diffusion_map.T / dd if embedding.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": # lobpcg needs double precision floats laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) if n_nodes < 5 * n_components + 1: # see note above under arpack why lobpcg has problems with small # number of nodes # lobpcg will fallback to eigh, so we short circuit it if sparse.isspmatrix(laplacian): laplacian = laplacian.toarray() lambdas, diffusion_map = eigh(laplacian) # 根据模型的假设,将原来的乘法改为除法 embedding = diffusion_map.T[:n_components] / dd else: laplacian = _set_diag(laplacian, 1, norm_laplacian) # We increase the number of eigenvectors requested, as lobpcg # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) # 根据模型的假设,将原来的乘法改为除法 embedding = diffusion_map.T[:n_components] / dd if embedding.shape[0] == 1: raise ValueError embedding = _deterministic_vector_sign_flip(embedding) if drop_first: return embedding[1:n_components].T else: return embedding[:n_components].T