def define_clusts(similarity_matrix, threshold=0.05, max_iter=200, method='ap'): """Define clusters given the similarity matrix and the threshold.""" n, labels = connected_components(similarity_matrix, directed=False) prev_max_clust = 0 print("connected components: %d" % n) clusters = labels.copy() if method == 'dbscan': ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1) if method == 'ap': ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter, preference='median') for i in range(n): idxs = np.where(labels == i)[0] if idxs.shape[0] > 1: sm = similarity_matrix[idxs][:, idxs] sm += sm.T + scipy.sparse.eye(sm.shape[0]) # Hierarchical clustering if method == 'hc': dists = squareform(1 - sm.toarray()) links = fastcluster.linkage(dists, method='ward') try: clusters_ = fcluster(links, threshold, 'distance') except ValueError as err: logging.critical(err) clusters_ = np.zeros(1, dtype=int) # DBSCAN elif method == 'dbscan': db = ap.fit(1. - sm.toarray()) # Number of clusters in labels, ignoring noise if present. clusters_ = db.labels_ # n_clusters_ = len(set(clusters_)) - int(0 in clusters_) # AffinityPropagation # ap = AffinityPropagation(affinity='precomputed') elif method == 'ap': db = ap.fit(sm) clusters_ = db.labels_ else: raise ValueError("clustering method %s unknown" % method) if np.min(clusters_) == 0: clusters_ += 1 clusters_ += prev_max_clust clusters[idxs] = clusters_ prev_max_clust = max(clusters_) else: # connected component contains just 1 element prev_max_clust += 1 clusters[idxs] = prev_max_clust return np.array(extra.flatten(clusters))
def _fix_connectivity(X, connectivity, n_components=None, affinity="euclidean"): """ Fixes the connectivity matrix - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary """ n_samples = X.shape[0] if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # Make the connectivity matrix symmetric: connectivity = connectivity + connectivity.T # Convert connectivity matrix to LIL if not sparse.isspmatrix_lil(connectivity): if not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() # Compute the number of nodes n_components, labels = connected_components(connectivity) if n_components > 1: warnings.warn("the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_components, stacklevel=2) # XXX: Can we do without completing the matrix? for i in xrange(n_components): idx_i = np.where(labels == i)[0] Xi = X[idx_i] for j in xrange(i): idx_j = np.where(labels == j)[0] Xj = X[idx_j] D = pairwise_distances(Xi, Xj, metric=affinity) ii, jj = np.where(D == np.min(D)) ii = ii[0] jj = jj[0] connectivity[idx_i[ii], idx_j[jj]] = True connectivity[idx_j[jj], idx_i[ii]] = True return connectivity, n_components
def _graph_is_connected(graph): """ Return whether the graph is connected (True) or Not (False) Parameters ---------- graph : array-like or sparse matrix, shape: (n_samples, n_samples) adjacency matrix of the graph, non-zero weight means an edge between the nodes Returns ------- is_connected : bool True means the graph is fully connected and False means not """ if sparse.isspmatrix(graph): # sparse graph, find all the connected components n_connected_components, _ = connected_components(graph) return n_connected_components == 1 else: # dense graph, find all connected components start from node 0 return _graph_connected_component(graph, 0).sum() == graph.shape[0]
dist_matrix[i] = 0 dist_matrix.flat[::N + 1] = 0 return dist_matrix dist_matrix = sparse.csr_matrix(generate_graph(20)) # auto graph_sk = graph_shortest_path(dist_matrix, directed = False) graph_sp = shortest_path(dist_matrix, directed = False) assert_array_almost_equal(graph_sk, graph_sp) # Floyd-Warshall graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'FW') graph_sp = shortest_path(dist_matrix, directed = False, method = 'FW') assert_array_almost_equal(graph_sk, graph_sp) # Dijkstra's graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'D') graph_sp = shortest_path(dist_matrix, directed = False, method = 'D') assert_array_almost_equal(graph_sk, graph_sp) from sklearn.utils.sparsetools import connected_components from scipy.sparse.csgraph import connected_components as c_c dist_matrix = sparse.csr_matrix(generate_graph(100)) (n_sk, labs_sk) = connected_components(dist_matrix) (n_sp, labs_sp) = c_c(dist_matrix)
import numpy as np from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components = 30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components = 30, eigen_solver='arpack', affinity="nearest_neighbors") se.fit(X) app.quit() # TODO : check budget distribution, draw budget conditionnaly out = connected_components(graph)
from sklearn.utils.arpack import eigsh app = service.prodbox.CinemaService() X = app.getWeightedSearchFeatures(15) graph = kneighbors_graph(X, 10) lap = graph_laplacian(graph, True) from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=30, algorithm="arpack") lap = spectral_embedding_._set_diag(lap, 1) svd.fit(-lap) eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T) eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1) print(eigenvalues) print(eigenvalues2) se = SpectralEmbedding(n_components=30, eigen_solver='arpack', affinity="nearest_neighbors") se.fit(X) app.quit() # TODO : check budget distribution, draw budget conditionnaly out = connected_components(graph)