def fit( self, adjacency: Union[sparse.csr_matrix, np.ndarray], personalization: Optional[Union[dict, np.ndarray]] = None) -> 'PageRank': """ Standard PageRank with restart. Parameters ---------- adjacency : Adjacency matrix. personalization : If ``None``, the uniform distribution is used. Otherwise, a non-negative, non-zero vector or a dictionary must be provided. Returns ------- self: :class:`PageRank` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError("The adjacency is not square. See BiPageRank.") rso = RandomSurferOperator(adjacency, self.damping_factor, personalization, False) self.scores_ = rso.solve(self.solver, self.n_iter) return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Harmonic': """ Harmonic centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Harmonic` """ adjacency = check_format(adjacency) n = adjacency.shape[0] if not is_square(adjacency): raise ValueError( "The adjacency is not square. Please use 'bipartite2undirected' or " "'bipartite2directed'.") indices = np.arange(n) paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices) np.fill_diagonal(paths, 1) inv = (1 / paths) np.fill_diagonal(inv, 0) self.scores_ = inv.dot(np.ones(n)) return self
def largest_connected_component(adjacency: Union[sparse.csr_matrix, np.ndarray], return_labels: bool = False): """ Extract the largest connected component of a graph. Bipartite graphs are treated as undirected ones. Parameters ---------- adjacency Adjacency or biadjacency matrix of the graph. return_labels: bool Whether to return the indices of the new nodes in the original graph. Returns ------- new_adjacency: sparse.csr_matrix Adjacency or biadjacency matrix of the largest connected component. indices: array or tuple of array Indices of the nodes in the original graph. For biadjacency matrices, ``indices[0]`` corresponds to the rows and ``indices[1]`` to the columns. """ adjacency = check_format(adjacency) n_samples, n_features = adjacency.shape if not is_square(adjacency): bipartite: bool = True full_adjacency = sparse.bmat([[None, adjacency], [adjacency.T, None]], format='csr') else: bipartite: bool = False full_adjacency = adjacency n_components, labels = connected_components(full_adjacency) unique_labels, counts = np.unique(labels, return_counts=True) component_label = unique_labels[np.argmax(counts)] component_indices = np.where(labels == component_label)[0] if bipartite: split_ix = np.searchsorted(component_indices, n_samples) samples_ix, features_ix = component_indices[: split_ix], component_indices[ split_ix:] - n_samples else: samples_ix, features_ix = component_indices, component_indices new_adjacency = adjacency[samples_ix, :] new_adjacency = (new_adjacency.tocsc()[:, features_ix]).tocsr() if return_labels: if bipartite: return new_adjacency, (samples_ix, features_ix) else: return new_adjacency, samples_ix else: return new_adjacency
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Closeness': """ Closeness centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Closeness` """ adjacency = check_format(adjacency) n = adjacency.shape[0] if not is_square(adjacency): raise ValueError( "The adjacency is not square. Please use 'bipartite2undirected' or " "'bipartite2directed'.") if not is_connected(adjacency): raise ValueError("The graph must be connected.") if self.method == 'exact': nb_samples = n indices = np.arange(n) elif self.method == 'approximate': nb_samples = min(int(log(n) / self.tol**2), n) indices = np.random.choice(np.arange(n), nb_samples, replace=False) else: raise ValueError( "Method should be either 'exact' or 'approximate'.") paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices) self.scores_ = ( (n - 1) * nb_samples / n) / paths.T.dot(np.ones(nb_samples)) return self
def cosine_modularity(adjacency, embedding: np.ndarray, col_embedding=None, resolution=1., weights='degree', return_all: bool = False): """ Quality metric of an embedding :math:`x` defined by: :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right) \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(x_j)}{2}\\right)` where :math:`\\pi(x_i)` is the projection of :math:`x_i` onto the unit-sphere. For bipartite graphs with column embedding :math:`y`, the metric is :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right) \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(y_j)}{2}\\right)` This metric is normalized to lie between -1 and 1 (for :math:`\\gamma = 1`). Parameters ---------- adjacency: sparse.csr_matrix or np.ndarray Adjacency matrix of the graph. embedding: np.ndarray Embedding of the nodes. col_embedding: None or np.ndarray For biadjacency matrices, embedding of the columns. resolution: float Resolution parameter. weights: ``'degree'`` or ``'uniform'`` Weights of the nodes. return_all: bool, default = ``False`` whether to return (fit, div, :math:`Q`) or :math:`Q` Returns ------- modularity : float fit: float, optional diversity: float, optional """ adjacency = check_format(adjacency) total_weight: float = adjacency.data.sum() if col_embedding is None: if not is_square(adjacency): raise ValueError( 'col_embedding cannot be None for non-square adjacency matrices.' ) else: col_embedding = embedding.copy() row_norms = np.linalg.norm(embedding, axis=1) col_norms = np.linalg.norm(col_embedding, axis=1) norm_row_emb = embedding norm_row_emb[(row_norms > 0)] /= row_norms[:, np.newaxis] norm_col_emb = col_embedding norm_col_emb[(col_norms > 0)] /= col_norms[:, np.newaxis] row_probs = check_probs(weights, adjacency) col_probs = check_probs(weights, adjacency.T) fit: float = 0.5 * (1 + (np.multiply( norm_row_emb, adjacency.dot(norm_col_emb))).sum() / total_weight) div: float = 0.5 * ( 1 + (embedding.T.dot(row_probs)).dot(col_embedding.T.dot(col_probs))) if return_all: return fit, div, fit - resolution * div else: return fit - resolution * div
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Spectral': """Fits the model from data in adjacency. Parameters ---------- adjacency : Adjacency matrix of the graph (symmetric matrix). Returns ------- self: :class:`Spectral` """ adjacency = check_format(adjacency).asfptype() if not is_square(adjacency): raise ValueError( 'The adjacency matrix is not square. See BiSpectral.') if not is_symmetric(adjacency): raise ValueError( 'The adjacency matrix is not symmetric.' 'Either convert it to a symmetric matrix or use BiSpectral.') n = adjacency.shape[0] if self.solver == 'auto': solver = auto_solver(adjacency.nnz) if solver == 'lanczos': self.solver: EigSolver = LanczosEig() else: self.solver: EigSolver = HalkoEig() if self.embedding_dimension > n - 2: warnings.warn( Warning( "The dimension of the embedding must be less than the number of nodes - 1." )) n_components = n - 2 else: n_components = self.embedding_dimension + 1 if (self.regularization is None or self.regularization == 0.) and not is_connected(adjacency): warnings.warn( Warning( "The graph is not connected and low-rank regularization is not active." "This can cause errors in the computation of the embedding." )) if isinstance(self.solver, HalkoEig) and not self.normalized_laplacian: raise NotImplementedError( "Halko solver is not yet compatible with regular Laplacian." "Call 'fit' with 'normalized_laplacian' = True or force lanczos solver." ) weights = adjacency.dot(np.ones(n)) regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * weights.sum() / n**2 weights += regularization * n if self.normalized_laplacian: # Finding the largest eigenvalues of the normalized adjacency is easier for the solver than finding the # smallest eigenvalues of the normalized laplacian. normalizing_matrix = diag_pinv(np.sqrt(weights)) if regularization: norm_adjacency = NormalizedAdjacencyOperator( adjacency, regularization) else: norm_adjacency = normalizing_matrix.dot( adjacency.dot(normalizing_matrix)) self.solver.which = 'LA' self.solver.fit(matrix=norm_adjacency, n_components=n_components) eigenvalues = 1 - self.solver.eigenvalues_ # eigenvalues of the Laplacian in increasing order index = np.argsort(eigenvalues) # skip first eigenvalue eigenvalues = eigenvalues[index][1:] # keep only positive eigenvectors of the normalized adjacency matrix eigenvectors = self.solver.eigenvectors_[:, index][:, 1:] * ( eigenvalues < 1 - self.tol) embedding = np.array(normalizing_matrix.dot(eigenvectors)) else: if regularization: laplacian = LaplacianOperator(adjacency, regularization) else: weight_matrix = sparse.diags(weights, format='csr') laplacian = weight_matrix - adjacency self.solver.which = 'SM' self.solver.fit(matrix=laplacian, n_components=n_components) eigenvalues = self.solver.eigenvalues_[1:] embedding = self.solver.eigenvectors_[:, 1:] if self.scaling: if self.scaling == 'multiply': eigenvalues = np.minimum(eigenvalues, 1) embedding *= np.sqrt(1 - eigenvalues) elif self.scaling == 'divide': inv_eigenvalues = np.zeros_like(eigenvalues) index = np.where(eigenvalues > 0)[0] inv_eigenvalues[index] = 1 / eigenvalues[index] embedding *= np.sqrt(inv_eigenvalues) else: warnings.warn( Warning( "The scaling must be 'multiply' or 'divide'. No scaling done." )) self.embedding_ = embedding self.eigenvalues_ = eigenvalues self.regularization_ = regularization return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain': """ Clustering using chosen Optimizer. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Louvain` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square. Use BiLouvain() instead.') n = adjacency.shape[0] out_weights = check_probs('degree', adjacency) in_weights = check_probs('degree', adjacency.T) nodes = np.arange(n) if self.shuffle_nodes: nodes = self.random_state.permutation(nodes) adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr() graph = AggregateGraph(adjacency, out_weights, in_weights) membership = sparse.identity(n, format='csr') increase = True iteration_count = 0 self.log.print("Starting with", graph.n_nodes, "nodes.") while increase: iteration_count += 1 self.algorithm.fit(graph) if self.algorithm.score_ <= self.agg_tol: increase = False else: agg_membership = membership_matrix(self.algorithm.labels_) membership = membership.dot(agg_membership) graph.aggregate(agg_membership) if graph.n_nodes == 1: break self.log.print("Iteration", iteration_count, "completed with", graph.n_nodes, "clusters and ", self.algorithm.score_, "increment.") if iteration_count == self.max_agg_iter: break if self.sorted_cluster: labels = reindex_clusters(membership.indices) else: labels = membership.indices if self.shuffle_nodes: reverse = np.empty(nodes.size, nodes.dtype) reverse[nodes] = np.arange(nodes.size) labels = labels[reverse] self.labels_ = labels self.iteration_count_ = iteration_count self.aggregate_graph_ = graph.norm_adjacency * adjacency.data.sum() return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Paris': """ Agglomerative clustering using the nearest neighbor chain. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Paris` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError( 'The adjacency matrix is not square. Use BiParis() instead.') n = adjacency.shape[0] sym_adjacency = adjacency + adjacency.T weights = self.weights out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) if n <= 1: raise ValueError('The graph must contain at least two nodes.') if self.engine == 'python': aggregate_graph = AggregateGraph(sym_adjacency, out_weights, in_weights) connected_components = [] dendrogram = [] while len(aggregate_graph.cluster_sizes) > 0: node = None for node in aggregate_graph.cluster_sizes: break chain = [node] while chain: node = chain.pop() if aggregate_graph.neighbors[node]: max_sim = -float("inf") nearest_neighbor = None for neighbor in aggregate_graph.neighbors[node]: sim = aggregate_graph.similarity(node, neighbor) if sim > max_sim: nearest_neighbor = neighbor max_sim = sim elif sim == max_sim: nearest_neighbor = min(neighbor, nearest_neighbor) if chain: nearest_neighbor_last = chain.pop() if nearest_neighbor_last == nearest_neighbor: dendrogram.append([ node, nearest_neighbor, 1. / max_sim, aggregate_graph.cluster_sizes[node] + aggregate_graph. cluster_sizes[nearest_neighbor] ]) aggregate_graph.merge(node, nearest_neighbor) else: chain.append(nearest_neighbor_last) chain.append(node) chain.append(nearest_neighbor) else: chain.append(node) chain.append(nearest_neighbor) else: connected_components.append( (node, aggregate_graph.cluster_sizes[node])) del aggregate_graph.cluster_sizes[node] node, cluster_size = connected_components.pop() for next_node, next_cluster_size in connected_components: cluster_size += next_cluster_size dendrogram.append( [node, next_node, float("inf"), cluster_size]) node = aggregate_graph.next_cluster aggregate_graph.next_cluster += 1 dendrogram = np.array(dendrogram) if self.reorder: dendrogram = reorder_dendrogram(dendrogram) self.dendrogram_ = dendrogram return self elif self.engine == 'numba': n = np.int32(adjacency.shape[0]) indices, indptr, data = sym_adjacency.indices, sym_adjacency.indptr, sym_adjacency.data dendrogram = fit_core(n, out_weights, in_weights, data, indices, indptr) dendrogram = np.array(dendrogram) if self.reorder: dendrogram = reorder_dendrogram(dendrogram) self.dendrogram_ = dendrogram return self else: raise ValueError('Unknown engine.')
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """ Tree sampling divergence of a hierarchy (quality metric). The higher the score, the better. Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized: If ``True``, normalized by the mutual information of the graph. Returns ------- score : float The tree sampling divergence of the hierarchy. If normalized, returns a value between 0 and 1. References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square.') n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') total_weight = adjacency.data.sum() if total_weight <= 0: raise ValueError('The graph must contain at least one edge.') adjacency.data = adjacency.data / total_weight out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights) height = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) node_sampling = np.zeros(n - 1) for t in range(n - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n and height[node1 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n] edge_sampling[node1 - n] = 0 node_sampling[t] = node_sampling[node1 - n] elif node2 >= n and height[node2 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n] edge_sampling[node2 - n] = 0 node_sampling[t] = node_sampling[node2 - n] if node2 in aggregate_graph.neighbors[node1]: edge_sampling[t] += aggregate_graph.neighbors[node1][node2] node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \ aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1] height[t] = dendrogram[t][2] aggregate_graph.merge(node1, node2) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data)) score /= mutual_information return score
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float: """ Dasgupta's score of a hierarchy, defined as 1 - Dasgupta's cost. The higher the score, the better. Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` or ``'uniform'`` (default). Returns ------- score : float Dasgupta's score of the hierarchy, normalized to get a value between 0 and 1. References ---------- Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering. Proceedings of ACM symposium on Theory of Computing. """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square.') n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights) height = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) cluster_weight = np.zeros(n - 1) for t in range(n - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n and height[node1 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n] edge_sampling[node1 - n] = 0 elif node2 >= n and height[node2 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n] edge_sampling[node2 - n] = 0 height[t] = dendrogram[t][2] if node2 in aggregate_graph.neighbors[node1]: edge_sampling[t] += aggregate_graph.neighbors[node1][node2] cluster_weight[t] = aggregate_graph.cluster_out_weights[node1] + aggregate_graph.cluster_out_weights[node2] \ + aggregate_graph.cluster_in_weights[node1] + aggregate_graph.cluster_in_weights[node2] aggregate_graph.merge(node1, node2) cost: float = edge_sampling.dot(cluster_weight) / 2 return 1 - cost