def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray, LinearOperator], seeds: Optional[Union[dict, np.ndarray]] = None) -> 'PageRank': """Fit algorithm to data. Parameters ---------- adjacency : Adjacency matrix. seeds : Parameter to be used for Personalized PageRank. Restart distribution as a vector or a dict (node: weight). If ``None``, the uniform distribution is used (no personalization, default). Returns ------- self: :class:`PageRank` """ if not isinstance(adjacency, LinearOperator): adjacency = check_format(adjacency) check_square(adjacency) seeds = seeds2probs(adjacency.shape[0], seeds) self.scores_ = get_pagerank(adjacency, seeds, damping_factor=self.damping_factor, n_iter=self.n_iter, solver=self.solver, tol=self.tol) return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Harmonic': """Harmonic centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Harmonic` """ adjacency = check_format(adjacency) check_square(adjacency) n = adjacency.shape[0] indices = np.arange(n) paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices) np.fill_diagonal(paths, 1) inv = (1 / paths) np.fill_diagonal(inv, 0) self.scores_ = inv.dot(np.ones(n)) return self
def fit( self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'LaplacianEmbedding': """Compute the graph embedding. Parameters ---------- adjacency : Adjacency matrix of the graph (symmetric matrix). Returns ------- self: :class:`LaplacianEmbedding` """ adjacency = check_format(adjacency).asfptype() check_square(adjacency) check_symmetry(adjacency) n = adjacency.shape[0] regularize: bool = not (self.regularization is None or self.regularization == 0.) check_scaling(self.scaling, adjacency, regularize) if regularize: solver: EigSolver = LanczosEig() else: solver = set_solver(self.solver, adjacency) n_components = 1 + check_n_components(self.n_components, n - 2) weights = adjacency.dot(np.ones(n)) regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * weights.sum() / n**2 weights += regularization * n laplacian = LaplacianOperator(adjacency, regularization) else: weight_diag = sparse.diags(weights, format='csr') laplacian = weight_diag - adjacency solver.which = 'SM' solver.fit(matrix=laplacian, n_components=n_components) eigenvalues = solver.eigenvalues_[1:] eigenvectors = solver.eigenvectors_[:, 1:] embedding = eigenvectors.copy() if self.scaling: eigenvalues_inv_diag = diag_pinv(eigenvalues**self.scaling) embedding = eigenvalues_inv_diag.dot(embedding.T).T if self.normalized: embedding = normalize(embedding, p=2) self.embedding_ = embedding self.eigenvalues_ = eigenvalues self.eigenvectors_ = eigenvectors self.regularization_ = regularization return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Closeness': """Closeness centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Closeness` """ adjacency = check_format(adjacency) check_square(adjacency) check_connected(adjacency) n = adjacency.shape[0] if self.method == 'exact': n_sources = n sources = np.arange(n) elif self.method == 'approximate': n_sources = min(int(log(n) / self.tol**2), n) sources = np.random.choice(np.arange(n), n_sources, replace=False) else: raise ValueError( "Method should be either 'exact' or 'approximate'.") dists = distance(adjacency, n_jobs=self.n_jobs, sources=sources) self.scores_ = ( (n - 1) * n_sources / n) / dists.T.dot(np.ones(n_sources)) return self
def dasgupta_cost(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform', normalized: bool = False) -> float: """Dasgupta's cost of a hierarchy. Expected size (weights = ``'uniform'``) or expected volume (weights = ``'degree'``) of the cluster induced by random edge sampling (closest ancestor of the two nodes in the hierarchy). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` or ``'uniform'`` (default). normalized : If ``True``, normalized cost (between 0 and 1). Returns ------- cost : float Cost. Example ------- >>> from sknetwork.hierarchy import dasgupta_score, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> cost = dasgupta_cost(adjacency, dendrogram) >>> np.round(cost, 2) 3.33 References ---------- Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering. Proceedings of ACM symposium on Theory of Computing. """ adjacency = check_format(adjacency) check_square(adjacency) n = adjacency.shape[0] check_min_size(n, 2) edge_sampling, _, cluster_weight = get_sampling_distributions( adjacency, dendrogram, weights) cost = edge_sampling.dot(cluster_weight) if not normalized: if weights == 'degree': cost *= adjacency.data.sum() else: cost *= n return cost
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], seeds: Optional[Union[dict, np.ndarray]] = None, initial_state: Optional = None) -> 'Diffusion': """Compute the diffusion (temperature at equilibrium). Parameters ---------- adjacency : Adjacency matrix of the graph. seeds : Temperatures of border nodes (dictionary or vector). Negative temperatures ignored. initial_state : Initial state of temperatures. Returns ------- self: :class:`Diffusion` """ adjacency = check_format(adjacency) check_square(adjacency) n: int = adjacency.shape[0] if seeds is None: self.scores_ = np.ones(n) / n return self seeds = check_seeds(seeds, n) b, border = limit_conditions(seeds) tmin, tmax = np.min(b[border]), np.max(b) interior: sparse.csr_matrix = sparse.diags(~border, shape=(n, n), format='csr', dtype=float) diffusion_matrix = interior.dot(normalize(adjacency)) if initial_state is None: if tmin != tmax: initial_state = b[border].mean() * np.ones(n) else: initial_state = np.zeros(n) initial_state[border] = b[border] if self.n_iter > 0: scores = initial_state for i in range(self.n_iter): scores = diffusion_matrix.dot(scores) scores[border] = b[border] else: a = sparse.eye(n, format='csr', dtype=float) - diffusion_matrix scores, info = bicgstab(a, b, atol=0., x0=initial_state) self._scipy_solver_info(info) if tmin != tmax: self.scores_ = np.clip(scores, tmin, tmax) else: self.scores_ = scores return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], seeds: Optional[Union[dict, np.ndarray]] = None, init: Optional[float] = None) -> 'Dirichlet': """Compute the solution to the Dirichlet problem (temperatures at equilibrium). Parameters ---------- adjacency : Adjacency matrix of the graph. seeds : Temperatures of seed nodes (dictionary or vector). Negative temperatures ignored. init : Temperature of non-seed nodes in initial state. If ``None``, use the average temperature of seed nodes (default). Returns ------- self: :class:`Dirichlet` """ adjacency = check_format(adjacency) check_square(adjacency) n: int = adjacency.shape[0] if seeds is None: self.scores_ = np.ones(n) / n return self seeds = check_seeds(seeds, n) border = (seeds >= 0) if init is None: scores = seeds[border].mean() * np.ones(n) else: scores = init * np.ones(n) scores[border] = seeds[border] if self.n_iter > 0: diffusion = DirichletOperator(adjacency, self.damping_factor, border) for i in range(self.n_iter): scores = diffusion.dot(scores) scores[border] = seeds[border] else: a = DeltaDirichletOperator(adjacency, self.damping_factor, border) b = -seeds b[~border] = 0 scores, info = bicgstab(a, b, atol=0., x0=scores) self._scipy_solver_info(info) tmin, tmax = seeds[border].min(), seeds[border].max() self.scores_ = np.clip(scores, tmin, tmax) return self
def __init__(self, adjacency: Union[sparse.csr_matrix, np.ndarray], coeffs: np.ndarray): if coeffs.shape[0] == 0: raise ValueError('A polynome requires at least one coefficient.') adjacency = check_format(adjacency) check_square(adjacency) shape = adjacency.shape dtype = adjacency.dtype super(Polynome, self).__init__(dtype=dtype, shape=shape) self.adjacency = adjacency self.coeffs = coeffs
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], seeds: Optional[Union[dict, np.ndarray]] = None, init: Optional[float] = None) \ -> 'Diffusion': """Compute the diffusion (temperatures at equilibrium). Parameters ---------- adjacency : Adjacency matrix of the graph. seeds : Temperatures of seed nodes in initial state (dictionary or vector). Negative temperatures ignored. init : Temperature of non-seed nodes in initial state. If ``None``, use the average temperature of seed nodes (default). Returns ------- self: :class:`Diffusion` """ adjacency = check_format(adjacency) check_square(adjacency) n: int = adjacency.shape[0] if seeds is None: self.scores_ = np.ones(n) / n return self seeds = check_seeds(seeds, n) border = (seeds >= 0) if init is None: scores = seeds[border].mean() * np.ones(n) else: scores = init * np.ones(n) scores[border] = seeds[border] diffusion = DirichletOperator(adjacency, self.damping_factor) for i in range(self.n_iter): scores = diffusion.dot(scores) self.scores_ = scores return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'RandomProjection': """Compute the graph embedding. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`RandomProjection` """ adjacency = check_format(adjacency).asfptype() check_square(adjacency) n = adjacency.shape[0] random_generator = check_random_state(self.random_state) random_matrix = random_generator.normal(size=(n, self.n_components)) # make the matrix orthogonal random_matrix, _ = np.linalg.qr(random_matrix) factor = random_matrix embedding = factor.copy() if self.random_walk: transition = normalize(adjacency) else: transition = adjacency for t in range(self.n_iter): factor = self.alpha * transition.dot(factor) embedding += factor if self.normalized: embedding = normalize(embedding, p=2) self.embedding_ = embedding return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'LouvainHierarchy': """Fit algorithm to data. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`LouvainHierarchy` """ adjacency = check_format(adjacency) check_square(adjacency) tree = self._recursive_louvain(adjacency, self.depth) dendrogram, _ = get_dendrogram(tree) dendrogram = np.array(dendrogram) dendrogram[:, 2] -= min(dendrogram[:, 2]) self.dendrogram_ = reorder_dendrogram(dendrogram) return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray, LinearOperator], seeds: Optional[Union[dict, np.ndarray]] = None) -> 'PageRank': """Fit algorithm to data. Parameters ---------- adjacency : Adjacency matrix. seeds : If ``None``, the uniform distribution is used. Otherwise, a non-negative, non-zero vector or a dictionary must be provided. Returns ------- self: :class:`PageRank` """ if not isinstance(adjacency, LinearOperator): adjacency = check_format(adjacency) check_square(adjacency) seeds = seeds2probs(adjacency.shape[0], seeds) self.scores_ = get_pagerank(adjacency, seeds, damping_factor=self.damping_factor, n_iter=self.n_iter, solver=self.solver, tol=self.tol) return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain': """Fit algorithm to the data. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Louvain` """ adjacency = check_format(adjacency) check_square(adjacency) n_nodes = adjacency.shape[0] probs_out = check_probs('degree', adjacency) probs_in = check_probs('degree', adjacency.T) nodes = np.arange(n_nodes) if self.shuffle_nodes: nodes = self.random_state.permutation(nodes) adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr() adjacency_norm = adjacency / adjacency.data.sum() membership = sparse.identity(n_nodes, format='csr') increase = True count_aggregations = 0 self.log.print("Starting with", n_nodes, "nodes.") while increase: count_aggregations += 1 current_labels, pass_increase = self._optimize( n_nodes, adjacency_norm, probs_out, probs_in) _, current_labels = np.unique(current_labels, return_inverse=True) if pass_increase <= self.tol_aggregation: increase = False else: membership_agg = membership_matrix(current_labels) membership = membership.dot(membership_agg) n_nodes, adjacency_norm, probs_out, probs_in = self._aggregate( adjacency_norm, probs_out, probs_in, membership_agg) if n_nodes == 1: break self.log.print("Aggregation", count_aggregations, "completed with", n_nodes, "clusters and ", pass_increase, "increment.") if count_aggregations == self.n_aggregations: break if self.sort_clusters: labels = reindex_labels(membership.indices) else: labels = membership.indices if self.shuffle_nodes: reverse = np.empty(nodes.size, nodes.dtype) reverse[nodes] = np.arange(nodes.size) labels = labels[reverse] self.labels_ = labels self._secondary_outputs(adjacency) return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], position_init: Optional[np.ndarray] = None, n_iter: Optional[int] = None) -> 'Spring': """Compute layout. Parameters ---------- adjacency : Adjacency matrix of the graph, treated as undirected. position_init : np.ndarray Custom initial positions of the nodes. Shape must be (n, 2). If ``None``, use the value of self.pos_init. n_iter : int Number of iterations to update positions. If ``None``, use the value of self.n_iter. Returns ------- self: :class:`Spring` """ adjacency = check_format(adjacency) check_square(adjacency) if not is_symmetric(adjacency): adjacency = directed2undirected(adjacency) n = adjacency.shape[0] position = np.zeros((n, 2)) if position_init is None: if self.position_init == 'random': position = np.random.randn(n, 2) elif self.position_init == 'spectral': position = Spectral(n_components=2, normalized=False).fit_transform(adjacency) elif isinstance(position_init, np.ndarray): if position_init.shape == (n, 2): position = position_init.copy() else: raise ValueError('Initial position has invalid shape.') else: raise TypeError('Initial position must be a numpy array.') if n_iter is None: n_iter = self.n_iter if self.strength is None: strength = np.sqrt((1 / n)) else: strength = self.strength delta_x: float = position[:, 0].max() - position[:, 0].min() delta_y: float = position[:, 1].max() - position[:, 1].min() step_max: float = 0.1 * max(delta_x, delta_y) step: float = step_max / (n_iter + 1) delta = np.zeros((n, 2)) for iteration in range(n_iter): delta *= 0 for i in range(n): indices = adjacency.indices[adjacency.indptr[i]:adjacency. indptr[i + 1]] data = adjacency.data[adjacency.indptr[i]:adjacency.indptr[i + 1]] grad: np.ndarray = (position[i] - position) # shape (n, 2) distance: np.ndarray = np.linalg.norm(grad, axis=1) # shape (n,) distance = np.where(distance < 0.01, 0.01, distance) attraction = np.zeros(n) attraction[indices] += data * distance[indices] / strength repulsion = (strength / distance)**2 delta[i]: np.ndarray = ( grad * (repulsion - attraction)[:, np.newaxis]).sum( axis=0) # shape (2,) length = np.linalg.norm(delta, axis=0) length = np.where(length < 0.01, 0.1, length) delta = delta * step_max / length position += delta step_max -= step err: float = np.linalg.norm(delta) / n if err < self.tol: break self.embedding_ = position return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain': """Fit algorithm to the data. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Louvain` """ adjacency = check_format(adjacency) check_square(adjacency) n = adjacency.shape[0] if self.modularity == 'potts': probs_ou = check_probs('uniform', adjacency) probs_in = probs_ou.copy() elif self.modularity == 'newman': probs_ou = check_probs('degree', adjacency) probs_in = probs_ou.copy() elif self.modularity == 'dugue': probs_ou = check_probs('degree', adjacency) probs_in = check_probs('degree', adjacency.T) else: raise ValueError('Unknown modularity function.') nodes = np.arange(n, dtype=np.int32) if self.shuffle_nodes: nodes = self.random_state.permutation(nodes) adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr() adjacency_clust = adjacency / adjacency.data.sum() membership = sparse.identity(n, format='csr') increase = True count_aggregations = 0 self.log.print("Starting with", n, "nodes.") while increase: count_aggregations += 1 labels_clust, pass_increase = self._optimize( adjacency_clust, probs_ou, probs_in) _, labels_clust = np.unique(labels_clust, return_inverse=True) if pass_increase <= self.tol_aggregation: increase = False else: membership_clust = membership_matrix(labels_clust) membership = membership.dot(membership_clust) adjacency_clust, probs_ou, probs_in = self._aggregate( adjacency_clust, probs_ou, probs_in, membership_clust) n = adjacency_clust.shape[0] if n == 1: break self.log.print("Aggregation", count_aggregations, "completed with", n, "clusters and ", pass_increase, "increment.") if count_aggregations == self.n_aggregations: break if self.sort_clusters: labels = reindex_labels(membership.indices) else: labels = membership.indices if self.shuffle_nodes: reverse = np.empty(nodes.size, nodes.dtype) reverse[nodes] = np.arange(nodes.size) labels = labels[reverse] self.labels_ = labels self._secondary_outputs(adjacency) return self
def dasgupta_cost(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform', normalized: bool = False) -> float: """Dasgupta's cost of a hierarchy. * Graphs * Digraphs Expected size (weights = ``'uniform'``) or expected weight (weights = ``'degree'``) of the cluster induced by random edge sampling (closest ancestor of the two nodes in the hierarchy). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` or ``'uniform'`` (default). normalized : If ``True``, normalized cost (between 0 and 1). Returns ------- cost : float Cost. Example ------- >>> from sknetwork.hierarchy import dasgupta_score, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> cost = dasgupta_cost(adjacency, dendrogram) >>> np.round(cost, 2) 3.33 References ---------- Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering. Proceedings of ACM symposium on Theory of Computing. """ adjacency = check_format(adjacency) check_square(adjacency) n = adjacency.shape[0] check_min_size(n, 2) aggregate_graph, height, edge_sampling, cluster_weight, _, _ = _instanciate_vars( adjacency, weights) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if i >= n and height[i - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[i - n] edge_sampling[i - n] = 0 elif j >= n and height[j - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[j - n] edge_sampling[j - n] = 0 height[t] = dendrogram[t][2] if j in aggregate_graph.neighbors[i]: edge_sampling[t] += aggregate_graph.neighbors[i][j] cluster_weight[t] = aggregate_graph.cluster_out_weights[i] + aggregate_graph.cluster_out_weights[j] \ + aggregate_graph.cluster_in_weights[i] + aggregate_graph.cluster_in_weights[j] aggregate_graph.merge(i, j) cost: float = edge_sampling.dot(cluster_weight) / 2 if not normalized: if weights == 'degree': cost *= adjacency.data.sum() else: cost *= n return cost
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). * Graphs * Digraphs Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.52 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars( adjacency, weights) node_sampling = np.zeros(n - 1) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if i >= n and height[i - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[i - n] edge_sampling[i - n] = 0 node_sampling[t] = node_sampling[i - n] elif j >= n and height[j - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[j - n] edge_sampling[j - n] = 0 node_sampling[t] = node_sampling[j - n] if j in aggregate_graph.neighbors[i]: edge_sampling[t] += aggregate_graph.neighbors[i][j] node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \ aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i] height[t] = dendrogram[t][2] aggregate_graph.merge(i, j) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) score /= mutual_information return score
def randomized_eig(matrix, n_components: int, which='LM', n_oversamples: int = 10, n_iter='auto', power_iteration_normalizer: Union[str, None] = 'auto', random_state=None, one_pass: bool = False): """Truncated randomized eigenvalue decomposition. Parameters ---------- matrix: ndarray or sparse matrix Matrix to decompose n_components: int Number of singular values and vectors to extract. which: str which eigenvalues to compute. ``'LM'`` for Largest Magnitude and ``'SM'`` for Smallest Magnitude. Any other entry will result in Largest Magnitude. n_oversamples : int (default=10) Additional number of random vectors to sample the range of ``matrix`` so as to ensure proper conditioning. The total number of random vectors used to find the range of ``matrix`` is ``n_components + n_oversamples``. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. n_iter: int or 'auto' (default is 'auto') See :meth:`randomized_range_finder` power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` See :meth:`randomized_range_finder` random_state: int, RandomState instance or None, optional (default=None) See :meth:`randomized_range_finder` one_pass: bool (default=False) whether to use algorithm 5.6 instead of 5.3. 5.6 requires less access to the original matrix, while 5.3 is more accurate. Returns ------- eigenvalues: np.ndarray eigenvectors: np.ndarray References ---------- Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 """ check_square(adjacency=matrix) random_state = check_random_state(random_state) n_random = n_components + n_oversamples shift_value: float = 0. # upper bound on spectral radius if which == 'SM': try: shift_value = (abs(matrix).dot(np.ones(matrix.shape[1]))).max() except TypeError: shift_value: float = 1.1 * randomized_eig(matrix, n_components=1)[0][0] matrix *= -1 if isinstance(matrix, SparseLR): matrix += shift_value * sparse.identity(matrix.shape[0], format='csr') else: matrix += shift_value * sparse.identity(matrix.shape[0]) if n_iter == 'auto': # Checks if the number of iterations is explicitly specified # Adjust n_iter. 7 was found a good compromise for PCA. n_iter = 7 if n_components < .1 * min(matrix.shape) else 4 range_matrix, random_matrix, random_proj = randomized_range_finder( matrix, n_random, n_iter, power_iteration_normalizer, random_state, True) if one_pass: approx_matrix = np.linalg.lstsq(random_matrix.T.dot(range_matrix), random_proj.T.dot(range_matrix), None)[0].T else: approx_matrix = (matrix.dot(range_matrix)).T.dot(range_matrix) eigenvalues, eigenvectors = np.linalg.eig(approx_matrix) del approx_matrix # eigenvalues indices in decreasing order values_order = np.argsort(eigenvalues)[::-1] eigenvalues = eigenvalues[values_order] eigenvectors = np.dot(range_matrix, eigenvectors)[:, values_order] if which == 'SM': eigenvalues = shift_value - eigenvalues return eigenvalues[:n_components], eigenvectors[:, :n_components]
def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, weights: Union[str, np.ndarray] = 'degree', weights_in: Union[str, np.ndarray] = 'degree', resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]: """Modularity of a clustering (node partition). * Graphs * Digraphs The modularity of a clustering is :math:`Q = \\sum_{i,j}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw_j}{w^2}\\right)\\delta_{c_i,c_j}` for graphs, :math:`Q = \\sum_{i,j}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w^+_iw^-_j}{w^2}\\right)\\delta_{c_i,c_j}` for digraphs, where * :math:`c_i` is the cluster of node :math:`i`,\n * :math:`w_i` is the weight of node :math:`i`,\n * :math:`w^+_i, w^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n * :math:`w = 1^TA1` is the total weight,\n * :math:`\\delta` is the Kronecker symbol,\n * :math:`\\gamma \\ge 0` is the resolution parameter. Parameters ---------- adjacency: Adjacency matrix of the graph. labels: Labels of nodes, vector of size :math:`n` . weights : Weights of nodes. ``'degree'`` (default), ``'uniform'`` or custom weights. weights_in : In-weights of nodes. ``None`` (default), ``'degree'``, ``'uniform'`` or custom weights. If ``None``, taken equal to weights. resolution: Resolution parameter (default = 1). return_all: If ``True``, return modularity, fit, diversity. Returns ------- modularity : float fit: float, optional diversity: float, optional Example ------- >>> from sknetwork.clustering import modularity >>> from sknetwork.data import house >>> adjacency = house() >>> labels = np.array([0, 0, 1, 1, 0]) >>> np.round(modularity(adjacency, labels), 2) 0.11 """ adjacency = check_format(adjacency).astype(float) check_square(adjacency) if len(labels) != adjacency.shape[0]: raise ValueError('Dimension mismatch between labels and adjacency matrix.') probs_row = check_probs(weights, adjacency) probs_col = check_probs(weights_in, adjacency.T) membership = membership_matrix(labels) fit: float = membership.multiply(adjacency.dot(membership)).data.sum() / adjacency.data.sum() div: float = membership.T.dot(probs_col).dot(membership.T.dot(probs_row)) mod: float = fit - resolution * div if return_all: return mod, fit, div else: return mod
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], position_init: Optional[np.ndarray] = None, n_iter: Optional[int] = None) -> 'Spring': """Compute layout. Parameters ---------- adjacency : Adjacency matrix of the graph, treated as undirected. position_init : np.ndarray Custom initial positions of the nodes. Shape must be (n, 2). If ``None``, use the value of self.pos_init. n_iter : int Number of iterations to update positions. If ``None``, use the value of self.n_iter. Returns ------- self: :class:`Spring` """ adjacency = check_format(adjacency) check_square(adjacency) if not is_symmetric(adjacency): adjacency = directed2undirected(adjacency) n = adjacency.shape[0] position = np.zeros((n, self.n_components)) if position_init is None: if self.position_init == 'random': position = np.random.randn(n, self.n_components) elif self.position_init == 'spectral': position = Spectral(n_components=self.n_components, normalized=False).fit_transform(adjacency) elif isinstance(position_init, np.ndarray): if position_init.shape == (n, self.n_components): position = position_init.copy() else: raise ValueError('Initial position has invalid shape.') else: raise TypeError('Initial position must be a numpy array.') if n_iter is None: n_iter = self.n_iter if self.strength is None: strength = np.sqrt((1 / n)) else: strength = self.strength pos_max = position.max(axis=0) pos_min = position.min(axis=0) step_max: float = 0.1 * (pos_max - pos_min).max() step: float = step_max / (n_iter + 1) tree = None delta = np.zeros((n, self.n_components)) for iteration in range(n_iter): delta *= 0 if self.approx_radius > 0: tree = cKDTree(position) for i in range(n): # attraction indices = adjacency.indices[adjacency.indptr[i]:adjacency.indptr[i+1]] attraction = adjacency.data[adjacency.indptr[i]:adjacency.indptr[i+1]] / strength grad = position[i] - position[indices] attraction *= np.linalg.norm(grad, axis=1) attraction = (grad * attraction[:, np.newaxis]).sum(axis=0) # repulsion if tree is None: grad: np.ndarray = (position[i] - position) # shape (n, n_components) distance: np.ndarray = np.linalg.norm(grad, axis=1) # shape (n,) else: neighbors = tree.query_ball_point(position[i], self.approx_radius) grad: np.ndarray = (position[i] - position[neighbors]) # shape (n_neigh, n_components) distance: np.ndarray = np.linalg.norm(grad, axis=1) # shape (n_neigh,) distance = np.where(distance < 0.01, 0.01, distance) repulsion = (grad * (strength / distance)[:, np.newaxis] ** 2).sum(axis=0) # total force delta[i]: np.ndarray = repulsion - attraction length = np.linalg.norm(delta, axis=0) length = np.where(length < 0.01, 0.1, length) delta = delta * step_max / length position += delta step_max -= step err: float = np.linalg.norm(delta) / n if err < self.tol: break self.embedding_ = position return self
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.05 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() edge_sampling, node_sampling, _ = get_sampling_distributions( adjacency, dendrogram, weights) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) if mutual_information > 0: score /= mutual_information return score
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], pos_init: Optional[np.ndarray] = None, n_iter: Optional[int] = None) -> 'ForceAtlas': """Compute layout. Parameters ---------- adjacency : Adjacency matrix of the graph, treated as undirected. pos_init : Position to start with. Random if not provided. n_iter : int Number of iterations to update positions. If ``None``, use the value of self.n_iter. Returns ------- self: :class:`ForceAtlas` """ # verify the format of the adjacency matrix adjacency = check_format(adjacency) check_square(adjacency) if not is_symmetric(adjacency): adjacency = directed2undirected(adjacency) n = adjacency.shape[0] # setting of the tolerance according to the size of the graph if n < 5000: tolerance = 0.1 elif 5000 <= n < 50000: # pragma: no cover tolerance = 1 else: # pragma: no cover tolerance = 10 if n_iter is None: n_iter = self.n_iter # initial position of the nodes of the graph if pos_init is None: position: np.ndarray = np.random.randn(n, self.n_components) else: if pos_init.shape != (n, self.n_components): raise ValueError( 'The initial position does not have valid dimensions.') else: position = pos_init # compute the vector with the degree of each node degree: np.ndarray = adjacency.dot(np.ones(adjacency.shape[1])) + 1 # initialization of variation of position of nodes resultants = np.zeros(n) delta: np.ndarray = np.zeros((n, self.n_components)) swing_vector: np.ndarray = np.zeros(n) global_speed = 1 for iteration in range(n_iter): delta *= 0 global_swing = 0 global_traction = 0 if self.approx_radius > 0: tree = cKDTree(position) else: tree = None for i in range(n): # attraction indices = adjacency.indices[adjacency.indptr[i]:adjacency. indptr[i + 1]] attraction = position[i] - position[indices] if self.lin_log: attraction = np.sign(attraction) * np.log( 1 + np.abs(10 * attraction)) attraction = attraction.sum(axis=0) # repulsion if tree is None: neighbors = np.arange(n) else: neighbors = tree.query_ball_point(position[i], self.approx_radius) grad: np.ndarray = (position[i] - position[neighbors] ) # shape (n_neigh, n_components) distance: np.ndarray = np.linalg.norm( grad, axis=1) # shape (n_neigh,) distance = np.where(distance < 0.01, 0.01, distance) repulsion = grad * (degree[neighbors] / distance)[:, np.newaxis] repulsion *= self.repulsive_factor * degree[i] repulsion = repulsion.sum(axis=0) # gravity gravity = self.gravity_factor * degree[i] * grad gravity = gravity.sum(axis=0) # forces resultant applied on node i for traction, swing and speed computation force = repulsion - attraction - gravity resultant_new: float = np.linalg.norm(force) resultant_old: float = resultants[i] swing_node: float = np.abs( resultant_new - resultant_old) # force variation applied on node i swing_vector[i] = swing_node global_swing += (degree[i] + 1) * swing_node traction: float = np.abs( resultant_new + resultant_old) / 2 # traction force applied on node i global_traction += (degree[i] + 1) * traction node_speed = self.speed * global_speed / ( 1 + global_speed * np.sqrt(swing_node)) if node_speed > self.speed_max / resultant_new: # pragma: no cover node_speed = self.speed_max / resultant_new delta[i]: np.ndarray = node_speed * force resultants[i] = resultant_new global_speed = tolerance * global_traction / global_swing position += delta # calculating displacement and final position of points after iteration if (swing_vector < 1).all(): break # if the swing of all nodes is zero, then convergence is reached and we break. self.embedding_ = position return self
def cosine_modularity(adjacency, embedding: np.ndarray, embedding_col=None, resolution=1., weights='degree', return_all: bool = False): """Quality metric of an embedding :math:`x` defined by: :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w^+_iw^-_j}{w^2}\\right) \\left(\\dfrac{1 + \\cos(x_i, x_j)}{2}\\right)` where * :math:`w^+_i, w^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n * :math:`w = 1^TA1` is the total weight of the graph. For bipartite graphs with column embedding :math:`y`, the metric is :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_{1,i}w_{2,j}}{w^2}\\right) \\left(\\dfrac{1 + \\cos(x_i, y_j)}{2}\\right)` where * :math:`w_{1,i}, w_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n * :math:`w = 1^TB1` is the total weight of the graph. Parameters ---------- adjacency : Adjacency matrix of the graph. embedding : Embedding of the nodes. embedding_col : Embedding of the columns (for bipartite graphs). resolution : Resolution parameter. weights : ``'degree'`` or ``'uniform'`` Weights of the nodes. return_all : If ``True``, also return fit and diversity Returns ------- modularity : float fit: float, optional diversity: float, optional Example ------- >>> from sknetwork.embedding import cosine_modularity >>> from sknetwork.data import karate_club >>> graph = karate_club(metadata=True) >>> adjacency = graph.adjacency >>> embedding = graph.position >>> np.round(cosine_modularity(adjacency, embedding), 2) 0.35 """ adjacency = check_format(adjacency) total_weight: float = adjacency.data.sum() if embedding_col is None: check_square(adjacency) embedding_col = embedding.copy() embedding_row_norm = normalize(embedding, p=2) embedding_col_norm = normalize(embedding_col, p=2) probs_row = check_probs(weights, adjacency) probs_col = check_probs(weights, adjacency.T) if isinstance(embedding_row_norm, sparse.csr_matrix) and isinstance(embedding_col_norm, sparse.csr_matrix): fit: float = 0.5 * (1 + (embedding_row_norm.multiply(adjacency.dot(embedding_col_norm))).sum() / total_weight) else: fit: float = 0.5 * ( 1 + (np.multiply(embedding_row_norm, adjacency.dot(embedding_col_norm))).sum() / total_weight) div: float = 0.5 * (1 + (embedding.T.dot(probs_row)).dot(embedding_col.T.dot(probs_col))) if return_all: return fit, div, fit - resolution * div else: return fit - resolution * div
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Spectral': """Compute the graph embedding. Parameters ---------- adjacency : Adjacency matrix of the graph (symmetric matrix). Returns ------- self: :class:`Spectral` """ adjacency = check_format(adjacency).asfptype() check_square(adjacency) check_symmetry(adjacency) n = adjacency.shape[0] solver = set_solver(self.solver, adjacency) n_components = 1 + check_n_components(self.n_components, n - 2) regularize: bool = not (self.regularization is None or self.regularization == 0.) check_scaling(self.scaling, adjacency, regularize) weights = adjacency.dot(np.ones(n)) regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * weights.sum() / n**2 weights += regularization * n # Spectral decomposition of the normalized adjacency matrix weights_inv_sqrt_diag = diag_pinv(np.sqrt(weights)) if regularization: norm_adjacency = NormalizedAdjacencyOperator( adjacency, regularization) else: norm_adjacency = weights_inv_sqrt_diag.dot( adjacency.dot(weights_inv_sqrt_diag)) solver.which = 'LA' solver.fit(matrix=norm_adjacency, n_components=n_components) eigenvalues = solver.eigenvalues_ index = np.argsort(-eigenvalues)[1:] # skip first eigenvalue eigenvalues = eigenvalues[index] eigenvectors = weights_inv_sqrt_diag.dot(solver.eigenvectors_[:, index]) embedding = eigenvectors.copy() if self.scaling: eigenvalues_inv_diag = diag_pinv((1 - eigenvalues)**self.scaling) embedding = eigenvalues_inv_diag.dot(embedding.T).T if self.normalized: embedding = normalize(embedding, p=2) self.embedding_ = embedding self.eigenvalues_ = eigenvalues self.eigenvectors_ = eigenvectors self.regularization_ = regularization return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Spectral': """Compute the graph embedding. Parameters ---------- adjacency : Adjacency matrix of the graph (symmetric matrix). Returns ------- self: :class:`Spectral` """ adjacency = check_format(adjacency).asfptype() check_square(adjacency) check_symmetry(adjacency) n = adjacency.shape[0] if self.solver == 'auto': solver = auto_solver(adjacency.nnz) if solver == 'lanczos': self.solver: EigSolver = LanczosEig() else: # pragma: no cover self.solver: EigSolver = HalkoEig() n_components = check_n_components(self.n_components, n - 2) n_components += 1 if self.equalize and (self.regularization is None or self.regularization == 0.) and not is_connected(adjacency): raise ValueError( "The option 'equalize' is valid only if the graph is connected or with regularization." "Call 'fit' either with 'equalize' = False or positive 'regularization'." ) weights = adjacency.dot(np.ones(n)) regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * weights.sum() / n**2 weights += regularization * n if self.normalized_laplacian: # Finding the largest eigenvalues of the normalized adjacency is easier for the solver than finding the # smallest eigenvalues of the normalized laplacian. weights_inv_sqrt_diag = diag_pinv(np.sqrt(weights)) if regularization: norm_adjacency = NormalizedAdjacencyOperator( adjacency, regularization) else: norm_adjacency = weights_inv_sqrt_diag.dot( adjacency.dot(weights_inv_sqrt_diag)) self.solver.which = 'LA' self.solver.fit(matrix=norm_adjacency, n_components=n_components) eigenvalues = 1 - self.solver.eigenvalues_ # eigenvalues of the Laplacian in increasing order index = np.argsort(eigenvalues)[1:] # skip first eigenvalue eigenvalues = eigenvalues[index] # eigenvectors of the Laplacian, skip first eigenvector eigenvectors = np.array( weights_inv_sqrt_diag.dot(self.solver.eigenvectors_[:, index])) else: if regularization: laplacian = LaplacianOperator(adjacency, regularization) else: weight_diag = sparse.diags(weights, format='csr') laplacian = weight_diag - adjacency self.solver.which = 'SM' self.solver.fit(matrix=laplacian, n_components=n_components) eigenvalues = self.solver.eigenvalues_[1:] eigenvectors = self.solver.eigenvectors_[:, 1:] embedding = eigenvectors.copy() if self.equalize: eigenvalues_sqrt_inv_diag = diag_pinv(np.sqrt(eigenvalues)) embedding = eigenvalues_sqrt_inv_diag.dot(embedding.T).T if self.barycenter: eigenvalues_diag = sparse.diags(eigenvalues) subtract = eigenvalues_diag.dot(embedding.T).T if not self.normalized_laplacian: weights_inv_diag = diag_pinv(weights) subtract = weights_inv_diag.dot(subtract) embedding -= subtract if self.normalized: embedding = normalize(embedding, p=2) self.embedding_ = embedding self.eigenvalues_ = eigenvalues self.eigenvectors_ = eigenvectors self.regularization_ = regularization return self