def _instantiate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'): """Initialize standard variables for metrics.""" weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) sym_adjacency = directed2undirected(adjacency) aggregate_graph = AggregateGraph(weights_row, weights_col, sym_adjacency.data.astype(float), sym_adjacency.indices, sym_adjacency.indptr) return aggregate_graph, weights_row, weights_col
def _instantiate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'): """Initialize standard variables for metrics.""" n = adjacency.shape[0] weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) sym_adjacency = directed2undirected(adjacency) aggregate_graph = AggregateGraph(weights_row, weights_col, sym_adjacency.data.astype(float), sym_adjacency.indices, sym_adjacency.indptr) height = np.zeros(n - 1) cluster_weight = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) return aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col
def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray], force_bipartite: bool = False) -> 'Louvain': """Fit algorithm to data. Parameters ---------- input_matrix : Adjacency matrix or biadjacency matrix of the graph. force_bipartite : If ``True``, force the input matrix to be considered as a biadjacency matrix even if square. Returns ------- self: :class:`Louvain` """ self._init_vars() if self.modularity == 'dugue': adjacency, self.bipartite = get_adjacency(input_matrix, force_directed=True, force_bipartite=force_bipartite) else: adjacency, self.bipartite = get_adjacency(input_matrix, force_bipartite=force_bipartite) n = adjacency.shape[0] if self.modularity == 'potts': probs_out = get_probs('uniform', adjacency) probs_in = probs_out.copy() elif self.modularity == 'newman': probs_out = get_probs('degree', adjacency) probs_in = probs_out.copy() elif self.modularity == 'dugue': probs_out = get_probs('degree', adjacency) probs_in = get_probs('degree', adjacency.T) else: raise ValueError('Unknown modularity function.') nodes = np.arange(n) if self.shuffle_nodes: nodes = self.random_state.permutation(nodes) adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr() adjacency_cluster = adjacency / adjacency.data.sum() membership = sparse.identity(n, format='csr') increase = True count_aggregations = 0 self.log.print("Starting with", n, "nodes.") while increase: count_aggregations += 1 labels_cluster, pass_increase = self._optimize(adjacency_cluster, probs_out, probs_in) _, labels_cluster = np.unique(labels_cluster, return_inverse=True) if pass_increase <= self.tol_aggregation: increase = False else: membership_cluster = membership_matrix(labels_cluster) membership = membership.dot(membership_cluster) adjacency_cluster, probs_out, probs_in = self._aggregate(adjacency_cluster, probs_out, probs_in, membership_cluster) n = adjacency_cluster.shape[0] if n == 1: break self.log.print("Aggregation", count_aggregations, "completed with", n, "clusters and ", pass_increase, "increment.") if count_aggregations == self.n_aggregations: break if self.sort_clusters: labels = reindex_labels(membership.indices) else: labels = membership.indices if self.shuffle_nodes: reverse = np.empty(nodes.size, nodes.dtype) reverse[nodes] = np.arange(nodes.size) labels = labels[reverse] self.labels_ = labels if self.bipartite: self._split_vars(input_matrix.shape) self._secondary_outputs(input_matrix) return self
def cosine_modularity(adjacency, embedding: np.ndarray, embedding_col=None, resolution=1., weights='degree', return_all: bool = False): """Quality metric of an embedding :math:`x` defined by: :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w^+_iw^-_j}{w^2}\\right) \\left(\\dfrac{1 + \\cos(x_i, x_j)}{2}\\right)` where * :math:`w^+_i, w^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n * :math:`w = 1^TA1` is the total weight of the graph. For bipartite graphs with column embedding :math:`y`, the metric is :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_{1,i}w_{2,j}}{w^2}\\right) \\left(\\dfrac{1 + \\cos(x_i, y_j)}{2}\\right)` where * :math:`w_{1,i}, w_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n * :math:`w = 1^TB1` is the total weight of the graph. Parameters ---------- adjacency : Adjacency matrix of the graph. embedding : Embedding of the nodes. embedding_col : Embedding of the columns (for bipartite graphs). resolution : Resolution parameter. weights : ``'degree'`` or ``'uniform'`` Weights of the nodes. return_all : If ``True``, also return fit and diversity Returns ------- modularity : float fit: float, optional diversity: float, optional Example ------- >>> from sknetwork.embedding import cosine_modularity >>> from sknetwork.data import karate_club >>> graph = karate_club(metadata=True) >>> adjacency = graph.adjacency >>> embedding = graph.position >>> np.round(cosine_modularity(adjacency, embedding), 2) 0.35 """ adjacency = check_format(adjacency) total_weight: float = adjacency.data.sum() if embedding_col is None: check_square(adjacency) embedding_col = embedding.copy() embedding_row_norm = normalize(embedding, p=2) embedding_col_norm = normalize(embedding_col, p=2) probs_row = get_probs(weights, adjacency) probs_col = get_probs(weights, adjacency.T) fit: float = 0.5 * (1 + (np.multiply(embedding_row_norm, adjacency.dot(embedding_col_norm))).sum() / total_weight) div: float = 0.5 * (1 + (embedding.T.dot(probs_row)).dot(embedding_col.T.dot(probs_col))) if return_all: return fit, div, fit - resolution * div else: return fit - resolution * div
def bimodularity(biadjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, labels_col: np.ndarray, weights: Union[str, np.ndarray] = 'degree', weights_col: Union[str, np.ndarray] = 'degree', resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]: """Bimodularity of the clustering (for bipartite graphs). The bimodularity of a clustering is :math:`Q = \\sum_{i}\\sum_{j}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{d_{1,i}d_{2,j}}{w^2}\\right) \\delta_{c_{1,i},c_{2,j}}` where * :math:`c_{1,i}, c_{2,j}` are the clusters of nodes :math:`i` (row) and :math:`j` (column),\n * :math:`d_{1,i}, d_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n * :math:`w = 1^TB1` is the total weight,\n * :math:`\\delta` is the Kronecker symbol,\n * :math:`\\gamma \\ge 0` is the resolution parameter. Parameters ---------- biadjacency : Biadjacency matrix of the graph (shape :math:`n_1 \\times n_2`). labels : Labels of rows, vector of size :math:`n_1`. labels_col: Labels of columns, vector of size :math:`n_2`. weights : Weights of nodes. ``'degree'`` (default), ``'uniform'`` or custom weights. weights_col : Weights of columns. ``'degree'`` (default), ``'uniform'`` or custom weights. resolution: Resolution parameter (default = 1). return_all: If ``True``, return modularity, fit, diversity. Returns ------- modularity : float fit: float, optional diversity: float, optional Example ------- >>> from sknetwork.clustering import bimodularity >>> from sknetwork.data import star_wars >>> biadjacency = star_wars() >>> labels = np.array([1, 1, 0, 0]) >>> labels_col = np.array([1, 0, 0]) >>> np.round(bimodularity(biadjacency, labels, labels_col), 2) 0.22 """ biadjacency = check_format(biadjacency).astype(float) n_row, n_col = biadjacency.shape if len(labels) != n_row: raise ValueError('Dimension mismatch between labels and biadjacency matrix.') if len(labels_col) != n_col: raise ValueError('Dimension mismatch between labels_col and biadjacency matrix.') adjacency = bipartite2directed(biadjacency) weights_ = get_probs(weights, biadjacency) weights_ = np.hstack((weights_, np.zeros(n_col))) weights_col_ = get_probs(weights_col, biadjacency.T) weights_col_ = np.hstack((np.zeros(n_row), weights_col_)) labels_ = np.hstack((labels, labels_col)) return modularity(adjacency, labels_, weights_, weights_col_, resolution, return_all)
def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, weights: Union[str, np.ndarray] = 'degree', weights_in: Union[str, np.ndarray] = 'degree', resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]: """Modularity of a clustering. The modularity of a clustering is :math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d_id_j}{w}\\right)\\delta_{c_i,c_j}` for graphs, :math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d^+_id^-_j}{w}\\right)\\delta_{c_i,c_j}` for directed graphs, where * :math:`c_i` is the cluster of node :math:`i`,\n * :math:`d_i` is the weight of node :math:`i`,\n * :math:`d^+_i, d^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n * :math:`w = 1^TA1` is the total weight,\n * :math:`\\delta` is the Kronecker symbol,\n * :math:`\\gamma \\ge 0` is the resolution parameter. Parameters ---------- adjacency: Adjacency matrix of the graph. labels: Labels of nodes, vector of size :math:`n` . weights : Weights of nodes. ``'degree'`` (default), ``'uniform'`` or custom weights. weights_in : In-weights of nodes. ``None`` (default), ``'degree'``, ``'uniform'`` or custom weights. If ``None``, taken equal to weights. resolution: Resolution parameter (default = 1). return_all: If ``True``, return modularity, fit, diversity. Returns ------- modularity : float fit: float, optional diversity: float, optional Example ------- >>> from sknetwork.clustering import modularity >>> from sknetwork.data import house >>> adjacency = house() >>> labels = np.array([0, 0, 1, 1, 0]) >>> np.round(modularity(adjacency, labels), 2) 0.11 """ adjacency = check_format(adjacency).astype(float) check_square(adjacency) if len(labels) != adjacency.shape[0]: raise ValueError('Dimension mismatch between labels and adjacency matrix.') probs_row = get_probs(weights, adjacency) probs_col = get_probs(weights_in, adjacency.T) membership = membership_matrix(labels) fit: float = membership.multiply(adjacency.dot(membership)).data.sum() / adjacency.data.sum() div: float = membership.T.dot(probs_col).dot(membership.T.dot(probs_row)) mod: float = fit - resolution * div if return_all: return mod, fit, div else: return mod
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.05 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() edge_sampling, node_sampling, _ = get_sampling_distributions( adjacency, dendrogram, weights) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) if mutual_information > 0: score /= mutual_information return score