def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'KMeans': """Apply embedding method followed by K-means. Parameters ---------- adjacency: Adjacency matrix of the graph. Returns ------- self: :class:`KMeans` """ n = adjacency.shape[0] check_n_clusters(self.n_clusters, n) embedding = self.embedding_method.fit_transform(adjacency) kmeans = KMeansDense(self.n_clusters) kmeans.fit(embedding) if self.sort_clusters: labels = reindex_labels(kmeans.labels_) else: labels = kmeans.labels_ self.labels_ = labels self._secondary_outputs(adjacency) return self
def cut_straight(dendrogram: np.ndarray, n_clusters: Optional[int] = None, threshold: Optional[float] = None, sort_clusters: bool = True, return_dendrogram: bool = False) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """Cut a dendrogram and return the corresponding clustering. Parameters ---------- dendrogram: Dendrogram. n_clusters : Number of clusters (optional). The number of clusters can be larger than n_clusters in case of equal heights in the dendrogram. threshold : Threshold on height (optional). If both n_clusters and threshold are ``None``, n_clusters is set to 2. sort_clusters : If ``True``, sorts clusters in decreasing order of size. return_dendrogram : If ``True``, returns the dendrogram formed by the clusters up to the root. Returns ------- labels : np.ndarray Cluster of each node. dendrogram_aggregate : np.ndarray Dendrogram starting from clusters (leaves = clusters). Example ------- >>> from sknetwork.hierarchy import cut_straight >>> dendrogram = np.array([[0, 1, 0, 2], [2, 3, 1, 3]]) >>> cut_straight(dendrogram) array([0, 0, 1]) """ check_dendrogram(dendrogram) n = dendrogram.shape[0] + 1 if return_dendrogram and not np.all(np.diff(dendrogram[:, 2]) >= 0): raise ValueError( "The third column of the dendrogram must be non-decreasing.") cluster = {i: [i] for i in range(n)} if n_clusters is None: if threshold is None: n_clusters = 2 else: n_clusters = n else: check_n_clusters(n_clusters, n, n_min=1) cut = np.sort(dendrogram[:, 2])[n - n_clusters] if threshold is not None: cut = max(cut, threshold) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if dendrogram[t][2] < cut and i in cluster and j in cluster: cluster[n + t] = cluster.pop(i) + cluster.pop(j) return get_labels(dendrogram, cluster, sort_clusters, return_dendrogram)
def cut_straight( dendrogram: np.ndarray, n_clusters: int = 2, sort_clusters: bool = True, return_dendrogram: bool = False ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """Cut a dendrogram and return the corresponding clustering. Parameters ---------- dendrogram: Dendrogram n_clusters : Number of clusters. sort_clusters : If ``True``, sorts clusters in decreasing order of size. return_dendrogram : If ``True``, returns the dendrogram formed by the clusters up to the root. Returns ------- labels : np.ndarray Cluster of each node. dendrogram_aggregate : np.ndarray Dendrogram starting from clusters (leaves = clusters). Example ------- >>> from sknetwork.hierarchy import cut_straight >>> dendrogram = np.array([[0, 1, 0, 2], [2, 3, 1, 3]]) >>> cut_straight(dendrogram) array([0, 0, 1]) """ check_dendrogram(dendrogram) n = dendrogram.shape[0] + 1 check_n_clusters(n_clusters, n, n_min=1) if return_dendrogram and not np.all(np.diff(dendrogram[:, 2]) >= 0): raise ValueError( "The third column of the dendrogram must be non-decreasing.") cluster = {i: [i] for i in range(n)} cut = np.sort(dendrogram[:, 2])[n - n_clusters] for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if dendrogram[t][2] < cut and i in cluster and j in cluster: cluster[n + t] = cluster.pop(i) + cluster.pop(j) return get_labels(dendrogram, cluster, sort_clusters, return_dendrogram)
def aggregate_dendrogram(dendrogram: np.ndarray, n_clusters: int = 2, return_counts: bool = False) \ -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """Aggregate a dendrogram in order to get a certain number of leaves. The leaves in the output dendrogram correspond to subtrees in the input one. Parameters ---------- dendrogram: The input to aggregate. n_clusters: Number of clusters (or leaves) to keep. return_counts If ``True``, returns an array of counts corresponding to the sizes of the merged subtrees. The sum of the counts is equal to the number of samples in the input dendrogram. Returns ------- new_dendrogram: Aggregated dendrogram. The nodes are reindexed from 0. counts: Size of the subtrees corresponding to each leaf in new_dendrogram. """ n_nodes: int = dendrogram.shape[0] + 1 check_n_clusters(n_clusters, n_nodes, n_min=1) new_dendrogram = dendrogram[n_nodes - n_clusters:].copy() node_indices = np.array( sorted(set(new_dendrogram[:, 0]).union(set(new_dendrogram[:, 1])))) new_index = {ix: i for i, ix in enumerate(node_indices)} for j in range(2): for i in range(new_dendrogram.shape[0]): new_dendrogram[i, j] = new_index[new_dendrogram[i, j]] if return_counts: leaves = node_indices[:n_clusters].astype(int) leaves_indices = leaves - n_nodes counts = dendrogram[leaves_indices, 3] return new_dendrogram, counts.astype(int) else: return new_dendrogram
def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'KMeans': """Apply embedding method followed by K-means. Parameters ---------- input_matrix : Adjacency matrix or biadjacency matrix of the graph. Returns ------- self: :class:`KMeans` """ self._init_vars() # input check_format(input_matrix) if self.co_cluster: check_n_clusters(self.n_clusters, np.sum(input_matrix.shape)) else: check_n_clusters(self.n_clusters, input_matrix.shape[0]) # embedding embedding, self.bipartite = get_embedding(input_matrix, self.embedding_method, self.co_cluster) # clustering kmeans = KMeansDense(self.n_clusters) kmeans.fit(embedding) # sort if self.sort_clusters: labels = reindex_labels(kmeans.labels_) else: labels = kmeans.labels_ # output self.labels_ = labels if self.co_cluster: self._split_vars(input_matrix.shape) self._secondary_outputs(input_matrix) return self
def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiKMeans': """Apply embedding method followed by clustering to the graph. Parameters ---------- biadjacency: Biadjacency matrix of the graph. Returns ------- self: :class:`BiKMeans` """ n_row, n_col = biadjacency.shape check_n_clusters(self.n_clusters, n_row) method = self.embedding_method method.fit(biadjacency) if self.co_cluster: embedding = np.vstack( (method.embedding_row_, method.embedding_col_)) else: embedding = method.embedding_ kmeans = KMeansDense(self.n_clusters) kmeans.fit(embedding) if self.sort_clusters: labels = reindex_labels(kmeans.labels_) else: labels = kmeans.labels_ self.labels_ = labels if self.co_cluster: self._split_vars(n_row) else: self.labels_row_ = labels if self.return_membership: membership_row = membership_matrix(self.labels_row_, n_labels=self.n_clusters) if self.labels_col_ is not None: membership_col = membership_matrix(self.labels_col_, n_labels=self.n_clusters) self.membership_row_ = normalize( biadjacency.dot(membership_col)) self.membership_col_ = normalize( biadjacency.T.dot(membership_row)) else: self.membership_row_ = normalize( biadjacency.dot(biadjacency.T.dot(membership_row))) self.membership_ = self.membership_row_ if self.return_aggregate: membership_row = membership_matrix(self.labels_row_, n_labels=self.n_clusters) biadjacency_ = sparse.csr_matrix(membership_row.T.dot(biadjacency)) if self.labels_col_ is not None: membership_col = membership_matrix(self.labels_col_, n_labels=self.n_clusters) biadjacency_ = biadjacency_.dot(membership_col) self.biadjacency_ = biadjacency_ return self