def compute_kmeans(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs): """Computes KMeans clusters using `sklearn.cluster.KMeans`(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)""" data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] km = sklearn.cluster.KMeans(n_clusters=n_clusters, **kwargs).fit(data.T) return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=km.cluster_centers_, labels=km.labels_))
def compute_kmeans2(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs): """Computes KMeans clusters using `scipy.cluster.vq.kmeans2` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans2.html""" data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] data = data.T.todense() if not np.issubdtype(data.dtype, np.floating): data = data.astype(np.float64) centroids, labels = scipy.cluster.vq.kmeans2(data, n_clusters, **kwargs) return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=centroids, labels=labels))
def compute_hca( corpus: VectorizedCorpus, tokens: List[str], linkage_method: str = 'ward', linkage_metric: str = 'euclidean' ) -> HCACorpusClusters: """Computes HCA clusters using `scipy.cluster.hierarchy.linkage` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html""" data = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] linkage_matrix = linkage(data.T.todense(), method=linkage_method, metric=linkage_metric) """ from documentation A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with token_ids Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster. """ return HCACorpusClusters(corpus, tokens, linkage_matrix)
def test_token_indices(corpus: VectorizedCorpus): assert corpus.token_indices(['a', 'c', 'z']) == [0, 2]