def test_sklearn_kmeans(assign_labels):
    sc = SpectralClustering(n_components=25,
                            random_state=0,
                            assign_labels=assign_labels,
                            kmeans_params={'n_clusters': 8})
    sc.fit(X)
    assert isinstance(sc.assign_labels_, sklearn.cluster.KMeans)
예제 #2
0
def test_callable_affinity():
    affinity = partial(
        metrics.pairwise.pairwise_kernels,
        metric="rbf",
        filter_params=True,
        gamma=1.0 / len(X),
    )
    sc = SpectralClustering(affinity=affinity, gamma=None)
    sc.fit(X)
예제 #3
0
def test_basic(as_ndarray, persist_embedding):
    sc = SpectralClustering(
        n_components=25, random_state=0, persist_embedding=persist_embedding
    )
    if as_ndarray:
        X_ = X.compute()
    else:
        X_ = X
    sc.fit(X_)
    assert len(sc.labels_) == len(X_)
예제 #4
0
    def run(self):
        if self.word_vectors not in {"fasttext", "word2vec"}:
            raise ValueError(
                f'Expected fasttext or word2vec; got {self.word_vectors}')

        print(
            f'Initializing dask dataframe of word embeddings at {datetime.now()}'
        )
        ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR /
                                      f'{self.word_vectors}_to_csv' / "*.part")

        print(
            f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}'
        )
        X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1)
        X = X.to_dask_array(lengths=True)

        # Perform k-means clustering
        print(f'Starting K-Means clustering at {datetime.now()}')
        k_means_clustering_model = KMeans(n_clusters=self.num_clusters,
                                          n_jobs=-1,
                                          max_iter=config.K_MEANS_MAX_ITER)
        k_means_cluster_labels = k_means_clustering_model.fit(X)

        # Write k-means results to disk
        print(
            f'Joining K-means results and writing to disk at {datetime.now()}')
        k_means_results_ddf = ddf.join(k_means_cluster_labels)
        k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means'
        k_means_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path)

        # Perform spectral clustering
        print(f'Starting Spectral clustering at {datetime.now()}')
        spectral_clustering_model = SpectralClustering(
            n_clusters=self.num_clusters,
            n_jobs=-1,
            persist_embedding=True,
            kmeans_params={"max_iter": config.K_MEANS_MAX_ITER})
        spectral_cluster_labels = spectral_clustering_model.fit(X)

        # Write spectral results to disk
        print(
            f'Joining Spectral results and writing to disk at {datetime.now()}'
        )
        spectral_results_ddf = ddf.join(spectral_cluster_labels)
        spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral'
        spectral_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path)

        # And save the success flag
        with self.output().open("w") as f:
            # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n")
            # f.write(spectral_clustering_model.get_params(deep=True))
            f.write(f'{self.word_vectors}: Success!')
예제 #5
0
def test_affinity_raises():
    sc = SpectralClustering(affinity="foo")
    with pytest.raises(ValueError) as m:
        sc.fit(X)

    assert m.match("Unknown affinity metric name 'foo'")

    sc = SpectralClustering(affinity=np.array([]))
    with pytest.raises(TypeError) as m:
        sc.fit(X)
        assert m.match("Unexpected type for affinity 'ndarray'")
예제 #6
0
def test_assign_labels_raises():
    sc = SpectralClustering(assign_labels="foo")
    with pytest.raises(ValueError) as m:
        sc.fit(X)

    assert m.match("Unknown 'assign_labels' 'foo'")

    sc = SpectralClustering(assign_labels=dict())
    with pytest.raises(TypeError) as m:
        sc.fit(X)

    assert m.match("Invalid type ")
예제 #7
0
def test_spectral_clustering(Xl_blobs_easy):
    X, y = Xl_blobs_easy
    X = (X - X.mean(0)) / X.std(0)
    model = SpectralClustering(
        random_state=0, n_clusters=3, n_components=5, gamma=None
    ).fit(X)
    labels = model.labels_.compute()
    y = y.compute()

    idx = [(y == i).argmax() for i in range(3)]
    grouped_idx = [np.where(y == y[idx[i]])[0] for i in range(3)]

    for indices in grouped_idx:
        assert len(set(labels[indices])) == 1
def test_spectral_clustering():
    S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]])

    model = SpectralClustering(random_state=0, n_clusters=2,
                               n_components=4).fit(S)
    labels = model.labels_.compute()
    if labels[0] == 0:
        labels = 1 - labels

    assert_array_equal(labels, [1, 1, 1, 0, 0, 0, 0])
예제 #9
0
def test_n_components_raises():
    sc = SpectralClustering(n_components=len(X))
    with pytest.raises(ValueError) as m:
        sc.fit(X)
    assert m.match("n_components")
예제 #10
0
def test_callable_affinity():
    affinity = partial(metrics.pairwise.pairwise_kernels,
                       metric='rbf',
                       filter_params=True)
    sc = SpectralClustering(affinity=affinity)
    sc.fit(X)
예제 #11
0
def test_basic(data, persist_embedding):
    sc = SpectralClustering(n_components=25,
                            random_state=0,
                            persist_embedding=persist_embedding)
    sc.fit(data)
    assert len(sc.labels_) == len(X)
def dask_spectral(feat, n_clusters, **kwargs):
    from dask_ml.cluster import SpectralClustering
    spectral = SpectralClustering(n_clusters=n_clusters,
                                  affinity='rbf',
                                  random_state=0).fit(feat)
    return spectral.labels_.compute()