def single_chunk_blobs(): """X, y pair for clustering The `X` and `y` have a single block, so chunksize is 100. Useful for testing `partial_fit` methods. """ X, y = make_blobs(chunks=100, random_state=0) return X, y
def Xl_blobs_easy(): """ Tuple of (X, labels) for classification. The centers are very spread out, so the clustering is easy. """ centers = np.array([[-7, -7], [0, 0], [7, 7]]) X, y = make_blobs(cluster_std=0.1, centers=centers, chunks=50, random_state=0) return X, y
def _prep_data(self, reg=False): self.n_samples = int(1e5) self.chunk_size = int(1e4) self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int) if reg: self.x, self.y = make_regression(n_samples=self.n_samples, chunks=self.chunk_size, random_state=0, n_features=40) else: self.x, self.y = make_blobs(n_samples=self.n_samples, chunks=self.chunk_size, random_state=0, n_features=40, centers=2, cluster_std=100) return self
from functools import partial import numpy as np import pytest import sklearn.cluster from dask_ml import metrics from dask_ml.cluster import SpectralClustering from dask_ml.datasets import make_blobs X, y = make_blobs(n_samples=200, chunks=100, random_state=0) @pytest.mark.parametrize("as_ndarray", [False, True]) @pytest.mark.parametrize("persist_embedding", [True, False]) def test_basic(as_ndarray, persist_embedding): sc = SpectralClustering( n_components=25, random_state=0, persist_embedding=persist_embedding ) if as_ndarray: X_ = X.compute() else: X_ = X sc.fit(X_) assert len(sc.labels_) == len(X_) @pytest.mark.parametrize( "assign_labels", [sklearn.cluster.KMeans(n_init=2), "sklearn-kmeans"] ) def test_sklearn_kmeans(assign_labels):