def test_score(nrows, ncols, nclusters, n_parts, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) local_model = cumlModel.get_combined_model() expected_score = local_model.score(X_train.compute()) assert abs(actual_score - expected_score) < 1e-3
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) y = y.astype('int64') wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) labels = cp.squeeze(y_train.compute().to_pandas().values) elif input_type == "array": X_train, y_train = X, y labels = cp.squeeze(y_train.compute()) cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) xformed = cumlModel.transform(X_train).compute() if input_type == "dataframe": xformed = cp.array(xformed if len(xformed.shape) == 1 else xformed.as_gpu_matrix()) if nclusters == 1: # series shape is (nrows,) not (nrows, 1) but both are valid # and equivalent for this test assert xformed.shape in [(nrows, nclusters), (nrows,)] else: assert xformed.shape == (nrows, nclusters) # The argmin of the transformed values should be equal to the labels # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1) xformed_labels = cp.argmin(xformed.reshape((int(nrows), int(nclusters))), axis=1) assert sk_adjusted_rand_score(cp.asnumpy(labels), cp.asnumpy(xformed_labels)) finally: client.close()
def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client): cluster_std = 10000.0 from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs # Using fairly high variance between points in clusters wt = cp.array([0.00001 for j in range(nrows)]) bound = nclusters * 100000 # Open the space really large centers = cp.random.uniform(-bound, bound, size=(nclusters, ncols)) X_cudf, y = make_blobs(n_samples=nrows, n_features=ncols, centers=centers, n_parts=n_parts, cluster_std=cluster_std, shuffle=False, verbose=False, random_state=10) # Choose one sample from each label and increase its weight for i in range(nclusters): wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0 cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) chunk_parts = int(nrows / n_parts) sample_weights = da.from_array(wt, chunks=(chunk_parts, )) cumlModel.fit(X_cudf, sample_weight=sample_weights) X = X_cudf.compute() labels_ = cumlModel.predict(X_cudf).compute() cluster_centers_ = cumlModel.cluster_centers_ for i in range(nrows): label = labels_[i] actual_center = cluster_centers_[label] diff = sum(abs(X[i] - actual_center)) # The large weight should be the centroid if wt[i] > 1.0: assert diff < 1.0 # Otherwise it should be pretty far away else: assert diff > 1000.0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf, delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers cumlPred = cp.array(cumlLabels.compute()) assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = np.squeeze(y.compute().to_pandas().values) score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get())) print(str(score)) assert 1.0 == score finally: client.close()
def test_transform(nrows, ncols, nclusters, n_parts, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, shuffle=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) labels = np.squeeze(y.compute().to_pandas().values) xformed = cumlModel.transform(X_cudf).compute() if nclusters == 1: # series shape is (nrows,) not (nrows, 1) but both are valid # and equivalent for this test assert xformed.shape in [(nrows, nclusters), (nrows, )] else: assert xformed.shape == (nrows, nclusters) xformed = cp.array(xformed if len(xformed.shape) == 1 else xformed.as_gpu_matrix()) # The argmin of the transformed values should be equal to the labels # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1) xformed_labels = cp.argmin(xformed.reshape( (int(nrows), int(nclusters))), axis=1) assert adjusted_rand_score(labels, cp.squeeze(xformed_labels.get())) finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cumlLabels.compute().values labels = y_train.compute().values elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score
def test_score(nrows, ncols, nclusters, n_parts, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, shuffle=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) actual_score = cumlModel.score(X_cudf) X = cp.array(X_cudf.compute().as_gpu_matrix()) predictions = cumlModel.predict(X_cudf).compute() predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = np.sqrt(np.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=1, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers from sklearn.metrics import adjusted_rand_score cumlPred = cumlLabels.compute().to_pandas().values assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = y.compute().to_pandas().values score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred) assert 1.0 == score finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from dask_ml.cluster import KMeans as dmlKMeans from cuml.test.dask.utils import dask_make_blobs X_df, X_cudf = dask_make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.1, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) daskmlModel1 = dmlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) daskmlModel1.fit(X_df) cumlLabels = cumlModel.predict(X_cudf) daskmlLabels1 = daskmlModel1.predict(X_df) from sklearn.metrics import adjusted_rand_score cumlPred = cumlLabels.compute().to_pandas().values daskmlPred1 = daskmlLabels1.compute() score = adjusted_rand_score(cumlPred, daskmlPred1) if owns_cluster: client.close() cluster.close() assert 1.0 == score
def test_transform(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) labels = y.compute().to_pandas().values labels = labels.reshape(labels.shape[0]) xformed = cumlModel.transform(X_cudf).compute() assert xformed.shape == (nrows, nclusters) # The argmin of the transformed values should be equal to the labels xformed_labels = np.argmin(xformed.to_pandas().to_numpy(), axis=1) from sklearn.metrics import adjusted_rand_score assert adjusted_rand_score(labels, xformed_labels) finally: client.close()
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) predictions = cumlModel.predict(X_train).compute() if input_type == "dataframe": X = cp.array(X_train.compute().as_gpu_matrix()) predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) elif input_type == "array": X = X_train.compute() centers = cumlModel.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()