def test_score(nrows, ncols, nclusters, n_parts, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) local_model = cumlModel.get_combined_model() expected_score = local_model.score(X_train.compute()) assert abs(actual_score - expected_score) < 1e-3
def test_predict_proba(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_probas = l_model.predict_proba(X_test) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors) d_model.fit(X_train, y_train) d_probas = d_model.predict_proba(X_test, convert_dtype=True) d_probas = da.compute(d_probas)[0] if datatype == 'dask_cudf': d_probas = list(map(lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], d_probas)) check_probabilities(l_probas, d_probas)
def test_extract_partitions_shape(nrows, ncols, n_parts, input_type, colocated, client): adj_input_type = 'dataframe' if input_type == 'series' else input_type X_arr, y_arr = make_blobs(n_samples=nrows, n_features=ncols, n_parts=n_parts) if adj_input_type == "dataframe" or input_type == "dataframe": X = to_dask_cudf(X_arr) y = to_dask_cudf(y_arr) elif input_type == "array": X, y = X_arr, y_arr if input_type == "series": X = X[X.columns[0]] if input_type == "dataframe" or input_type == "series": X_len_parts = X.map_partitions(len).compute() y_len_parts = y.map_partitions(len).compute() elif input_type == "array": X_len_parts = X.chunks[0] y_len_parts = y.chunks[0] if colocated: ddh = DistributedDataHandler.create((X, y), client) parts = [part.result() for worker, part in ddh.gpu_futures] for i in range(len(parts)): assert (parts[i][0].shape[0] == X_len_parts[i]) and ( parts[i][1].shape[0] == y_len_parts[i]) else: ddh = DistributedDataHandler.create(X, client) parts = [part.result() for worker, part in ddh.gpu_futures] for i in range(len(parts)): assert (parts[i].shape[0] == X_len_parts[i])
def test_extract_partitions_worker_list(nrows, ncols, n_parts, input_type, colocated, cluster): client = Client(cluster) try: adj_input_type = 'dataframe' if input_type == 'series' else input_type X_arr, y_arr = make_blobs(n_samples=int(nrows), n_features=ncols, n_parts=n_parts) if adj_input_type == "dataframe" or input_type == "dataframe": X = to_dask_cudf(X_arr) y = to_dask_cudf(y_arr) elif input_type == "array": X, y = X_arr, y_arr if input_type == "series": X = X[X.columns[0]] if colocated: ddh = DistributedDataHandler.create((X, y), client) else: ddh = DistributedDataHandler.create(X, client) parts = list(map(lambda x: x[1], ddh.gpu_futures)) assert len(parts) == n_parts finally: client.close()
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) y = y.astype('int64') wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) labels = cp.squeeze(y_train.compute().to_pandas().values) elif input_type == "array": X_train, y_train = X, y labels = cp.squeeze(y_train.compute()) cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) xformed = cumlModel.transform(X_train).compute() if input_type == "dataframe": xformed = cp.array(xformed if len(xformed.shape) == 1 else xformed.as_gpu_matrix()) if nclusters == 1: # series shape is (nrows,) not (nrows, 1) but both are valid # and equivalent for this test assert xformed.shape in [(nrows, nclusters), (nrows,)] else: assert xformed.shape == (nrows, nclusters) # The argmin of the transformed values should be equal to the labels # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1) xformed_labels = cp.argmin(xformed.reshape((int(nrows), int(nclusters))), axis=1) assert sk_adjusted_rand_score(cp.asnumpy(labels), cp.asnumpy(xformed_labels)) finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cumlLabels.compute().values labels = y_train.compute().values elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score
def test_predict_and_score(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset np_y_test = y_test l_model = lKNNReg(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_distances, l_indices = l_model.kneighbors(X_test) l_outputs = l_model.predict(X_test) local_out = (l_outputs, l_indices, l_distances) handmade_local_score = r2_score(y_test, l_outputs) handmade_local_score = round(float(handmade_local_score), 3) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNReg(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_outputs, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_outputs, d_indices, d_distances) if datatype == 'dask_array': distributed_score = d_model.score(X_test, y_test) distributed_score = round(float(distributed_score), 3) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) exact_match(local_out, distributed_out) if datatype == 'dask_array': assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2) else: y_pred = distributed_out[0] handmade_distributed_score = float(r2_score(np_y_test, y_pred)) handmade_distributed_score = round(handmade_distributed_score, 3) assert handmade_distributed_score == pytest.approx( handmade_local_score, abs=1e-2)
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) try: cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_train) except Exception as e: print(str(e)) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X_cpu) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign) finally: client.close()
def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(data_info, input_type, cluster): client = Client(cluster) nrows, ncols, n_parts = data_info try: from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] finally: client.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def _check_input_fit(self, X, is_categories=False): """Helper function to check input of fit within the multi-gpu model""" if isinstance(X, (dask.array.core.Array, cp.ndarray)): self._set_input_type('array') if is_categories: X = X.transpose() if isinstance(X, cp.ndarray): return DataFrame(X) else: return to_dask_cudf(X, client=self.client) else: self._set_input_type('df') return X
def test_score(dataset, datatype, n_neighbors, n_parts, client): X_train, X_test, y_train, y_test = dataset if not n_parts: n_parts = len(client.has_what().keys()) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNReg(client=client, n_neighbors=n_neighbors) d_model.fit(X_train, y_train) d_outputs, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_outputs, d_indices, d_distances) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) cuml_score = d_model.score(X_test, y_test) if datatype == 'dask_cudf': y_test = y_test.compute().as_matrix() else: y_test = y_test.compute() manual_score = accuracy_score(y_test, distributed_out[0]) assert cuml_score == manual_score
def test_predict(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset l_model = lKNNReg(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_distances, l_indices = l_model.kneighbors(X_test) l_outputs = l_model.predict(X_test) local_out = (l_outputs, l_indices, l_distances) if not n_parts: n_parts = len(client.has_what().keys()) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) d_model = dKNNReg(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_outputs, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_outputs, d_indices, d_distances) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) match_test(local_out, distributed_out) accuracy_score(local_out[0], distributed_out[0]) > 0.12
def test_predict_and_score(dataset, datatype, parameters, client): n_neighbors, n_parts, batch_size = parameters X_train, X_test, y_train, y_test = dataset l_model = lKNNReg(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_outputs = l_model.predict(X_test) handmade_local_score = r2_score(y_test, l_outputs) handmade_local_score = round(float(handmade_local_score), 3) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNReg(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_outputs = d_model.predict(X_test, convert_dtype=True) d_outputs = d_outputs.compute() d_outputs = d_outputs.to_numpy() \ if isinstance(d_outputs, DataFrame) \ else d_outputs exact_match(l_outputs, d_outputs) distributed_score = d_model.score(X_test, y_test) distributed_score = round(float(distributed_score), 3) assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2)
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) predictions = cumlModel.predict(X_train).compute() if input_type == "dataframe": X = cp.array(X_train.compute().as_gpu_matrix()) predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) elif input_type == "array": X = X_train.compute() centers = cumlModel.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()