def test_n_init_cluster_consistency(random_state): nclusters = 8 X, y = get_data_consistency_test() cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) initial_clusters = cuml_kmeans.cluster_centers_ cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
def test_n_init_cluster_consistency(random_state): cluster_std = 1.0 nrows = 100000 ncols = 100 nclusters = 8 X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) initial_clusters = cuml_kmeans.cluster_centers_ cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
def test_score(nrows, ncols, nclusters): X, y = make_blobs(nrows, ncols, nclusters, cluster_std=0.01, random_state=10) cuml_kmeans = cuml.KMeans(verbose=1, init="k-means||", n_clusters=nclusters, random_state=10) cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cp.array(cuml_kmeans.cluster_centers_.as_gpu_matrix()) expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = np.sqrt(np.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1*expected_score) \ >= actual_score - SCORE_EPS
def test_rand_index_score(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) cu_score = cu_ars(y, cu_y_pred) cu_score_using_sk = sk_ars(y, cu_y_pred) assert array_equal(cu_score, cu_score_using_sk)
def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 np.random.seed(random_state) # set weight per sample to be from 1 to max_weight wt = np.random.randint(1, high=max_weight, size=nrows) X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X, sample_weight=wt) cu_score = cuml_kmeans.score(X) sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt) sk_score = sk_kmeans.score(cp.asnumpy(X)) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_kmeans_sequential_plus_plus_init(nrows, ncols, nclusters, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) cu_score = cuml_kmeans.score(X) kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) kmeans.fit(X.copy_to_host()) sk_score = kmeans.score(X.copy_to_host()) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_kmeans_sklearn_comparison_default(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'], random_state=12, n_init=10, output_type='numpy') X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
def test_score(nrows, ncols, nclusters): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=0.01, shuffle=False, random_state=10) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=10, output_type='numpy') cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cuml_kmeans.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = cp.array(centers[label]) dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1*expected_score) \ >= actual_score - SCORE_EPS
def test_traditional_kmeans_plus_plus_init(nrows, ncols, nclusters, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) cu_score = cuml_kmeans.score(X) kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) kmeans.fit(cp.asnumpy(X)) sk_score = kmeans.score(cp.asnumpy(X)) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_kmeans_sklearn_comparison_default(name, nrows): default_base = {'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3} pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) # cuML score should be in a close neighborhood around scikit-learn's assert sk_score - 0.03 <= cu_score <= sk_score + 0.03
def test_score(nrows, ncols, nclusters, random_state): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=1.0, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cuml_kmeans.cluster_centers_ expected_score = 0.0 for idx, label in enumerate(predictions): x = X[idx, :] y = cp.array(centers[label, :], dtype=cp.float32) sq_euc_dist = cp.sum(cp.square((x - y))) expected_score += sq_euc_dist expected_score *= -1 cp.testing.assert_allclose(actual_score, expected_score, atol=0.1, rtol=1e-5)
def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None): if gdf.shape[0] == 0: return None # Before reclustering remove all columns that may interfere ids = gdf['id'] chembl_ids = gdf['chembl_id'] gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], axis=1, inplace=True) if new_figerprints is not None and new_chembl_ids is not None: # Add new figerprints and chEmblIds before reclustering if self.pca: new_figerprints = self.pca.transform(new_figerprints) if self.enable_gpu: fp_df = cudf.DataFrame(new_figerprints, \ index=[idx for idx in range(self.orig_df.shape[0], self.orig_df.shape[0] + len(new_figerprints))], columns=gdf.columns) else: fp_df = pandas.DataFrame(new_figerprints, \ index=[idx for idx in range(self.orig_df.shape[0], self.orig_df.shape[0] + len(new_figerprints))], columns=gdf.columns) gdf = gdf.append(fp_df, ignore_index=True) # Update original dataframe for it to work on reload fp_df['id'] = fp_df.index self.orig_df = self.orig_df.append(fp_df, ignore_index=True) chembl_ids = chembl_ids.append( cudf.Series(new_chembl_ids), ignore_index=True) ids = ids.append(fp_df['id'], ignore_index=True) self.chembl_ids.extend(new_chembl_ids) del fp_df if self.enable_gpu: kmeans_float = cuml.KMeans(n_clusters=self.n_clusters) else: kmeans_float = sklearn.cluster.KMeans(n_clusters=self.n_clusters) kmeans_float.fit(gdf) Xt = self.umap.fit_transform(gdf) # Add back the column required for plotting and to correlating data # between re-clustering if self.enable_gpu: gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('cluster', kmeans_float.labels_.to_gpu_array()) gdf.add_column('chembl_id', chembl_ids) gdf.add_column('id', ids) else: gdf['x'] = Xt[:,0] gdf['y'] = Xt[:,1] gdf['cluster'] = kmeans_float.labels_ gdf['chembl_id'] = chembl_ids gdf['id'] = ids return gdf
def KMeans(data, cluster): warnings.filterwarnings('ignore') data_pack = torch.utils.dlpack.to_dlpack(data) data_df = cudf.from_dlpack(data_pack) model = cuml.KMeans(n_clusters=cluster) result = model.fit(data_df) labels = torch.utils.dlpack.from_dlpack(result.labels_.to_dlpack()) warnings.filterwarnings('once') return labels
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X).to_array() if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 4e-3) and score_test else: if name == 'aniso': # aniso dataset border points tend to differ in the frontier # between clusters when compared to sklearn tol = 2e-2 else: # We allow up to 5 points to be different for the other # datasets to be robust to small behavior changes # between library versions/ small changes. Visually it is # very clear that the algorithm work. Will add option # to plot if desired in a future version. tol = 1e-2 assert (clusters_equal( sk_y_pred, cu_y_pred, params['n_clusters'], tol=tol)) and score_test
def _query8(self): self._loadTables('query8') train = {} for i in range(12): train['c{}'.format(i)] = np.random.rand(1000) train = cudf.DataFrame(train) kmeans = cuml.KMeans(n_clusters=8) kmeans.fit(train) rideIndex = self._createIndex( self.rideTable, 'ride.start', ) locationFilter = self.locationTable[ self.locationTable['loc.locationId'] == 0] locationPolygon = self._createBox( locationFilter, 'loc.bounds', ) startTime = time.time() (joinRide, numbaTime) = self._spatialJoinDist(self.rideTable, locationFilter, 'ride.start', 'loc.bounds', rideIndex, locationPolygon, 0.0) featureName = [ 'ride.c0', 'ride.c1', 'ride.c2', 'ride.c3', 'ride.c4', 'ride.c5', 'ride.c6', 'ride.c7', 'ride.c8', 'ride.c9', 'ride.c10', 'ride.c11' ] join0 = joinRide.merge(self.riderTable, left_on='ride.riderId', right_on='rider.riderId') groupRider = join0.groupby(['rider.riderId'], )[featureName].mean() groupRider['cluster'] = kmeans.predict(groupRider) endTime = time.time() groupRider.to_csv( 'query8_gpu.csv', index=False, ) return endTime - startTime - numbaTime
def test_all_kmeans_params(n_rows, n_clusters, max_iter, init, oversampling_factor, max_samples_per_batch): np.random.seed(0) X = np.random.rand(1000, 10) if init == 'preset': init = np.random.rand(n_clusters, 10) cuml_kmeans = cuml.KMeans(n_clusters=n_clusters, max_iter=max_iter, init=init, oversampling_factor=oversampling_factor, max_samples_per_batch=max_samples_per_batch) cuml_kmeans.fit_predict(X)
def test_kmeans_clusters_blobs(nrows, ncols, nclusters, random_state, cluster_std): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=random_state,) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') preds = cuml_kmeans.fit_predict(X) assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99
def test_all_kmeans_params(n_clusters, max_iter, init, oversampling_factor, max_samples_per_batch, random_state): np.random.seed(0) X = np.random.rand(1000, 10) if init == 'preset': init = np.random.rand(n_clusters, 10) cuml_kmeans = cuml.KMeans(n_clusters=n_clusters, max_iter=max_iter, init=init, random_state=random_state, oversampling_factor=oversampling_factor, max_samples_per_batch=max_samples_per_batch, output_type='cupy') cuml_kmeans.fit_predict(X)
def test_kmeans_sklearn_comparison(name): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 10000) params = default_base.copy() params.update(pat[1]) kmeans = cluster.KMeans(n_clusters=params['n_clusters']) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = ( ('sk_Kmeans', kmeans), ('cuml_Kmeans', cuml_kmeans), ) sk_y_pred, _ = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, _ = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. if name == 'noisy_circles': assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 2e-3 else: assert clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) print(cuml_kmeans.score(X), kmeans.score(X)) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 2e-3) and score_test else: assert (clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])) and score_test
import cuml from cuml.test.utils import array_equal import numpy as np from sklearn.datasets import load_iris from sklearn.datasets import make_regression import pickle from sklearn.manifold.t_sne import trustworthiness regression_models = dict(LinearRegression=cuml.LinearRegression(), Lasso=cuml.Lasso(), Ridge=cuml.Ridge(), ElasticNet=cuml.ElasticNet()) solver_models = dict(CD=cuml.CD(), SGD=cuml.SGD(eta0=0.005)) cluster_models = dict(KMeans=cuml.KMeans()) decomposition_models = dict( PCA=cuml.PCA(), TruncatedSVD=cuml.TruncatedSVD(), ) decomposition_models_xfail = dict( GaussianRandomProjection=cuml.GaussianRandomProjection(), SparseRandomProjection=cuml.SparseRandomProjection()) neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors()) dbscan_model = dict(DBSCAN=cuml.DBSCAN()) umap_model = dict(UMAP=cuml.UMAP())
fit_intercept), "Lasso": lambda fit_intercept=True: cuml.Lasso(fit_intercept=fit_intercept), "Ridge": lambda fit_intercept=True: cuml.Ridge(fit_intercept=fit_intercept), "ElasticNet": lambda fit_intercept=True: cuml.ElasticNet(fit_intercept=fit_intercept) } solver_models = { "CD": lambda: cuml.CD(), "SGD": lambda: cuml.SGD(eta0=0.005), "QN": lambda: cuml.QN(loss="softmax") } cluster_models = {"KMeans": lambda: cuml.KMeans()} decomposition_models = { "PCA": lambda: cuml.PCA(), "TruncatedSVD": lambda: cuml.TruncatedSVD(), } decomposition_models_xfail = { "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(), "SparseRandomProjection": lambda: cuml.SparseRandomProjection() } neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()} dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()}
if enable_gpu: pca = cuml.PCA(n_components=pca_components) else: pca = sklearn.decomposition.PCA(n_components=pca_components) df_fingerprints = pca.fit_transform(df_fingerprints) print('Runtime PCA time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) else: pca = False print('PCA has been skipped') task_start_time = datetime.now() n_clusters = 7 if enable_gpu: kmeans_float = cuml.KMeans(n_clusters=n_clusters) else: kmeans_float = sklearn.cluster.KMeans(n_clusters=n_clusters) kmeans_float.fit(df_fingerprints) print('Runtime Kmeans time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) # UMAP task_start_time = datetime.now() if enable_gpu: umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) else: umap = umap.UMAP() Xt = umap.fit_transform(df_fingerprints) print('Runtime UMAP time (hh:mm:ss.ms) {}'.format(datetime.now() -
def _cluster(self, embedding, n_pca): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. """ if hasattr(embedding, 'compute'): embedding = embedding.compute() embedding = embedding.reset_index() # Before reclustering remove all columns that may interfere embedding, prop_series = self._remove_non_numerics(embedding) self.n_molecules, n_obs = embedding.shape if self.context.is_benchmark: molecular_embedding_sample, spearman_index = self._random_sample_from_arrays( embedding, n_samples=self.n_spearman) if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: if self.pca == None: self.pca = cuml.PCA(n_components=n_pca) self.pca.fit(embedding) embedding = self.pca.transform(embedding) with MetricsLogger('kmeans', self.n_molecules) as ml: if self.n_molecules < MIN_RECLUSTER_SIZE: raise Exception( 'Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) kmeans_cuml = cuml.KMeans(n_clusters=self.n_clusters) kmeans_cuml.fit(embedding) kmeans_labels = kmeans_cuml.predict(embedding) ml.metric_name = 'silhouette_score' ml.metric_func = batched_silhouette_scores ml.metric_func_kwargs = {} ml.metric_func_args = (None, None) if self.context.is_benchmark: (embedding_sample, kmeans_labels_sample), _ = self._random_sample_from_arrays( embedding, kmeans_labels, n_samples=self.n_silhouette) ml.metric_func_args = (embedding_sample, kmeans_labels_sample) with MetricsLogger('umap', self.n_molecules) as ml: umap = cuml.manifold.UMAP() Xt = umap.fit_transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho ml.metric_func_args = (None, None) if self.context.is_benchmark: X_train_sample, _ = self._random_sample_from_arrays( embedding, index=spearman_index) ml.metric_func_args = (molecular_embedding_sample, X_train_sample) # Add back the column required for plotting and to correlating data # between re-clustering embedding['cluster'] = kmeans_labels embedding['x'] = Xt[0] embedding['y'] = Xt[1] # Add back the prop columns for col in prop_series.keys(): embedding[col] = prop_series[col] return embedding
regression_models = { "LinearRegression": lambda: cuml.LinearRegression(), "Lasso": lambda: cuml.Lasso(), "Ridge": lambda: cuml.Ridge(), "ElasticNet": lambda: cuml.ElasticNet() } solver_models = { "CD": lambda: cuml.CD(), "SGD": lambda: cuml.SGD(eta0=0.005) } cluster_models = { "KMeans": lambda: cuml.KMeans() } decomposition_models = { "PCA": lambda: cuml.PCA(), "TruncatedSVD": lambda: cuml.TruncatedSVD(), } decomposition_models_xfail = { "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(), "SparseRandomProjection": lambda: cuml.SparseRandomProjection() } neighbor_models = { "NearestNeighbors": lambda: cuml.NearestNeighbors() }