def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None): if gdf.shape[0] == 0: return None # Before reclustering remove all columns that may interfere ids = gdf['id'] chembl_ids = gdf['chembl_id'] gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True) if new_figerprints is not None and new_chembl_ids is not None: # Add new figerprints and chEmblIds before reclustering fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns) gdf = gdf.append(fp_df, ignore_index=True) chembl_ids = chembl_ids.append( cudf.Series(new_chembl_ids), ignore_index=True) kmeans_float = KMeans(n_clusters=self.n_clusters) kmeans_float.fit(gdf) Xt = self.umap.fit_transform(gdf) # Add back the column required for plotting and to correlating data # between re-clustering gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('id', gdf.index) gdf.add_column('chembl_id', chembl_ids) gdf.add_column('cluster', kmeans_float.labels_.to_array()) return gdf
def kmeans_fit(X): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, max_samples_per_batch=params.samples_per_batch) alg.fit(X) return alg
def _cluster(self, embedding): logger.info('Computing cluster...') embedding = embedding.reset_index() n_molecules = embedding.shape[0] # Before reclustering remove all columns that may interfere embedding, prop_series = self._remove_non_numerics(embedding) with MetricsLogger('random_proj', n_molecules) as ml: srp = self.srp_embedding.fit_transform(embedding.values) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho ml.metric_func_args = (embedding, embedding, srp) with MetricsLogger('kmeans', n_molecules) as ml: kmeans_cuml = KMeans(n_clusters=self.n_clusters) kmeans_cuml.fit(srp) kmeans_labels = kmeans_cuml.predict(srp) ml.metric_name = 'silhouette_score' ml.metric_func = batched_silhouette_scores ml.metric_func_kwargs = {} ml.metric_func_args = (None, None) if self.context.is_benchmark: (srp_sample, kmeans_labels_sample), _ = self._random_sample_from_arrays( srp, kmeans_labels, n_samples=self.n_silhouette) ml.metric_func_args = (srp_sample, kmeans_labels_sample) # Add back the column required for plotting and to correlating data # between re-clustering srp = self.rand_jitter(srp) embedding['cluster'] = kmeans_labels embedding['x'] = srp[:, 0] embedding['y'] = srp[:, 1] # Add back the prop columns for col in prop_series.keys(): embedding[col] = prop_series[col] return embedding
y = np.asarray( [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [4.0, 2.0], [4.0, 4.0], [4.0, 0.0]], dtype=np.float32) x = np2cudf(y) q = np.asarray([[0, 0], [4, 4]], dtype=np.float32) p = np2cudf(q) a = np.asarray([[1.0, 1.0], [1.0, 2.0], [3.0, 2.0], [4.0, 3.0]], dtype=np.float32) b = np2cudf(a) print("input:") print(b) print("\nCalling fit") kmeans_float = KMeans(n_clusters=2, n_gpu=-1) kmeans_float.fit(b) print("labels:") print(kmeans_float.labels_) print("cluster_centers:") print(kmeans_float.cluster_centers_) ''' print("\nCalling Predict") print("labels:") print(kmeans_float.predict(p)) print("cluster_centers:") print(kmeans_float.cluster_centers_) ''' print("\nCalling fit_predict") kmeans_float2 = KMeans(n_clusters=2, n_gpu=-1) print("labels:")
count+=1 if count>max: break logger.info('Initializing Morgan fingerprints...') results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute() np_array_fingerprints = np.stack(results).astype(np.float32) # take np.array shape (n_mols, nBits) for GPU DataFrame gdf = np2cudf(np_array_fingerprints) # prepare one set of clusters n_clusters = 7 kmeans_float = KMeans(n_clusters=n_clusters) kmeans_float.fit(gdf) # UMAP umap = UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) Xt = umap.fit_transform(gdf) gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('cluster', kmeans_float.labels_) # start dash v = chemvisualize.ChemVisualization( gdf.copy(), n_clusters, chemblID_list)