示例#1
0
    def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None):
        if gdf.shape[0] == 0:
            return None

        # Before reclustering remove all columns that may interfere  
        ids = gdf['id'] 
        chembl_ids = gdf['chembl_id']

        gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True)
        if new_figerprints is not None and new_chembl_ids is not None:
            # Add new figerprints and chEmblIds before reclustering
            fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns)
            gdf = gdf.append(fp_df, ignore_index=True)
            chembl_ids = chembl_ids.append(
                cudf.Series(new_chembl_ids), ignore_index=True)

        kmeans_float = KMeans(n_clusters=self.n_clusters)
        kmeans_float.fit(gdf)

        Xt = self.umap.fit_transform(gdf)

        # Add back the column required for plotting and to correlating data 
        # between re-clustering 
        gdf.add_column('x', Xt[0].to_array())
        gdf.add_column('y', Xt[1].to_array())
        gdf.add_column('id', gdf.index)
        gdf.add_column('chembl_id', chembl_ids)
        gdf.add_column('cluster', kmeans_float.labels_.to_array())
        return gdf
示例#2
0
def kmeans_fit(X):
    alg = KMeans(n_clusters=params.n_clusters,
                 tol=params.tol,
                 max_iter=params.maxiter,
                 init=X_init,
                 max_samples_per_batch=params.samples_per_batch)
    alg.fit(X)
    return alg
示例#3
0
    def _cluster(self, embedding):
        logger.info('Computing cluster...')
        embedding = embedding.reset_index()
        n_molecules = embedding.shape[0]

        # Before reclustering remove all columns that may interfere
        embedding, prop_series = self._remove_non_numerics(embedding)

        with MetricsLogger('random_proj', n_molecules) as ml:
            srp = self.srp_embedding.fit_transform(embedding.values)

            ml.metric_name = 'spearman_rho'
            ml.metric_func = self._compute_spearman_rho
            ml.metric_func_args = (embedding, embedding, srp)

        with MetricsLogger('kmeans', n_molecules) as ml:
            kmeans_cuml = KMeans(n_clusters=self.n_clusters)
            kmeans_cuml.fit(srp)
            kmeans_labels = kmeans_cuml.predict(srp)

            ml.metric_name = 'silhouette_score'
            ml.metric_func = batched_silhouette_scores
            ml.metric_func_kwargs = {}
            ml.metric_func_args = (None, None)
            if self.context.is_benchmark:
                (srp_sample,
                 kmeans_labels_sample), _ = self._random_sample_from_arrays(
                     srp, kmeans_labels, n_samples=self.n_silhouette)
                ml.metric_func_args = (srp_sample, kmeans_labels_sample)

        # Add back the column required for plotting and to correlating data
        # between re-clustering
        srp = self.rand_jitter(srp)
        embedding['cluster'] = kmeans_labels
        embedding['x'] = srp[:, 0]
        embedding['y'] = srp[:, 1]

        # Add back the prop columns
        for col in prop_series.keys():
            embedding[col] = prop_series[col]

        return embedding
示例#4
0
y = np.asarray(
    [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [4.0, 2.0], [4.0, 4.0], [4.0, 0.0]],
    dtype=np.float32)
x = np2cudf(y)
q = np.asarray([[0, 0], [4, 4]], dtype=np.float32)
p = np2cudf(q)
a = np.asarray([[1.0, 1.0], [1.0, 2.0], [3.0, 2.0], [4.0, 3.0]],
               dtype=np.float32)
b = np2cudf(a)
print("input:")
print(b)

print("\nCalling fit")
kmeans_float = KMeans(n_clusters=2, n_gpu=-1)
kmeans_float.fit(b)
print("labels:")
print(kmeans_float.labels_)
print("cluster_centers:")
print(kmeans_float.cluster_centers_)
'''
print("\nCalling Predict")
print("labels:")
print(kmeans_float.predict(p))
print("cluster_centers:")
print(kmeans_float.cluster_centers_)
'''

print("\nCalling fit_predict")
kmeans_float2 = KMeans(n_clusters=2, n_gpu=-1)
print("labels:")
示例#5
0
            count+=1
            if count>max:
                break

    logger.info('Initializing Morgan fingerprints...')
    results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute()

    np_array_fingerprints = np.stack(results).astype(np.float32)

    # take np.array shape (n_mols, nBits) for GPU DataFrame
    gdf = np2cudf(np_array_fingerprints)

    # prepare one set of clusters
    n_clusters = 7
    kmeans_float = KMeans(n_clusters=n_clusters)
    kmeans_float.fit(gdf)
    
    # UMAP
    umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)
    Xt = umap.fit_transform(gdf)
    gdf.add_column('x', Xt[0].to_array())
    gdf.add_column('y', Xt[1].to_array())

    gdf.add_column('cluster', kmeans_float.labels_)

    # start dash
    v = chemvisualize.ChemVisualization(
        gdf.copy(), n_clusters, chemblID_list)