示例#1
0
    def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None):
        if gdf.shape[0] == 0:
            return None

        # Before reclustering remove all columns that may interfere  
        ids = gdf['id'] 
        chembl_ids = gdf['chembl_id']

        gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True)
        if new_figerprints is not None and new_chembl_ids is not None:
            # Add new figerprints and chEmblIds before reclustering
            fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns)
            gdf = gdf.append(fp_df, ignore_index=True)
            chembl_ids = chembl_ids.append(
                cudf.Series(new_chembl_ids), ignore_index=True)

        kmeans_float = KMeans(n_clusters=self.n_clusters)
        kmeans_float.fit(gdf)

        Xt = self.umap.fit_transform(gdf)

        # Add back the column required for plotting and to correlating data 
        # between re-clustering 
        gdf.add_column('x', Xt[0].to_array())
        gdf.add_column('y', Xt[1].to_array())
        gdf.add_column('id', gdf.index)
        gdf.add_column('chembl_id', chembl_ids)
        gdf.add_column('cluster', kmeans_float.labels_.to_array())
        return gdf
示例#2
0
def kmeans_fit(X):
    alg = KMeans(n_clusters=params.n_clusters,
                 tol=params.tol,
                 max_iter=params.maxiter,
                 init=X_init,
                 max_samples_per_batch=params.samples_per_batch)
    alg.fit(X)
    return alg
示例#3
0
def kmeans(X, k, round_values=True):
    """ Summarize a dataset with k mean samples weighted by the number of data points they
    each represent.
    Parameters
    ----------
    X : numpy.array or pandas.DataFrame or any scipy.sparse matrix
        Matrix of data samples to summarize (# samples x # features)
    k : int
        Number of means to use for approximation.
    round_values : bool
        For all i, round the ith dimension of each mean sample to match the nearest value
        from X[:,i]. This ensures discrete features always get a valid value.
    Returns
    -------
    DenseData object.
    """

    if not rapids_installed:
        raise RuntimeError(
            "cuML is required to use GPU explainers. Check https://rapids.ai/start.html \
            for more information on how to install it.")
    if cuml.__version__ >= '21.08':
        from cuml.explainer.sampling import kmeans_sampling
        summary, group_names, labels = kmeans_sampling(X,
                                                       k,
                                                       round_values,
                                                       detailed=True)

        return DenseData(summary, group_names, None, 1.0 * np.bincount(labels))
    # For backward compatibility
    group_names = [str(i) for i in range(X.shape[1])]
    if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
        group_names = X.columns
        X = X.values

    # in case there are any missing values in data impute them
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    X = imp.fit_transform(X)

    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

    if round_values:
        for i in range(k):
            for j in range(X.shape[1]):
                xj = X[:, j].toarray().flatten() if issparse(
                    X) else X[:, j]  # sparse support courtesy of @PrimozGodec
                ind = np.argmin(np.abs(xj - kmeans.cluster_centers_[i, j]))
                kmeans.cluster_centers_[i, j] = X[ind, j]
    return DenseData(kmeans.cluster_centers_, group_names, None,
                     1.0 * np.bincount(kmeans.labels_))
示例#4
0
    def _cluster(self, embedding):
        logger.info('Computing cluster...')
        embedding = embedding.reset_index()
        n_molecules = embedding.shape[0]

        # Before reclustering remove all columns that may interfere
        embedding, prop_series = self._remove_non_numerics(embedding)

        with MetricsLogger('random_proj', n_molecules) as ml:
            srp = self.srp_embedding.fit_transform(embedding.values)

            ml.metric_name = 'spearman_rho'
            ml.metric_func = self._compute_spearman_rho
            ml.metric_func_args = (embedding, embedding, srp)

        with MetricsLogger('kmeans', n_molecules) as ml:
            kmeans_cuml = KMeans(n_clusters=self.n_clusters)
            kmeans_cuml.fit(srp)
            kmeans_labels = kmeans_cuml.predict(srp)

            ml.metric_name = 'silhouette_score'
            ml.metric_func = batched_silhouette_scores
            ml.metric_func_kwargs = {}
            ml.metric_func_args = (None, None)
            if self.context.is_benchmark:
                (srp_sample,
                 kmeans_labels_sample), _ = self._random_sample_from_arrays(
                     srp, kmeans_labels, n_samples=self.n_silhouette)
                ml.metric_func_args = (srp_sample, kmeans_labels_sample)

        # Add back the column required for plotting and to correlating data
        # between re-clustering
        srp = self.rand_jitter(srp)
        embedding['cluster'] = kmeans_labels
        embedding['x'] = srp[:, 0]
        embedding['y'] = srp[:, 1]

        # Add back the prop columns
        for col in prop_series.keys():
            embedding[col] = prop_series[col]

        return embedding
示例#5
0
def kmeans_sampling(X, k, round_values=True, detailed=False, random_state=0):
    """
    Adapted from :
    https://github.com/slundberg/shap/blob/9411b68e8057a6c6f3621765b89b24d82bee13d4/shap/utils/_legacy.py
    Summarize a dataset (X) using weighted k-means.

    Parameters
    ----------
    X : cuDF or Pandas DataFrame/Series, numpy arrays or cuda_array_interface
        compliant device array.
        Data to be summarized, shape (n_samples, n_features)
    k : int
        Number of means to use for approximation.
    round_values : bool; default=True
        For all i, round the ith dimension of each mean sample to match the
        nearest value from X[:,i]. This ensures discrete features always get
        a valid value.
    detailed: bool; default=False
        To return details of group names and cluster labels of all data points
    random_state: int; default=0
        Sets the random state.

    Returns
    -------
    summary : Summary of the data, shape (k, n_features)
    group_names : Names of the features
    labels : Cluster labels of the data points in the original dataset,
             shape (n_samples, 1)
    """
    output_dtype = get_supported_input_type(X)
    _output_dtype_str = determine_array_type(X)
    cuml.internals.set_api_output_type(_output_dtype_str)

    if output_dtype is None:
        raise TypeError(f"Type of input {type(X)} is not supported. Supported \
                        dtypes: cuDF DataFrame, cuDF Series, cupy, numba,\
                        numpy, pandas DataFrame, pandas Series")

    if "DataFrame" in str(output_dtype):
        group_names = X.columns
        X = cp.array(X.values, copy=False)
    if "Series" in str(output_dtype):
        group_names = X.name
        X = cp.array(X.values.reshape(-1, 1), copy=False)
    else:
        # it's either numpy, cupy or numba
        X = cp.array(X, copy=False)
        try:
            # more than one column
            group_names = [str(i) for i in range(X.shape[1])]
        except IndexError:
            # one column
            X = X.reshape(-1, 1)
            group_names = ['0']

    # in case there are any missing values in data impute them
    imp = SimpleImputer(missing_values=cp.nan,
                        strategy='mean',
                        output_type=_output_dtype_str)
    X = imp.fit_transform(X)

    kmeans = KMeans(n_clusters=k,
                    random_state=random_state,
                    output_type=_output_dtype_str).fit(X)

    if round_values:
        for i in range(k):
            for j in range(X.shape[1]):
                xj = X[:, j].toarray().flatten() if issparse(
                    X) else X[:, j]  # sparse support courtesy of @PrimozGodec
                ind = cp.argmin(cp.abs(xj - kmeans.cluster_centers_[i, j]))
                kmeans.cluster_centers_[i, j] = X[ind, j]
    summary = kmeans.cluster_centers_
    labels = kmeans.labels_

    if detailed:
        return summary, group_names, labels
    else:
        return summary
示例#6
0

y = np.asarray(
    [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [4.0, 2.0], [4.0, 4.0], [4.0, 0.0]],
    dtype=np.float32)
x = np2cudf(y)
q = np.asarray([[0, 0], [4, 4]], dtype=np.float32)
p = np2cudf(q)
a = np.asarray([[1.0, 1.0], [1.0, 2.0], [3.0, 2.0], [4.0, 3.0]],
               dtype=np.float32)
b = np2cudf(a)
print("input:")
print(b)

print("\nCalling fit")
kmeans_float = KMeans(n_clusters=2, n_gpu=-1)
kmeans_float.fit(b)
print("labels:")
print(kmeans_float.labels_)
print("cluster_centers:")
print(kmeans_float.cluster_centers_)
'''
print("\nCalling Predict")
print("labels:")
print(kmeans_float.predict(p))
print("cluster_centers:")
print(kmeans_float.cluster_centers_)
'''

print("\nCalling fit_predict")
kmeans_float2 = KMeans(n_clusters=2, n_gpu=-1)
示例#7
0
            smiles_list.append(fields[1].decode("utf-8"))
            count+=1
            if count>max:
                break

    logger.info('Initializing Morgan fingerprints...')
    results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute()

    np_array_fingerprints = np.stack(results).astype(np.float32)

    # take np.array shape (n_mols, nBits) for GPU DataFrame
    gdf = np2cudf(np_array_fingerprints)

    # prepare one set of clusters
    n_clusters = 7
    kmeans_float = KMeans(n_clusters=n_clusters)
    kmeans_float.fit(gdf)
    
    # UMAP
    umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)
    Xt = umap.fit_transform(gdf)
    gdf.add_column('x', Xt[0].to_array())
    gdf.add_column('y', Xt[1].to_array())

    gdf.add_column('cluster', kmeans_float.labels_)

    # start dash
    v = chemvisualize.ChemVisualization(