Пример #1
0
 def generate_PCA(self, features):
     pca = PCA(n_components=FLAGS.pca_n_components,
               svd_solver=FLAGS.pca_svd_solver,
               iterated_power=FLAGS.pca_iterated_power,
               tol=FLAGS.pca_tol,
               random_state=FLAGS.pca_random_state,
               copy=FLAGS.pca_copy,
               whiten=FLAGS.pca_whiten,
               ).fit_transform(features)
     return pca.as_matrix()
Пример #2
0
def pca(adata, n_components=50, train_ratio=0.35, n_batches=50, gpu=False):

    """
    Performs a batched PCA by training on the first `train_ratio` samples
    and transforming in `n_batches` number of batches.

    Parameters
    ----------

    adata : anndata.AnnData of shape (n_cells, n_genes)
        Annotated data object for which to perform PCA

    n_components : int
        Number of principal components to keep

    train_ratio : float
        Percentage of cells to use for training

    n_batches : int
        Number of batches to use for transform

    gpu : bool
        Uses Scikit-Learn for CPU (gpu=False) and RAPIDS cuML for GPU
        (gpu=True)
    """

    train_size = math.ceil(adata.X.shape[0] * train_ratio)

    if gpu:
        from cuml.decomposition import PCA
        import cupy as cp
    else:
        from sklearn.decomposition import PCA
        import numpy as cp

    pca = PCA(n_components=n_components).fit(adata.X[:train_size])
    
    embeddings = cp.zeros((adata.X.shape[0], n_components))
    batch_size = int(embeddings.shape[0] / n_batches)
    for batch in range(n_batches):
        start_idx = batch * batch_size
        end_idx = start_idx + batch_size

        if(adata.X.shape[0] - end_idx < batch_size):
            end_idx = adata.X.shape[0]

        embeddings[start_idx:end_idx,:] = cp.asarray(pca.transform(adata.X[start_idx:end_idx]))
        
    if gpu:
        embeddings = embeddings.get()

    adata.obsm["X_pca"] = embeddings
    return adata
Пример #3
0
 def re_cluster(self, adata_copy):
     #### rerun clusterings
     adata_copy.obsm["X_pca"] = PCA(
         n_components=self.n_components).fit_transform(adata_copy.X).get()
     sc.pp.neighbors(adata_copy,
                     n_neighbors=self.n_neighbors,
                     n_pcs=self.knn_n_pcs,
                     method='rapids')
     sc.tl.umap(adata_copy,
                min_dist=self.umap_min_dist,
                spread=self.umap_spread,
                method='rapids')
     sc.tl.louvain(adata_copy,
                   flavor='rapids',
                   resolution=self.louvain_resolution)
     return adata_copy
Пример #4
0
# read data
X = pd.read_csv(file_name)
X = X.apply(pd.to_numeric, errors='coerce')
if 'class' in X:
    X = X.drop('class', axis=1)
if 'target' in X:
    X = X.drop('target', axis=1)
if 'variety' in X:
    X = X.drop('variety', axis=1)

X = X.values
X = X.astype("float32")
sklearn_X = X
n_components = min(X.shape)
time_init_pca = dt.now()
print(
    "CUML Running PCA with {} features, {} samples, and {} n_components on dataset {}"
    .format(X.shape[1], X.shape[0], n_components, dataset_name))
pca = PCA(n_components=min(X.shape),
          copy=True,
          whiten=False,
          svd_solver='jacobi',
          tol=1.e-3,
          iterated_power=15,
          random_state=42)
time_fit_transform = dt.now()
X_transformed = pca.fit_transform(X)
print("CUML Time for transform {}ms".format(
    (dt.now() - time_fit_transform).microseconds / 1000))
print("CUML Total time {}ms".format(
    (dt.now() - time_init_pca).microseconds / 1000))
Пример #5
0
def pca_gpu():
    print("gpu pca")
    pca_loan_gpu = PCA_gpu(n_components=n_components)
    pca_loan_gpu.fit(loan_norm_rdf)