def generate_PCA(self, features): pca = PCA(n_components=FLAGS.pca_n_components, svd_solver=FLAGS.pca_svd_solver, iterated_power=FLAGS.pca_iterated_power, tol=FLAGS.pca_tol, random_state=FLAGS.pca_random_state, copy=FLAGS.pca_copy, whiten=FLAGS.pca_whiten, ).fit_transform(features) return pca.as_matrix()
def pca(adata, n_components=50, train_ratio=0.35, n_batches=50, gpu=False): """ Performs a batched PCA by training on the first `train_ratio` samples and transforming in `n_batches` number of batches. Parameters ---------- adata : anndata.AnnData of shape (n_cells, n_genes) Annotated data object for which to perform PCA n_components : int Number of principal components to keep train_ratio : float Percentage of cells to use for training n_batches : int Number of batches to use for transform gpu : bool Uses Scikit-Learn for CPU (gpu=False) and RAPIDS cuML for GPU (gpu=True) """ train_size = math.ceil(adata.X.shape[0] * train_ratio) if gpu: from cuml.decomposition import PCA import cupy as cp else: from sklearn.decomposition import PCA import numpy as cp pca = PCA(n_components=n_components).fit(adata.X[:train_size]) embeddings = cp.zeros((adata.X.shape[0], n_components)) batch_size = int(embeddings.shape[0] / n_batches) for batch in range(n_batches): start_idx = batch * batch_size end_idx = start_idx + batch_size if(adata.X.shape[0] - end_idx < batch_size): end_idx = adata.X.shape[0] embeddings[start_idx:end_idx,:] = cp.asarray(pca.transform(adata.X[start_idx:end_idx])) if gpu: embeddings = embeddings.get() adata.obsm["X_pca"] = embeddings return adata
def re_cluster(self, adata_copy): #### rerun clusterings adata_copy.obsm["X_pca"] = PCA( n_components=self.n_components).fit_transform(adata_copy.X).get() sc.pp.neighbors(adata_copy, n_neighbors=self.n_neighbors, n_pcs=self.knn_n_pcs, method='rapids') sc.tl.umap(adata_copy, min_dist=self.umap_min_dist, spread=self.umap_spread, method='rapids') sc.tl.louvain(adata_copy, flavor='rapids', resolution=self.louvain_resolution) return adata_copy
# read data X = pd.read_csv(file_name) X = X.apply(pd.to_numeric, errors='coerce') if 'class' in X: X = X.drop('class', axis=1) if 'target' in X: X = X.drop('target', axis=1) if 'variety' in X: X = X.drop('variety', axis=1) X = X.values X = X.astype("float32") sklearn_X = X n_components = min(X.shape) time_init_pca = dt.now() print( "CUML Running PCA with {} features, {} samples, and {} n_components on dataset {}" .format(X.shape[1], X.shape[0], n_components, dataset_name)) pca = PCA(n_components=min(X.shape), copy=True, whiten=False, svd_solver='jacobi', tol=1.e-3, iterated_power=15, random_state=42) time_fit_transform = dt.now() X_transformed = pca.fit_transform(X) print("CUML Time for transform {}ms".format( (dt.now() - time_fit_transform).microseconds / 1000)) print("CUML Total time {}ms".format( (dt.now() - time_init_pca).microseconds / 1000))
def pca_gpu(): print("gpu pca") pca_loan_gpu = PCA_gpu(n_components=n_components) pca_loan_gpu.fit(loan_norm_rdf)