def test_cell_lines(): print("Testing on Cell Lines...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") if os.path.exists("./result/cell_lines_cpu_z.npy"): Z_cpu = np.load("./result/cell_lines_cpu_z.npy") print("Precalculated CPU mode result is loaded.") else: start_cpu = time.time() Z_cpu = harmonize(X, df_metadata, 'dataset') end_cpu = time.time() print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) np.save("./result/cell_lines_cpu_z.npy", Z_cpu) if os.path.exists("./result/cell_lines_gpu_z.npy"): Z_gpu = np.load("./result/cell_lines_gpu_z.npy") print("Precalculated GPU mode result is loaded.") else: start_gpu = time.time() Z_gpu = harmonize(X, df_metadata, 'dataset', use_gpu = True) end_gpu = time.time() print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) np.save("./result/cell_lines_gpu_z.npy", Z_gpu) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metrics(Z_cpu, Z_R, prefix = "cell_lines_cpu") check_metrics(Z_gpu, Z_R, prefix = "cell_lines_gpu") if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")] if len(umap_list) < 4: plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "cell_lines", batch_key = 'dataset')
def test_mantonbm(): print("Testing on MantonBM...") z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)] if len(z_files) < 3: adata = pg.read_input("./data/MantonBM/original_data.h5ad") adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1])) if os.path.exists("./result/MantonBM_cpu_z.npy"): Z_cpu = np.load("./result/MantonBM_cpu_z.npy") print("Precalculated CPU mode result is loaded.") else: start_cpu = time.time() Z_cpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel') end_cpu = time.time() print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) np.save("./result/MantonBM_cpu_z.npy", Z_cpu) if os.path.exists("./result/MantonBM_gpu_z.npy"): Z_gpu = np.load("./result/MantonBM_gpu_z.npy") print("Precalculated GPU mode result is loaded.") else: start_gpu = time.time() Z_gpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel', use_gpu = True) end_gpu = time.time() print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) np.save("./result/MantonBM_gpu_z.npy", Z_gpu) Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") check_metrics(Z_cpu, Z_R, prefix = "MantonBM_cpu") check_metrics(Z_gpu, Z_R, prefix = "MantonBM_gpu") if os.path.exists("./result/MantonBM_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "MantonBM", batch_key = 'Individual')
def transform(self, ds: loompy.LoomConnection, normalizer: Normalizer, cells: np.ndarray = None) -> np.ndarray: if cells is None: cells = np.arange(ds.shape[1]) transformed = np.zeros((cells.shape[0], self.pca.n_components_)) j = 0 # Support out-of-order datasets key = None if "Accession" in ds.row_attrs: key = "Accession" layer = self.layer if self.layer is not None else "" for (_, selection, view) in ds.scan(items=cells, axis=1, layers=[layer], key=key): vals = normalizer.transform(view.layers[layer][:, :], selection) n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform( vals[self.genes, :].transpose()) j += n_cells_in_batch if self.test_significance: # Must select significant components only once, and reuse for future transformations if self.sigs is None: pvalue_KS = np.zeros( transformed.shape[1]) # pvalue of each component for i in range(1, transformed.shape[1]): (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i]) self.sigs = np.where(pvalue_KS < 0.1)[0] if len(self.sigs) == 0: self.sigs = (0, 1) transformed = transformed[:, self.sigs] if self.batch_keys is not None and len(self.batch_keys) > 0: keys_df = pd.DataFrame.from_dict( {k: ds.ca[k] for k in self.batch_keys}) transformed = harmonize(transformed, keys_df, batch_key=self.batch_keys) return transformed
def pre_step1(adata): adata.var['highly_variable'] = meanCVfit(adata) adata.raw = adata sc.pp.scale(adata, max_value=10) #scale sc.tl.pca(adata, svd_solver='arpack') #run PCA Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key='Batch') adata.obsm['X_harmony'] = Z #need these b/c will re-run kNN in UMAP 2D space sc.pp.neighbors(adata, n_neighbors=25, use_rep='X_harmony') sc.tl.umap(adata) return adata
def test_mantonbm(): print("Testing on MantonBM dataset...") z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"): adata = pg.read_input("./data/MantonBM/original_data.h5ad") adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1])) if os.path.exists("./result/MantonBM_torch_z.npy"): Z_torch = np.load("./result/MantonBM_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/MantonBM_torch_z.npy", Z_torch) if os.path.exists("./result/MantonBM_py_z.npy"): Z_py = np.load("./result/MantonBM_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/MantonBM_py_z.npy", Z_py) Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'L2') if os.path.exists("./result/MantonBM_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "MantonBM", batch_key = "Individual")
def test_pbmc(): print("Testing on 10x pbmc dataset...") z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"): adata = pg.read_input("./data/10x_pbmc/original_data.h5ad") if os.path.exists("./result/pbmc_torch_z.npy"): Z_torch = np.load("./result/pbmc_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/pbmc_torch_z.npy", Z_torch) if os.path.exists("./result/pbmc_py_z.npy"): Z_py = np.load("./result/pbmc_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print(ho.objective_harmony) print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/pbmc_py_z.npy", Z_py) Z_R = np.loadtxt("./result/pbmc_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'L2') if os.path.exists("./result/pbmc_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "pbmc", batch_key = "Channel")
def init_recording(self): data = self.recorder.toggle() if data: print(data) wave_gen, filename, duration_midi = data for i in range(len(duration_midi)): if duration_midi[i][0] < 0.12: duration_midi[i] = (duration_midi[i][0], 0) duration_midi = harmony.harmonize(duration_midi) self.live_wave = wave_gen print([[i[1] for i in j] for j in duration_midi]) tempo = 120 multiplier = 1 / 60 * tempo * 480 converted_midi_duration = [[(i * multiplier, j) for i, j in k] for k in duration_midi] for i in converted_midi_duration: self.seq.append( NoteSequencer(self.sched, self.synth, 1, (0, 0), i, True))
def test_cell_lines(): print("Testing on cell lines dataset...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") source_loaded = True if os.path.exists("./result/cell_lines_torch_z.npy"): Z_torch = np.load("./result/cell_lines_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(X, df_metadata, batch_key = 'dataset') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/cell_lines_torch_z.npy", Z_torch) if os.path.exists("./result/cell_lines_py_z.npy"): Z_py = np.load("./result/cell_lines_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(X, df_metadata, ['dataset']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) print(ho.objective_harmony) Z_py = np.transpose(ho.Z_corr) np.save("./result/cell_lines_py_z.npy", Z_py) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2') if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset") if os.path.exists("./result/cell_lines_result.h5ad"): adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r') stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony') print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py') print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch') print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
def run_harmony( data: MultimodalData, rep: str = 'pca', n_jobs: int = -1, n_clusters: int = None, random_state: int = 0, ) -> str: """Batch correction on PCs using Harmony. This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_. Parameters ---------- data: ``MultimodalData``. Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"``. Which representation to use as input of Harmony, default is PCA. n_jobs : ``int``, optional, default: ``-1``. Number of threads to use for the KMeans clustering used in Harmony. ``-1`` refers to using all available threads. n_clusters: ``int``, optional, default: ``None``. Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data. random_state: ``int``, optional, default: ``0``. Seed for random number generator Returns ------- out_rep: ``str`` The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm. This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above. Update ``data.obsm``: * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm. Examples -------- >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25) """ if not is_categorical_dtype(data.obs['Channel']): data.obs['Channel'] = pd.Categorical(data.obs['Channel']) if data.obs['Channel'].cat.categories.size == 1: logger.warning("Warning: data only contains 1 channel. Cannot apply Harmony!") return rep try: from harmony import harmonize except ImportError as e: print(f"ERROR: {e}") print("ERROR: Need Harmony! Try 'pip install harmony-pytorch'.") import sys sys.exit(-1) logger.info("Start integration using Harmony.") out_rep = rep + '_harmony' data.obsm['X_' + out_rep] = harmonize(X_from_rep(data, rep), data.obs, 'Channel', n_clusters = n_clusters, n_jobs_kmeans = n_jobs, random_state = random_state) return out_rep
#6. PCA sc.pp.regress_out(adata, ['n_counts', 'perc_others']) sc.tl.pca(adata, svd_solver='arpack') #7. copy PCA adata for harmony ad_pca = adata.copy() #8. calculate neighbor sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=npc) #9. embed- umap sc.tl.umap(adata, n_components=2, random_state=42) adata.write(f'{fd_out}/concat_merged.h5ad') #-----------------------------harmony----------------------------------- #1. rename ad_pca adata = ad_pca.copy() #2. harmony Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key='sample') adata.obsm['X_harmony'] = Z #3. calculate neighbor sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=npc, use_rep='X_harmony') #4. embed- umap sc.tl.umap(adata, n_components=2, random_state=42) #5. save adata.write(f'{fd_out}/harmony_merged.h5ad')
def run_Harmony(adata, batch_key='orig.ident'): return harmonize(adata.obsm['X_pca'], adata.obs, batch_key=batch_key)
def subcluster_iteration(adata_in, min_cells=10, nhvgs=2000, npcs=20, n_neighbors=50, min_dist=1.0, spread=2.0, resolution=1., umap_genestoplot=['CD14'], pc_genestoplot=['CD14'], other_plot=['DPIc', 'louvain'], random_state=14, harmony=False, harmony_key='frz_status', regress_out_keys=None, n_jobs_regress=1, harmony_theta=2, scale=True): ''' Assumes input data is already log TP10K normalized''' _adata = adata_in.copy() sc.pp.filter_genes(_adata, min_cells=min_cells) sc.pp.highly_variable_genes(_adata, n_top_genes=nhvgs) _adata = _adata[:, _adata.var['highly_variable']] if regress_out_keys is not None: _adata = _adata.copy() sc.pp.regress_out(_adata, regress_out_keys, n_jobs=n_jobs_regress, copy=False) if scale: sc.pp.scale(_adata, max_value=10) sc.tl.pca(_adata, svd_solver='arpack', random_state=14) sc.pl.pca(_adata, components=['1,2', '3,4', '5,6', '7,8'], color=pc_genestoplot, ncols=4, use_raw=True) sc.pl.pca_loadings(_adata, components=[1, 2, 3, 4, 5]) sc.pl.pca_variance_ratio(_adata, log=True) if harmony: Z = harmonize(_adata.obsm['X_pca'], _adata.obs, batch_key=harmony_key, random_state=random_state, theta=harmony_theta) _adata.obsm['X_harmony'] = Z sc.pp.neighbors(_adata, n_neighbors=n_neighbors, n_pcs=npcs, random_state=random_state, use_rep='X_harmony') else: sc.pp.neighbors(_adata, n_neighbors=n_neighbors, n_pcs=npcs, random_state=random_state) sc.tl.umap(_adata, min_dist=min_dist, spread=spread, random_state=random_state) np.random.seed(random_state) sc.tl.leiden(_adata, resolution=resolution, random_state=random_state) fig = sc.pl.umap(_adata, color=umap_genestoplot, use_raw=True) fig = sc.pl.umap(_adata, color=other_plot) sc.tl.rank_genes_groups(_adata, 'leiden', method='wilcoxon') display(pd.DataFrame(_adata.uns['rank_genes_groups']['names']).head(20)) return (_adata)
def run_harmony( data: Union[MultimodalData, UnimodalData], batch: str = "Channel", rep: str = "pca", n_comps: int = None, n_jobs: int = -1, n_clusters: int = None, random_state: int = 0, use_gpu: bool = False, max_iter_harmony: int = 10, ) -> str: """Batch correction on PCs using Harmony. This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_. Parameters ---------- data: ``MultimodalData``. Annotated data matrix with rows for cells and columns for genes. batch: ``str``, optional, default: ``"Channel"``. Which attribute in data.obs field represents batches, default is "Channel". rep: ``str``, optional, default: ``"pca"``. Which representation to use as input of Harmony, default is PCA. n_comps: `int`, optional (default: None) Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. n_jobs : ``int``, optional, default: ``-1``. Number of threads to use in Harmony. ``-1`` refers to using all physical CPU cores. n_clusters: ``int``, optional, default: ``None``. Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data. random_state: ``int``, optional, default: ``0``. Seed for random number generator use_gpu: ``bool``, optional, default: ``False``. If ``True``, use GPU if available. Otherwise, use CPU only. max_iter_harmony: ``int``, optional, default: ``10``. Maximum iterations on running Harmony if not converged. Returns ------- out_rep: ``str`` The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm. This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above. Update ``data.obsm``: * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm. Examples -------- >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25) """ if not check_batch_key(data, batch, "Cannot apply Harmony!"): return rep try: from harmony import harmonize except ImportError as e: import sys logger.error(f"{e}\nNeed Harmony! Try 'pip install harmony-pytorch'.") sys.exit(-1) logger.info("Start integration using Harmony.") out_rep = rep + "_harmony" data.obsm["X_" + out_rep] = harmonize( X_from_rep(data, rep, n_comps), data.obs, batch, n_clusters = n_clusters, n_jobs = n_jobs, random_state = random_state, use_gpu = use_gpu, max_iter_harmony = max_iter_harmony, ) return out_rep