def test_aggregate_10x_matrices(self): m1 = pg.read_input( "tests/pegasus-test-data/input/heart_1k_v3/filtered_feature_bc_matrix.h5", genome="mm10", ) m2 = pg.read_input( "tests/pegasus-test-data/input/heart_1k_v2/filtered_gene_bc_matrices_h5.h5", genome="mm10", ) pg.aggregate_matrices( "tests/pegasus-test-data/input/aggregate_test.csv", what_to_return='aggregate_test', ) result = pg.read_input("aggregate_test.h5sc", genome="mm10") self.assertEqual(m1.shape[0] + m2.shape[0], result.shape[0], "Cell dimension is incorrect") self.assertEqual(m1.shape[1], result.shape[1], "Feature dimension is incorrect") m1_result = result[list(range(m1.shape[0])), :] m2_result = result[list(range(m1.shape[0], m1.shape[0] + m2.shape[0])), :] self.assertEqual((m1_result.X != m1.X).sum(), 0, "Values differ") self.assertEqual((m2_result.X != m2.X).sum(), 0, "Values differ") self.assertTrue( m1_result.obs.index.values[0].startswith("heart_1k_v3"), "Prefix not added") self.assertTrue( m2_result.obs.index.values[0].startswith("heart_1k_v2"), "Prefix not added")
def test_read_write_h5ad(self): adata = pg.read_input( "tests/pegasus-test-data/input/hgmm_1k_v3_filtered_feature_bc_matrix/" ) pg.write_output(adata, "test.h5ad") adata2 = pg.read_input("test.h5ad") assert_adata_equal(self, adata, adata2)
def test_read_write_old_5ad_backed_whitelist(self): shutil.copy( "tests/pegasus-test-data/input/test_obsm_compound.h5ad", "test_obsm_compound.h5ad", ) adata = pg.read_input("test_obsm_compound.h5ad", h5ad_mode="r+") pg.write_output(adata, "test_obsm_compound.h5ad", whitelist=["obs"]) adata2 = pg.read_input("test_obsm_compound.h5ad") assert_adata_equal(self, adata, adata2)
def test_output(self): data_h5ad = pg.read_input("tests/result.mm10-rna.h5ad") self.assertEqual(self.data.shape, data_h5ad.shape, "H5AD format's shape is inconsistent!") data_loom = pg.read_input("tests/result.mm10-rna.loom") self.assertEqual(self.data.shape, data_loom.shape, "Loom format's shape is inconsistent!") self.assertIn('result.log', os.listdir('tests'), 'Clustering log is lost!')
def test_csv(self): df = pd.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3], b=[4, 5, 6])) df.to_csv("test.csv") adata = pg.read_input("test.csv", genome="test").T np.testing.assert_array_equal(df.values, adata.X.toarray()) np.testing.assert_array_equal(df.index.values, adata.obs.index.values) np.testing.assert_array_equal(df.columns.values, adata.var.index.values) for chunk_size in [1, 2, 3, 4]: adata_chunks = pg.read_input("test.csv", genome="test", chunk_size=chunk_size).T assert_adata_equal(self, adata, adata_chunks)
def test_write_mtx(self): adata = pg.read_input( "tests/pegasus-test-data/input/heart_1k_v3/filtered_feature_bc_matrix.h5" ) adata.var['test'] = 1.0 adata.obs['test'] = 1.0 output_dir = 'test_mtx/mm10' pg.write_output(adata, os.path.join(output_dir, 'matrix.mtx.gz')) adata2 = pg.read_input(output_dir) del adata2.obs['Channel'] # get channel from csv adata2.obs = adata2.obs.join( pd.read_csv(os.path.join(output_dir, 'obs.csv.gz'), index_col=0)) adata2.var = adata2.var.join( pd.read_csv(os.path.join(output_dir, 'var.csv.gz'), index_col=0)) del adata2.var['featuretype'] assert_adata_equal(self, adata, adata2, obs_blacklist=['Channel'])
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords): """ Read h5ad, loom, mtx, 10X h5, and csv formatted files Parameters ---------- path: str File name of data file. obs: {str, pd.DataFrame} Path to obs data file or a data frame var: {str, pd.DataFrame} Path to var data file or a data frame obs_filter {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids var_filter: {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids Returns ------- Annotated data matrix. """ if str(path).lower().endswith('.txt'): df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0) adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns)) else: adata = pg.read_input(path, **keywords) def get_df(meta): if not isinstance(meta, pd.DataFrame): tmp_path = None if meta.startswith('gs://'): tmp_path = download_gs_url(meta) meta = tmp_path meta = pd.read_csv(meta, sep=None, index_col='id', engine='python') if tmp_path is not None: os.remove(tmp_path) return meta if obs is not None: if not isinstance(obs, list) and not isinstance(obs, tuple): obs = [obs] for item in obs: adata.obs = adata.obs.join(get_df(item)) if var is not None: if not isinstance(var, list) and not isinstance(var, tuple): var = [var] for item in var: adata.var = adata.var.join(get_df(item)) return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
def test_demux(self): data = pg.read_input("tests/cb_cc_demux.zarr.zip") self.assertEqual(data.shape, (737280, 33694), "Demux data shape differs!") self.assertIn('demux_type', data.obs.columns, "Demux type is lost!") self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!") f_list = glob.glob("tests/cb_cc.*.pdf") self.assertEqual(len(f_list), 4, "Demux diagnosis plots are missing!") self.assertIn('cb_cc.out.demuxEM.zarr.zip', os.listdir('tests'), "Demultiplexed RNA matrix is lost!")
def test_citeseq(self): data = pg.read_input("tests/cb_cc_citeseq.zarr.zip") self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!") self.assertIn('demux_type', data.obs.columns, "Demux type is lost!") self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!") self.assertEqual(data.shape, (737280, 33694), "RNA data shape differs!") data.select_data('GRCh38-citeseq') self.assertEqual(data.shape, (578353, 31), "CITE-Seq data shape differs!")
def test_mantonbm(): print("Testing on MantonBM dataset...") z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"): adata = pg.read_input("./data/MantonBM/original_data.h5ad") adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1])) if os.path.exists("./result/MantonBM_torch_z.npy"): Z_torch = np.load("./result/MantonBM_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/MantonBM_torch_z.npy", Z_torch) if os.path.exists("./result/MantonBM_py_z.npy"): Z_py = np.load("./result/MantonBM_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/MantonBM_py_z.npy", Z_py) Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'L2') if os.path.exists("./result/MantonBM_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "MantonBM", batch_key = "Individual")
def test_clustering(self): data = pg.read_input("tests/citeseq_result.zarr.zip") self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!") n_rna_cells = data.shape[0] self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!") self.assertEqual(data.obs['assignment'].cat.categories.size, 7, "Not all cells are demultiplexed singlets!") self.assertIn('X_citeseq', data.obsm.keys(), "CITE-Seq coordinates are lost!") self.assertEqual(data.obsm['X_citeseq_umap'].shape[1], data.obsm['X_umap'].shape[1], "Some of UMAP embeddings is lost!") data.select_data('GRCh38-citeseq') n_citeseq_cells = data.shape[0] self.assertEqual(n_rna_cells, n_citeseq_cells, "Two modalities have inconsistent number of cells!")
def test_pbmc(): print("Testing on 10x pbmc dataset...") z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"): adata = pg.read_input("./data/10x_pbmc/original_data.h5ad") if os.path.exists("./result/pbmc_torch_z.npy"): Z_torch = np.load("./result/pbmc_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/pbmc_torch_z.npy", Z_torch) if os.path.exists("./result/pbmc_py_z.npy"): Z_py = np.load("./result/pbmc_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print(ho.objective_harmony) print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/pbmc_py_z.npy", Z_py) Z_R = np.loadtxt("./result/pbmc_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'L2') if os.path.exists("./result/pbmc_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "pbmc", batch_key = "Channel")
def test_mantonbm(): print("Testing on MantonBM...") z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)] if len(z_files) < 3: adata = pg.read_input("./data/MantonBM/original_data.h5ad") adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1])) if os.path.exists("./result/MantonBM_cpu_z.npy"): Z_cpu = np.load("./result/MantonBM_cpu_z.npy") print("Precalculated CPU mode result is loaded.") else: start_cpu = time.time() Z_cpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel') end_cpu = time.time() print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) np.save("./result/MantonBM_cpu_z.npy", Z_cpu) if os.path.exists("./result/MantonBM_gpu_z.npy"): Z_gpu = np.load("./result/MantonBM_gpu_z.npy") print("Precalculated GPU mode result is loaded.") else: start_gpu = time.time() Z_gpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel', use_gpu = True) end_gpu = time.time() print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) np.save("./result/MantonBM_gpu_z.npy", Z_gpu) Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") check_metrics(Z_cpu, Z_R, prefix = "MantonBM_cpu") check_metrics(Z_gpu, Z_R, prefix = "MantonBM_gpu") if os.path.exists("./result/MantonBM_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "MantonBM", batch_key = 'Individual')
def test_cell_lines(): print("Testing on cell lines dataset...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") source_loaded = True if os.path.exists("./result/cell_lines_torch_z.npy"): Z_torch = np.load("./result/cell_lines_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(X, df_metadata, batch_key = 'dataset') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/cell_lines_torch_z.npy", Z_torch) if os.path.exists("./result/cell_lines_py_z.npy"): Z_py = np.load("./result/cell_lines_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(X, df_metadata, ['dataset']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) print(ho.objective_harmony) Z_py = np.transpose(ho.Z_corr) np.save("./result/cell_lines_py_z.npy", Z_py) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2') if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset") if os.path.exists("./result/cell_lines_result.h5ad"): adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r') stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony') print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py') print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch') print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
def test_mtx_v3_dir(self): adata = pg.read_input( "tests/pegasus-test-data/input/hgmm_1k_v3_filtered_feature_bc_matrix/" ) self.assertEqual(adata.shape[0], 1046)
def test_mtx_v2_dir(self): adata = pg.read_input( "tests/pegasus-test-data/input/hgmm_1k_filtered_gene_bc_matrices/hg19/" ) self.assertEqual(adata.shape[0], 504)
def test_read_write_old_5ad(self): adata = pg.read_input( "tests/pegasus-test-data/input/test_obsm_compound.h5ad") pg.write_output(adata, "test.h5ad") adata2 = pg.read_input("test.h5ad") assert_adata_equal(self, adata, adata2)
if __name__ == "__main__": import pandas as pd import pegasus as pg import argparse parser = argparse.ArgumentParser( description='Update the X_pca with the results of harmony') parser.add_argument('h5ad_filename', type=str) parser.add_argument('harmony_csv', type=str) parser.add_argument('output', type=str) args = parser.parse_args() args = args.__dict__ pca = pd.read_csv(args["harmony_csv"]) pca = pca.values.T[1:] # remove the id pf the pc adata = pg.read_input(args["h5ad_filename"]) adata.obsm["X_pca"] = pca pg.write_output(adata, args["output"])
def main(): args = my_args() out = args.output command = "pegasus aggregate_matrix %s %s" % (args.input_csv, out) os.system(command) zarr_file = "%s.zarr.zip" % (out) data = pg.read_input(zarr_file) if args.citeseq: data.select_data("%s-rna" % (data.uns['genome'])) pg.qc_metrics(data, percent_mito=args.MT_percent, mito_prefix=args.MT_prefix, max_genes=args.max_genes) df_qc = pg.get_filter_stats(data) df_qc.to_csv("%s_qc_get_filter_stats.csv" % (out)) pg.qcviolin(data, plot_type='gene') plt.savefig("%s_qcviolin_gene.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='count') plt.savefig("%s_qcviolin_UMI_count.pdf" % (out), bbox_inches='tight') pg.qcviolin(data, plot_type='mito') plt.savefig("%s_qcviolin_UMI_mito.pdf" % (out), bbox_inches='tight') # filtering pg.filter_data(data) pg.identify_robust_genes(data, percent_cells=0.05) pg.log_norm(data) print(data.obs['Channel'].value_counts()) # save log norm data, rna df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.rna.log_norm.pkl" % (out)) if args.citeseq: data.select_data("%s-citeseq" % (data.uns['genome'])) df = pd.DataFrame.sparse.from_spmatrix(data.X) df.index = data.obs.index.tolist() df.columns = data.var.index.tolist() df.to_pickle("%s.antibody.log_norm.pkl" % (out)) data.select_data("%s-rna" % (data.uns['genome'])) # without batch correction data_baseline = data.copy() pg.highly_variable_features(data_baseline, consider_batch=False, n_top=4000) data_baseline.var.loc[ data_baseline.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data_baseline) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') pg.pca(data_baseline, n_components=200) pg.neighbors(data_baseline, K=200) pg.louvain(data_baseline, resolution=2) pg.umap(data_baseline, n_neighbors=10, min_dist=0.4) pg.scatter(data_baseline, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_without_BC.pdf" % (out), bbox_inches='tight') # with batch correction pg.highly_variable_features(data, consider_batch=True, n_top=4000) data.var.loc[data.var['highly_variable_features']].sort_values( by='hvf_rank') pg.hvfplot(data) plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight') data_harmony = data.copy() pg.pca(data_harmony, n_components=200) harmony_key = pg.run_harmony(data_harmony) pg.neighbors(data_harmony, rep=harmony_key, K=200) pg.louvain(data_harmony, rep=harmony_key, resolution=2) pg.umap(data_harmony, rep=harmony_key, n_neighbors=10, min_dist=0.4) pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap') plt.savefig("%s_Harmony_BC.pdf" % (out), bbox_inches='tight') pg.write_output(data_harmony, "%s_harmony.zarr" % (out)) ddf = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf.index = data_harmony.obs.index.tolist() ddf.columns = data_harmony.var.index.tolist() data_harmony.select_data("%s-citeseq" % (data_harmony.uns['genome'])) ddf2 = pd.DataFrame.sparse.from_spmatrix(data_harmony.X) ddf2.index = data_harmony.obs.index.tolist() ddf2.columns = data_harmony.var.index.tolist() df_all = pd.concat([ddf, ddf2], axis=1) df_all = df_all.sparse.to_dense() df_all = df_all.round(3) df_all.to_csv("%s.Harmony_correction.data.csv" % (out)) ### original harmony UMAP data out = data_harmony.obs.copy() out['UMAP1'] = data_harmony.obsm['X_umap'][:, 0] out['UMAP2'] = data_harmony.obsm['X_umap'][:, 1] from anndata import AnnData ann = AnnData(X=out[['UMAP1', 'UMAP2']], obs=out[['Channel', 'louvain_labels']]) import scanpy as sc from matplotlib import rcParams sc.pl.scatter(ann, x="UMAP1", y="UMAP2", color='louvain_labels', legend_loc='on data', legend_fontsize=12, legend_fontoutline=2, frameon=False, title='clustering of cells') plt.savefig("%s_Scapy_UMAP.png" % (args.output), bbox_inches='tight') out.to_csv("%s_Harmony_UMAP.csv" % (args.output))
import os import pegasus as pg #RNA_markers = snakemake.input.RNA #integrated_markers = snakemake.input.integrated #################### # GLOBAL VARIABLES # #################### args = get_args() adata = pg.read_input("MantonBM_nonmix_subset.h5sc") #directory = snakemake.params.out_dir ############ # FUNCTION # ############ ############### # MAIN # ############### def main(): main()
def __init__(self, *args, **kwargs): super(TestPipeline, self).__init__(*args, **kwargs) self.aggr_data = pg.read_input("tests/aggr.zarr.zip") self.data = pg.read_input("tests/result.zarr.zip")