def get_human_22_fake_genome(): from mbf_genomics.testing import MockGenome import gzip genes = pd.read_msgpack( gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_genes.msgpack.gz"))) ).reset_index() tr = pd.read_msgpack( gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_transcripts.msgpack.gz"))) ).reset_index() return MockGenome(df_genes=genes, df_transcripts=tr, chr_lengths={"22": 50_818_468})
def get_human_22_fake_genome(): import gzip genes = pd.read_msgpack( gzip.GzipFile( mbf_sampledata.get_sample_path("mbf_align/hs_22_genes.msgpack.gz") ) ).reset_index() tr = pd.read_msgpack( gzip.GzipFile( mbf_sampledata.get_sample_path("mbf_align/hs_22_transcripts.msgpack.gz") ) ).reset_index() genes["chr"] = "chr22" tr["chr"] = "chr22" return MockGenome( df_genes=genes, df_transcripts=tr, chr_lengths={"chr22": 50_818_468} )
def test_simple(self): genome = MockGenome( pd.DataFrame({ "stable_id": ["a", "b", "c"], "chr": "1", "tss": [0, 100, 1000], "tes": [10, 101, 1010], }), df_genes_meta=pd.DataFrame({ "gene_stable_id": ["a", "b", "c"], "description": ["hello", "world", "!"], }).set_index("gene_stable_id"), ) g = genes.Genes(genome) anno = genes.annotators.Description() g += anno force_load(g.annotate()) ppg.run_pipegraph() assert "description" in g.df.columns assert (g.df.sort_values("gene_stable_id")["description"] == [ "hello", "world", "!" ]).all()
def test_simple(self, tmpdir): genome = MockGenome( pd.DataFrame({ "stable_id": ["a", "b", "c"], "chr": "1", "tss": [0, 100, 1000], "tes": [10, 101, 1010], }), df_genes_meta=pd.DataFrame({ "gene_stable_id": ["a", "b", "c"], "description": ["hello", "world", "!"], }).set_index("gene_stable_id"), ) g = genes.Genes(genome) df_to_add = pd.DataFrame( { "testcol": [1, 2, 3], "index_vals": ["a", "b", "d"] }, index=["a", "b", "d"]) tmp_path = Path(tmpdir) / "dump.tsv" df_to_add.to_csv(tmp_path, sep="\t", index=False) anno = genes.annotators.FromFile( tmp_path, columns_to_add=["testcol"], index_column_table="index_vals", index_column_genes="gene_stable_id", fill_value=-1, ) g += anno force_load(g.annotate()) ppg.run_pipegraph() print(g.df.index) print(g.df) assert "testcol" in g.df.columns assert g.df.loc[0]["testcol"] == 1 assert g.df.loc[1]["testcol"] == 2 assert g.df.loc[2]["testcol"] == -1 assert len(g.df) == 3