def test_align_and_extract_umis(new_pipegraph): from mbf_align.post_process import AnnotateFastqBarcodes for folder in [ get_sample_path(Path("mbf_align/sample_extract_barcodes")), get_sample_path(Path("mbf_align/sample_extract_barcodes_gz")), ]: new_pipegraph.new_pipegraph() genome = get_human_22_fake_genome() mbf_qualitycontrol.prune_qc(lambda _: False) r = Sample("test", str(folder), False, pairing="only_second", vid="AA123") al = AlignedSample("test", str(folder / "test.bam"), genome, False, "AA123") x = al.post_process( AnnotateFastqBarcodes(r, { "XC": [0, 4], "XM": [7, 7 + 4] })) ppg.run_pipegraph() f = x.get_bam() r = next(f.fetch()) print(r.tags) assert r.get_tag("XC") == "AGTC" assert r.get_tag("XM") == "TGAC"
def test_quick_run(self, new_pipegraph, per_test_store): from mbf_sampledata import get_sample_path, get_human_22_fake_genome from mbf_align.lanes import AlignedSample import mbf_qualitycontrol new_pipegraph.quiet = False mbf_qualitycontrol.disable_qc() input_file = get_sample_path("mbf_externals/input.bam") background_file = get_sample_path("mbf_externals/background.bam") genome = get_human_22_fake_genome() input = AlignedSample("input", input_file, genome, is_paired=False, vid="AA000") background = AlignedSample("background", background_file, genome, is_paired=False, vid="AA001") a = PeakZilla() gr = a.call_peaks(input, background, {"-c": "1.01", "-s": "0.1"}) gr.write() ppg.util.global_pipegraph.run() assert len(gr.df) == 37 assert "AA000" in gr.vid assert "AA001" in gr.vid
def test_subtract_subset(self, new_pipegraph): from mbf_sampledata import get_sample_path from mbf_bam import subtract_bam input = get_sample_path("mbf_align/chipseq_chr22.bam") minued = get_sample_path("mbf_align/chipseq_chr22_subset_plus_unmapped.bam") output = "output.bam" print(input, input.exists()) print(minued, minued.exists()) subtract_bam(str(output), str(input.absolute()), str(minued.absolute())) f = pysam.Samfile(output) should = 80495 total = sum((x.total for x in f.get_index_statistics())) assert should == total
def test_extended_minus_background(self, new_pipegraph): genome = get_human_22_fake_genome() lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) start = 41842000 regions = pd.DataFrame( { "chr": ["chr22"], "start": [ start, ], "stop": [start + 1000], } ) extend = 10 sermb = smooth.SmoothExtendedReadsMinusBackground({lane1.name: lane1}, extend) calculated = sermb.calc(regions, lane1) should = np.zeros((1, 1000)) assert (should == calculated).all() assert lane1.load() in sermb.get_dependencies(lane1)
def test_volcano_plot(self): ppg.util.global_pipegraph.quiet = False import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] treated = [x for x in pasilla_data.columns if x.startswith("treated")] untreated = [ x for x in pasilla_data.columns if x.startswith("untreated") ] pasilla_data = DelayedDataFrame("pasilla", pasilla_data) comp = Comparisons(pasilla_data, { "treated": treated, "untreated": untreated }).a_vs_b("treated", "untreated", TTest()) comp.filter([("log2FC", "|>=", 2.0), ("FDR", "<=", 0.05)]) prune_qc(lambda job: "volcano" in job.job_id) run_pipegraph() qc_jobs = list(get_qc_jobs()) qc_jobs = [x for x in qc_jobs if not x._pruned] print(qc_jobs) assert len(qc_jobs) == 1 assert_image_equal(qc_jobs[0].filenames[0])
def test_by_annotator(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ { "chr": "chr22", "start": start, "stop": start + 1000, }, { "chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000, }, { "chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000, }, ] ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lanes = {lane1.name: lane1} raw_data = { lane1.name: np.array( [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], ] ) } plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) class FakeAnno(mbf_genomics.annotator.Annotator): columns = ["colA"] def calc(self, df): return pd.Series([1, 3, 2]) o = order.ByAnnotator(FakeAnno()) ppg.JobGeneratingJob("shu", lambda: None).depends_on( o.get_dependencies(plot_regions, lanes)[0] ) ppg.run_pipegraph() plot_regions._load() norm_data = norm.AsIs().calc(lanes, raw_data) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert (res_order == [0, 2, 1]).all()
def test_smooth(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() df = pd.DataFrame( [ { "chr": "chr22", "start": 36925 * 1000 - 1000, "stop": 36925 * 1000 + 1000, }, { "chr": "chr22", "start": 31485 * 1000 - 2000, "stop": 31485 * 1000 + 2000, }, {"chr": "chr22", "start": 41842 * 1000, "stop": (41842 * 1000) + 1}, ] ) plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lane2 = mbf_align.lanes.AlignedSample( "two", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) h = mbf_heatmap.chipseq.Heatmap( plot_regions, [lane1, lane2], region_strategy=regions.RegionFromCenter(1000), smoothing_strategy=smooth.SmoothExtendedReads(), ) fn = "test.png" h.plot(fn, norm.AsIs(), order.FirstLaneSum()) ppg.run_pipegraph() assert_image_equal(fn)
def test_rename_raises_on_no_replacement(self, new_pipegraph): ppg.util.global_pipegraph.quiet = False input = get_sample_path("mbf_align/ex2.bam") output = "out.bam" j = job_reheader_and_rename_chromosomes(input, output, {}) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert not Path("out.bam").exists() assert "No replacement happened" in str(j.exception)
def get_human_22_fake_genome(): import gzip genes = pd.read_msgpack( gzip.GzipFile( mbf_sampledata.get_sample_path("mbf_align/hs_22_genes.msgpack.gz") ) ).reset_index() tr = pd.read_msgpack( gzip.GzipFile( mbf_sampledata.get_sample_path("mbf_align/hs_22_transcripts.msgpack.gz") ) ).reset_index() genes["chr"] = "chr22" tr["chr"] = "chr22" return MockGenome( df_genes=genes, df_transcripts=tr, chr_lengths={"chr22": 50_818_468} )
def test_deseq2_with_and_without_additional_columns(self): import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] print(pasilla_data.columns) pasilla_data = pasilla_data.assign( treated_fake=pasilla_data.treated2fb, untreated_fake=pasilla_data.untreated2fb, ) gts = { "treated": [ x for x in pasilla_data.columns if x.startswith("treated") and "3" not in x ], "untreated": [ x for x in pasilla_data.columns if x.startswith("untreated") and "3" not in x ], "other": [x for x in pasilla_data.columns if "3" in x], } assert len(gts["other"]) == 2 assert sum( (len(x) for x in gts.values())) + 1 == len(pasilla_data.columns) # GeneId ddf = DelayedDataFrame("ex", pasilla_data) c = Comparisons(ddf, gts) with_other = c.a_vs_b( "treated", "untreated", DESeq2Unpaired(), include_other_samples_for_variance=True, ) without_other = c.a_vs_b( "treated", "untreated", DESeq2Unpaired(), include_other_samples_for_variance=False, ) force_load(ddf.add_annotator(with_other)) force_load(ddf.add_annotator(without_other)) # run_pipegraph() df = ddf.df print(df.head()) df.to_csv("test.csv") # this is a fairly weak test, but it shows that it at least does *something* assert (df[with_other["p"]] != pytest.approx( df[without_other["p"]])).all() assert (df[with_other["log2FC"]] != pytest.approx( df[without_other["log2FC"]])).all()
def test_rename(self, new_pipegraph): ppg.util.global_pipegraph.quiet = False input = get_sample_path("mbf_align/ex2.bam") output = "out.bam" job_reheader_and_rename_chromosomes( input, output, {"chr1": "shu", "chr2": "sha"} ) ppg.run_pipegraph() assert Path("out.bam").exists() f = pysam.Samfile("out.bam") assert set(f.references) == set(["shu", "sha"])
def test_by_column(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ { "chr": "chr22", "start": start, "stop": start + 1000, "colA": "a", }, { "chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000, "colA": "c", }, { "chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000, "colA": "b", }, ] ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lanes = {lane1.name: lane1} o = order.ByAnnotator("colA", func=lambda x: [ord(y) for y in x]) raw_data = { lane1.name: np.array( [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], ] ) } plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) ppg.JobGeneratingJob("shu", lambda: None).depends_on(plot_regions.load()) ppg.run_pipegraph() plot_regions._load() norm_data = norm.AsIs().calc(lanes, raw_data) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert (res_order == [0, 2, 1]).all()
def test_extended(self, new_pipegraph): genome = get_human_22_fake_genome() lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) start = 41842000 regions = pd.DataFrame( { "chr": ["chr22"], "start": [ start, ], "stop": [start + 1000], } ) extend = 10 calculated = smooth.SmoothExtendedReads(extend).calc(regions, lane1) should = np.zeros(1000) known = [ (41842170, True, [(0, 36)]), (41842241, False, [(0, 36)]), (41842399, False, [(0, 36)]), (41842416, False, [(0, 36)]), (41842602, True, [(0, 36)]), (41842687, False, [(0, 36)]), (41842689, True, [(0, 36)]), (41842730, True, [(0, 36)]), (41842750, False, [(0, 36)]), (41842770, True, [(0, 36)]), (41842796, True, [(0, 36)]), (41842942, False, [(0, 36)]), (41842985, False, [(0, 36)]), ] for pos, is_reverse, cigar in known: pos -= start print(pos) if is_reverse: # downstream verlaengern! should[pos - extend : pos + cigar[0][1]] += 1 else: should[pos : pos + cigar[0][1] + extend] += 1 should = should.reshape((1, 1000)) assert should.shape == calculated.shape if (should != calculated).any(): for ii in range(1000): if should[0, ii] != calculated[0, ii]: print(ii, should[0, ii], calculated[0, ii]) assert (should == calculated).all()
def test_simple(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ {"chr": "chr22", "start": start, "stop": start + 1000}, {"chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000}, {"chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000}, ] ) plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lane2 = mbf_align.lanes.AlignedSample( "two", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) h = mbf_heatmap.chipseq.Heatmap( plot_regions, [lane1, lane2], region_strategy=regions.RegionAsIs(), smoothing_strategy=smooth.SmoothRaw(), ) fn = "test.png" h.plot(fn, norm.AsIs(), order.AsIs()) ppg.run_pipegraph() assert_image_equal(fn)
def test_get_reads_in_exon(): import mbf_sampledata import pysam genome = mbf_sampledata.get_human_22_fake_genome() bam = pysam.Samfile( mbf_sampledata.get_sample_path("mbf_align/rnaseq_spliced_chr22.bam")) g = genome.genes["ENSG00000128228"] reads = g.get_reads_in_exons(bam) assert reads start = 21642302 - 1 stop = 21644299 for r in reads: ov = r.get_overlap(start, stop) assert ov > 0
def _get_tuch_data(self): import mbf_sampledata import mbf_r import rpy2.robjects as ro path = mbf_sampledata.get_sample_path("mbf_comparisons/TuchEtAlS1.csv") # directly from the manual. # plus minus """To make # this file, we downloaded Table S1 from Tuch et al. [39], deleted some unnecessary columns # and edited the column headings slightly:""" ro.r("""load_data = function(path) { rawdata <- read.delim(path, check.names=FALSE, stringsAsFactors=FALSE) library(edgeR) y <- DGEList(counts=rawdata[,3:8], genes=rawdata[,1:2]) library(org.Hs.eg.db) idfound <- y$genes$idRefSeq %in% mappedRkeys(org.Hs.egREFSEQ) y <- y[idfound,] egREFSEQ <- toTable(org.Hs.egREFSEQ) m <- match(y$genes$idRefSeq, egREFSEQ$accession) y$genes$EntrezGene <- egREFSEQ$gene_id[m] egSYMBOL <- toTable(org.Hs.egSYMBOL) m <- match(y$genes$EntrezGene, egSYMBOL$gene_id) y$genes$Symbol <- egSYMBOL$symbol[m] o <- order(rowSums(y$counts), decreasing=TRUE) y <- y[o,] d <- duplicated(y$genes$Symbol) y <- y[!d,] cbind(y$genes, y$counts) } """) df = mbf_r.convert_dataframe_from_r(ro.r("load_data")(str(path))) df.columns = [ "idRefSeq", "nameOfGene", "EntrezGene", "Symbol", "8.N", "8.T", "33.N", "33.T", "51.N", "51.T", ] assert len(df) == 10519 return df
def test_deseq2(self): import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] gts = { "treated": [x for x in pasilla_data.columns if x.startswith("treated")], "untreated": [x for x in pasilla_data.columns if x.startswith("untreated")], } ddf = DelayedDataFrame("ex", pasilla_data) c = Comparisons(ddf, gts) a = c.a_vs_b("treated", "untreated", DESeq2Unpaired()) force_load(ddf.add_annotator(a)) run_pipegraph() check = """# This is deseq2 version specific data- probably needs fixing if upgrading deseq2 ## baseMean log2FoldChange lfcSE stat pvalue padj ## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> ## FBgn0039155 453 -3.72 0.160 -23.2 1.63e-119 1.35e-115 ## FBgn0029167 2165 -2.08 0.103 -20.3 1.43e-91 5.91e-88 ## FBgn0035085 367 -2.23 0.137 -16.3 6.38e-60 1.75e-56 ## FBgn0029896 258 -2.21 0.159 -13.9 5.40e-44 1.11e-40 ## FBgn0034736 118 -2.56 0.185 -13.9 7.66e-44 1.26e-40 """ df = ddf.df.sort_values(a["FDR"]) df = df.set_index("Gene") for row in check.split("\n"): row = row.strip() if row and not row[0] == "#": row = row.split() self.assertAlmostEqual(df.ix[row[0]][a["log2FC"]], float(row[2]), places=2) self.assertAlmostEqual(df.ix[row[0]][a["p"]], float(row[5]), places=2) self.assertAlmostEqual(df.ix[row[0]][a["FDR"]], float(row[6]), places=2)
def test_correlation(self): ppg.util.global_pipegraph.quiet = False import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] treated = [x for x in pasilla_data.columns if x.startswith("treated")] untreated = [ x for x in pasilla_data.columns if x.startswith("untreated") ] pasilla_data = DelayedDataFrame("pasilla", pasilla_data) Comparisons(pasilla_data, {"treated": treated, "untreated": untreated}) prune_qc(lambda job: "correlation" in job.job_id) run_pipegraph() qc_jobs = list(get_qc_jobs()) qc_jobs = [x for x in qc_jobs if not x._pruned] print(qc_jobs) assert len(qc_jobs) == 1 assert_image_equal(qc_jobs[0].filenames[0])
def get_alignment_stats(self, bam_filename): assert (Path(bam_filename).resolve() == get_sample_path( "mbf_align/rnaseq_spliced_chr22.bam").resolve()) return {"Hello": 23}
def test_ithlane_max(self, new_pipegraph): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ {"chr": "chr22", "start": start, "stop": start + 1000}, {"chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000}, {"chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000}, ] ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lane2 = mbf_align.lanes.AlignedSample( "two", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) with pytest.raises(AttributeError): order.IthLaneMax(lane1.name) o = order.IthLaneMax(1) # raw_data = {lane1.name: smooth.SmoothRaw().calc(df, lane1)} raw_data = { lane1.name: np.array( [ [0, 0, 5, 0], [2, 1, 1, 1], [1, 0, 0, 0], ] ) } print(raw_data) print(raw_data[lane1.name].max(axis=1)) lanes = {lane1.name: lane1} lanes[lane2.name] = lane2 norm_data = norm.AsIs().calc(lanes, raw_data) plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) with pytest.raises(KeyError): o.calc( plot_regions, {lane1.name: lane1, lane2.name: lane2}, raw_data, norm_data, ) o = order.IthLaneMax(lane2) with pytest.raises(KeyError): o.calc(plot_regions, {lane1.name: lane1}, raw_data, norm_data) raw_data[lane2.name] = raw_data[lane1.name].copy() res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert clusters is None assert ( res_order == [2, 1, 0] ).all() # remember, from top to bottom in plotting later on. raw_data[lane2.name] = np.array( [ [0, 0, 0, 0], [5, 1, 1, 0], [1, 0, 0, 4], ] ) o = order.IthLaneMax(0) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert ( res_order == [2, 1, 0] ).all() # remember, from top to bottom in plotting later on. o = order.IthLaneMax(1) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert ( res_order == [0, 2, 1] ).all() # remember, from top to bottom in plotting later on.