def test_exceeding_max_cycle(self, new_pipegraph): max_depth = 50 # this raisess... jobs = [] for x in range(0, max_depth - 1): j = ppg.FileGeneratingJob(str(x), lambda: write(str(x), str(x))) if jobs: j.depends_on(jobs[-1]) jobs.append(j) jobs[0].depends_on(j) def inner(): ppg.run_pipegraph() assertRaises(ppg.CycleError, inner) new_pipegraph.new_pipegraph() jobs = [] for x in range(0, max_depth + 100): j = ppg.FileGeneratingJob(str(x), lambda: write(str(x), str(x))) if jobs: j.depends_on(jobs[-1]) jobs.append(j) jobs[0].depends_on(j) with pytest.raises(ppg.CycleError): ppg.run_pipegraph()
def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated( self, new_pipegraph ): a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p") a.depends_on(p) def gen(): c = ppg.FileGeneratingJob( "out/C", lambda: writeappend("out/C", "out/Cx", "C") ) c.depends_on(a) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/Ac") == "A" assert read("out/C") == "C" assert read("out/Cx") == "C" new_pipegraph.new_pipegraph() a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p2") a.depends_on(p) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/Ac") == "AA" assert read("out/Cx") == "CC"
def test_unpickle_bug_prevents_single_job_from_unpickling(self): def do_a(): write("out/A", "A") append("out/As", "A") ppg.FileGeneratingJob("out/A", do_a) def do_b(): write("out/B", "A") append("out/Bs", "A") job_B = ppg.FileGeneratingJob("out/B", do_b) cd = CantDepickle() job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert read("out/Bs") == "A" print("second run") ppg.new_pipegraph(dump_graph=False) ppg.FileGeneratingJob("out/A", do_a) job_B = ppg.FileGeneratingJob("out/B", do_b) job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert ( read("out/Bs") == "AA" ) # this one got rerun because we could not load the invariant...
def test_reruns_just_plot_if_plot_changed(self, new_pipegraph): def calc(): append("out/calc", "A") return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150))} ) def plot(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.png" ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() assert magic(of).find(b"PNG image") != -1 assert read("out/calc") == "A" assert read("out/plot") == "B" new_pipegraph.new_pipegraph() def plot2(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("Y", "X") ppg.PlotJob(of, calc, plot2) ppg.run_pipegraph() assert magic(of).find(b"PNG image") != -1 assert read("out/calc") == "A" assert read("out/plot") == "BB"
def test_raises_on_non_dependend_job_injection2(self): o = Dummy() of = "out/A" def do_write(): write(of, o.A + o.B) job = ppg.FileGeneratingJob(of, do_write) ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def generate_deps(): def load_a(): return "A" def load_b(): return "B" dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a) ppg.AttributeLoadingJob("dlB", o, "B", load_b) job.depends_on(dlA) # let's not do anything with dlA gen_job = ppg.DependencyInjectionJob("C", generate_deps) job.depends_on(gen_job) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert not (os.path.exists(of)) # since the gen job crashed assert os.path.exists( "out/D" ) # since it has no relation to the gen job actually... assert isinstance(gen_job.exception, ppg.JobContractError) assert "case 1" in str(gen_job.exception)
def test_no_rerun_if_calc_change_but_ignore_codechanges(self, new_pipegraph): def calc(): append("out/calc", "A") return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150))} ) def plot(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.png" job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() assert magic(of).find(b"PNG image") != -1 assert read("out/calc") == "A" assert read("out/plot") == "B" new_pipegraph.new_pipegraph() def calc2(): append("out/calc", "A") x = 5 # noqa: E157,F841 return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150))} ) job = ppg.PlotJob(of, calc2, plot) job.ignore_code_changes() ppg.run_pipegraph() assert magic(of).find(b"PNG image") != -1 assert read("out/calc") == "A" assert read("out/plot") == "B"
def test_basic_prune(self): ppg.FileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B")) b.prune() ppg.run_pipegraph() assert Path("A").read_text() == "A" assert not Path("B").exists()
def test_reruns_both_if_calc_changed(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B') ppg.new_pipegraph(rc_gen(), quiet=True) def calc2(): append('out/calc', 'A') x = 5 return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) job = ppg.PlotJob(of, calc2, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'AA') self.assertEqual(read('out/plot'),'BB')
def test_no_rerun_if_ignore_code_changes_and_plot_changes(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B') ppg.new_pipegraph(rc_gen(), quiet=True) def plot2(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('Y','X') job = ppg.PlotJob(of, calc, plot2) job.ignore_code_changes() ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B')
def test_plotjob_fails(self): def calc(): return None def calc2(): return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150)), "w": "B"} ) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") p1 = ppg.PlotJob("out/A.png", calc, plot) p2 = ppg.PlotJob("out/B.png", calc2, plot) import pathlib pc = ppg.CombinedPlotJob( pathlib.Path("out/C.png"), [p1, p2], {"facet": "w"} ) with pytest.raises(ValueError): ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], []) with pytest.raises(ValueError): ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1], {"facet": "w"}) ppg.CombinedPlotJob(pathlib.Path("out/D.png"), [p1, p2], []) ppg.CombinedPlotJob(pathlib.Path("out/E.png"), [p1, p2], {"facet": "w"}) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "did not return a" in str(p1.cache_job.exception) assert pc.error_reason == "Indirect"
def test_basic(self): def calc(): return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150))} ) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") def plot2(df): p = pyggplot.Plot(df).add_scatter("Y", "X") p.width = 5 p.height = 2 return p of = "out/test.png" p = ppg.PlotJob(of, calc, plot) p.add_fiddle(lambda p: p.scale_x_log10()) p.add_another_plot("out/test2.png", plot2) ppg.run_pipegraph() assert magic(of).find(b"PNG image") != -1 assert os.path.exists(of + ".tsv") assert os.path.exists("cache/out/test.png") assert os.path.exists("out/test2.png") assert not os.path.exists("cache/out/test2.png") assert not os.path.exists("cache/out/test2.png.tsv")
def test_job_creation_after_pipegraph_run_raises(self): def inner(): ppg.FileGeneratingJob("A", lambda: None) ppg.new_pipegraph(quiet=True, dump_graph=False) ppg.run_pipegraph() assertRaises(ValueError, inner)
def test_raises_on_non_dependend_job_injection2_can_be_ignored(self): o = Dummy() of = "out/A" def do_write(): write(of, o.A) # + o.B - but B is not in the dependency chain! job = ppg.FileGeneratingJob(of, do_write) ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def generate_deps(): def load_a(): return "A" def load_b(): return "B" dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a) ppg.AttributeLoadingJob("dlB", o, "B", load_b) job.depends_on(dlA) # let's not do anything with dlA gen_job = ppg.DependencyInjectionJob( "C", generate_deps, check_for_dependency_injections=False ) job.depends_on(gen_job) ppg.run_pipegraph() assert os.path.exists(of) # since the gen job crashed
def test_jobs_concurrent_jobs_run_concurrently(self): # we'll determine this by the start respective end times.. ppg.new_pipegraph( ppg.resource_coordinators.LocalSystem(max_cores_to_use=2), quiet=True, dump_graph=False, ) jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobA.cores_needed = 1 jobB.cores_needed = 1 ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/B") == "B" if jobA.start_time < jobB.start_time: first_job = jobA second_job = jobB else: first_job = jobB second_job = jobA print( "times", first_job.start_time, first_job.stop_time, second_job.start_time, second_job.stop_time, ) if jobA.start_time is None: raise ValueError("JobA did not run") assert first_job.stop_time > second_job.start_time
def test_tempfile_not_run_on_prune(self): a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A"))) b.depends_on(a) b.prune() ppg.run_pipegraph() assert not Path('B').exists() assert not Path('A').exists()
def test_run_may_be_called_only_once(self): ppg.new_pipegraph(quiet=True, dump_graph=False) ppg.run_pipegraph() def inner(): ppg.run_pipegraph() assertRaises(ValueError, inner)
def test_pruning_final_jobs_directly(self): ppg.FileGeneratingJob("A", lambda: write("A", "A")) ppg.FileGeneratingJob("B", lambda: write("B", "B")) c = ppg.FinalJob("shu", lambda: write("C", "C")) c.prune() ppg.run_pipegraph() assert Path("A").read_text() == "A" assert Path("B").read_text() == "B" assert not Path("C").exists()
def test_pdf(self): import pydataframe def calc(): return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.pdf' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PDF document') != -1)
def test_can_not_run_twice(self): ppg.new_pipegraph(dump_graph=False) ppg.run_pipegraph() try: ppg.run_pipegraph() assert False # "Exception not correctly raised" except ValueError as e: print(e) assert "Each pipegraph may be run only once." in str(e)
def test_basic(self): ppg.new_pipegraph(rc_gen(), quiet=False) import pydataframe def calc(): return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1)
def test_ignored_if_generating_within_filegenerating(self): write_job = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "aa")) def load(): ppg.FileGeneratingJob("out/B", lambda: write("out/B", "aa")) write("out/C", "c") dl = ppg.FileGeneratingJob("out/C", load) write_job.depends_on(dl) ppg.run_pipegraph() assert read("out/C") == "c"
def test_basic(self): def gen(): ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) ppg.JobGeneratingJob("genjob", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/B") == "B" assert read("out/C") == "C"
def a(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) def genA(): jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) jobC.depends_on(jobB) jobA = ppg.JobGeneratingJob("A", genA) jobB.depends_on(jobA) ppg.run_pipegraph() assert read("out/B") == "B" assert read("out/C") == "C"
def test_tempfile_still_run_if_needed_for_other(self): a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A"))) c = ppg.FileGeneratingJob("C", lambda: write("C", "C" + read("A"))) b.depends_on(a) c.depends_on(a) b.prune() ppg.run_pipegraph() assert not Path('B').exists() assert Path('C').exists() assert Path('C').read_text() == 'CA' assert not Path('A').exists()
def test_raises_if_generating_within_dataload(self): ppg.util.global_pipegraph.quiet = False write_job = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "aa")) def load(): ppg.FileGeneratingJob("out/B", lambda: write("out/B", "aa")) dl = ppg.DataLoadingJob("load_data", load) write_job.depends_on(dl) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Trying to add new jobs to running pipeline" in str(dl.exception)
def test_generated_job_depending_on_each_other(self): # basic idea. You have jobgen A, # it not only creates filegenB, but also filegenC that depends on B # does that work def gen(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", read("out/B"))) jobC.depends_on(jobB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/B") == "B" assert read("out/C") == "B"
def test_non_default_status_filename(self): try: forget_job_status("shu.dat") forget_job_status() ppg.new_pipegraph( quiet=True, invariant_status_filename="shu.dat", dump_graph=False ) ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) ppg.run_pipegraph() assert os.path.exists("shu.dat") assert not (os.path.exists(ppg.graph.invariant_status_filename_default)) finally: forget_job_status("shu.dat")
def test_pdf(self): def calc(): return pd.DataFrame( {"X": list(range(0, 100)), "Y": list(range(50, 150))} ) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.pdf" ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() assert magic(of).find(b"PDF document") != -1
def test_can_not_add_jobs_after_run(self): ppg.new_pipegraph(dump_graph=False) ppg.run_pipegraph() try: ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) assert False # , "Exception not correctly raised") except ValueError as e: print(e) assert ( "This pipegraph was already run. You need to create a new one for more jobs" in str(e) )
def test_injecting_multiple_stages(self): def gen(): def genB(): def genC(): ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) ppg.JobGeneratingJob("C", genC) ppg.JobGeneratingJob("B", genB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "D"
def test_complete(self): def calc(): return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)), "w": "A" }) def calc2(): return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)), "w": "B" }) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") p1 = ppg.PlotJob("out/A.png", calc, plot) p2 = ppg.PlotJob("out/B.png", calc2, plot) import pathlib ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], ["w"]) ppg.CombinedPlotJob(pathlib.Path("out/D.png"), [p1, p2], []) ppg.CombinedPlotJob( pathlib.Path("out/E.png"), [p1, p2], {"facets": "w"}, fiddle=lambda p: p.scale_x_log10(), ) with pytest.raises(ValueError): ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], "w") with pytest.raises(TypeError): ppg.CombinedPlotJob(5, [p1, p2], "w") with pytest.raises(ValueError): ppg.CombinedPlotJob("out/D.something", [p1, p2], "w") with pytest.raises(ValueError): ppg.CombinedPlotJob("out/D.png", [], "w") with pytest.raises(ValueError): ppg.CombinedPlotJob("out/D.png", [p1, p2.job_id], "w") ppg.run_pipegraph() assert magic("out/C.png").find(b"PNG image") != -1 assert magic("out/D.png").find(b"PNG image") != -1 assert magic("out/E.png").find(b"PNG image") != -1
def test_smooth(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() df = pd.DataFrame( [ { "chr": "chr22", "start": 36925 * 1000 - 1000, "stop": 36925 * 1000 + 1000, }, { "chr": "chr22", "start": 31485 * 1000 - 2000, "stop": 31485 * 1000 + 2000, }, {"chr": "chr22", "start": 41842 * 1000, "stop": (41842 * 1000) + 1}, ] ) plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lane2 = mbf_align.lanes.AlignedSample( "two", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) h = mbf_heatmap.chipseq.Heatmap( plot_regions, [lane1, lane2], region_strategy=regions.RegionFromCenter(1000), smoothing_strategy=smooth.SmoothExtendedReads(), ) fn = "test.png" h.plot(fn, norm.AsIs(), order.FirstLaneSum()) ppg.run_pipegraph() assert_image_equal(fn)
def test_write_trim_predefines(tmpdir, scouter): scouter.write_predefined_sequences() outputfile = scouter.result_dir / "predefined_sequences.tsv" ppg.run_pipegraph() df = pd.read_csv(outputfile, sep="\t") scouter.assert_predefined(df["Full Sequence"].values, df["Sequence"].values) assert outputfile.exists() df_new = pd.read_csv(outputfile, sep="\t") df_new.index = df_new["Name"] print(df.head()) assert df_new.loc["1>A_test3"]["Duplicate"] assert df_new.loc["1>A_test4"]["Duplicate"] assert df_new.loc["1>A_test3"]["Deduplicated"] assert not df_new.loc["1>A_test4"]["Deduplicated"] assert df_new.loc["1>A_test3"][ "Duplicate Entries"] == "1>A_test3;1>A_test4"
def test_anno_returning_series(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.Series(list(range(len(df)))) a += SeriesAnno() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert (a.df["C"] == [0, 1, 2]).all()
def test_raises_if_calc_returns_non_df(self): #import pydataframe def calc(): return None def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X', 'Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) try: ppg.run_pipegraph() raise ValueError("should not be reached") except ppg.RuntimeError: pass self.assertTrue( isinstance(job.cache_job.exception, ppg.JobContractError))
def test_simple(self): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) def store(value): o.a = value job = ppg.CachedDataLoadingJob("out/mycalc", calc, store) of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 100))
def test_lane_with_job_generating_fastq(self): def gen_fastq(fn): with open(fn, "wb") as op: op.write(b"@shu\nAGTC\n+\nasdf") job = FileGeneratingJob("input.fastq", gen_fastq) lane = Sample("Sample_a", job, False, vid="VA000") assert lane.vid == "VA000" temp_job = lane.prepare_input() assert job in temp_job.prerequisites real_job = lane.save_input() ppg.run_pipegraph() assert not Path(temp_job.filenames[0]).exists() assert Path(real_job.filenames[0]).exists() with gzip.GzipFile(real_job.filenames[0], "r") as op: lines = op.readlines() assert len(lines) == 4
def test_basic(self, new_pipegraph): # TODO: there is a problem with this apporach. The AttributeLoadingJob # references different objects, since it get's pickled alongside with the method, # and depickled again, and then it's not the same object anymore, # so the FileGeneratingJob and the AttributeLoadingJob in this test # reference different objects. # I'm not sure how to handle this right now though. # I have an idea: Do JobGraphModifyingJobs in each slave, and send back just the # dependency data (and new job name). # that way, we can still execute on any slave, and all the pointers should be # right. new_pipegraph.new_pipegraph() o = Dummy() of = "out/A" def do_write(): # logging.info("Accessing dummy (o) %i in pid %s" % (id(o), os.getpid())) write(of, o.A + o.B) job = ppg.FileGeneratingJob(of, do_write) def generate_deps(): def load_a(): # logging.info('executing load A') return "A" def load_b(): # logging.info('executing load B') return "B" # logging.info("Creating dl on %i in pid %s" % (id(o), os.getpid())) dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a) # logging.info("created dlA") dlB = ppg.AttributeLoadingJob("dlB", o, "B", load_b) job.depends_on(dlA) job.depends_on(dlB) return [dlA, dlB] gen_job = ppg.DependencyInjectionJob("C", generate_deps) job.depends_on(gen_job) ppg.run_pipegraph() assert read(of) == "AB"
def test_two_differenct_annotators_with_identical_column_names_raise_on_creation( self ): a = DummyAnnotatable("A") columns_called = [False] class DA(Annotator): def __init__(self, prefix): self.prefix = prefix self.cache_name = prefix @property def columns(self): raise ValueError() columns_called[0] = True return ["%s-A" % self.prefix] def calc(self, df): ll = len(df) return pd.DataFrame({"DA1-A": [0] * ll}) class DA2(Annotator): cache_name = "DA2" def __init__(self, prefix): self.prefix = prefix @property def columns(self): columns_called[0] = True return ["%s-A" % self.prefix] def annotate(self, df): ll = len(df) return pd.DataFrame({"DA1-A": [0] * ll}) a += DA("DA-1") d = DA("DA-2") a += d # still ok. a += d # still ok...a assert DA("DA-2") is d assert columns_called[0] is False # with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph()
def test_logging(new_pipegraph): import logging my_logger = logging.getLogger('pypipegraph') h = logging.FileHandler(filename='ppg.log', mode='w') my_logger.addHandler(h) logging.getLogger().warning("Should not be in the log.") try: my_logger.setLevel(logging.DEBUG) f = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') h.setFormatter(f) ppg.FileGeneratingJob('out/A', lambda: write('out/A', 'A')) ppg.run_pipegraph() finally: my_logger.removeHandler(h) assert os.path.exists('ppg.log') d = read('ppg.log') assert not ('Should not be in the log.\n' in d) assert 'pypipegraph - INFO' in d assert 'pypipegraph - DEBUG' in d
def test_injecting_filegenerating_job(self): of = "out/A" def do_write(): write(of, read("out/B")) job = ppg.FileGeneratingJob(of, do_write) def generate_dep(): def write_B(): write("out/B", "B") inner_job = ppg.FileGeneratingJob("out/B", write_B) job.depends_on(inner_job) job_gen = ppg.DependencyInjectionJob("gen_job", generate_dep) job.depends_on(job_gen) ppg.run_pipegraph() assert read("out/A") == "B"
def test_count_fastq_trimmed(tmpdir, raw_lane, scouter): scouter.write_fastq_count_trimmed(raw_lane) ppg.run_pipegraph() output_file = scouter.result_dir / f"{raw_lane.name}_{scouter.name}_all_reads_trimmed.tsv" df = pd.read_csv(output_file, sep="\t") expected = { "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGCA": 2, "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGC": 2, "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTG": 1, "TTGCTTTACCTCCTTTTAGCCTCTTTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGCA": 2, "AGGAATCGCTTTACCTCCTTTTAGTTGAAATTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGT": 1 } for _, row in df.iterrows(): assert expected[row["Sequence"]] == row["Count"]
def test_cached_jobs_get_depencies_only_on_the_lazy_filegenerator_not_on_the_loading_job( self): o = Dummy() def calc(): return list(range(0, o.b)) job = ppg.CachedAttributeLoadingJob("a", o, "a", calc) def do_b(): return 100 jobB = ppg.AttributeLoadingJob("b", o, "b", do_b) job.depends_on(jobB) assert not (jobB in job.prerequisites) assert jobB in job.lfg.prerequisites ppg.run_pipegraph() assert jobB.was_invalidated assert job.was_invalidated
def test_anno_not_returning_enough_rows_and_no_index(self): class BrokenAnno(Annotator): columns = ["X"] def calc(self, df): return pd.DataFrame({"X": [1]}) a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) a += BrokenAnno() lj = a.anno_jobs["X"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Length and index mismatch " in str(lj().exception)
def test_raises_if_plot_returns_non_plot(self): # import pyggplot def calc(): return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) def plot(df): return None of = "out/test.png" job = ppg.PlotJob(of, calc, plot) try: ppg.run_pipegraph() raise ValueError("should not be reached") except ppg.RuntimeError: pass assert isinstance(job.exception, ppg.JobContractError)
def test_all_transcripts(self, mock_download, shared_prebuild): g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild) ppg.run_pipegraph() df = g.df_transcripts assert "gene_stable_id" in df.columns assert len(df) == 6928 + 4 # from the a2 locus assert df["chr"].dtype.name == "category" assert df["biotype"].dtype.name == "category" assert df.loc["KIS71021"].chr == "2" assert df.loc["KIS71021"].strand == 1 assert df.loc["KIS71021"].start == 354_742 assert df.loc["KIS71021"].stop == 356_690 assert df.loc["KIS71021"].gene_stable_id == "UMAG_12118" assert df.loc["KIS71021"].biotype == "protein_coding" assert df.loc["KIS71021"].exons == ((354_742, 354_936), (355_222, 356_690)) assert df.loc["KIS71021"].exon_stable_ids == ("KIS71021-1", "KIS71021-2")
def test_annotator_raising(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class RaiseAnno(Annotator): columns = ["aa"] cache_name = "empty" def calc(self, df): raise ValueError("hello") anno1 = RaiseAnno() a += anno1 force_load(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() anno_job = a.anno_jobs[RaiseAnno().get_cache_name()] assert "hello" in str(anno_job.lfg.exception)
def test_multi_level(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) b = a.filter("sha", lambda df: df["C"] == 4, Constant("C", 4)) a1 = LenAnno("count") b += a1 c = b.filter("shc", lambda df: df["A"] >= 2) a2 = LenAnno("count2") c += a2 c.write() ppg.run_pipegraph() assert len(c.df) == 2 assert (c.df["A"] == [2, 3]).all() assert (c.df["count"] == "count3").all() assert (c.df["count2"] == "count22").all()
def test_cached_dataloading_job_does_not_load_its_preqs_on_cached( self, new_pipegraph ): o = Dummy() def a(): o.a = "A" append("out/A", "A") def calc(): append("out/B", "B") return o.a * 2 def load(value): o.c = value append("out/Cx", "C") # not C, that's the cached file, you know... def output(): write("out/D", o.c) dl = ppg.DataLoadingJob("out/A", a) ca = ppg.CachedDataLoadingJob("out/C", calc, load) fg = ppg.FileGeneratingJob("out/D", output) fg.depends_on(ca) ca.depends_on(dl) ppg.run_pipegraph() assert read("out/D") == "AA" # we did write the final result assert read("out/A") == "A" # ran the dl job assert read("out/B") == "B" # ran the calc job... assert read("out/Cx") == "C" # ran the load jobo os.unlink("out/D") # so the filegen and the loadjob of cached should rerun... new_pipegraph.new_pipegraph() dl = ppg.DataLoadingJob("out/A", a) ca = ppg.CachedDataLoadingJob("out/C", calc, load) fg = ppg.FileGeneratingJob("out/D", output) fg.depends_on(ca) ca.depends_on(dl) ppg.run_pipegraph() assert read("out/D") == "AA" # we did write the final result assert read("out/A") == "A" # did not run the dl job assert read("out/B") == "B" # did not run the calc job again assert read("out/Cx") == "CC" # did run the load job again
def test_protein_creation(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), None, ProkaryoticCode(), ) g.download_genome() g.job_transcripts() ppg.run_pipegraph() should = dict( iter_fasta( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ))) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(g.find_file("pep.fasta"))) if actual != should: assert not set(should.keys()).difference(set(actual.keys( ))) # they are all here, we just have more (tRNA...) for k in should: if actual[k] != should[k]: print(k) print(len(actual[k])) print(len(should[k])) print(actual[k]) print(should[k]) # print(g.get_cds_sequence(k.decode('utf-8'))) # else: # print('ok', k) # assert actual[k] == should[k] assert False
def test_being_generated(self): o = Dummy() def calc(): return 55 def store(value): o.a = value def dump(): write("out/A", str(o.a)) def gen(): calc_job = ppg.CachedDataLoadingJob("out/B", calc, store) dump_job = ppg.FileGeneratingJob("out/A", dump) dump_job.depends_on(calc_job) ppg.JobGeneratingJob("out/C", gen) ppg.run_pipegraph() assert read("out/A") == "55"
def test_generated_job_depending_on_each_other_one_of_them_is_loading(self): # basic idea. You have jobgen A, # it not only creates filegenB, but also DataloadingC that depends on B # does that work def gen(): def load(): global shu shu = "123" def do_write(): global shu write("out/A", shu) dl = ppg.DataLoadingJob("dl", load) jobB = ppg.FileGeneratingJob("out/A", do_write) jobB.depends_on(dl) ppg.JobGeneratingJob("gen", gen) ppg.run_pipegraph() assert read("out/A") == "123"
def test_generated_job_depends_on_failing_job(self, new_pipegraph): # import logging # new_pipegraph.new_pipegraph(log_file="debug.log", log_level=logging.DEBUG) def fn_a(): raise ValueError() def fn_b(): c = ppg.FileGeneratingJob("c", lambda: write("c", read("a"))) c.depends_on(a) return [c] a = ppg.FileGeneratingJob("a", fn_a) b = ppg.JobGeneratingJob("b", fn_b) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert isinstance(a.exception, ValueError) assert a.error_reason == "Exception" assert b.error_reason == "no error" assert ppg.util.global_pipegraph.jobs["c"].error_reason == "Indirect"
def test_transcript_wrong_order(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "start > stop" in str(job.exception)
def test_anno_returning_string(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C", "D"] def calc(self, df): return "abc" a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "result was no dataframe" in str(lj().lfg.exception)
def test_DynamicColumNames(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class Dynamic(Annotator): @property def columns(self): return ["a"] def calc(self, df): return pd.DataFrame({"a": ["x", "y"]}) a += Dynamic() a.anno_jobs[Dynamic().get_cache_name()] force_load(a.annotate()) ppg.run_pipegraph() assert_frame_equal( a.df, pd.DataFrame({"A": [1, 2], "B": ["c", "d"], "a": ["x", "y"]}) )
def test_annotator_missing_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class MissingColumnNames(Annotator): cache_name = "MissingColumnNames" def calc(self, df): return pd.DataFrame({}) def __repr__(self): return "MissingColumnNames()" a += MissingColumnNames() lg = a.anno_jobs["MissingColumnNames"] force_load(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "AttributeError" in repr(lg().lfg.exception)
def test_add_another_not_returning_plot(self): def calc(): return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") def plot2(df): return of = "out/test.png" p = ppg.PlotJob(of, calc, plot) p.add_fiddle(lambda p: p.scale_x_log10()) p2 = p.add_another_plot("out/test2.png", plot2) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert isinstance(p2.exception, ppg.JobContractError)
def test_lying_about_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.DataFrame({"D": [0, 1, 2]}) a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "declared different " in str(lj().exception)
def test_empty_gtf_and_cdna_and_protein(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), None, None, ) g.download_genome() assert g.gtf_filename is None assert g.cdna_fasta_filename is None g.job_transcripts() g.job_genes() g.job_proteins() ppg.run_pipegraph() assert len(g.df_transcripts) == 0 assert len(g.get_gtf()) == 0 assert len(g.df_genes) == 0 assert len(g.df_proteins) == 0