def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated( self, new_pipegraph): a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p") a.depends_on(p) def gen(): c = ppg.FileGeneratingJob( "out/C", lambda: writeappend("out/C", "out/Cx", "C")) c.depends_on(a) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/Ac") == "A" assert read("out/C") == "C" assert read("out/Cx") == "C" new_pipegraph.new_pipegraph() a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p2") a.depends_on(p) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/Ac") == "AA" assert read("out/Cx") == "CC"
def test_invalidation(self, new_pipegraph): def gen(): ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "D" new_pipegraph.new_pipegraph() def gen(): ppg.FileGeneratingJob("out/D", lambda: write("out/D", "E")) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "E"
def test_annos_dependening(self): class A(Annotator): cache_name = "hello" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "hello2" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += B() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert "ab" in a.df.columns assert "aa" in a.df.columns assert (a.df["ab"] == (a.df["aa"] + "b")).all()
def test_by_annotator(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ { "chr": "chr22", "start": start, "stop": start + 1000, }, { "chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000, }, { "chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000, }, ] ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lanes = {lane1.name: lane1} raw_data = { lane1.name: np.array( [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], ] ) } plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) class FakeAnno(mbf_genomics.annotator.Annotator): columns = ["colA"] def calc(self, df): return pd.Series([1, 3, 2]) o = order.ByAnnotator(FakeAnno()) ppg.JobGeneratingJob("shu", lambda: None).depends_on( o.get_dependencies(plot_regions, lanes)[0] ) ppg.run_pipegraph() plot_regions._load() norm_data = norm.AsIs().calc(lanes, raw_data) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert (res_order == [0, 2, 1]).all()
def gen(): def genB(): def genC(): ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) ppg.JobGeneratingJob("C", genC) ppg.JobGeneratingJob("B", genB)
def gen(): calc_job = ppg.CachedDataLoadingJob("out/B", calc, store) def gen2(): dump_job = ppg.FileGeneratingJob("out/A", dump) dump_job.depends_on(calc_job) ppg.JobGeneratingJob("out/D", gen2)
def test_invalidation_multiple_stages(self, new_pipegraph): counter = [0] def count(): counter[0] += 1 return str(counter[0]) def gen(): def genB(): def genC(): count() ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) ppg.JobGeneratingJob("C", genC) ppg.JobGeneratingJob("B", genB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "D" assert counter[0] == 1 new_pipegraph.new_pipegraph() ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "D" assert counter[0] == 2 new_pipegraph.new_pipegraph() def gen(): def genB(): def genC(): count() ppg.FileGeneratingJob("out/D", lambda: write("out/D", "E")) ppg.JobGeneratingJob("C", genC) ppg.JobGeneratingJob("B", genB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "E" assert counter[0] == 3
def test_basic(self): def gen(): ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) ppg.JobGeneratingJob("genjob", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/B") == "B" assert read("out/C") == "C"
def force_load(job, prefix=None): """make sure a dataloadingjob has been loaded (if applicable)""" if ppg.inside_ppg(): if not isinstance(job, ppg.Job): if prefix is None: global fl_count fl_count += 1 prefix = "fl_%i" % fl_count else: prefix = job.job_id return ppg.JobGeneratingJob(prefix + "_force_load", lambda: None).depends_on(job)
def test_adding_in_job_generating_raises(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) ) def gen(): a.add_annotator(Constant("shu", 5)) job = ppg.JobGeneratingJob("x", gen) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert isinstance(job.exception, ppg.JobContractError)
def a(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) def genA(): jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) jobC.depends_on(jobB) jobA = ppg.JobGeneratingJob("A", genA) jobB.depends_on(jobA) ppg.run_pipegraph() assert read("out/B") == "B" assert read("out/C") == "C"
def test_by_column(self, new_pipegraph_no_qc): genome = get_human_22_fake_genome() start = 17750239 df = pd.DataFrame( [ { "chr": "chr22", "start": start, "stop": start + 1000, "colA": "a", }, { "chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000, "colA": "c", }, { "chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000, "colA": "b", }, ] ) lane1 = mbf_align.lanes.AlignedSample( "one", mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"), genome, False, None, ) lanes = {lane1.name: lane1} o = order.ByAnnotator("colA", func=lambda x: [ord(y) for y in x]) raw_data = { lane1.name: np.array( [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], ] ) } plot_regions = mbf_genomics.regions.GenomicRegions( "testregions", lambda: df, [], genome ) ppg.JobGeneratingJob("shu", lambda: None).depends_on(plot_regions.load()) ppg.run_pipegraph() plot_regions._load() norm_data = norm.AsIs().calc(lanes, raw_data) res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data) assert (res_order == [0, 2, 1]).all()
def test_injecting_multiple_stages(self): def gen(): def genB(): def genC(): ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) ppg.JobGeneratingJob("C", genC) ppg.JobGeneratingJob("B", genB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/D") == "D"
def test_generated_job_depending_on_each_other_one_of_them_is_Invariant( self, new_pipegraph ): # basic idea. You have jobgen A, # it not only creates filegenB, but also ParameterDependencyC that A depends on # does that work def gen(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobB.ignore_code_changes() jobC = ppg.ParameterInvariant("C", ("ccc",)) jobB.depends_on(jobC) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/B") == "B" new_pipegraph.new_pipegraph() def gen2(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "C")) jobB.ignore_code_changes() jobC = ppg.ParameterInvariant("C", ("ccc",)) jobB.depends_on(jobC) ppg.JobGeneratingJob("A", gen2) ppg.run_pipegraph() assert read("out/B") == "B" # no rerun new_pipegraph.new_pipegraph() def gen3(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "C")) jobB.ignore_code_changes() jobCX = ppg.ParameterInvariant("C", ("DDD",)) jobB.depends_on(jobCX) ppg.JobGeneratingJob("A", gen3) ppg.run_pipegraph() assert read("out/B") == "C" # did get rerun
def test_generated_job_depending_on_each_other(self): # basic idea. You have jobgen A, # it not only creates filegenB, but also filegenC that depends on B # does that work def gen(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", read("out/B"))) jobC.depends_on(jobB) ppg.JobGeneratingJob("A", gen) ppg.run_pipegraph() assert read("out/B") == "B" assert read("out/C") == "B"
def test_annotator_coliding_with_non_anno_column(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) a += Constant("A", "aa") lj = a.anno_jobs["A"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "were already present" in str(lj().exception)
def test_raises_if_needs_more_ram_than_we_have(self): def gen(): jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) jobA.memory_needed = 1024 * 1024 * 1024 * 1024 ppg.JobGeneratingJob("genjob", gen) try: ppg.run_pipegraph() raise ValueError("should not be reached") except ppg.RuntimeError: pass assert not (os.path.exists("out/A")) # since the gen job crashed jobGenerated = ppg.util.global_pipegraph.jobs["out/A"] assert jobGenerated.failed assert jobGenerated.error_reason == "Needed to much memory/cores"
def test_jobgenerating_is_not_dependency_injection(self): old = ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def gen(): write("out/E", "E") p = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) old.depends_on(p) j = ppg.JobGeneratingJob("genjob", gen) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert isinstance(j.exception, ppg.JobContractError) assert read("out/E") == "E" assert not os.path.exists("out/C") # that job never makes it to the pipeline assert read("out/D") == "D"
def test_generated_jobs_that_can_not_run_right_away_because_of_dataloading_do_not_crash( self): o = Dummy() existing_dl = ppg.AttributeLoadingJob("a", o, "a", lambda: "Ashu") def gen(): new_dl = ppg.AttributeLoadingJob("b", o, "b", lambda: "Bshu") fg_a = ppg.FileGeneratingJob("out/C", lambda: write("out/C", o.a)) fg_b = ppg.FileGeneratingJob("out/D", lambda: write("out/D", o.b)) fg_a.depends_on(existing_dl) fg_b.depends_on(new_dl) ppg.JobGeneratingJob("E", gen) ppg.run_pipegraph() assert read("out/C") == "Ashu" assert read("out/D") == "Bshu"
def test_job_generating_job_changing_cwd(new_pipegraph): from pathlib import Path os.mkdir("shu") def load(): os.chdir("shu") Path("b").write_text("world") return 55 a = ppg.FileGeneratingJob("a", lambda: Path("a").write_text("hello")) b = ppg.JobGeneratingJob("b", load) a.depends_on(b) ppg.run_pipegraph() assert read("a") == "hello" assert read("shu/b") == "world"
def test_anno_not_returning_enough_rows_and_no_index_range_index_on_df(self): class BrokenAnno(Annotator): columns = ["X"] def calc(self, df): return pd.DataFrame({"X": [1]}) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) ) a += BrokenAnno() lj = a.anno_jobs["X"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Length and index mismatch " in str(lj().exception)
def test_anno_returning_series(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.Series(list(range(len(df)))) a += SeriesAnno() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert (a.df["C"] == [0, 1, 2]).all()
def test_generated_job_depends_on_failing_job(self, new_pipegraph): # import logging # new_pipegraph.new_pipegraph(log_file="debug.log", log_level=logging.DEBUG) def fn_a(): raise ValueError() def fn_b(): c = ppg.FileGeneratingJob("c", lambda: write("c", read("a"))) c.depends_on(a) return [c] a = ppg.FileGeneratingJob("a", fn_a) b = ppg.JobGeneratingJob("b", fn_b) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert isinstance(a.exception, ValueError) assert a.error_reason == "Exception" assert b.error_reason == "no error" assert ppg.util.global_pipegraph.jobs["c"].error_reason == "Indirect"
def test_generated_job_depending_on_each_other_one_of_them_is_loading(self): # basic idea. You have jobgen A, # it not only creates filegenB, but also DataloadingC that depends on B # does that work def gen(): def load(): global shu shu = "123" def do_write(): global shu write("out/A", shu) dl = ppg.DataLoadingJob("dl", load) jobB = ppg.FileGeneratingJob("out/A", do_write) jobB.depends_on(dl) ppg.JobGeneratingJob("gen", gen) ppg.run_pipegraph() assert read("out/A") == "123"
def test_lying_about_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.DataFrame({"D": [0, 1, 2]}) a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "declared different " in str(lj().exception)
def test_anno_returning_string(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C", "D"] def calc(self, df): return "abc" a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "result was no dataframe" in str(lj().lfg.exception)
def test_being_generated(self): o = Dummy() def calc(): return 55 def store(value): o.a = value def dump(): write("out/A", str(o.a)) def gen(): calc_job = ppg.CachedDataLoadingJob("out/B", calc, store) dump_job = ppg.FileGeneratingJob("out/A", dump) dump_job.depends_on(calc_job) ppg.JobGeneratingJob("out/C", gen) ppg.run_pipegraph() assert read("out/A") == "55"
def force_load(ddf): ppg.JobGeneratingJob("shu", lambda: 55).depends_on(ddf.annotate())
def inner(): ppg.JobGeneratingJob("out/a", "shu")
def inner(): ppg.JobGeneratingJob(5, lambda: 1)