示例#1
0
    def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated(
            self, new_pipegraph):
        a = ppg.FileGeneratingJob("out/A",
                                  lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p")
        a.depends_on(p)

        def gen():
            c = ppg.FileGeneratingJob(
                "out/C", lambda: writeappend("out/C", "out/Cx", "C"))
            c.depends_on(a)

        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/Ac") == "A"
        assert read("out/C") == "C"
        assert read("out/Cx") == "C"
        new_pipegraph.new_pipegraph()

        a = ppg.FileGeneratingJob("out/A",
                                  lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p2")
        a.depends_on(p)
        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/Ac") == "AA"
        assert read("out/Cx") == "CC"
示例#2
0
    def test_invalidation(self, new_pipegraph):
        def gen():
            ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "D"
        new_pipegraph.new_pipegraph()

        def gen():
            ppg.FileGeneratingJob("out/D", lambda: write("out/D", "E"))

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "E"
示例#3
0
    def test_annos_dependening(self):
        class A(Annotator):
            cache_name = "hello"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "hello2"
            columns = ["ab"]

            def calc(self, df):
                return df["aa"] + "b"

            def dep_annos(self):
                return [A()]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        a += B()
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        ppg.run_pipegraph()
        assert "ab" in a.df.columns
        assert "aa" in a.df.columns
        assert (a.df["ab"] == (a.df["aa"] + "b")).all()
    def test_by_annotator(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": start,
                    "stop": start + 1000,
                },
                {
                    "chr": "chr22",
                    "start": start + 20000,
                    "stop": start + 20000 + 1000,
                },
                {
                    "chr": "chr22",
                    "start": start + 30000,
                    "stop": start + 30000 + 1000,
                },
            ]
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lanes = {lane1.name: lane1}
        raw_data = {
            lane1.name: np.array(
                [
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                ]
            )
        }
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )

        class FakeAnno(mbf_genomics.annotator.Annotator):
            columns = ["colA"]

            def calc(self, df):
                return pd.Series([1, 3, 2])

        o = order.ByAnnotator(FakeAnno())
        ppg.JobGeneratingJob("shu", lambda: None).depends_on(
            o.get_dependencies(plot_regions, lanes)[0]
        )
        ppg.run_pipegraph()
        plot_regions._load()

        norm_data = norm.AsIs().calc(lanes, raw_data)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert (res_order == [0, 2, 1]).all()
示例#5
0
        def gen():
            def genB():
                def genC():
                    ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

                ppg.JobGeneratingJob("C", genC)

            ppg.JobGeneratingJob("B", genB)
        def gen():
            calc_job = ppg.CachedDataLoadingJob("out/B", calc, store)

            def gen2():
                dump_job = ppg.FileGeneratingJob("out/A", dump)
                dump_job.depends_on(calc_job)

            ppg.JobGeneratingJob("out/D", gen2)
示例#7
0
    def test_invalidation_multiple_stages(self, new_pipegraph):
        counter = [0]

        def count():
            counter[0] += 1
            return str(counter[0])

        def gen():
            def genB():
                def genC():
                    count()
                    ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

                ppg.JobGeneratingJob("C", genC)

            ppg.JobGeneratingJob("B", genB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "D"
        assert counter[0] == 1

        new_pipegraph.new_pipegraph()
        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "D"
        assert counter[0] == 2

        new_pipegraph.new_pipegraph()

        def gen():
            def genB():
                def genC():
                    count()
                    ppg.FileGeneratingJob("out/D", lambda: write("out/D", "E"))

                ppg.JobGeneratingJob("C", genC)

            ppg.JobGeneratingJob("B", genB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "E"
        assert counter[0] == 3
示例#8
0
    def test_basic(self):
        def gen():
            ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
            ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C"))

        ppg.JobGeneratingJob("genjob", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/B") == "B"
        assert read("out/C") == "C"
示例#9
0
def force_load(job, prefix=None):
    """make sure a dataloadingjob has been loaded (if applicable)"""
    if ppg.inside_ppg():
        if not isinstance(job, ppg.Job):
            if prefix is None:
                global fl_count
                fl_count += 1
                prefix = "fl_%i" % fl_count
        else:
            prefix = job.job_id
        return ppg.JobGeneratingJob(prefix + "_force_load",
                                    lambda: None).depends_on(job)
示例#10
0
    def test_adding_in_job_generating_raises(self):
        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
        )

        def gen():
            a.add_annotator(Constant("shu", 5))

        job = ppg.JobGeneratingJob("x", gen)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert isinstance(job.exception, ppg.JobContractError)
示例#11
0
        def a():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))

            def genA():
                jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C"))
                jobC.depends_on(jobB)

            jobA = ppg.JobGeneratingJob("A", genA)
            jobB.depends_on(jobA)
            ppg.run_pipegraph()
            assert read("out/B") == "B"
            assert read("out/C") == "C"
    def test_by_column(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": start,
                    "stop": start + 1000,
                    "colA": "a",
                },
                {
                    "chr": "chr22",
                    "start": start + 20000,
                    "stop": start + 20000 + 1000,
                    "colA": "c",
                },
                {
                    "chr": "chr22",
                    "start": start + 30000,
                    "stop": start + 30000 + 1000,
                    "colA": "b",
                },
            ]
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lanes = {lane1.name: lane1}
        o = order.ByAnnotator("colA", func=lambda x: [ord(y) for y in x])
        raw_data = {
            lane1.name: np.array(
                [
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                ]
            )
        }
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )
        ppg.JobGeneratingJob("shu", lambda: None).depends_on(plot_regions.load())
        ppg.run_pipegraph()
        plot_regions._load()

        norm_data = norm.AsIs().calc(lanes, raw_data)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert (res_order == [0, 2, 1]).all()
示例#13
0
    def test_injecting_multiple_stages(self):
        def gen():
            def genB():
                def genC():
                    ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

                ppg.JobGeneratingJob("C", genC)

            ppg.JobGeneratingJob("B", genB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "D"
示例#14
0
    def test_generated_job_depending_on_each_other_one_of_them_is_Invariant(
        self, new_pipegraph
    ):
        # basic idea. You have jobgen A,
        # it not only creates filegenB, but also ParameterDependencyC that A depends on
        # does that work
        def gen():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
            jobB.ignore_code_changes()
            jobC = ppg.ParameterInvariant("C", ("ccc",))
            jobB.depends_on(jobC)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/B") == "B"

        new_pipegraph.new_pipegraph()

        def gen2():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "C"))
            jobB.ignore_code_changes()
            jobC = ppg.ParameterInvariant("C", ("ccc",))
            jobB.depends_on(jobC)

        ppg.JobGeneratingJob("A", gen2)
        ppg.run_pipegraph()
        assert read("out/B") == "B"  # no rerun

        new_pipegraph.new_pipegraph()

        def gen3():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "C"))
            jobB.ignore_code_changes()
            jobCX = ppg.ParameterInvariant("C", ("DDD",))
            jobB.depends_on(jobCX)

        ppg.JobGeneratingJob("A", gen3)
        ppg.run_pipegraph()
        assert read("out/B") == "C"  # did get rerun
示例#15
0
    def test_generated_job_depending_on_each_other(self):
        # basic idea. You have jobgen A,
        # it not only creates filegenB, but also filegenC that depends on B
        # does that work
        def gen():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
            jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", read("out/B")))
            jobC.depends_on(jobB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/B") == "B"
        assert read("out/C") == "B"
示例#16
0
 def test_annotator_coliding_with_non_anno_column(self):
     a = DelayedDataFrame(
         "shu",
         lambda: pd.DataFrame(
             {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
         ).set_index("idx"),
     )
     a += Constant("A", "aa")
     lj = a.anno_jobs["A"]
     ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "were already present" in str(lj().exception)
示例#17
0
    def test_raises_if_needs_more_ram_than_we_have(self):
        def gen():
            jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            jobA.memory_needed = 1024 * 1024 * 1024 * 1024

        ppg.JobGeneratingJob("genjob", gen)
        try:
            ppg.run_pipegraph()
            raise ValueError("should not be reached")
        except ppg.RuntimeError:
            pass
        assert not (os.path.exists("out/A"))  # since the gen job crashed
        jobGenerated = ppg.util.global_pipegraph.jobs["out/A"]
        assert jobGenerated.failed
        assert jobGenerated.error_reason == "Needed to much memory/cores"
示例#18
0
    def test_jobgenerating_is_not_dependency_injection(self):
        old = ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

        def gen():
            write("out/E", "E")
            p = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C"))
            old.depends_on(p)

        j = ppg.JobGeneratingJob("genjob", gen)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert isinstance(j.exception, ppg.JobContractError)
        assert read("out/E") == "E"
        assert not os.path.exists("out/C")  # that job never makes it to the pipeline
        assert read("out/D") == "D"
示例#19
0
    def test_generated_jobs_that_can_not_run_right_away_because_of_dataloading_do_not_crash(
            self):
        o = Dummy()
        existing_dl = ppg.AttributeLoadingJob("a", o, "a", lambda: "Ashu")

        def gen():
            new_dl = ppg.AttributeLoadingJob("b", o, "b", lambda: "Bshu")
            fg_a = ppg.FileGeneratingJob("out/C", lambda: write("out/C", o.a))
            fg_b = ppg.FileGeneratingJob("out/D", lambda: write("out/D", o.b))
            fg_a.depends_on(existing_dl)
            fg_b.depends_on(new_dl)

        ppg.JobGeneratingJob("E", gen)
        ppg.run_pipegraph()
        assert read("out/C") == "Ashu"
        assert read("out/D") == "Bshu"
示例#20
0
def test_job_generating_job_changing_cwd(new_pipegraph):
    from pathlib import Path

    os.mkdir("shu")

    def load():
        os.chdir("shu")
        Path("b").write_text("world")
        return 55

    a = ppg.FileGeneratingJob("a", lambda: Path("a").write_text("hello"))
    b = ppg.JobGeneratingJob("b", load)
    a.depends_on(b)
    ppg.run_pipegraph()
    assert read("a") == "hello"
    assert read("shu/b") == "world"
示例#21
0
    def test_anno_not_returning_enough_rows_and_no_index_range_index_on_df(self):
        class BrokenAnno(Annotator):
            columns = ["X"]

            def calc(self, df):
                return pd.DataFrame({"X": [1]})

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
        )
        a += BrokenAnno()
        lj = a.anno_jobs["X"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "Length and index mismatch " in str(lj().exception)
示例#22
0
    def test_anno_returning_series(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C"]

            def calc(self, df):
                return pd.Series(list(range(len(df))))

        a += SeriesAnno()
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        ppg.run_pipegraph()
        assert (a.df["C"] == [0, 1, 2]).all()
示例#23
0
    def test_generated_job_depends_on_failing_job(self, new_pipegraph):
        # import logging
        # new_pipegraph.new_pipegraph(log_file="debug.log", log_level=logging.DEBUG)
        def fn_a():
            raise ValueError()

        def fn_b():
            c = ppg.FileGeneratingJob("c", lambda: write("c", read("a")))
            c.depends_on(a)
            return [c]

        a = ppg.FileGeneratingJob("a", fn_a)
        b = ppg.JobGeneratingJob("b", fn_b)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()

        assert isinstance(a.exception, ValueError)
        assert a.error_reason == "Exception"
        assert b.error_reason == "no error"
        assert ppg.util.global_pipegraph.jobs["c"].error_reason == "Indirect"
示例#24
0
    def test_generated_job_depending_on_each_other_one_of_them_is_loading(self):
        # basic idea. You have jobgen A,
        # it not only creates filegenB, but also DataloadingC that depends on B
        # does that work
        def gen():
            def load():
                global shu
                shu = "123"

            def do_write():
                global shu
                write("out/A", shu)

            dl = ppg.DataLoadingJob("dl", load)
            jobB = ppg.FileGeneratingJob("out/A", do_write)
            jobB.depends_on(dl)

        ppg.JobGeneratingJob("gen", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "123"
示例#25
0
    def test_lying_about_columns(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C"]

            def calc(self, df):
                return pd.DataFrame({"D": [0, 1, 2]})

        a += SeriesAnno()
        lj = a.anno_jobs["C"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "declared different " in str(lj().exception)
示例#26
0
    def test_anno_returning_string(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C", "D"]

            def calc(self, df):
                return "abc"

        a += SeriesAnno()
        lj = a.anno_jobs["C"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "result was no dataframe" in str(lj().lfg.exception)
示例#27
0
    def test_being_generated(self):
        o = Dummy()

        def calc():
            return 55

        def store(value):
            o.a = value

        def dump():
            write("out/A", str(o.a))

        def gen():
            calc_job = ppg.CachedDataLoadingJob("out/B", calc, store)
            dump_job = ppg.FileGeneratingJob("out/A", dump)
            dump_job.depends_on(calc_job)

        ppg.JobGeneratingJob("out/C", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "55"
示例#28
0
def force_load(ddf):
    ppg.JobGeneratingJob("shu", lambda: 55).depends_on(ddf.annotate())
示例#29
0
 def inner():
     ppg.JobGeneratingJob("out/a", "shu")
示例#30
0
 def inner():
     ppg.JobGeneratingJob(5, lambda: 1)