Exemplo n.º 1
0
    def test_exceeding_max_cycle(self, new_pipegraph):
        max_depth = 50
        # this raisess...
        jobs = []
        for x in range(0, max_depth - 1):
            j = ppg.FileGeneratingJob(str(x), lambda: write(str(x), str(x)))
            if jobs:
                j.depends_on(jobs[-1])
            jobs.append(j)
        jobs[0].depends_on(j)

        def inner():
            ppg.run_pipegraph()

        assertRaises(ppg.CycleError, inner)

        new_pipegraph.new_pipegraph()
        jobs = []
        for x in range(0, max_depth + 100):
            j = ppg.FileGeneratingJob(str(x), lambda: write(str(x), str(x)))
            if jobs:
                j.depends_on(jobs[-1])
            jobs.append(j)
        jobs[0].depends_on(j)

        with pytest.raises(ppg.CycleError):
            ppg.run_pipegraph()
    def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated(
        self, new_pipegraph
    ):
        a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p")
        a.depends_on(p)

        def gen():
            c = ppg.FileGeneratingJob(
                "out/C", lambda: writeappend("out/C", "out/Cx", "C")
            )
            c.depends_on(a)

        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/Ac") == "A"
        assert read("out/C") == "C"
        assert read("out/Cx") == "C"
        new_pipegraph.new_pipegraph()

        a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p2")
        a.depends_on(p)
        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/Ac") == "AA"
        assert read("out/Cx") == "CC"
Exemplo n.º 3
0
    def test_unpickle_bug_prevents_single_job_from_unpickling(self):
        def do_a():
            write("out/A", "A")
            append("out/As", "A")

        ppg.FileGeneratingJob("out/A", do_a)

        def do_b():
            write("out/B", "A")
            append("out/Bs", "A")

        job_B = ppg.FileGeneratingJob("out/B", do_b)
        cd = CantDepickle()
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert read("out/Bs") == "A"
        print("second run")
        ppg.new_pipegraph(dump_graph=False)

        ppg.FileGeneratingJob("out/A", do_a)
        job_B = ppg.FileGeneratingJob("out/B", do_b)
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert (
            read("out/Bs") == "AA"
        )  # this one got rerun because we could not load the invariant...
Exemplo n.º 4
0
        def test_reruns_just_plot_if_plot_changed(self, new_pipegraph):
            def calc():
                append("out/calc", "A")
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150))}
                )

            def plot(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.png"
            ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            assert magic(of).find(b"PNG image") != -1
            assert read("out/calc") == "A"
            assert read("out/plot") == "B"

            new_pipegraph.new_pipegraph()

            def plot2(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("Y", "X")

            ppg.PlotJob(of, calc, plot2)
            ppg.run_pipegraph()
            assert magic(of).find(b"PNG image") != -1
            assert read("out/calc") == "A"
            assert read("out/plot") == "BB"
    def test_raises_on_non_dependend_job_injection2(self):
        o = Dummy()
        of = "out/A"

        def do_write():
            write(of, o.A + o.B)

        job = ppg.FileGeneratingJob(of, do_write)
        ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

        def generate_deps():
            def load_a():
                return "A"

            def load_b():
                return "B"

            dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a)
            ppg.AttributeLoadingJob("dlB", o, "B", load_b)
            job.depends_on(dlA)
            # let's not do anything with dlA

        gen_job = ppg.DependencyInjectionJob("C", generate_deps)
        job.depends_on(gen_job)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()

        assert not (os.path.exists(of))  # since the gen job crashed
        assert os.path.exists(
            "out/D"
        )  # since it has no relation to the gen job actually...
        assert isinstance(gen_job.exception, ppg.JobContractError)
        assert "case 1" in str(gen_job.exception)
Exemplo n.º 6
0
        def test_no_rerun_if_calc_change_but_ignore_codechanges(self, new_pipegraph):
            def calc():
                append("out/calc", "A")
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150))}
                )

            def plot(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.png"
            job = ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            assert magic(of).find(b"PNG image") != -1
            assert read("out/calc") == "A"
            assert read("out/plot") == "B"

            new_pipegraph.new_pipegraph()

            def calc2():
                append("out/calc", "A")
                x = 5  # noqa: E157,F841
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150))}
                )

            job = ppg.PlotJob(of, calc2, plot)
            job.ignore_code_changes()
            ppg.run_pipegraph()
            assert magic(of).find(b"PNG image") != -1
            assert read("out/calc") == "A"
            assert read("out/plot") == "B"
Exemplo n.º 7
0
 def test_basic_prune(self):
     ppg.FileGeneratingJob("A", lambda: write("A", "A"))
     b = ppg.FileGeneratingJob("B", lambda: write("B", "B"))
     b.prune()
     ppg.run_pipegraph()
     assert Path("A").read_text() == "A"
     assert not Path("B").exists()
Exemplo n.º 8
0
    def test_reruns_both_if_calc_changed(self):
        import pydataframe
        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X','Y')
        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)
        def calc2():
            append('out/calc', 'A')
            x = 5
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        job = ppg.PlotJob(of, calc2, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'AA')
        self.assertEqual(read('out/plot'),'BB')
Exemplo n.º 9
0
    def test_no_rerun_if_ignore_code_changes_and_plot_changes(self):
        import pydataframe
        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X','Y')
        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)
        def plot2(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('Y','X')
        job = ppg.PlotJob(of, calc, plot2)
        job.ignore_code_changes()
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')
Exemplo n.º 10
0
        def test_plotjob_fails(self):
            def calc():
                return None

            def calc2():
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150)), "w": "B"}
                )

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            p1 = ppg.PlotJob("out/A.png", calc, plot)
            p2 = ppg.PlotJob("out/B.png", calc2, plot)
            import pathlib

            pc = ppg.CombinedPlotJob(
                pathlib.Path("out/C.png"), [p1, p2], {"facet": "w"}
            )
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], [])
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1], {"facet": "w"})

            ppg.CombinedPlotJob(pathlib.Path("out/D.png"), [p1, p2], [])
            ppg.CombinedPlotJob(pathlib.Path("out/E.png"), [p1, p2], {"facet": "w"})

            with pytest.raises(ppg.RuntimeError):
                ppg.run_pipegraph()
            assert "did not return a" in str(p1.cache_job.exception)
            assert pc.error_reason == "Indirect"
Exemplo n.º 11
0
        def test_basic(self):
            def calc():
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150))}
                )

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            def plot2(df):
                p = pyggplot.Plot(df).add_scatter("Y", "X")
                p.width = 5
                p.height = 2
                return p

            of = "out/test.png"
            p = ppg.PlotJob(of, calc, plot)
            p.add_fiddle(lambda p: p.scale_x_log10())
            p.add_another_plot("out/test2.png", plot2)
            ppg.run_pipegraph()
            assert magic(of).find(b"PNG image") != -1
            assert os.path.exists(of + ".tsv")
            assert os.path.exists("cache/out/test.png")
            assert os.path.exists("out/test2.png")
            assert not os.path.exists("cache/out/test2.png")
            assert not os.path.exists("cache/out/test2.png.tsv")
Exemplo n.º 12
0
    def test_job_creation_after_pipegraph_run_raises(self):
        def inner():
            ppg.FileGeneratingJob("A", lambda: None)

        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()
        assertRaises(ValueError, inner)
Exemplo n.º 13
0
    def test_raises_on_non_dependend_job_injection2_can_be_ignored(self):
        o = Dummy()
        of = "out/A"

        def do_write():
            write(of, o.A)  # + o.B - but B is not in the dependency chain!

        job = ppg.FileGeneratingJob(of, do_write)
        ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

        def generate_deps():
            def load_a():
                return "A"

            def load_b():
                return "B"

            dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a)
            ppg.AttributeLoadingJob("dlB", o, "B", load_b)
            job.depends_on(dlA)
            # let's not do anything with dlA

        gen_job = ppg.DependencyInjectionJob(
            "C", generate_deps, check_for_dependency_injections=False
        )
        job.depends_on(gen_job)
        ppg.run_pipegraph()

        assert os.path.exists(of)  # since the gen job crashed
Exemplo n.º 14
0
 def test_jobs_concurrent_jobs_run_concurrently(self):
     # we'll determine this by the start respective end times..
     ppg.new_pipegraph(
         ppg.resource_coordinators.LocalSystem(max_cores_to_use=2),
         quiet=True,
         dump_graph=False,
     )
     jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
     jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
     jobA.cores_needed = 1
     jobB.cores_needed = 1
     ppg.run_pipegraph()
     assert read("out/A") == "A"
     assert read("out/B") == "B"
     if jobA.start_time < jobB.start_time:
         first_job = jobA
         second_job = jobB
     else:
         first_job = jobB
         second_job = jobA
     print(
         "times",
         first_job.start_time,
         first_job.stop_time,
         second_job.start_time,
         second_job.stop_time,
     )
     if jobA.start_time is None:
         raise ValueError("JobA did not run")
     assert first_job.stop_time > second_job.start_time
Exemplo n.º 15
0
 def test_tempfile_not_run_on_prune(self):
      a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A"))
      b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A")))
      b.depends_on(a)
      b.prune()
      ppg.run_pipegraph()
      assert not Path('B').exists()
      assert not Path('A').exists()
Exemplo n.º 16
0
    def test_run_may_be_called_only_once(self):
        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()

        def inner():
            ppg.run_pipegraph()

        assertRaises(ValueError, inner)
Exemplo n.º 17
0
 def test_pruning_final_jobs_directly(self):
     ppg.FileGeneratingJob("A", lambda: write("A", "A"))
     ppg.FileGeneratingJob("B", lambda: write("B", "B"))
     c = ppg.FinalJob("shu", lambda: write("C", "C"))
     c.prune()
     ppg.run_pipegraph()
     assert Path("A").read_text() == "A"
     assert Path("B").read_text() == "B"
     assert not Path("C").exists()
Exemplo n.º 18
0
 def test_pdf(self):
     import pydataframe
     def calc():
         return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
     def plot(df):
         return pyggplot.Plot(df).add_scatter('X','Y')
     of = 'out/test.pdf'
     job = ppg.PlotJob(of, calc, plot)
     ppg.run_pipegraph()
     self.assertTrue(magic(of).find('PDF document') != -1)
Exemplo n.º 19
0
    def test_can_not_run_twice(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.run_pipegraph()
            assert False  # "Exception not correctly raised"
        except ValueError as e:
            print(e)
            assert "Each pipegraph may be run only once." in str(e)
Exemplo n.º 20
0
 def test_basic(self):
     ppg.new_pipegraph(rc_gen(), quiet=False)
     import pydataframe
     def calc():
         return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
     def plot(df):
         return pyggplot.Plot(df).add_scatter('X','Y')
     of = 'out/test.png'
     job = ppg.PlotJob(of, calc, plot)
     ppg.run_pipegraph()
     self.assertTrue(magic(of).find('PNG image') != -1)
Exemplo n.º 21
0
    def test_ignored_if_generating_within_filegenerating(self):
        write_job = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "aa"))

        def load():
            ppg.FileGeneratingJob("out/B", lambda: write("out/B", "aa"))
            write("out/C", "c")

        dl = ppg.FileGeneratingJob("out/C", load)
        write_job.depends_on(dl)
        ppg.run_pipegraph()
        assert read("out/C") == "c"
Exemplo n.º 22
0
    def test_basic(self):
        def gen():
            ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
            ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C"))

        ppg.JobGeneratingJob("genjob", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/B") == "B"
        assert read("out/C") == "C"
Exemplo n.º 23
0
        def a():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))

            def genA():
                jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C"))
                jobC.depends_on(jobB)

            jobA = ppg.JobGeneratingJob("A", genA)
            jobB.depends_on(jobA)
            ppg.run_pipegraph()
            assert read("out/B") == "B"
            assert read("out/C") == "C"
Exemplo n.º 24
0
 def test_tempfile_still_run_if_needed_for_other(self):
      a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A"))
      b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A")))
      c = ppg.FileGeneratingJob("C", lambda: write("C", "C" + read("A")))
      b.depends_on(a)
      c.depends_on(a)
      b.prune()
      ppg.run_pipegraph()
      assert not Path('B').exists()
      assert Path('C').exists()
      assert Path('C').read_text() == 'CA'
      assert not Path('A').exists()
Exemplo n.º 25
0
    def test_raises_if_generating_within_dataload(self):
        ppg.util.global_pipegraph.quiet = False
        write_job = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "aa"))

        def load():
            ppg.FileGeneratingJob("out/B", lambda: write("out/B", "aa"))

        dl = ppg.DataLoadingJob("load_data", load)
        write_job.depends_on(dl)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "Trying to add new jobs to running pipeline" in str(dl.exception)
Exemplo n.º 26
0
    def test_generated_job_depending_on_each_other(self):
        # basic idea. You have jobgen A,
        # it not only creates filegenB, but also filegenC that depends on B
        # does that work
        def gen():
            jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
            jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", read("out/B")))
            jobC.depends_on(jobB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/B") == "B"
        assert read("out/C") == "B"
Exemplo n.º 27
0
 def test_non_default_status_filename(self):
     try:
         forget_job_status("shu.dat")
         forget_job_status()
         ppg.new_pipegraph(
             quiet=True, invariant_status_filename="shu.dat", dump_graph=False
         )
         ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
         ppg.run_pipegraph()
         assert os.path.exists("shu.dat")
         assert not (os.path.exists(ppg.graph.invariant_status_filename_default))
     finally:
         forget_job_status("shu.dat")
Exemplo n.º 28
0
        def test_pdf(self):
            def calc():
                return pd.DataFrame(
                    {"X": list(range(0, 100)), "Y": list(range(50, 150))}
                )

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.pdf"
            ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            assert magic(of).find(b"PDF document") != -1
Exemplo n.º 29
0
    def test_can_not_add_jobs_after_run(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            assert False  # , "Exception not correctly raised")
        except ValueError as e:
            print(e)
            assert (
                "This pipegraph was already run. You need to create a new one for more jobs"
                in str(e)
            )
Exemplo n.º 30
0
    def test_injecting_multiple_stages(self):
        def gen():
            def genB():
                def genC():
                    ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D"))

                ppg.JobGeneratingJob("C", genC)

            ppg.JobGeneratingJob("B", genB)

        ppg.JobGeneratingJob("A", gen)
        ppg.run_pipegraph()
        assert read("out/D") == "D"
Exemplo n.º 31
0
        def test_complete(self):
            def calc():
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150)),
                    "w": "A"
                })

            def calc2():
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150)),
                    "w": "B"
                })

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            p1 = ppg.PlotJob("out/A.png", calc, plot)
            p2 = ppg.PlotJob("out/B.png", calc2, plot)
            import pathlib

            ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], ["w"])
            ppg.CombinedPlotJob(pathlib.Path("out/D.png"), [p1, p2], [])
            ppg.CombinedPlotJob(
                pathlib.Path("out/E.png"),
                [p1, p2],
                {"facets": "w"},
                fiddle=lambda p: p.scale_x_log10(),
            )
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob(pathlib.Path("out/C.png"), [p1, p2], "w")
            with pytest.raises(TypeError):
                ppg.CombinedPlotJob(5, [p1, p2], "w")
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob("out/D.something", [p1, p2], "w")
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob("out/D.png", [], "w")
            with pytest.raises(ValueError):
                ppg.CombinedPlotJob("out/D.png", [p1, p2.job_id], "w")

            ppg.run_pipegraph()
            assert magic("out/C.png").find(b"PNG image") != -1
            assert magic("out/D.png").find(b"PNG image") != -1
            assert magic("out/E.png").find(b"PNG image") != -1
Exemplo n.º 32
0
    def test_smooth(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": 36925 * 1000 - 1000,
                    "stop": 36925 * 1000 + 1000,
                },
                {
                    "chr": "chr22",
                    "start": 31485 * 1000 - 2000,
                    "stop": 31485 * 1000 + 2000,
                },
                {"chr": "chr22", "start": 41842 * 1000, "stop": (41842 * 1000) + 1},
            ]
        )
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lane2 = mbf_align.lanes.AlignedSample(
            "two",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )

        h = mbf_heatmap.chipseq.Heatmap(
            plot_regions,
            [lane1, lane2],
            region_strategy=regions.RegionFromCenter(1000),
            smoothing_strategy=smooth.SmoothExtendedReads(),
        )
        fn = "test.png"
        h.plot(fn, norm.AsIs(), order.FirstLaneSum())
        ppg.run_pipegraph()
        assert_image_equal(fn)
def test_write_trim_predefines(tmpdir, scouter):

    scouter.write_predefined_sequences()
    outputfile = scouter.result_dir / "predefined_sequences.tsv"
    ppg.run_pipegraph()
    df = pd.read_csv(outputfile, sep="\t")
    scouter.assert_predefined(df["Full Sequence"].values,
                              df["Sequence"].values)
    assert outputfile.exists()
    df_new = pd.read_csv(outputfile, sep="\t")
    df_new.index = df_new["Name"]
    print(df.head())
    assert df_new.loc["1>A_test3"]["Duplicate"]
    assert df_new.loc["1>A_test4"]["Duplicate"]
    assert df_new.loc["1>A_test3"]["Deduplicated"]
    assert not df_new.loc["1>A_test4"]["Deduplicated"]
    assert df_new.loc["1>A_test3"][
        "Duplicate Entries"] == "1>A_test3;1>A_test4"
Exemplo n.º 34
0
    def test_anno_returning_series(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C"]

            def calc(self, df):
                return pd.Series(list(range(len(df))))

        a += SeriesAnno()
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        ppg.run_pipegraph()
        assert (a.df["C"] == [0, 1, 2]).all()
Exemplo n.º 35
0
    def test_raises_if_calc_returns_non_df(self):
        #import pydataframe
        def calc():
            return None

        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X', 'Y')

        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        try:
            ppg.run_pipegraph()
            raise ValueError("should not be reached")
        except ppg.RuntimeError:
            pass
        self.assertTrue(
            isinstance(job.cache_job.exception, ppg.JobContractError))
Exemplo n.º 36
0
    def test_simple(self):
        o = Dummy()

        def calc():
            return ", ".join(str(x) for x in range(0, 100))

        def store(value):
            o.a = value

        job = ppg.CachedDataLoadingJob("out/mycalc", calc, store)
        of = "out/A"

        def do_write():
            write(of, o.a)

        ppg.FileGeneratingJob(of, do_write).depends_on(job)
        ppg.run_pipegraph()
        assert read(of) == ", ".join(str(x) for x in range(0, 100))
Exemplo n.º 37
0
    def test_lane_with_job_generating_fastq(self):
        def gen_fastq(fn):
            with open(fn, "wb") as op:
                op.write(b"@shu\nAGTC\n+\nasdf")

        job = FileGeneratingJob("input.fastq", gen_fastq)

        lane = Sample("Sample_a", job, False, vid="VA000")
        assert lane.vid == "VA000"
        temp_job = lane.prepare_input()
        assert job in temp_job.prerequisites
        real_job = lane.save_input()
        ppg.run_pipegraph()
        assert not Path(temp_job.filenames[0]).exists()
        assert Path(real_job.filenames[0]).exists()
        with gzip.GzipFile(real_job.filenames[0], "r") as op:
            lines = op.readlines()
        assert len(lines) == 4
Exemplo n.º 38
0
    def test_basic(self, new_pipegraph):
        # TODO: there is a problem with this apporach. The AttributeLoadingJob
        # references different objects, since it get's pickled alongside with the method,
        # and depickled again, and then it's not the same object anymore,
        # so the FileGeneratingJob and the AttributeLoadingJob in this test
        # reference different objects.
        # I'm not sure how to handle this right now though.

        # I have an idea: Do JobGraphModifyingJobs in each slave, and send back just the
        # dependency data (and new job name).
        # that way, we can still execute on any slave, and all the pointers should be
        # right.
        new_pipegraph.new_pipegraph()

        o = Dummy()
        of = "out/A"

        def do_write():
            # logging.info("Accessing dummy (o) %i in pid %s" % (id(o), os.getpid()))
            write(of, o.A + o.B)

        job = ppg.FileGeneratingJob(of, do_write)

        def generate_deps():
            def load_a():
                # logging.info('executing load A')
                return "A"

            def load_b():
                # logging.info('executing load B')
                return "B"

            # logging.info("Creating dl on %i in pid %s" % (id(o), os.getpid()))
            dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a)
            # logging.info("created dlA")
            dlB = ppg.AttributeLoadingJob("dlB", o, "B", load_b)
            job.depends_on(dlA)
            job.depends_on(dlB)
            return [dlA, dlB]

        gen_job = ppg.DependencyInjectionJob("C", generate_deps)
        job.depends_on(gen_job)
        ppg.run_pipegraph()
        assert read(of) == "AB"
Exemplo n.º 39
0
    def test_two_differenct_annotators_with_identical_column_names_raise_on_creation(
        self
    ):
        a = DummyAnnotatable("A")
        columns_called = [False]

        class DA(Annotator):
            def __init__(self, prefix):
                self.prefix = prefix
                self.cache_name = prefix

            @property
            def columns(self):
                raise ValueError()
                columns_called[0] = True
                return ["%s-A" % self.prefix]

            def calc(self, df):
                ll = len(df)
                return pd.DataFrame({"DA1-A": [0] * ll})

        class DA2(Annotator):
            cache_name = "DA2"

            def __init__(self, prefix):
                self.prefix = prefix

            @property
            def columns(self):
                columns_called[0] = True
                return ["%s-A" % self.prefix]

            def annotate(self, df):
                ll = len(df)
                return pd.DataFrame({"DA1-A": [0] * ll})

        a += DA("DA-1")
        d = DA("DA-2")
        a += d  # still ok.
        a += d  # still ok...a
        assert DA("DA-2") is d
        assert columns_called[0] is False
        # with pytest.raises(ppg.RuntimeError):
        ppg.run_pipegraph()
Exemplo n.º 40
0
def test_logging(new_pipegraph):
    import logging
    my_logger = logging.getLogger('pypipegraph')
    h = logging.FileHandler(filename='ppg.log', mode='w')
    my_logger.addHandler(h)
    logging.getLogger().warning("Should not be in the log.")
    try:
        my_logger.setLevel(logging.DEBUG)
        f = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        h.setFormatter(f)
        ppg.FileGeneratingJob('out/A', lambda: write('out/A', 'A'))
        ppg.run_pipegraph()
    finally:
        my_logger.removeHandler(h)
    assert os.path.exists('ppg.log')
    d = read('ppg.log')
    assert not ('Should not be in the log.\n' in d)
    assert 'pypipegraph - INFO' in d
    assert 'pypipegraph - DEBUG' in d
Exemplo n.º 41
0
    def test_injecting_filegenerating_job(self):
        of = "out/A"

        def do_write():
            write(of, read("out/B"))

        job = ppg.FileGeneratingJob(of, do_write)

        def generate_dep():
            def write_B():
                write("out/B", "B")

            inner_job = ppg.FileGeneratingJob("out/B", write_B)
            job.depends_on(inner_job)

        job_gen = ppg.DependencyInjectionJob("gen_job", generate_dep)
        job.depends_on(job_gen)
        ppg.run_pipegraph()
        assert read("out/A") == "B"
def test_count_fastq_trimmed(tmpdir, raw_lane, scouter):
    scouter.write_fastq_count_trimmed(raw_lane)
    ppg.run_pipegraph()
    output_file = scouter.result_dir / f"{raw_lane.name}_{scouter.name}_all_reads_trimmed.tsv"
    df = pd.read_csv(output_file, sep="\t")
    expected = {
        "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGCA":
        2,
        "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGC":
        2,
        "TTGCTTTACCTCCTTTTAGTTGGCCTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTG":
        1,
        "TTGCTTTACCTCCTTTTAGCCTCTTTTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGTCTGCA":
        2,
        "AGGAATCGCTTTACCTCCTTTTAGTTGAAATTGCCCCGGCCCCGGTCCCTTGCCAAAATGTCTTGTTTAGCCCCGGGTGCTCCTGTCGGGTCTTGACTGATTCACACTTGATATTCTTGTCTTCTGGTTCTTGCTCTGATGAGCACACGT":
        1
    }
    for _, row in df.iterrows():
        assert expected[row["Sequence"]] == row["Count"]
Exemplo n.º 43
0
    def test_cached_jobs_get_depencies_only_on_the_lazy_filegenerator_not_on_the_loading_job(
            self):
        o = Dummy()

        def calc():
            return list(range(0, o.b))

        job = ppg.CachedAttributeLoadingJob("a", o, "a", calc)

        def do_b():
            return 100

        jobB = ppg.AttributeLoadingJob("b", o, "b", do_b)
        job.depends_on(jobB)
        assert not (jobB in job.prerequisites)
        assert jobB in job.lfg.prerequisites
        ppg.run_pipegraph()
        assert jobB.was_invalidated
        assert job.was_invalidated
Exemplo n.º 44
0
    def test_anno_not_returning_enough_rows_and_no_index(self):
        class BrokenAnno(Annotator):
            columns = ["X"]

            def calc(self, df):
                return pd.DataFrame({"X": [1]})

        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )
        a += BrokenAnno()
        lj = a.anno_jobs["X"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "Length and index mismatch " in str(lj().exception)
Exemplo n.º 45
0
        def test_raises_if_plot_returns_non_plot(self):
            # import pyggplot
            def calc():
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            def plot(df):
                return None

            of = "out/test.png"
            job = ppg.PlotJob(of, calc, plot)
            try:
                ppg.run_pipegraph()
                raise ValueError("should not be reached")
            except ppg.RuntimeError:
                pass
            assert isinstance(job.exception, ppg.JobContractError)
Exemplo n.º 46
0
    def test_all_transcripts(self, mock_download, shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_transcripts
        assert "gene_stable_id" in df.columns
        assert len(df) == 6928 + 4  # from the a2 locus
        assert df["chr"].dtype.name == "category"
        assert df["biotype"].dtype.name == "category"
        assert df.loc["KIS71021"].chr == "2"
        assert df.loc["KIS71021"].strand == 1
        assert df.loc["KIS71021"].start == 354_742
        assert df.loc["KIS71021"].stop == 356_690
        assert df.loc["KIS71021"].gene_stable_id == "UMAG_12118"
        assert df.loc["KIS71021"].biotype == "protein_coding"
        assert df.loc["KIS71021"].exons == ((354_742, 354_936), (355_222,
                                                                 356_690))
        assert df.loc["KIS71021"].exon_stable_ids == ("KIS71021-1",
                                                      "KIS71021-2")
Exemplo n.º 47
0
    def test_annotator_raising(self):
        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )

        class RaiseAnno(Annotator):
            columns = ["aa"]
            cache_name = "empty"

            def calc(self, df):
                raise ValueError("hello")

        anno1 = RaiseAnno()
        a += anno1
        force_load(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        anno_job = a.anno_jobs[RaiseAnno().get_cache_name()]
        assert "hello" in str(anno_job.lfg.exception)
Exemplo n.º 48
0
 def test_multi_level(self):
     a = DelayedDataFrame(
         "shu",
         lambda: pd.DataFrame(
             {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
         ).set_index("idx"),
     )
     b = a.filter("sha", lambda df: df["C"] == 4, Constant("C", 4))
     a1 = LenAnno("count")
     b += a1
     c = b.filter("shc", lambda df: df["A"] >= 2)
     a2 = LenAnno("count2")
     c += a2
     c.write()
     ppg.run_pipegraph()
     assert len(c.df) == 2
     assert (c.df["A"] == [2, 3]).all()
     assert (c.df["count"] == "count3").all()
     assert (c.df["count2"] == "count22").all()
Exemplo n.º 49
0
    def test_cached_dataloading_job_does_not_load_its_preqs_on_cached(
        self, new_pipegraph
    ):
        o = Dummy()

        def a():
            o.a = "A"
            append("out/A", "A")

        def calc():
            append("out/B", "B")
            return o.a * 2

        def load(value):
            o.c = value
            append("out/Cx", "C")  # not C, that's the cached file, you know...

        def output():
            write("out/D", o.c)

        dl = ppg.DataLoadingJob("out/A", a)
        ca = ppg.CachedDataLoadingJob("out/C", calc, load)
        fg = ppg.FileGeneratingJob("out/D", output)
        fg.depends_on(ca)
        ca.depends_on(dl)
        ppg.run_pipegraph()
        assert read("out/D") == "AA"  # we did write the final result
        assert read("out/A") == "A"  # ran the dl job
        assert read("out/B") == "B"  # ran the calc job...
        assert read("out/Cx") == "C"  # ran the load jobo
        os.unlink("out/D")  # so the filegen and the loadjob of cached should rerun...
        new_pipegraph.new_pipegraph()
        dl = ppg.DataLoadingJob("out/A", a)
        ca = ppg.CachedDataLoadingJob("out/C", calc, load)
        fg = ppg.FileGeneratingJob("out/D", output)
        fg.depends_on(ca)
        ca.depends_on(dl)
        ppg.run_pipegraph()
        assert read("out/D") == "AA"  # we did write the final result
        assert read("out/A") == "A"  # did not run the dl job
        assert read("out/B") == "B"  # did not run the calc job again
        assert read("out/Cx") == "CC"  # did run the load job again
Exemplo n.º 50
0
    def test_protein_creation(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            None,
            ProkaryoticCode(),
        )
        g.download_genome()
        g.job_transcripts()
        ppg.run_pipegraph()

        should = dict(
            iter_fasta(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
                )))
        should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
        actual = dict(iter_fasta(g.find_file("pep.fasta")))
        if actual != should:
            assert not set(should.keys()).difference(set(actual.keys(
            )))  # they are all here, we just have more (tRNA...)
            for k in should:
                if actual[k] != should[k]:
                    print(k)
                    print(len(actual[k]))
                    print(len(should[k]))

                    print(actual[k])
                    print(should[k])
                    # print(g.get_cds_sequence(k.decode('utf-8')))
                # else:
                # print('ok', k)
                # assert actual[k] == should[k]
            assert False
Exemplo n.º 51
0
    def test_being_generated(self):
        o = Dummy()

        def calc():
            return 55

        def store(value):
            o.a = value

        def dump():
            write("out/A", str(o.a))

        def gen():
            calc_job = ppg.CachedDataLoadingJob("out/B", calc, store)
            dump_job = ppg.FileGeneratingJob("out/A", dump)
            dump_job.depends_on(calc_job)

        ppg.JobGeneratingJob("out/C", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "55"
Exemplo n.º 52
0
    def test_generated_job_depending_on_each_other_one_of_them_is_loading(self):
        # basic idea. You have jobgen A,
        # it not only creates filegenB, but also DataloadingC that depends on B
        # does that work
        def gen():
            def load():
                global shu
                shu = "123"

            def do_write():
                global shu
                write("out/A", shu)

            dl = ppg.DataLoadingJob("dl", load)
            jobB = ppg.FileGeneratingJob("out/A", do_write)
            jobB.depends_on(dl)

        ppg.JobGeneratingJob("gen", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "123"
Exemplo n.º 53
0
    def test_generated_job_depends_on_failing_job(self, new_pipegraph):
        # import logging
        # new_pipegraph.new_pipegraph(log_file="debug.log", log_level=logging.DEBUG)
        def fn_a():
            raise ValueError()

        def fn_b():
            c = ppg.FileGeneratingJob("c", lambda: write("c", read("a")))
            c.depends_on(a)
            return [c]

        a = ppg.FileGeneratingJob("a", fn_a)
        b = ppg.JobGeneratingJob("b", fn_b)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()

        assert isinstance(a.exception, ValueError)
        assert a.error_reason == "Exception"
        assert b.error_reason == "no error"
        assert ppg.util.global_pipegraph.jobs["c"].error_reason == "Indirect"
Exemplo n.º 54
0
 def test_transcript_wrong_order(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "start > stop" in str(job.exception)
Exemplo n.º 55
0
    def test_anno_returning_string(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C", "D"]

            def calc(self, df):
                return "abc"

        a += SeriesAnno()
        lj = a.anno_jobs["C"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "result was no dataframe" in str(lj().lfg.exception)
Exemplo n.º 56
0
    def test_DynamicColumNames(self):
        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )

        class Dynamic(Annotator):
            @property
            def columns(self):
                return ["a"]

            def calc(self, df):
                return pd.DataFrame({"a": ["x", "y"]})

        a += Dynamic()
        a.anno_jobs[Dynamic().get_cache_name()]
        force_load(a.annotate())
        ppg.run_pipegraph()
        assert_frame_equal(
            a.df, pd.DataFrame({"A": [1, 2], "B": ["c", "d"], "a": ["x", "y"]})
        )
Exemplo n.º 57
0
    def test_annotator_missing_columns(self):
        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )

        class MissingColumnNames(Annotator):
            cache_name = "MissingColumnNames"

            def calc(self, df):
                return pd.DataFrame({})

            def __repr__(self):
                return "MissingColumnNames()"

        a += MissingColumnNames()
        lg = a.anno_jobs["MissingColumnNames"]
        force_load(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "AttributeError" in repr(lg().lfg.exception)
Exemplo n.º 58
0
        def test_add_another_not_returning_plot(self):
            def calc():
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            def plot2(df):
                return

            of = "out/test.png"
            p = ppg.PlotJob(of, calc, plot)
            p.add_fiddle(lambda p: p.scale_x_log10())
            p2 = p.add_another_plot("out/test2.png", plot2)
            with pytest.raises(ppg.RuntimeError):
                ppg.run_pipegraph()
            assert isinstance(p2.exception, ppg.JobContractError)
Exemplo n.º 59
0
    def test_lying_about_columns(self):
        a = DelayedDataFrame(
            "shu",
            lambda: pd.DataFrame(
                {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
            ).set_index("idx"),
        )

        class SeriesAnno(Annotator):
            columns = ["C"]

            def calc(self, df):
                return pd.DataFrame({"D": [0, 1, 2]})

        a += SeriesAnno()
        lj = a.anno_jobs["C"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "declared different " in str(lj().exception)
Exemplo n.º 60
0
 def test_empty_gtf_and_cdna_and_protein(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         None,
         None,
     )
     g.download_genome()
     assert g.gtf_filename is None
     assert g.cdna_fasta_filename is None
     g.job_transcripts()
     g.job_genes()
     g.job_proteins()
     ppg.run_pipegraph()
     assert len(g.df_transcripts) == 0
     assert len(g.get_gtf()) == 0
     assert len(g.df_genes) == 0
     assert len(g.df_proteins) == 0