def test_name_must_be_str(self): with pytest.raises(TypeError): ppg.CachedDataLoadingJob(123, lambda: 123, lambda: 5) with pytest.raises(ValueError): ppg.CachedDataLoadingJob("123", 123, lambda: 5) with pytest.raises(ValueError): ppg.CachedDataLoadingJob("123", lambda: 5, 123)
def gen(): calc_job = ppg.CachedDataLoadingJob("out/B", calc, store) def gen2(): dump_job = ppg.FileGeneratingJob("out/A", dump) dump_job.depends_on(calc_job) ppg.JobGeneratingJob("out/D", gen2)
def _anno_cache_and_calc(self, anno): def calc(): if not isinstance(anno.columns, list): raise ValueError("Columns was not a list") if hasattr(anno, "calc_ddf"): df = anno.calc_ddf(self.ddf) else: df = anno.calc(self.ddf.df) if isinstance(df, pd.Series) and len(anno.columns) == 1: df = pd.DataFrame({anno.columns[0]: df}) if not isinstance(df, pd.DataFrame): raise ValueError( "result was no dataframe (or series and len(anno.columns) == 1)" ) return df def load(df): s_should = set(anno.columns) if not len(s_should): raise ValueError("anno.columns was empty") s_actual = set(df.columns) if s_should != s_actual: raise ValueError( "Annotator declared different columns from those actualy calculated: %s" % (s_should.symmetric_difference(s_actual)) ) if set(df.columns).intersection(self.ddf.df.columns): raise ValueError( "Annotator created columns that were already present.", self.ddf.name, anno.get_cache_name(), set(df.columns).intersection(self.ddf.df.columns), ) self.ddf.df = _combine_annotator_df_and_old_df(df, self.ddf.df) (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True) job = ppg.CachedDataLoadingJob( self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(), calc, load, ) ppg.Job.depends_on( job, self.load() ) # both the load and nthe calc needs our ddf.df job.depends_on( self.load(), ppg.FunctionInvariant( self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"), anno.calc if hasattr(anno, "calc") else anno.calc_ddf, ), ) for d in anno.dep_annos(): if d is not None: job.depends_on(self.ddf.anno_jobs[d.get_cache_name()]) job.depends_on(anno.deps(self.ddf)) job.lfg.cores_needed = getattr(anno, "cores_needed", 1) return job
def test_cached_dataloading_job_does_not_load_its_preqs_on_cached( self, new_pipegraph ): o = Dummy() def a(): o.a = "A" append("out/A", "A") def calc(): append("out/B", "B") return o.a * 2 def load(value): o.c = value append("out/Cx", "C") # not C, that's the cached file, you know... def output(): write("out/D", o.c) dl = ppg.DataLoadingJob("out/A", a) ca = ppg.CachedDataLoadingJob("out/C", calc, load) fg = ppg.FileGeneratingJob("out/D", output) fg.depends_on(ca) ca.depends_on(dl) ppg.run_pipegraph() assert read("out/D") == "AA" # we did write the final result assert read("out/A") == "A" # ran the dl job assert read("out/B") == "B" # ran the calc job... assert read("out/Cx") == "C" # ran the load jobo os.unlink("out/D") # so the filegen and the loadjob of cached should rerun... new_pipegraph.new_pipegraph() dl = ppg.DataLoadingJob("out/A", a) ca = ppg.CachedDataLoadingJob("out/C", calc, load) fg = ppg.FileGeneratingJob("out/D", output) fg.depends_on(ca) ca.depends_on(dl) ppg.run_pipegraph() assert read("out/D") == "AA" # we did write the final result assert read("out/A") == "A" # did not run the dl job assert read("out/B") == "B" # did not run the calc job again assert read("out/Cx") == "CC" # did run the load job again
def load(self): def load_func(df): self.ddf.df = df self.ddf.non_annotator_columns = self.ddf.df.columns job = ppg.CachedDataLoadingJob(self.ddf.cache_dir / "calc", self.loading_function, load_func) job.depends_on(self.deps).depends_on( ppg.FunctionInvariant( self.ddf.__class__.__name__ + "_" + self.ddf.name + "_load", self.loading_function, )) return job
def test_no_dependand_still_calc(self): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) def store(value): o.a = value ppg.CachedDataLoadingJob("out/mycalc", calc, store) # job.ignore_code_changes() #or it would run anyway... hm. assert not (os.path.exists("out/mycalc")) ppg.run_pipegraph() assert os.path.exists("out/mycalc")
def test_simple(self): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) def store(value): o.a = value job = ppg.CachedDataLoadingJob("out/mycalc", calc, store) of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 100))
def test_cached_jobs_get_depencies_only_on_the_lazy_filegenerator_not_on_the_loading_job( self): o = Dummy() def calc(): return list(range(0, o.b)) def load(value): o.a = value job = ppg.CachedDataLoadingJob("a", calc, load) def do_b(): return 100 jobB = ppg.AttributeLoadingJob("b", o, "b", do_b) job.depends_on(jobB) assert not (jobB in job.prerequisites) assert jobB in job.lfg.prerequisites
def test_preqrequisites_end_up_on_lfg(self): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) def store(value): o.a = value job = ppg.CachedDataLoadingJob("out/mycalc", calc, store) of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) job_preq = ppg.FileGeneratingJob("out/B", do_write) job.depends_on(job_preq) assert not (job_preq in job.prerequisites) assert job_preq in job.lfg.prerequisites
def test_cant_unpickle(self): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) def store(value): o.a = value job = ppg.CachedDataLoadingJob("out/mycalc", calc, store) job.ignore_code_changes() write("out/mycalc", "no unpickling this") of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) with pytest.raises(ValueError): ppg.run_pipegraph() assert isinstance(job.exception, ValueError) assert "Unpickling error" in str(job.exception)
def _anno_cache_and_calc(self, anno): def calc(): if not isinstance(anno.columns, list): raise ValueError("Columns was not a list") if hasattr(anno, "calc_ddf"): df = anno.calc_ddf(self.ddf) else: df = anno.calc(self.ddf.df) if isinstance(df, pd.Series) and len(anno.columns) == 1: df = pd.DataFrame({anno.columns[0]: df}) if not isinstance(df, pd.DataFrame): raise ValueError( "result was no dataframe (or series and len(anno.columns) == 1)" ) return df def load(df): s_should = set(anno.columns) if not len(s_should): raise ValueError("anno.columns was empty") s_actual = set(df.columns) if s_should != s_actual: raise ValueError( "Annotator declared different columns from those actualy calculated: %s" % (s_should.symmetric_difference(s_actual))) if set(df.columns).intersection(self.ddf.df.columns): raise ValueError( "Annotator created columns that were already present.", self.ddf.name, anno.get_cache_name(), set(df.columns).intersection(self.ddf.df.columns), ) if isinstance(df.index, pd.RangeIndex): if len(df) == len( self.ddf.df): # assume it's simply ordered by the df df.index = self.ddf.df.index else: raise ValueError( "Length and index mismatch between DataFrame and Annotator result - " "Annotator must return either a DF with a compatible index " "or at least one with the same length (and a RangeIndex)" ) self.ddf.df = pd.concat([self.ddf.df, df], axis=1) (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True) job = ppg.CachedDataLoadingJob( self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(), calc, load, ) ppg.Job.depends_on( job, self.load()) # both the load and nthe calc needs our ddf.df job.depends_on( self.load(), ppg.FunctionInvariant( self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"), anno.calc if hasattr(anno, "calc") else anno.calc_ddf, ), ) for d in anno.dep_annos(): if d is not None: job.depends_on(self.ddf.anno_jobs[d.get_cache_name()]) job.depends_on(anno.deps(self.ddf)) job.lfg.cores_needed = getattr(anno, "cores_needed", 1) return job
def test_accepts(self): import pathlib write("aaa", "hello") write("bbb", "hello") write("ccc", "hello") a = ppg.FileTimeInvariant(pathlib.Path("aaa")) a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"]) b = ppg.FileGeneratingJob( pathlib.Path("b"), lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")), ) b.depends_on(a) b.depends_on(a1) dd = Dummy() def mf(): write("c", "cc" + read("g")) write("d", "dd" + read("h") + dd.attr) write("e", "ee" + read("i") + read("j")) c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf) c.depends_on(b) d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1) c.depends_on(d) e = ppg.ParameterInvariant(pathlib.Path("c"), "hello") c.depends_on(e) f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg")) c.depends_on(f) def tmf(): write("h", "hh") write("i", "ii") g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf) c.depends_on(g) def tpf(): write("j", "jjjj") write("k", "kkkk") h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf) c.depends_on(h) i = ppg.CachedDataLoadingJob( pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res ) c.depends_on(i) m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55") c.depends_on(m) ppg.run_pipegraph() assert read("aaa") == "hello" assert read("b") == "bbhellohellohello" assert read("c") == "ccgg" assert read("d") == "ddhh55" assert read("e") == "eeiijjjj" assert not (os.path.exists("g")) assert not (os.path.exists("h")) assert not (os.path.exists("i")) assert not (os.path.exists("j")) assert read("k") == "kkkk"
def inner(): ppg.CachedDataLoadingJob(5, lambda: 1, lambda value: 55)
def inner(): ppg.CachedDataLoadingJob("out/a", lambda value: 55, "shu")
def test_use_cores(self): ca = ppg.CachedDataLoadingJob("out/C", lambda: 55, lambda x: None) assert ca.use_cores(5) is ca assert ca.lfg.cores_needed == 5