def test_job_creation_after_pipegraph_run_raises(self): def inner(): ppg.FileGeneratingJob("A", lambda: None) ppg.new_pipegraph(quiet=True, dump_graph=False) ppg.run_pipegraph() assertRaises(ValueError, inner)
def test_no_rerun_if_ignore_code_changes_and_plot_changes(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B') ppg.new_pipegraph(rc_gen(), quiet=True) def plot2(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('Y','X') job = ppg.PlotJob(of, calc, plot2) job.ignore_code_changes() ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B')
def test_reruns_just_plot_if_plot_changed(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({ "X": list(range(0, 100)), 'Y': list(range(50, 150)) }) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X', 'Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'), 'A') self.assertEqual(read('out/plot'), 'B') ppg.new_pipegraph(rc_gen(), quiet=True) def plot2(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('Y', 'X') job = ppg.PlotJob(of, calc, plot2) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'), 'A') self.assertEqual(read('out/plot'), 'BB')
def test_reruns_just_plot_if_plot_changed(self): def calc(): append("out/calc", "A") return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) def plot(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.png" ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find(b"PNG image") != -1) self.assertEqual(read("out/calc"), "A") self.assertEqual(read("out/plot"), "B") ppg.new_pipegraph(rc_gen(), quiet=True) def plot2(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("Y", "X") ppg.PlotJob(of, calc, plot2) ppg.run_pipegraph() self.assertTrue(magic(of).find(b"PNG image") != -1) self.assertEqual(read("out/calc"), "A") self.assertEqual(read("out/plot"), "BB")
def test_unpickle_bug_prevents_single_job_from_unpickling(self): def do_a(): write("out/A", "A") append("out/As", "A") ppg.FileGeneratingJob("out/A", do_a) def do_b(): write("out/B", "A") append("out/Bs", "A") job_B = ppg.FileGeneratingJob("out/B", do_b) cd = CantDepickle() job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert read("out/Bs") == "A" print("second run") ppg.new_pipegraph(dump_graph=False) ppg.FileGeneratingJob("out/A", do_a) job_B = ppg.FileGeneratingJob("out/B", do_b) job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert ( read("out/Bs") == "AA" ) # this one got rerun because we could not load the invariant...
def test_jobs_concurrent_jobs_run_concurrently(self): # we'll determine this by the start respective end times.. ppg.new_pipegraph( ppg.resource_coordinators.LocalSystem(max_cores_to_use=2), quiet=True, dump_graph=False, ) jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobA.cores_needed = 1 jobB.cores_needed = 1 ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/B") == "B" if jobA.start_time < jobB.start_time: first_job = jobA second_job = jobB else: first_job = jobB second_job = jobA print( "times", first_job.start_time, first_job.stop_time, second_job.start_time, second_job.stop_time, ) if jobA.start_time is None: raise ValueError("JobA did not run") assert first_job.stop_time > second_job.start_time
def test_reruns_both_if_calc_changed(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'A') self.assertEqual(read('out/plot'),'B') ppg.new_pipegraph(rc_gen(), quiet=True) def calc2(): append('out/calc', 'A') x = 5 return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) job = ppg.PlotJob(of, calc2, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'),'AA') self.assertEqual(read('out/plot'),'BB')
def test_run_may_be_called_only_once(self): ppg.new_pipegraph(quiet=True, dump_graph=False) ppg.run_pipegraph() def inner(): ppg.run_pipegraph() assertRaises(ValueError, inner)
def test_can_not_run_twice(self): ppg.new_pipegraph(dump_graph=False) ppg.run_pipegraph() try: ppg.run_pipegraph() assert False # "Exception not correctly raised" except ValueError as e: print(e) assert "Each pipegraph may be run only once." in str(e)
def test_basic(self): ppg.new_pipegraph(rc_gen(), quiet=False) import pydataframe def calc(): return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))}) def plot(df): return pyggplot.Plot(df).add_scatter('X','Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1)
def test_can_not_add_jobs_after_run(self): ppg.new_pipegraph(dump_graph=False) ppg.run_pipegraph() try: ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) assert False # , "Exception not correctly raised") except ValueError as e: print(e) assert ( "This pipegraph was already run. You need to create a new one for more jobs" in str(e))
def test_indirect_cicle(self): ppg.new_pipegraph(quiet=True, dump_graph=False) jobA = ppg.FileGeneratingJob("A", lambda: write("A", "A")) jobB = ppg.FileGeneratingJob("B", lambda: write("B", "A")) jobC = ppg.FileGeneratingJob("C", lambda: write("C", "A")) jobC.depends_on(jobB) jobB.depends_on(jobA) jobA.depends_on(jobC) def inner(): ppg.run_pipegraph() assertRaises(ppg.CycleError, inner)
def test_can_not_add_jobs_after_run(self): ppg.new_pipegraph(dump_graph=False) ppg.run_pipegraph() try: ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) assert False # , "Exception not correctly raised") except ValueError as e: print(e) assert ( "This pipegraph was already run. You need to create a new one for more jobs" in str(e) )
def test_non_default_status_filename(self): try: forget_job_status("shu.dat") forget_job_status() ppg.new_pipegraph( quiet=True, invariant_status_filename="shu.dat", dump_graph=False ) ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) ppg.run_pipegraph() assert os.path.exists("shu.dat") assert not (os.path.exists(ppg.graph.invariant_status_filename_default)) finally: forget_job_status("shu.dat")
def test_non_default_status_filename(self): try: forget_job_status("shu.dat") forget_job_status() ppg.new_pipegraph(quiet=True, invariant_status_filename="shu.dat", dump_graph=False) ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) ppg.run_pipegraph() assert os.path.exists("shu.dat") assert not (os.path.exists( ppg.graph.invariant_status_filename_default)) finally: forget_job_status("shu.dat")
def test_basic(self): ppg.new_pipegraph(rc_gen(), quiet=False) def calc(): return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) def plot(df): return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.png" ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find(b"PNG image") != -1)
def test_basic(self): ppg.new_pipegraph(rc_gen(), quiet=False) import pydataframe def calc(): return pydataframe.DataFrame({ "X": list(range(0, 100)), 'Y': list(range(50, 150)) }) def plot(df): return pyggplot.Plot(df).add_scatter('X', 'Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1)
def transmit_pipegraph(self, jobs): global global_pipegraph try: pypipegraph.new_pipegraph(pypipegraph.resource_coordinators.DummyResourceCoordinator()) global_pipegraph = pypipegraph.util.global_pipegraph jobs = cPickle.loads(jobs) #which fills the global pipegraph... logger.info("received pipegraph") logger.info("job len %i" % len(jobs)) for name in jobs: logger.info("adding %s" % name) global_pipegraph.add_job(jobs[name]) logger.info("Loaded pipegraph. Num jobs: %i" % len(global_pipegraph.jobs)) global_pipegraph.running = True return {'ok': True, 'exception': ''} except Exception, e: logger.info("Pipegraph loading failed") logger.info(traceback.format_tb()) return {"ok": False, 'exception': str(e)}
def transmit_pipegraph(self, jobs): global global_pipegraph try: pypipegraph.new_pipegraph( pypipegraph.resource_coordinators.DummyResourceCoordinator()) global_pipegraph = pypipegraph.util.global_pipegraph jobs = cPickle.loads(jobs) #which fills the global pipegraph... logger.info("received pipegraph") logger.info("job len %i" % len(jobs)) for name in jobs: logger.info("adding %s" % name) global_pipegraph.add_job(jobs[name]) logger.info("Loaded pipegraph. Num jobs: %i" % len(global_pipegraph.jobs)) global_pipegraph.running = True return {'ok': True, 'exception': ''} except Exception, e: logger.info("Pipegraph loading failed") logger.info(traceback.format_tb()) return {"ok": False, 'exception': str(e)}
def np(quiet=True, **kwargs): if not first[0]: Path(target_path).mkdir(parents=True, exist_ok=True) os.chdir(target_path) Path("logs").mkdir() Path("cache").mkdir() Path("results").mkdir() Path("out").mkdir() import logging h = logging.getLogger("pypipegraph") h.setLevel(logging.WARNING) first[0] = True rc = ppg.resource_coordinators.LocalSystem(1) ppg.new_pipegraph(rc, quiet=quiet, dump_graph=False, **kwargs) ppg.util.global_pipegraph.result_dir = Path("results") g = ppg.util.global_pipegraph g.new_pipegraph = np return g
def test_no_rerun_if_calc_change_but_ignore_codechanges(self): import pydataframe def calc(): append('out/calc', 'A') return pydataframe.DataFrame({ "X": list(range(0, 100)), 'Y': list(range(50, 150)) }) def plot(df): append('out/plot', 'B') return pyggplot.Plot(df).add_scatter('X', 'Y') of = 'out/test.png' job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'), 'A') self.assertEqual(read('out/plot'), 'B') ppg.new_pipegraph(rc_gen(), quiet=True) def calc2(): append('out/calc', 'A') x = 5 return pydataframe.DataFrame({ "X": list(range(0, 100)), 'Y': list(range(50, 150)) }) job = ppg.PlotJob(of, calc2, plot) job.ignore_code_changes() ppg.run_pipegraph() self.assertTrue(magic(of).find('PNG image') != -1) self.assertEqual(read('out/calc'), 'A') self.assertEqual(read('out/plot'), 'B')
def get_genome(name=None): global ppg_genome cache_dir = Path(__file__).parent / "run" / "genome_cache" if ppg_genome is None: old_pipegraph = ppg.util.global_pipegraph ppg.new_pipegraph() g = get_Candidatus_carsonella_ruddii_pv( name, cache_dir=cache_dir # , ignore_code_changes=True ) g.download_genome() # g.job_genes() # g.job_transcripts() ppg_genome = g ppg.run_pipegraph() ppg.util.global_pipegraph = old_pipegraph return InteractiveFileBasedGenome( name, ppg_genome._filename_lookups["genome.fasta"], ppg_genome._filename_lookups["cdna.fasta"], ppg_genome._filename_lookups["proteins.fasta"], ppg_genome._filename_lookups["genes.gtf"], ppg_genome.cache_dir, )
def test_no_rerun_if_calc_change_but_ignore_codechanges(self): def calc(): append("out/calc", "A") return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) def plot(df): append("out/plot", "B") return pyggplot.Plot(df).add_scatter("X", "Y") of = "out/test.png" job = ppg.PlotJob(of, calc, plot) ppg.run_pipegraph() self.assertTrue(magic(of).find(b"PNG image") != -1) self.assertEqual(read("out/calc"), "A") self.assertEqual(read("out/plot"), "B") ppg.new_pipegraph(rc_gen(), quiet=True) def calc2(): append("out/calc", "A") x = 5 # noqa: E157,F841 return pd.DataFrame({ "X": list(range(0, 100)), "Y": list(range(50, 150)) }) job = ppg.PlotJob(of, calc2, plot) job.ignore_code_changes() ppg.run_pipegraph() self.assertTrue(magic(of).find(b"PNG image") != -1) self.assertEqual(read("out/calc"), "A") self.assertEqual(read("out/plot"), "B")
import pypipegraph import urllib2 import hashlib pypipegraph.new_pipegraph() output_filename = "result.tab" # where to store the final counts # each call to download_job will return a job that downloads just this url. def download_job(url): target_file = ( "website_%s" % hashlib.md5(url).hexdigest() ) # we need a unique name for each target file. def do_download(): request = urllib2.urlopen(url) data = request.read() request.close() file_handle = open(target_file, "wb") file_handle.write(data) file_handle.close() return pypipegraph.FileGeneratingJob(target_file, do_download) def retrieve_urls(): # now I said we were downloading these, but to make the tutorial independand, # we'll fake that bit, ok? # just pretend ;). return [
import pypipegraph import urllib2 import hashlib pypipegraph.new_pipegraph() output_filename = 'result.tab' # where to store the final counts # each call to download_job will return a job that downloads just this url. def download_job(url): target_file = 'website_%s' % hashlib.md5( url).hexdigest() # we need a unique name for each target file. def do_download(): request = urllib2.urlopen(url) data = request.read() request.close() file_handle = open(target_file, 'wb') file_handle.write(data) file_handle.close() return pypipegraph.FileGeneratingJob(target_file, do_download) def retrieve_urls(): # now I said we were downloading these, but to make the tutorial independand, # we'll fake that bit, ok? # just pretend ;). return [ 'http://code.google.com/p/pypipegraph', 'http://code.google.com/p/pypipegraph/w/list'
def install_bioconductor(): bc_version = os.environ["BIOCONDUCTOR_VERSION"] cran_mode = os.environ["CRAN_MODE"] sources = ["cran", "software", "annotation", "experiment"] sources = { x: load_packages(x, os.environ["URL_%s" % x.upper()]).get() for x in sources } if bc_version in manual_overwrite: for src_name, src in manual_overwrite[bc_version].items(): for pkg_name, url in src.items(): sources[src_name][pkg_name]["url"] = url pkgs = list(sources.values()) whitelist = os.environ["BIOCONDUCTOR_WHITELIST"].split(":") logging.basicConfig( filename="/anysnake/bioconductor/ppg.log", level=logging.INFO, filemode="w" ) cpus = int(ppg.util.CPUs() * 1.25) # rule of thumb to achieve maximum throughupt ppg.new_pipegraph( invariant_status_filename="/anysnake/bioconductor/.ppg_status", resource_coordinator=ppg.resource_coordinators.LocalSystem( max_cores_to_use=cpus, interactive=False ), ) jobs, prune_because_of_missing_preqs = build_jobs(pkgs) # now we have jobs for *every* R package # which we now need to filter down to_prune = set() to_prune.update(sources["annotation"].keys()) to_prune.update(sources["experiment"].keys()) to_prune.update(prune_because_of_missing_preqs) prune(jobs, to_prune) if cran_mode == "minimal": prune(jobs, sources["cran"]) already_unpruned = set() for k in sources["software"]: for j in jobs[k]: unprune(j, already_unpruned) prune(jobs, to_prune) already_unpruned = set() for k in whitelist: if k in jobs: for j in jobs[k]: unprune(j, already_unpruned) if "_full_" in whitelist: for k in sources["software"]: for j in jobs[k]: unprune(j, already_unpruned) # still need to apply the blacklist, no matter whether __full__ was set! to_prune = set() to_prune.update(windows_only_packages(pkgs)) to_prune.update(blacklist) if bc_version in blacklist_per_version: to_prune.update(blacklist_per_version[bc_version]) prune(jobs, to_prune) ppg.util.global_pipegraph.connect_graph() ppg.run_pipegraph() for j in ppg.util.global_pipegraph.job_uniquifier.values(): if j._pruned: print("pruned", j.job_id, "because of", j._pruned) write_done_sentinel(cran_mode, whitelist)
def run_exports(gen_additional_jobs=None, handle_ppg=True, settings='ovca'): if settings == 'ovca': apply_ovca_settings() else: raise ValueError("unknow setting value", settings) old = Path(os.getcwd()).absolute() os.chdir("/project") if handle_ppg: ppg.new_pipegraph() # os.chdir(old) to_wide_columns = {} jobs = [] for cls in exporting_classes: instance = cls() if hasattr(instance, "exports"): instance.exports() out_prefix = getattr(instance, "out_prefix", "") for method_name in dir(instance): method = getattr(instance, method_name) if hasattr(method, "_output_name"): print(cls.__name__, method.__name__) output_filename = ("/project/processed/" + out_prefix + method._output_name + ".units") cwd = str(Path(method._abs_filename).parent) def write(output_filename=output_filename, method=method, cwd=cwd): os.chdir(cwd) df = method() os.chdir("/project") check_dataframe(out_prefix + method._output_name, df) Path(output_filename).parent.mkdir(exist_ok=True, parents=True) if "unit" in df: for ii, (unit, sub_df) in enumerate( df.groupby("unit", sort=True)): try: sub_df.to_parquet( output_filename[:output_filename. rfind(".")] + "." + str(ii) + ".parquet") except: sub_df.to_pickle("debug.pickle") raise Path(output_filename).write_text( json.dumps(sorted(df.unit.unique()))) else: df.to_parquet( output_filename[:output_filename.rfind(".")] + ".0.parquet") Path(output_filename).write_text(json.dumps(["nounit" ])) Path(output_filename + ".desc").write_text( method._description) job = ppg.MultiFileGeneratingJob( [output_filename, output_filename + ".desc"], write) job.depends_on( ppg.FunctionInvariant(output_filename + "_inner_func", method)) if method._input_files: job.depends_on(ppg.MultiFileInvariant(method._input_files)) if method._deps: if hasattr(method._deps, "__call__"): deps = method._deps(method.__self__) else: deps = method._deps job.depends_on(deps) print(output_filename) print("") os.chdir("/project") jobs.append(job) to_wide_columns[out_prefix + method._output_name] = method._wide_columns def dump_to_wide_columns(output_filename): Path(output_filename).write_text(json.dumps(to_wide_columns)) jobs.append( ppg.FileGeneratingJob( "/project/processed/_to_wide_columns.json", dump_to_wide_columns).depends_on( ppg.ParameterInvariant( "/project/processed/_to_wide_columns.json", ppg.util.freeze(to_wide_columns), ))) old = Path(os.getcwd()).absolute() if handle_ppg: os.chdir("/project") ppg.run_pipegraph() os.chdir(old) return jobs
def inner(): ppg.new_pipegraph(quiet=True, dump_graph=False) jobA = ppg.FileGeneratingJob("A", lambda: write("A", "A")) jobB = ppg.FileGeneratingJob("A", lambda: write("B", "A")) jobA.depends_on(jobB) jobB.depends_on(jobA)
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph.py data_path (bibtex_key|component)") sys.exit(1) # This creates a global Pipegraph object # All new jobs will automatically register with it. pypipegraph.new_pipegraph() invariants_csv_files = [] for file in glob.glob(os.path.join(argv[1], "*.csv")): invariants_csv_files.append(pypipegraph.FileTimeInvariant(file)) dictdata_ids = [] def load_dictdata_ids(): cr = loaded_data["cr"] dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) return dictdata_ids def create_corpusreader(): cr = CorpusReaderDict(argv[1]) return cr def set_corpusreader(value): loaded_data["cr"] = value loaded_data["dictdata_ids"] = load_dictdata_ids() cr_loading_job = pypipegraph.CachedDataLoadingJob(filename_corpusreader, create_corpusreader, set_corpusreader) cr_loading_job.depends_on(invariants_csv_files) def generate_dictdata_graph_job(dictdata_id): cr = loaded_data["cr"] dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) target_file = "{0}.dot".format(dictdata_string) # now, we need a function that downloads from url and stores to target_file def generate_dictdata_graph(): gr = Graph() src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': raise(NoSpanishException) language_iso = None if tgt_language_iso == 'spa': language_iso = src_language_iso else: language_iso = tgt_language_iso bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == 'spa': (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open(target_file, "w", "utf-8") output.write(write(gr)) output.close() return pypipegraph.FileGeneratingJob(target_file, generate_dictdata_graph) def gen_jobs(): cr = loaded_data["cr"] jobs_generate_dot = [generate_dictdata_graph_job(dictdata_id) for dictdata_id in loaded_data["dictdata_ids"] if cr.src_language_iso_for_dictdata_id(dictdata_id) == "spa" or cr.tgt_language_iso_for_dictdata_id(dictdata_id) == "spa"] for job in jobs_generate_dot: job.depends_on(cr_loading_job) def combine_graphs(): gr = None for dictdata_id in loaded_data["dictdata_ids"]: #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) #target_file = "{0}.dot".format(dictdata_string) j = generate_dictdata_graph_job(dictdata_id) target_file = j.job_id IN = codecs.open(target_file, "r", "utf-8") if gr == None: gr = read(IN.read()) else: gr2 = read(IN.read()) for node in gr2: gr.add_node(node, gr2.node[node]) for n1, n2 in gr2.edges_iter(): gr.add_edge(n1, n2, gr2.edge[n1][n2]) IN.close() OUT = codecs.open(filename_combined_graph, "w", "utf-8") OUT.write(write(gr)) OUT.close() job_combine_graphs = pypipegraph.FileGeneratingJob(filename_combined_graph, combine_graphs) job_combine_graphs.depends_on(jobs_generate_dot) pypipegraph.JobGeneratingJob("makejobs", gen_jobs).depends_on(cr_loading_job) pypipegraph.run_pipegraph()
ExonSmartStrandedPython) work_dir = Path("_benchmark_read_counting") work_dir.mkdir(exist_ok=True) os.chdir(work_dir) bam_name = (Path("results") / "aligned" / "STAR_2.6.1d" / "Drosophila_melanogaster_94" / "ERR2984187" / "ERR2984187.bam") if not bam_name.exists(): # leverage pipeline to get some sample data import mbf_align import mbf_externals ppg.new_pipegraph() genome = mbf_genomes.EnsemblGenome("Drosophila_melanogaster", 94) aligner = mbf_externals.aligners.STAR() # just some random drospohila lane. samples = {"ERR2984187": "ERR2984187"} raw = { name: mbf_align.Sample( name, mbf_align.strategies.FASTQsFromAccession(err), reverse_reads=False, pairing="only_first", ) for name, err in samples.items() }
if __name__ == "__main__": import time import os import sys sys.path.append("../../") import pypipegraph as ppg ppg.new_pipegraph() def run_long(): time.sleep(20) raise ValueError() def run_short(): time.sleep(5) with open("short.dat", "wb") as op: op.write("DONO") if os.path.exists("short.dat"): os.unlink("short.dat") if os.path.exists("long.dat"): os.unlink("long.dat") job1 = ppg.FileGeneratingJob("long.dat", run_long) job2 = ppg.FileGeneratingJob("short.dat", run_short) job1.depends_on(job2) ppg.run_pipegraph()