def check_output(self, job, wait=3): """ Raise exception if output files of job are missing. """ for f in job.expanded_output: if not f.exists: logger.warning( "Output file {} not present. Waiting {} " "seconds to ensure that this is not because of filesystem " "latency.".format(f, wait) ) while not f.exists and wait > 0: wait -= 1 time.sleep(1) if not f.exists: raise MissingOutputException( "Output file {} not " "produced by rule {}.".format(f, job.rule.name), lineno=job.rule.lineno, snakefile=job.rule.snakefile, ) input_maxtime = job.input_maxtime if input_maxtime is not None: output_mintime = job.output_mintime if output_mintime is not None and output_mintime < input_maxtime: raise RuleException( "Output files {} are older than input " "files. Did you extract an archive? Make sure that output " "files have a more recent modification date than the " "archive, e.g. by using 'touch'.".format(", ".join(job.expanded_output)), rule=job.rule, )
def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) if self.dynamic_output: for f, _ in chain(*map(partial(self.expand_dynamic, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill), self.rule.dynamic_output)): os.remove(f) for f, f_ in zip(self.output, self.rule.output): f.prepare() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare()
def print_exception(ex, linemaps, print_traceback=True): """ Print an error message for a given exception. Arguments ex -- the exception linemaps -- a dict of a dict that maps for each snakefile the compiled lines to source code lines in the snakefile. """ # traceback.print_exception(type(ex), ex, ex.__traceback__) origin = get_exception_origin(ex, linemaps) if origin is not None: lineno, file = origin logger.critical(format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=print_traceback)) return if isinstance(ex, SyntaxError): logger.critical( format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=print_traceback) ) elif isinstance(ex, TokenError): logger.critical(format_error(ex, None, show_traceback=print_traceback)) elif isinstance(ex, RuleException): for e in ex._include + [ex]: if not e.omit: logger.critical( format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=print_traceback) ) elif isinstance(ex, WorkflowError): logger.critical( format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=print_traceback) ) elif isinstance(ex, KeyboardInterrupt): logger.warning("Cancelling snakemake on user request.") else: traceback.print_exception(type(ex), ex, ex.__traceback__)
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ while True: try: self._open_jobs.wait() except: # this will be caused because of SIGTERM or SIGINT self._executor.shutdown() return False self._open_jobs.clear() if not self.keepgoing and self._errors: logger.warning("Will exit after finishing " "currently running jobs.") self._executor.shutdown() return False if not any(self.open_jobs): self._executor.shutdown() return not self._errors needrun = list(self.open_jobs) assert needrun logger.debug("Ready jobs:\n\t" + "\n\t".join(map(str, needrun))) run = self.job_selector(needrun) logger.debug("Selected jobs:\n\t" + "\n\t".join(map(str, run))) self.running.update(run) for job in run: self.run(job)
def finish_job(self, job): super().finish_job(job) self.stats.report_job_end(job) try: self.workflow.persistence.finished(job) except IOError as e: logger.warning("Failed to remove marker file for job started " "({}). Please ensure write permissions for the " "directory {}".format( e, self.workflow.persistence.path))
def _error(self, job): """ Clear jobs and stop the workflow. """ with self._lock: self._errors = True self.running.remove(job) self.failed.add(job) if self.keepgoing: logger.warning("Job failed, going on with independent jobs.") else: self._open_jobs.set()
def _run(self, job, callback=None, error_callback=None): super()._run(job) self.stats.report_job_start(job) try: self.workflow.persistence.started(job) except IOError as e: logger.warning("Failed to set marker file for job started ({}). " "Snakemake will work, but cannot ensure that output files " "are complete in case of a kill signal or power loss. " "Please ensure write permissions for the " "directory {}".format( e, self.workflow.persistence.path))
def lutime(f, times): #In some cases, we have a platform where os.supports_follow_symlink includes stat() #but not utime(). This leads to an anomaly. In any case we never want to touch the #target of a link. if os.utime in os.supports_follow_symlinks: #...utime is well behaved return os.utime(f, times, follow_symlinks=False) elif not os.path.islink(f): #...symlinks not an issue here return os.utime(f, times) else: #...problem system. Do nothing. logger.warning("Unable to set utime on symlink {}. Your Python build does not support it.".format(f)) return None
def printjob(self, job): # skip dynamic jobs that will be "executed" only in dryrun mode if self.dag.dynamic(job): return def format_files(job, io, ruleio, dynamicio): for f in io: f_ = ruleio[f] if f in dynamicio: yield "{} (dynamic)".format(f_) else: yield f def format_ruleitem(name, value): return "" if not value else "\t{}: {}".format(name, value) desc = list() if not self.quiet: if job.message: desc.append(job.message) else: desc.append("{}rule {}:".format(self.rule_prefix(job), job.rule.name)) for name, value in ( ("input", ", ".join(format_files( job, job.input, job.ruleio, job.dynamic_input))), ("output", ", ".join(format_files( job, job.output, job.ruleio, job.dynamic_output))), ("log", job.log), ("reason", self.dag.reason(job) if self.printreason else None)): if value: desc.append(format_ruleitem(name, value)) priority = self.dag.priority(job) if priority > 1: desc.append(format_ruleitem( "priority", "highest" if priority == Job.HIGHEST_PRIORITY else priority)) if self.printthreads and job.threads > 1: desc.append(format_ruleitem("threads", job.threads)) if self.printshellcmds and job.shellcmd: desc.append(job.shellcmd) if desc: logger.info("\n".join(desc)) if job.dynamic_output: logger.warning("Subsequent jobs will be added dynamically " "depending on the output of this rule")
def remove(file, remove_non_empty_dir=False): if os.path.exists(file): if os.path.isdir(file): if remove_non_empty_dir: shutil.rmtree(file) else: try: os.removedirs(file) except OSError as e: # skip non empty directories if e.errno == 39: logger.info("Skipped removing empty directory {}".format(e.filename)) else: logger.warning(str(e)) else: os.remove(file)
def finish(self, job, update_dynamic=True): self._finished.add(job) try: self._ready_jobs.remove(job) except KeyError: pass # mark depending jobs as ready for job_ in self.depending[job]: if self.needrun(job_) and self._ready(job_): self._ready_jobs.add(job_) if update_dynamic and job.dynamic_output: logger.warning("Dynamically updating jobs") newjob = self.update_dynamic(job) if newjob: # simulate that this job ran and was finished before self.omitforce.add(newjob) self._needrun.add(newjob) self._finished.add(newjob) self.postprocess() self.handle_protected(newjob)
def handle_temp(self, job): """ Remove temp files if they are no longer needed. """ if self.notemp: return needed = lambda job_, f: any( f in files for j, files in self.depending[job_].items() if not self.finished(j) and self.needrun(j) and j != job ) def unneeded_files(): for job_, files in self.dependencies[job].items(): for f in job_.temp_output & files: if not needed(job_, f): yield f for f in filterfalse(partial(needed, job), job.temp_output): if not f in self.targetfiles: yield f for f in unneeded_files(): logger.warning("Removing temporary output file {}".format(f)) f.remove()
def update_dynamic(self, job): dynamic_wildcards = job.dynamic_wildcards if not dynamic_wildcards: # this happens e.g. in dryrun if output is not yet present return depending = list(filter(lambda job_: not self.finished(job_), self.bfs(self.depending, job))) newrule, non_dynamic_wildcards = job.rule.dynamic_branch( dynamic_wildcards, input=False) self.specialize_rule(job.rule, newrule) # no targetfile needed for job newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards) self.replace_job(job, newjob) for job_ in depending: if job_.dynamic_input: newrule_ = job_.rule.dynamic_branch(dynamic_wildcards) if newrule_ is not None: self.specialize_rule(job_.rule, newrule_) if not self.dynamic(job_): logger.debug("Updating job {}.".format(job_)) newjob_ = Job(newrule_, self, targetfile=job_.targetfile) unexpected_output = self.reason( job_).missing_output.intersection( newjob.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( newjob_.rule, unexpected_output)) self.replace_job(job_, newjob_) return newjob
def __init__( self, workflow, dag, cores, local_cores=1, dryrun=False, touch=False, cluster=None, cluster_status=None, cluster_config=None, cluster_sync=None, drmaa=None, drmaa_log_dir=None, kubernetes=None, container_image=None, tibanna=None, tibanna_sfn=None, google_lifesciences=None, google_lifesciences_regions=None, google_lifesciences_location=None, google_lifesciences_cache=False, tes=None, precommand="", preemption_default=None, preemptible_rules=None, tibanna_config=False, jobname=None, quiet=False, printreason=False, printshellcmds=False, keepgoing=False, max_jobs_per_second=None, max_status_checks_per_second=100, latency_wait=3, greediness=1.0, force_use_threads=False, assume_shared_fs=True, keepincomplete=False, keepmetadata=True, scheduler_type=None, scheduler_ilp_solver=None, ): """ Create a new instance of KnapsackJobScheduler. """ from ratelimiter import RateLimiter self.cluster = cluster self.cluster_config = cluster_config self.cluster_sync = cluster_sync self.dag = dag self.workflow = workflow self.dryrun = dryrun self.touch = touch self.quiet = quiet self.keepgoing = keepgoing self.running = set() self.failed = set() self.finished_jobs = 0 self.greediness = 1 self.max_jobs_per_second = max_jobs_per_second self.keepincomplete = keepincomplete self.keepmetadata = keepmetadata self.scheduler_type = scheduler_type self.scheduler_ilp_solver = scheduler_ilp_solver self.global_resources = { name: (sys.maxsize if res is None else res) for name, res in workflow.global_resources.items() } self.resources = dict(self.global_resources) use_threads = ( force_use_threads or (os.name != "posix") or cluster or cluster_sync or drmaa ) self._open_jobs = threading.Semaphore(0) self._lock = threading.Lock() self._errors = False self._finished = False self._job_queue = None self._submit_callback = self._noop self._finish_callback = partial( self._proceed, update_dynamic=not self.dryrun, print_progress=not self.quiet and not self.dryrun, ) self._local_executor = None if dryrun: self._executor = DryrunExecutor( workflow, dag, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, ) elif touch: self._executor = TouchExecutor( workflow, dag, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, ) elif cluster or cluster_sync or (drmaa is not None): if not workflow.immediate_submit: # No local jobs when using immediate submit! # Otherwise, they will fail due to missing input self._local_executor = CPUExecutor( workflow, dag, local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cores=local_cores, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) if cluster or cluster_sync: if cluster_sync: constructor = SynchronousClusterExecutor else: constructor = partial( GenericClusterExecutor, statuscmd=cluster_status, max_status_checks_per_second=max_status_checks_per_second, ) self._executor = constructor( workflow, dag, None, submitcmd=(cluster or cluster_sync), cluster_config=cluster_config, jobname=jobname, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, assume_shared_fs=assume_shared_fs, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) if workflow.immediate_submit: self._submit_callback = partial( self._proceed, update_dynamic=False, print_progress=False, update_resources=False, handle_job_success=False, ) else: self._executor = DRMAAExecutor( workflow, dag, None, drmaa_args=drmaa, drmaa_log_dir=drmaa_log_dir, jobname=jobname, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cluster_config=cluster_config, assume_shared_fs=assume_shared_fs, max_status_checks_per_second=max_status_checks_per_second, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) elif kubernetes: self._local_executor = CPUExecutor( workflow, dag, local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cores=local_cores, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) self._executor = KubernetesExecutor( workflow, dag, kubernetes, container_image=container_image, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cluster_config=cluster_config, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) elif tibanna: self._local_executor = CPUExecutor( workflow, dag, local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, use_threads=use_threads, latency_wait=latency_wait, cores=local_cores, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) self._executor = TibannaExecutor( workflow, dag, cores, tibanna_sfn, precommand=precommand, tibanna_config=tibanna_config, container_image=container_image, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) elif google_lifesciences: self._local_executor = CPUExecutor( workflow, dag, local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cores=local_cores, ) self._executor = GoogleLifeSciencesExecutor( workflow, dag, cores, container_image=container_image, regions=google_lifesciences_regions, location=google_lifesciences_location, cache=google_lifesciences_cache, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, preemption_default=preemption_default, preemptible_rules=preemptible_rules, ) elif tes: self._local_executor = CPUExecutor( workflow, dag, local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cores=local_cores, keepincomplete=keepincomplete, ) self._executor = TaskExecutionServiceExecutor( workflow, dag, cores=local_cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, tes_url=tes, container_image=container_image, ) else: self._executor = CPUExecutor( workflow, dag, cores, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, use_threads=use_threads, latency_wait=latency_wait, cores=cores, keepincomplete=keepincomplete, keepmetadata=keepmetadata, ) if self.max_jobs_per_second and not self.dryrun: max_jobs_frac = Fraction(self.max_jobs_per_second).limit_denominator() self.rate_limiter = RateLimiter( max_calls=max_jobs_frac.numerator, period=max_jobs_frac.denominator ) else: # essentially no rate limit self.rate_limiter = DummyRateLimiter() # Choose job selector (greedy or ILP) self.job_selector = self.job_selector_greedy if scheduler_type == "ilp": import pulp if pulp.apis.LpSolverDefault is None: logger.warning( "Falling back to greedy scheduler because no default " "solver is found for pulp (you have to install either " "coincbc or glpk)." ) else: self.job_selector = self.job_selector_ilp self._user_kill = None try: signal.signal(signal.SIGTERM, self.exit_gracefully) except ValueError: # If this fails, it is due to scheduler not being invoked in the main thread. # This can only happen with --gui, in which case it is fine for now. pass self._open_jobs.release()
def __init__( self, path, job, caption, env, category, wildcards_overwrite=None, mode_embedded=True, ): self.mode_embedded = mode_embedded self.path = path self.target = os.path.basename(path) self.size = os.path.getsize(self.path) logger.info("Adding {} ({:.2g} MB).".format(self.name, self.size / 1e6)) self.raw_caption = caption self.mime, _ = mime_from_file(self.path) h = hashlib.sha256() h.update(path.encode()) self.id = h.hexdigest() self.job = job self._wildcards = (job.wildcards if wildcards_overwrite is None else wildcards_overwrite) self.wildcards = logging.format_wildcards(self._wildcards) self.params = (logging.format_dict(job.params).replace("\n", r"\n").replace( '"', r"\"")) self.category = category self.table_content = None if self.is_table: if self.size > 1e6: logger.warning( "Table {} >1MB. Rendering as generic file.".format( self.path)) else: with open(self.path) as table: dialect = None for prefix in range(10, 17): try: table.seek(0) dialect = csv.Sniffer().sniff(table.read(prefix)) break except csv.Error: pass except UnicodeDecodeError: # table is not readable as UTF-8 break if dialect is None: logger.warning( "Failed to infer CSV/TSV dialect from table {}. " "Rendering as generic file.".format(self.path)) else: table.seek(0) reader = csv.reader(table, dialect) columns = next(reader) table = map( lambda row: list(map(num_if_possible, row)), reader) template = env.get_template("table.html") html = template.render(columns=columns, table=table, name=self.name).encode() self.table_content = html self.mime = "text/html" self.path = os.path.basename(self.path) + ".html" self.data_uri = self._data_uri() self.png_uri = self._png_uri()
def auto_report(dag, path, stylesheet=None): try: from jinja2 import Template, Environment, PackageLoader except ImportError as e: raise WorkflowError( "Python package jinja2 must be installed to create reports.") mode_embedded = True if path.endswith(".zip"): mode_embedded = False elif not path.endswith(".html"): raise WorkflowError("Report file does not end with .html or .zip") custom_stylesheet = None if stylesheet is not None: try: with open(stylesheet) as s: custom_stylesheet = s.read() except (Exception, BaseException) as e: raise WorkflowError("Unable to read custom report stylesheet.", e) logger.info("Creating report...") env = Environment( loader=PackageLoader("snakemake", "report"), trim_blocks=True, lstrip_blocks=True, ) env.filters["get_resource_as_string"] = get_resource_as_string persistence = dag.workflow.persistence results = defaultdict(lambda: defaultdict(list)) records = defaultdict(JobRecord) recorded_files = set() for job in dag.jobs: for f in itertools.chain(job.expanded_output, job.input): if is_flagged(f, "report") and f not in recorded_files: if not f.exists: raise WorkflowError("File {} marked for report but does " "not exist.".format(f)) report_obj = get_flag_value(f, "report") def register_file(f, wildcards_overwrite=None): wildcards = wildcards_overwrite or job.wildcards category = Category(report_obj.category, wildcards=wildcards, job=job) subcategory = Category(report_obj.subcategory, wildcards=wildcards, job=job) results[category][subcategory].append( FileRecord( f, job, report_obj.caption, env, category, wildcards_overwrite=wildcards_overwrite, mode_embedded=mode_embedded, )) recorded_files.add(f) if os.path.isfile(f): register_file(f) if os.path.isdir(f): if not isinstance(report_obj.patterns, list): raise WorkflowError( "Invalid patterns given for report. Must be list.", rule=job.rule, ) if not report_obj.patterns: raise WorkflowError( "Directory marked for report but no file patterns given via patterns=[...]. " "See report documentation.", rule=job.rule, ) for pattern in report_obj.patterns: pattern = os.path.join(f, pattern) wildcards = glob_wildcards(pattern)._asdict() names = wildcards.keys() for w in zip(*wildcards.values()): w = dict(zip(names, w)) w.update(job.wildcards_dict) w = Wildcards(fromdict=w) f = apply_wildcards(pattern, w) register_file(f, wildcards_overwrite=w) for f in job.expanded_output: meta = persistence.metadata(f) if not meta: logger.warning("Missing metadata for file {}. Maybe metadata " "was deleted or it was created using an older " "version of Snakemake. This is a non critical " "warning.".format(f)) continue try: job_hash = meta["job_hash"] rule = meta["rule"] rec = records[(job_hash, rule)] rec.rule = rule rec.job = job rec.starttime = min(rec.starttime, meta["starttime"]) rec.endtime = max(rec.endtime, meta["endtime"]) rec.conda_env_file = None rec.conda_env = meta["conda_env"] rec.container_img_url = meta["container_img_url"] rec.output.append(f) except KeyError as e: print(e) logger.warning("Metadata for file {} was created with a too " "old Snakemake version.".format(f)) for subcats in results.values(): for catresults in subcats.values(): catresults.sort(key=lambda res: res.name) # prepare runtimes runtimes = [{ "rule": rec.rule, "runtime": rec.endtime - rec.starttime } for rec in sorted(records.values(), key=lambda rec: rec.rule)] # prepare end times timeline = [{ "rule": rec.rule, "starttime": datetime.datetime.fromtimestamp(rec.starttime).isoformat(), "endtime": datetime.datetime.fromtimestamp(rec.endtime).isoformat(), } for rec in sorted(records.values(), key=lambda rec: rec.rule)] # prepare per-rule information rules = defaultdict(list) for rec in records.values(): rule = RuleRecord(rec.job, rec) if rec.rule not in rules: rules[rec.rule].append(rule) else: merged = False for other in rules[rec.rule]: if rule == other: other.add(rec) merged = True break if not merged: rules[rec.rule].append(rule) # rulegraph rulegraph, xmax, ymax = rulegraph_d3_spec(dag) # configfiles configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles] seen = set() files = [ seen.add(res.target) or res for cat in results.values() for subcat in cat.values() for res in subcat if res.target not in seen ] rst_links = textwrap.dedent(""" .. _Workflow: javascript:show_panel('workflow') .. _Statistics: javascript:show_panel('statistics') {% for cat, catresults in categories|dictsort %} .. _{{ cat.name }}: javascript:show_panel("{{ cat.id }}") {% endfor %} {% for res in files %} .. _{{ res.target }}: javascript:show_panel("{{ res.category.id }}") {% endfor %} """) for cat, subcats in results.items(): for subcat, catresults in subcats.items(): for res in catresults: res.render(env, rst_links, results, files) # global description text = "" if dag.workflow.report_text: with open(dag.workflow.report_text) as f: class Snakemake: config = dag.workflow.config text = f.read() + rst_links text = publish_parts( env.from_string(text).render(snakemake=Snakemake, categories=results, files=files), writer_name="html", )["body"] # record time now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0]) results_size = sum(res.size for cat in results.values() for subcat in cat.values() for res in subcat) try: from pygments.formatters import HtmlFormatter except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports.") template = env.get_template("report.html") logger.info("Downloading resources and rendering HTML.") rendered = template.render( results=results, results_size=results_size, configfiles=configfiles, text=text, rulegraph_nodes=rulegraph["nodes"], rulegraph_links=rulegraph["links"], rulegraph_width=xmax + 20, rulegraph_height=ymax + 20, runtimes=runtimes, timeline=timeline, rules=[rec for recs in rules.values() for rec in recs], version=__version__, now=now, pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"), custom_stylesheet=custom_stylesheet, mode_embedded=mode_embedded, ) # TODO look into supporting .WARC format, also see (https://webrecorder.io) if not mode_embedded: with ZipFile(path, mode="w") as zipout: folder = Path(Path(path).stem) # store results in data folder for subcats in results.values(): for catresults in subcats.values(): for result in catresults: # write raw data if result.table_content is not None: zipout.writestr( str(folder.joinpath(result.data_uri)), result.table_content, ) else: zipout.write(result.path, str(folder.joinpath(result.data_uri))) # write thumbnail if result.is_img and result.png_content: zipout.writestr( str(folder.joinpath(result.png_uri)), result.png_content) # write report html zipout.writestr(str(folder.joinpath("report.html")), rendered) else: with open(path, "w", encoding="utf-8") as htmlout: htmlout.write(rendered) logger.info("Report created: {}.".format(path))
def execute( self, targets=None, dryrun=False, touch=False, cores=1, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, quiet=False, keepgoing=False, printshellcmds=False, printreason=False, printdag=False, cluster=None, immediate_submit=False, ignore_ambiguity=False, workdir=None, printrulegraph=False, stats=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, output_wait=3, nolock=False, unlock=False, resources=None, notemp=False, nodeps=False, cleanup_metadata=None): self.global_resources = dict() if cluster or resources is None else resources self.global_resources["_cores"] = cores def rules(items): return map(self._rules.__getitem__, filter(self.is_rule, items)) def files(items): return map(os.path.relpath, filterfalse(self.is_rule, items)) if workdir is None: workdir = os.getcwd() if self._workdir is None else self._workdir os.chdir(workdir) if not targets: targets = [self.first_rule] if self.first_rule is not None else list() if prioritytargets is None: prioritytargets = list() if forcerun is None: forcerun = list() priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) targetrules = set(chain( rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules))) targetfiles = set(chain(files(targets), priorityfiles, forcefiles)) if forcetargets: forcefiles.update(targetfiles) forcerules.update(targetrules) dag = DAG( self, dryrun=dryrun, targetfiles=targetfiles, targetrules=targetrules, forceall=forceall, forcefiles=forcefiles, forcerules=forcerules, priorityfiles=priorityfiles, priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, notemp=notemp) self.persistence = Persistence(nolock=nolock, dag=dag) if cleanup_metadata: for f in cleanup_metadata: self.persistence.cleanup_metadata(f) return True dag.init() dag.check_dynamic() if unlock: try: self.persistence.cleanup_locks() logger.warning("Unlocking working directory.") return True except IOError: logger.error("Error: Unlocking the directory {} failed. Maybe " "you don't have the permissions?") return False try: self.persistence.lock() except IOError: logger.critical("Error: Directory cannot be locked. Please make " "sure that no other Snakemake process is trying to create " "the same files in the following directory:\n{}\n" "If you are sure that no other " "instances of snakemake are running on this directory, " "the remaining lock was likely caused by a kill signal or " "a power loss. It can be removed with " "the --unlock argument.".format(os.getcwd())) return False dag.check_incomplete() dag.postprocess() if nodeps: missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)] logger.critical("Dependency resolution disabled (--nodeps) " "but missing input " "files detected. If this happens on a cluster, please make sure " "that you handle the dependencies yourself or turn of " "--immediate-submit. Missing input files:\n{}".format( "\n".join(missing_input))) return False if printdag: print(dag) return True elif printrulegraph: print(dag.rule_dot()) return True elif summary: print("\n".join(dag.summary())) return True elif list_version_changes: items = list(chain( *map(self.persistence.version_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_code_changes: items = list(chain( *map(self.persistence.code_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_input_changes: items = list(chain( *map(self.persistence.input_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_params_changes: items = list(chain( *map(self.persistence.params_changed, dag.jobs))) if items: print(*items, sep="\n") return True scheduler = JobScheduler( self, dag, cores, dryrun=dryrun, touch=touch, cluster=cluster, immediate_submit=immediate_submit, quiet=quiet, keepgoing=keepgoing, printreason=printreason, printshellcmds=printshellcmds, output_wait=output_wait) if not dryrun and not quiet and len(dag): if cluster: logger.warning("Provided cluster nodes: {}".format(cores)) else: logger.warning("Provided cores: {}".format(cores)) logger.warning("\n".join(dag.stats())) success = scheduler.schedule() if success: if dryrun: if not quiet: logger.warning("\n".join(dag.stats())) elif stats: scheduler.stats.to_csv(stats) else: logger.critical( "Exiting because a job execution failed. " "Look above for error message") return False return True
def _set_inoutput_item(self, item, output=False, name=None): """ Set an item to be input or output. Arguments item -- the item inoutput -- either a Namedlist of input or output items name -- an optional name for the item """ inoutput = self.output if output else self.input if isinstance(item, str): # add the rule to the dependencies if isinstance(item, _IOFile): self.dependencies[item] = item.rule if output: if self.wildcard_constraints or self.workflow._wildcard_constraints: try: item = update_wildcard_constraints( item, self.wildcard_constraints, self.workflow._wildcard_constraints) except ValueError as e: raise IOFileException(str(e), snakefile=self.snakefile, lineno=self.lineno) else: if contains_wildcard_constraints(item): logger.warning( "wildcard constraints in inputs are ignored") _item = IOFile(item, rule=self) if is_flagged(item, "temp"): if output: self.temp_output.add(_item) if is_flagged(item, "protected"): if output: self.protected_output.add(_item) if is_flagged(item, "touch"): if output: self.touch_output.add(_item) if is_flagged(item, "dynamic"): if output: self.dynamic_output.add(_item) else: self.dynamic_input.add(_item) if is_flagged(item, "subworkflow"): if output: raise SyntaxError( "Only input files may refer to a subworkflow") else: # record the workflow this item comes from self.subworkflow_input[_item] = item.flags["subworkflow"] inoutput.append(_item) if name: inoutput.add_name(name) elif callable(item): if output: raise SyntaxError( "Only input files can be specified as functions") inoutput.append(item) if name: inoutput.add_name(name) else: try: start = len(inoutput) for i in item: self._set_inoutput_item(i, output=output) if name: # if the list was named, make it accessible inoutput.set_name(name, start, end=len(inoutput)) except TypeError: raise SyntaxError( "Input and output files have to be specified as strings or lists of strings." )
def validate(data, schema, set_default=True): """Validate data with JSON schema at given path. Args: data (object): data to validate. Can be a config dict or a pandas data frame. schema (str): Path to JSON schema used for validation. The schema can also be in YAML format. If validating a pandas data frame, the schema has to describe a row record (i.e., a dict with column names as keys pointing to row values). See http://json-schema.org. The path is interpreted relative to the Snakefile when this function is called. set_default (bool): set default values defined in schema. See http://python-jsonschema.readthedocs.io/en/latest/faq/ for more information """ try: import jsonschema from jsonschema import validators, RefResolver except ImportError: raise WorkflowError( "The Python 3 package jsonschema must be installed " "in order to use the validate directive.") if not os.path.isabs(schema): frame = inspect.currentframe().f_back # if workflow object is not available this has not been started from a workflow if "workflow" in frame.f_globals: workflow = frame.f_globals["workflow"] schema = os.path.join(workflow.current_basedir, schema) schemafile = schema schema = _load_configfile(schema, filetype="Schema") resolver = RefResolver( urljoin("file:", schemafile), schema, handlers={ "file": lambda uri: _load_configfile(re.sub("^file://", "", uri)) }, ) # Taken from http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for property, subschema in properties.items(): if "default" in subschema: instance.setdefault(property, subschema["default"]) for error in validate_properties(validator, properties, instance, schema): yield error return validators.extend(validator_class, {"properties": set_defaults}) Validator = validators.validator_for(schema) if Validator.META_SCHEMA["$schema"] != schema["$schema"]: logger.warning( "No validator found for JSON Schema version identifier '{}'". format(schema["$schema"])) logger.warning( "Defaulting to validator for JSON Schema version '{}'".format( Validator.META_SCHEMA["$schema"])) logger.warning("Note that schema file may not be validated correctly.") DefaultValidator = extend_with_default(Validator) if not isinstance(data, dict): try: import pandas as pd recordlist = [] if isinstance(data, pd.DataFrame): for i, record in enumerate(data.to_dict("records")): record = { k: v for k, v in record.items() if not pd.isnull(v) } try: if set_default: DefaultValidator( schema, resolver=resolver).validate(record) recordlist.append(record) else: jsonschema.validate(record, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError( "Error validating row {} of data frame.".format(i), e) if set_default: newdata = pd.DataFrame(recordlist, data.index) newcol = ~newdata.columns.isin(data.columns) n = len(data.columns) for col in newdata.loc[:, newcol].columns: data.insert(n, col, newdata.loc[:, col]) n = n + 1 return except ImportError: pass raise WorkflowError("Unsupported data type for validation.") else: try: if set_default: DefaultValidator(schema, resolver=resolver).validate(data) else: jsonschema.validate(data, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError("Error validating config file.", e)
def decorate(ruleinfo): if ruleinfo.wildcard_constraints: rule.set_wildcard_constraints( *ruleinfo.wildcard_constraints[0], **ruleinfo.wildcard_constraints[1]) if ruleinfo.input: rule.set_input(*ruleinfo.input[0], **ruleinfo.input[1]) if ruleinfo.output: rule.set_output(*ruleinfo.output[0], **ruleinfo.output[1]) if ruleinfo.params: rule.set_params(*ruleinfo.params[0], **ruleinfo.params[1]) # handle default resources if self.default_resources is not None: rule.resources = copy.deepcopy(self.default_resources.parsed) if ruleinfo.threads is not None: if (not isinstance(ruleinfo.threads, int) and not isinstance(ruleinfo.threads, float) and not callable(ruleinfo.threads)): raise RuleException( "Threads value has to be an integer, float, or a callable.", rule=rule, ) if name in self.overwrite_threads: rule.resources["_cores"] = self.overwrite_threads[name] else: rule.resources["_cores"] = int(ruleinfo.threads) if ruleinfo.shadow_depth: if ruleinfo.shadow_depth not in (True, "shallow", "full", "minimal"): raise RuleException( "Shadow must either be 'minimal', 'shallow', 'full', " "or True (equivalent to 'full')", rule=rule, ) if ruleinfo.shadow_depth is True: rule.shadow_depth = "full" logger.warning( "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|shallow|full' instead." .format(rule)) else: rule.shadow_depth = ruleinfo.shadow_depth if ruleinfo.resources: args, resources = ruleinfo.resources if args: raise RuleException("Resources have to be named.") if not all( map(lambda r: isinstance(r, int) or callable(r), resources.values())): raise RuleException( "Resources values have to be integers or callables", rule=rule) rule.resources.update(resources) if ruleinfo.priority: if not isinstance(ruleinfo.priority, int) and not isinstance( ruleinfo.priority, float): raise RuleException("Priority values have to be numeric.", rule=rule) rule.priority = ruleinfo.priority if ruleinfo.version: rule.version = ruleinfo.version if ruleinfo.log: rule.set_log(*ruleinfo.log[0], **ruleinfo.log[1]) if ruleinfo.message: rule.message = ruleinfo.message if ruleinfo.benchmark: rule.benchmark = ruleinfo.benchmark if not self.run_local and ruleinfo.group is not None: rule.group = ruleinfo.group if ruleinfo.wrapper: if self.use_conda: rule.conda_env = snakemake.wrapper.get_conda_env( ruleinfo.wrapper, prefix=self.wrapper_prefix) # TODO retrieve suitable singularity image if self.use_env_modules and ruleinfo.env_modules: # If using environment modules and they are defined for the rule, # ignore conda and singularity directive below. # The reason is that this is likely intended in order to use # a software stack specifically compiled for a particular # HPC cluster. invalid_rule = not (ruleinfo.script or ruleinfo.wrapper or ruleinfo.shellcmd or ruleinfo.notebook) if invalid_rule: raise RuleException( "Modules directive is only allowed with " "shell, script, notebook, or wrapper directives (not with run)", rule=rule, ) from snakemake.deployment.env_modules import EnvModules rule.env_modules = EnvModules(*ruleinfo.env_modules) else: if ruleinfo.conda_env and self.use_conda: if not (ruleinfo.script or ruleinfo.wrapper or ruleinfo.shellcmd or ruleinfo.notebook): raise RuleException( "Conda environments are only allowed " "with shell, script, notebook, or wrapper directives " "(not with run).", rule=rule, ) if not (urllib.parse.urlparse(ruleinfo.conda_env).scheme or os.path.isabs(ruleinfo.conda_env)): ruleinfo.conda_env = os.path.join( self.current_basedir, ruleinfo.conda_env) rule.conda_env = ruleinfo.conda_env if self.use_singularity: invalid_rule = not (ruleinfo.script or ruleinfo.wrapper or ruleinfo.shellcmd or ruleinfo.notebook) if ruleinfo.singularity_img: if invalid_rule: raise RuleException( "Singularity directive is only allowed " "with shell, script, notebook or wrapper directives " "(not with run).", rule=rule, ) rule.singularity_img = ruleinfo.singularity_img elif self.global_singularity_img: if not invalid_rule: # skip rules with run directive rule.singularity_img = self.global_singularity_img rule.norun = ruleinfo.norun rule.docstring = ruleinfo.docstring rule.run_func = ruleinfo.func rule.shellcmd = ruleinfo.shellcmd rule.script = ruleinfo.script rule.notebook = ruleinfo.notebook rule.wrapper = ruleinfo.wrapper rule.cwl = ruleinfo.cwl rule.restart_times = self.restart_times rule.basedir = self.current_basedir ruleinfo.func.__name__ = "__{}".format(rule.name) self.globals[ruleinfo.func.__name__] = ruleinfo.func setattr(rules, rule.name, RuleProxy(rule)) if checkpoint: checkpoints.register(rule) return ruleinfo.func
def auto_report(dag, path): try: from jinja2 import Template, Environment, PackageLoader except ImportError as e: raise WorkflowError( "Python package jinja2 must be installed to create reports." ) if not path.endswith(".html"): raise WorkflowError("Report file does not end with .html") logger.info("Creating report...") env = Environment( loader=PackageLoader("snakemake", "report"), trim_blocks=True, lstrip_blocks=True, ) env.filters["get_resource_as_string"] = get_resource_as_string persistence = dag.workflow.persistence results = defaultdict(list) records = defaultdict(JobRecord) recorded_files = set() for job in dag.jobs: for f in itertools.chain(job.expanded_output, job.input): if is_flagged(f, "report") and f not in recorded_files: if not f.exists: raise WorkflowError( "File {} marked for report but does " "not exist.".format(f) ) if os.path.isfile(f): report_obj = get_flag_value(f, "report") category = Category(report_obj.category) results[category].append( FileRecord(f, job, report_obj.caption, env, category) ) recorded_files.add(f) for f in job.expanded_output: meta = persistence.metadata(f) if not meta: logger.warning( "Missing metadata for file {}. Maybe metadata " "was deleted or it was created using an older " "version of Snakemake. This is a non critical " "warning.".format(f) ) continue try: job_hash = meta["job_hash"] rule = meta["rule"] rec = records[(job_hash, rule)] rec.rule = rule rec.job = job rec.starttime = min(rec.starttime, meta["starttime"]) rec.endtime = max(rec.endtime, meta["endtime"]) rec.conda_env_file = None rec.conda_env = meta["conda_env"] rec.singularity_img_url = meta["singularity_img_url"] rec.output.append(f) except KeyError as e: print(e) logger.warning( "Metadata for file {} was created with a too " "old Snakemake version.".format(f) ) for catresults in results.values(): catresults.sort(key=lambda res: res.name) # prepare runtimes runtimes = [ {"rule": rec.rule, "runtime": rec.endtime - rec.starttime} for rec in sorted(records.values(), key=lambda rec: rec.rule) ] # prepare end times timeline = [ { "rule": rec.rule, "starttime": datetime.datetime.fromtimestamp(rec.starttime).isoformat(), "endtime": datetime.datetime.fromtimestamp(rec.endtime).isoformat(), } for rec in sorted(records.values(), key=lambda rec: rec.rule) ] # prepare per-rule information rules = defaultdict(list) for rec in records.values(): rule = RuleRecord(rec.job, rec) if rec.rule not in rules: rules[rec.rule].append(rule) else: merged = False for other in rules[rec.rule]: if rule == other: other.add(rec) merged = True break if not merged: rules[rec.rule].append(rule) # rulegraph rulegraph, xmax, ymax = rulegraph_d3_spec(dag) # configfiles configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles] seen = set() files = [ seen.add(res.target) or res for cat in results.values() for res in cat if res.target not in seen ] rst_links = textwrap.dedent( """ .. _Results: #results .. _Rules: #rules .. _Statistics: #stats {% for cat, catresults in categories|dictsort %} .. _{{ cat.name }}: #{{ cat.id }} {% for res in files %} .. _{{ res.target }}: #{{ res.id }} {% endfor %} {% endfor %} .. _ """ ) for cat, catresults in results.items(): for res in catresults: res.render(env, rst_links, results, files) # global description text = "" if dag.workflow.report_text: with open(dag.workflow.report_text) as f: class Snakemake: config = dag.workflow.config text = f.read() + rst_links text = publish_parts( env.from_string(text).render( snakemake=Snakemake, categories=results, files=files ), writer_name="html", )["body"] # record time now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0]) results_size = sum(res.size for cat in results.values() for res in cat) try: from pygments.formatters import HtmlFormatter except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports." ) # render HTML template = env.get_template("report.html") with open(path, "w", encoding="utf-8") as out: out.write( template.render( results=results, results_size=results_size, configfiles=configfiles, text=text, rulegraph_nodes=rulegraph["nodes"], rulegraph_links=rulegraph["links"], rulegraph_width=xmax + 20, rulegraph_height=ymax + 20, runtimes=runtimes, timeline=timeline, rules=[rec for recs in rules.values() for rec in recs], version=__version__, now=now, pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"), ) ) logger.info("Report created.")
def __init__(self, path, job, caption, env, category): self.path = path self.target = os.path.basename(path) self.size = os.path.getsize(self.path) logger.info("Adding {} ({:.2g} MB).".format(self.name, self.size / 1e6)) self.raw_caption = caption self.mime, _ = mime_from_file(self.path) self.id = uuid.uuid4() self.job = job self.wildcards = logging.format_wildcards(job.wildcards) self.params = logging.format_dict(job.params) self.png_uri = None self.category = category if self.is_img: convert = shutil.which("convert") if convert is not None: try: # 2048 aims at a reasonable balance between what displays # can show in a png-preview image and what renders quick # into a small enough png max_width = "2048" max_height = "2048" # '>' means only larger images scaled down to within max-dimensions max_spec = max_width + "x" + max_height + ">" png = sp.check_output( ["convert", "-resize", max_spec, self.path, "png:-"], stderr=sp.PIPE, ) uri = data_uri( png, os.path.basename(self.path) + ".png", mime="image/png" ) self.png_uri = uri except sp.CalledProcessError as e: logger.warning( "Failed to convert image to png with " "imagemagick convert: {}".format(e.stderr) ) else: logger.warning( "Command convert not in $PATH. Install " "imagemagick in order to have embedded " "images and pdfs in the report." ) if self.is_table: if self.size > 1e6: logger.warning( "Table {} >1MB. Rendering as generic file.".format(self.path) ) else: with open(self.path) as table: dialect = None for prefix in range(10, 17): try: table.seek(0) dialect = csv.Sniffer().sniff(table.read(prefix)) break except csv.Error: pass except UnicodeDecodeError: # table is not readable as UTF-8 break if dialect is None: logger.warning( "Failed to infer CSV/TSV dialect from table {}. " "Rendering as generic file.".format(self.path) ) else: table.seek(0) reader = csv.reader(table, dialect) columns = next(reader) table = map(lambda row: list(map(num_if_possible, row)), reader) template = env.get_template("table.html") html = template.render( columns=columns, table=table, name=self.name ).encode() self.mime = "text/html" self.path = os.path.basename(self.path) + ".html" self.data_uri = data_uri(html, self.path, mime=self.mime) return # fallback self.data_uri = data_uri_from_file(path)
def fetch_from_ncbi(self, accession_list, destination_dir, force_overwrite=False, rettype="fasta", retmode="text", file_ext=None, combined_file_prefix=None, remove_separate_files=False, chunk_size=1, db="nuccore", **kwargs): """ This function downloads and saves files from NCBI. Adapted in part from the BSD-licensed code here: https://github.com/broadinstitute/viral-ngs/blob/master/util/genbank.py """ max_chunk_size = 500 # Conform to NCBI retreival guidelines by chunking into 500-accession chunks if # >500 accessions are specified and chunk_size is set to 1 # Also clamp chunk size to 500 if the user specified a larger value. if chunk_size > max_chunk_size or (len(accession_list) > max_chunk_size and chunk_size == 1): chunk_size = max_chunk_size outEx = {"fasta": "fasta", "ft": "tbl", "gb": "gbk"} output_directory = os.path.abspath(os.path.expanduser(destination_dir)) if not os.path.exists(output_directory): os.makedirs(output_directory) output_extension = str(file_ext) # ensure the extension starts with a ".", also allowing for passed-in # extensions that already have it if output_extension[:1] != ".": output_extension = "." + output_extension logger.info("Fetching {} entries from NCBI: {}\n".format( str(len(accession_list)), ", ".join(accession_list[:10]))) output_files = [] for chunk_num, chunk in enumerate( self._seq_chunks(accession_list, chunk_size)): # sleep to throttle requests to 2 per second per NCBI guidelines: # https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen time.sleep(0.5) acc_string = ",".join(chunk) # if the filename would be longer than Linux allows, simply say "chunk-chunk_num" if len(acc_string) + len(output_extension) <= 254: output_file_path = os.path.join(output_directory, acc_string + output_extension) else: output_file_path = os.path.join( output_directory, "chunk-{}".format(chunk_num) + output_extension) if not force_overwrite: logger.info("not overwriting, checking for existence") assert not os.path.exists(output_file_path), ( """File %s already exists. Consider removing this file or specifying a different output directory. The files for the accessions specified can be overwritten if you add force_overwrite flag. Processing aborted.""" % output_file_path) try_count = 1 while True: try: logger.info("Fetching file {}: {}, try #{}".format( chunk_num + 1, acc_string, try_count)) handle = self.entrez.efetch(db=db, rettype=rettype, retmode=retmode, id=acc_string, **kwargs) with open(output_file_path, "w") as outf: for line in handle: outf.write(line) output_files.append(output_file_path) except IOError: logger.warning( "Error fetching file {}: {}, try #{} probably because NCBI is too busy." .format(chunk_num + 1, acc_string, try_count)) try_count += 1 if try_count > 4: logger.warning("Tried too many times. Aborting.") raise # if the fetch failed, wait a few seconds and try again. logger.info("Waiting and retrying...") time.sleep(2) continue break # assert that we are not trying to remove the intermediate files without writing a combined file if remove_separate_files: assert combined_file_prefix, """The intermediate files can only be removed if a combined file is written via combined_file_prefix""" # build a path to the combined genome file if combined_file_prefix: concatenated_genome_file_path = os.path.join( output_directory, combined_file_prefix + output_extension) if not force_overwrite: assert not os.path.exists(concatenated_genome_file_path), ( """File %s already exists. Consider removing this file or specifying a different output directory. The files for the accessions specified can be overwritten if you add force_overwrite flag. Processing aborted.""" % output_file_path) # concatenate the files together into one genome file with open(concatenated_genome_file_path, "w") as outfile: for file_path in output_files: with open(file_path) as infile: for line in infile: outfile.write(line) # if the option is specified, remove the intermediate fasta files if remove_separate_files: while len(output_files) > 0: os.unlink(output_files.pop()) # add the combined file to the list of files returned output_files.append(concatenated_genome_file_path) # return list of files return output_files
def create(self, dryrun=False): """Create the conda enviroment.""" from snakemake.shell import shell self.check_is_file_based() # Read env file and create hash. env_file = self.file deploy_file = None pin_file = None tmp_env_file = None tmp_deploy_file = None tmp_pin_file = None if not isinstance(env_file, LocalSourceFile) or isinstance( env_file, LocalGitFile): with tempfile.NamedTemporaryFile(delete=False, suffix=".yaml") as tmp: # write to temp file such that conda can open it tmp.write(self.content) env_file = tmp.name tmp_env_file = tmp.name if self.post_deploy_file: with tempfile.NamedTemporaryFile( delete=False, suffix=".post-deploy.sh") as tmp: # write to temp file such that conda can open it tmp.write(self.content_deploy) deploy_file = tmp.name tmp_deploy_file = tmp.name if self.pin_file: with tempfile.NamedTemporaryFile(delete=False, suffix="pin.txt") as tmp: tmp.write(self.content_pin) pin_file = tmp.name tmp_pin_file = tmp.name else: env_file = env_file.get_path_or_uri() deploy_file = self.post_deploy_file pin_file = self.pin_file env_path = self.address if self.is_containerized: if not dryrun: try: shell.check_output( singularity.shellcmd( self._container_img.path, "[ -d '{}' ]".format(env_path), args=self._singularity_args, envvars=self.get_singularity_envvars(), quiet=True, ), stderr=subprocess.PIPE, ) except subprocess.CalledProcessError as e: raise WorkflowError( "Unable to find environment in container image. " "Maybe a conda environment was modified without containerizing again " "(see snakemake --containerize)?\nDetails:\n{}\n{}". format(e, e.stderr.decode())) return env_path else: # env should be present in the container return env_path # Check for broken environment if os.path.exists(os.path.join( env_path, "env_setup_start")) and not os.path.exists( os.path.join(env_path, "env_setup_done")): if dryrun: logger.info( "Incomplete Conda environment {} will be recreated.". format(self.file.simplify_path())) else: logger.info( "Removing incomplete Conda environment {}...".format( self.file.simplify_path())) shutil.rmtree(env_path, ignore_errors=True) # Create environment if not already present. if not os.path.exists(env_path): if dryrun: logger.info("Conda environment {} will be created.".format( self.file.simplify_path())) return env_path logger.info("Creating conda environment {}...".format( self.file.simplify_path())) env_archive = self.archive_file try: # Touch "start" flag file os.makedirs(env_path, exist_ok=True) with open(os.path.join(env_path, "env_setup_start"), "a") as f: pass # Check if env archive exists. Use that if present. if os.path.exists(env_archive): logger.info("Installing archived conda packages.") pkg_list = os.path.join(env_archive, "packages.txt") if os.path.exists(pkg_list): # read pacakges in correct order # this is for newer env archives where the package list # was stored packages = [ os.path.join(env_archive, pkg.rstrip()) for pkg in open(pkg_list) ] else: # guess order packages = glob(os.path.join(env_archive, "*.tar.bz2")) # install packages manually from env archive cmd = " ".join([ "conda", "create", "--quiet", "--yes", "--prefix '{}'".format(env_path), ] + packages) if self._container_img: cmd = singularity.shellcmd( self._container_img.path, cmd, args=self._singularity_args, envvars=self.get_singularity_envvars(), ) out = shell.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True) else: def create_env(env_file, filetype="yaml"): # Copy env file to env_path (because they can be on # different volumes and singularity should only mount one). # In addition, this allows to immediately see what an # environment in .snakemake/conda contains. target_env_file = env_path + f".{filetype}" shutil.copy(env_file, target_env_file) logger.info( "Downloading and installing remote packages.") strict_priority = ([ "conda config --set channel_priority strict &&" ] if self._container_img else []) subcommand = [self.frontend] yes_flag = ["--yes"] if filetype == "yaml": subcommand.append("env") yes_flag = [] cmd = (strict_priority + subcommand + [ "create", "--quiet", '--file "{}"'.format(target_env_file), '--prefix "{}"'.format(env_path), ] + yes_flag) cmd = " ".join(cmd) if self._container_img: cmd = singularity.shellcmd( self._container_img.path, cmd, args=self._singularity_args, envvars=self.get_singularity_envvars(), ) out = shell.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True) # cleanup if requested if self._cleanup is CondaCleanupMode.tarballs: logger.info("Cleaning up conda package tarballs.") shell.check_output("conda clean -y --tarballs") elif self._cleanup is CondaCleanupMode.cache: logger.info( "Cleaning up conda package tarballs and package cache." ) shell.check_output( "conda clean -y --tarballs --packages") return out if pin_file is not None: try: logger.info( f"Using pinnings from {self.pin_file.get_path_or_uri()}." ) out = create_env(pin_file, filetype="pin.txt") except subprocess.CalledProcessError as e: # remove potential partially installed environment shutil.rmtree(env_path, ignore_errors=True) advice = "" if isinstance(self.file, LocalSourceFile): advice = ( " If that works, make sure to update the pin file with " f"'snakedeploy pin-conda-env {self.file.get_path_or_uri()}'." ) logger.warning( f"Failed to install conda environment from pin file ({self.pin_file.get_path_or_uri()}). " f"Trying regular environment definition file.{advice}" ) out = create_env(env_file, filetype="yaml") else: out = create_env(env_file, filetype="yaml") # Execute post-deplay script if present if deploy_file: target_deploy_file = env_path + ".post-deploy.sh" shutil.copy(deploy_file, target_deploy_file) self.execute_deployment_script(env_file, target_deploy_file) # Touch "done" flag file with open(os.path.join(env_path, "env_setup_done"), "a") as f: pass logger.debug(out) logger.info( f"Environment for {self.file.get_path_or_uri()} created (location: {os.path.relpath(env_path)})" ) except subprocess.CalledProcessError as e: # remove potential partially installed environment shutil.rmtree(env_path, ignore_errors=True) raise CreateCondaEnvironmentException( f"Could not create conda environment from {env_file}:\nCommand:\n{e.cmd}\nOutput:\n{e.output}" ) if tmp_env_file: # temporary file was created os.remove(tmp_env_file) if tmp_deploy_file: os.remove(tmp_deploy_file) return env_path
if jobid is not None: with cls._lock: del cls._processes[jobid] if retcode: raise sp.CalledProcessError(retcode, cmd) return ret @staticmethod def iter_stdout(proc, cmd): for l in proc.stdout: yield l[:-1] retcode = proc.wait() if retcode: raise sp.CalledProcessError(retcode, cmd) # set bash as default shell on posix compatible OS if os.name == "posix": if not shutil.which("bash"): logger.warning("Cannot set bash as default shell because it is not " "available in your PATH. Falling back to sh.") if not shutil.which("sh"): logger.warning("Cannot fall back to sh since it seems to be not " "available on this system. Using whatever is " "defined as default.") else: shell.executable("sh") else: shell.executable("bash")
def _set_inoutput_item(self, item, output=False, name=None): """ Set an item to be input or output. Arguments item -- the item inoutput -- a Namedlist of either input or output items name -- an optional name for the item """ inoutput = self.output if output else self.input # Check to see if the item is a path, if so, just make it a string if isinstance(item, Path): item = str(item) if isinstance(item, str): if ON_WINDOWS: if isinstance(item, (_IOFile, AnnotatedString)): item = item.new_from(item.replace(os.sep, os.altsep)) else: item = item.replace(os.sep, os.altsep) rule_dependency = None if isinstance(item, _IOFile) and item.rule and item in item.rule.output: rule_dependency = item.rule item = self.apply_path_modifier( item, property="output" if output else "input" ) # Check to see that all flags are valid # Note that "remote", "dynamic", and "expand" are valid for both inputs and outputs. if isinstance(item, AnnotatedString): for flag in item.flags: if not output and flag in [ "protected", "temp", "temporary", "directory", "touch", "pipe", ]: logger.warning( "The flag '{}' used in rule {} is only valid for outputs, not inputs.".format( flag, self ) ) if output and flag in ["ancient"]: logger.warning( "The flag '{}' used in rule {} is only valid for inputs, not outputs.".format( flag, self ) ) # add the rule to the dependencies if rule_dependency is not None: self.dependencies[item] = rule_dependency if output: item = self._update_item_wildcard_constraints(item) else: if ( contains_wildcard_constraints(item) and self.workflow.mode != Mode.subprocess ): logger.warning( "Wildcard constraints in inputs are ignored. (rule: {})".format( self ) ) if self.workflow.all_temp and output: # mark as temp if all output files shall be marked as temp item = snakemake.io.flag(item, "temp") # record rule if this is an output file output _item = IOFile(item, rule=self) if is_flagged(item, "temp"): if output: self.temp_output.add(_item) if is_flagged(item, "protected"): if output: self.protected_output.add(_item) if is_flagged(item, "touch"): if output: self.touch_output.add(_item) if is_flagged(item, "dynamic"): if output: self.dynamic_output.add(_item) else: self.dynamic_input.add(_item) if is_flagged(item, "report"): report_obj = item.flags["report"] if report_obj.caption is not None: r = ReportObject( self.workflow.current_basedir.join(report_obj.caption), report_obj.category, report_obj.subcategory, report_obj.patterns, report_obj.htmlindex, ) item.flags["report"] = r if is_flagged(item, "subworkflow"): if output: raise SyntaxError("Only input files may refer to a subworkflow") else: # record the workflow this item comes from sub = item.flags["subworkflow"] if _item in self.subworkflow_input: other = self.subworkflow_input[_item] if sub != other: raise WorkflowError( "The input file {} is ambiguously " "associated with two subworkflows " "{} and {}.".format(item, sub, other), rule=self, ) self.subworkflow_input[_item] = sub inoutput.append(_item) if name: inoutput._add_name(name) elif callable(item): if output: raise SyntaxError("Only input files can be specified as functions") inoutput.append(item) if name: inoutput._add_name(name) else: try: start = len(inoutput) for i in item: self._set_inoutput_item(i, output=output) if name: # if the list was named, make it accessible inoutput._set_name(name, start, end=len(inoutput)) except TypeError: raise SyntaxError( "Input and output files have to be specified as strings or lists of strings." )
# library preparation kit specific configuration libprep_fn = srcdir("libprep.config") with open(libprep_fn) as fh: LIBPREP_CONF = yaml.load(fh, Loader=Loader) or {} kit = config.get("libprepkit") if kit is not None: if len(config["read_geometry"]) > 1: kit += " PE" else: kit += " SE" if kit in LIBPREP_CONF: # overwrite default config update_config(CONF, LIBPREP_CONF[kit]) else: if kit is None: logger.warning("Running without LIBREPKIT defined!") else: logger.warning("`{}` is not a valid librepkit name".format(kit)) sys.exit() # update config (config.yaml). Does not update if key exists update_config2(config, CONF) # update read geometry with delta_readlen if 'delta_readlen' in config and 'read_geometry' in config: read_geometry = config["read_geometry"] for i, val in enumerate(config['delta_readlen']): read_geometry[i] = int(read_geometry[i]) + int(val) config["read_geometry"] = read_geometry
def job_selector_ilp(self, jobs): """ Job scheduling by optimization of resource usage by solving ILP using pulp """ import pulp from pulp import lpSum logger.info("Select jobs to execute...") # assert self.resources["_cores"] > 0 scheduled_jobs = { job: pulp.LpVariable( "job_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, job in enumerate(jobs) } size_gb = lambda f: f.size / 1e9 temp_files = { temp_file for job in jobs for temp_file in self.dag.temp_input(job) } temp_job_improvement = { temp_file: pulp.LpVariable( "temp_file_{}".format(idx), lowBound=0, upBound=1, cat="Continuous" ) for idx, temp_file in enumerate(temp_files) } temp_file_deletable = { temp_file: pulp.LpVariable( "deletable_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, temp_file in enumerate(temp_files) } prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize) total_temp_size = max(sum([size_gb(temp_file) for temp_file in temp_files]), 1) total_core_requirement = sum( [max(job.resources.get("_cores", 1), 1) for job in jobs] ) # Objective function # Job priority > Core load # Core load > temp file removal # Instant removal > temp size prob += ( 2 * total_core_requirement * 2 * total_temp_size * lpSum([job.priority * scheduled_jobs[job] for job in jobs]) + 2 * total_temp_size * lpSum( [ max(job.resources.get("_cores", 1), 1) * scheduled_jobs[job] for job in jobs ] ) + total_temp_size * lpSum( [ temp_file_deletable[temp_file] * size_gb(temp_file) for temp_file in temp_files ] ) + lpSum( [ temp_job_improvement[temp_file] * size_gb(temp_file) for temp_file in temp_files ] ) ) # Constraints: for name in self.workflow.global_resources: prob += ( lpSum( [scheduled_jobs[job] * job.resources.get(name, 0) for job in jobs] ) <= self.resources[name] ) # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file remaining_jobs = self.remaining_jobs for temp_file in temp_files: prob += temp_job_improvement[temp_file] <= lpSum( [ scheduled_jobs[job] * self.required_by_job(temp_file, job) for job in jobs ] ) / lpSum([self.required_by_job(temp_file, job) for job in remaining_jobs]) prob += temp_file_deletable[temp_file] <= temp_job_improvement[temp_file] solver = ( pulp.get_solver(self.scheduler_ilp_solver) if self.scheduler_ilp_solver else pulp.apis.LpSolverDefault ) solver.msg = self.workflow.verbose # disable extensive logging try: prob.solve(solver) except pulp.apis.core.PulpSolverError as e: logger.warning( "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver. " "Run Snakemake with --verbose to see the full solver output for debugging the problem." ) return self.job_selector_greedy(jobs) selected_jobs = set( job for job, variable in scheduled_jobs.items() if variable.value() == 1.0 ) if not selected_jobs: logger.warning( "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver." "Run Snakemake with --verbose to see the full solver output for debugging the problem." ) return self.job_selector_greedy(jobs) for name in self.workflow.global_resources: self.resources[name] -= sum( [job.resources.get(name, 0) for job in selected_jobs] ) return selected_jobs
def check_localrules(self): undefined = self._localrules - set(rule.name for rule in self.rules) if undefined: logger.warning("localrules directive specifies rules that are not " "present in the Snakefile:\n{}\n".format( "\n".join(map("\t{}".format, undefined))))
def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. Creates a shadow directory for the job if specified. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) self.remove_existing_output() for f, f_ in zip(self.output, self.rule.output): f.prepare() self.download_remote_input() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare() if not self.is_shadow: return # Create shadow directory structure self.shadow_dir = tempfile.mkdtemp( dir=self.rule.workflow.persistence.shadow_path) cwd = os.getcwd() if self.rule.shadow_depth == "minimal": # Re-create the directory structure in the shadow directory for (f, d) in set([(item, os.path.dirname(item)) for sublist in [self.input, self.output, self.log] if sublist is not None for item in sublist]): if d and not os.path.isabs(d): rel_path = os.path.relpath(d) # Only create subdirectories if not rel_path.split(os.path.sep)[0] == "..": os.makedirs(os.path.join(self.shadow_dir, rel_path), exist_ok=True) else: raise RuleException( "The following file name references a parent directory relative to your workdir.\n" "This isn't supported for shadow: \"minimal\". Consider using an absolute path instead.\n{}" .format(f), rule=self.rule) # Symlink the input files for rel_path in set([ os.path.relpath(f) for f in self.input if not os.path.isabs(f) ]): link = os.path.join(self.shadow_dir, rel_path) original = os.path.relpath(rel_path, os.path.dirname(link)) os.symlink(original, link) # Shallow simply symlink everything in the working directory. elif self.rule.shadow_depth == "shallow": for source in os.listdir(cwd): link = os.path.join(self.shadow_dir, source) os.symlink(os.path.abspath(source), link) elif self.rule.shadow_depth == "full": snakemake_dir = os.path.join(cwd, ".snakemake") for dirpath, dirnames, filenames in os.walk(cwd): # Must exclude .snakemake and its children to avoid infinite # loop of symlinks. if os.path.commonprefix([snakemake_dir, dirpath]) == snakemake_dir: continue for dirname in dirnames: if dirname == ".snakemake": continue relative_source = os.path.relpath( os.path.join(dirpath, dirname)) shadow = os.path.join(self.shadow_dir, relative_source) os.mkdir(shadow) for filename in filenames: source = os.path.join(dirpath, filename) relative_source = os.path.relpath(source) link = os.path.join(self.shadow_dir, relative_source) os.symlink(source, link)
def script( path, basedir, input, output, params, wildcards, threads, resources, log, config, rulename, conda_env, singularity_img, singularity_args, bench_record, jobid, bench_iteration, shadow_dir, ): """ Load a script from the given basedir + path and execute it. Supports Python 3 and R. """ f = None try: path, source, language = get_source(path, basedir) if language == "python": wrapper_path = path[7:] if path.startswith("file://") else path snakemake = Snakemake( input, output, params, wildcards, threads, resources, log, config, rulename, bench_iteration, os.path.dirname(wrapper_path), ) snakemake = pickle.dumps(snakemake) # Obtain search path for current snakemake module. # The module is needed for unpickling in the script. # We append it at the end (as a fallback). searchpath = SNAKEMAKE_SEARCHPATH if singularity_img is not None: searchpath = singularity.SNAKEMAKE_MOUNTPOINT searchpath = '"{}"'.format(searchpath) # For local scripts, add their location to the path in case they use path-based imports if path.startswith("file://"): searchpath += ', "{}"'.format(os.path.dirname(path[7:])) preamble = textwrap.dedent(""" ######## Snakemake header ######## import sys; sys.path.extend([{searchpath}]); import pickle; snakemake = pickle.loads({snakemake}); from snakemake.logging import logger; logger.printshellcmds = {printshellcmds}; __real_file__ = __file__; __file__ = {file_override}; ######## Original script ######### """).format( searchpath=escape_backslash(searchpath), snakemake=snakemake, printshellcmds=logger.printshellcmds, file_override=repr(os.path.realpath(wrapper_path)), ) elif language == "r" or language == "rmarkdown": preamble = textwrap.dedent(""" ######## Snakemake header ######## library(methods) Snakemake <- setClass( "Snakemake", slots = c( input = "list", output = "list", params = "list", wildcards = "list", threads = "numeric", log = "list", resources = "list", config = "list", rule = "character", bench_iteration = "numeric", scriptdir = "character", source = "function" ) ) snakemake <- Snakemake( input = {}, output = {}, params = {}, wildcards = {}, threads = {}, log = {}, resources = {}, config = {}, rule = {}, bench_iteration = {}, scriptdir = {}, source = function(...){{ wd <- getwd() setwd(snakemake@scriptdir) source(...) setwd(wd) }} ) ######## Original script ######### """).format( REncoder.encode_namedlist(input), REncoder.encode_namedlist(output), REncoder.encode_namedlist(params), REncoder.encode_namedlist(wildcards), threads, REncoder.encode_namedlist(log), REncoder.encode_namedlist({ name: value for name, value in resources.items() if name != "_cores" and name != "_nodes" }), REncoder.encode_dict(config), REncoder.encode_value(rulename), REncoder.encode_numeric(bench_iteration), REncoder.encode_value( os.path.dirname(path[7:]) if path. startswith("file://") else os.path.dirname(path)), ) elif language == "julia": preamble = textwrap.dedent(""" ######## Snakemake header ######## struct Snakemake input::Dict output::Dict params::Dict wildcards::Dict threads::Int64 log::Dict resources::Dict config::Dict rule::String bench_iteration scriptdir::String #source::Any end snakemake = Snakemake( {}, #input::Dict {}, #output::Dict {}, #params::Dict {}, #wildcards::Dict {}, #threads::Int64 {}, #log::Dict {}, #resources::Dict {}, #config::Dict {}, #rule::String {}, #bench_iteration::Int64 {}, #scriptdir::String #, #source::Any ) ######## Original script ######### """.format( JuliaEncoder.encode_namedlist(input), JuliaEncoder.encode_namedlist(output), JuliaEncoder.encode_namedlist(params), JuliaEncoder.encode_namedlist(wildcards), JuliaEncoder.encode_value(threads), JuliaEncoder.encode_namedlist(log), JuliaEncoder.encode_namedlist({ name: value for name, value in resources.items() if name != "_cores" and name != "_nodes" }), JuliaEncoder.encode_dict(config), JuliaEncoder.encode_value(rulename), JuliaEncoder.encode_value(bench_iteration), JuliaEncoder.encode_value( os.path.dirname(path[7:]) if path. startswith("file://") else os.path.dirname(path)), ).replace("'", '"')) else: raise ValueError( "Unsupported script: Expecting either Python (.py), R (.R), RMarkdown (.Rmd) or Julia (.jl) script." ) dir = ".snakemake/scripts" os.makedirs(dir, exist_ok=True) with tempfile.NamedTemporaryFile(suffix="." + os.path.basename(path), dir=dir, delete=False) as f: if not language == "rmarkdown": f.write(preamble.encode()) f.write(source) else: # Insert Snakemake object after the RMarkdown header code = source.decode() pos = next(islice(re.finditer(r"---\n", code), 1, 2)).start() + 3 f.write(str.encode(code[:pos])) preamble = textwrap.dedent(""" ```{r, echo=FALSE, message=FALSE, warning=FALSE} %s ``` """ % preamble) f.write(preamble.encode()) f.write(str.encode(code[pos:])) if language == "python": py_exec = sys.executable if conda_env is not None: py = os.path.join(conda_env, "bin", "python") if os.path.exists(py): out = subprocess.check_output( [py, "--version"], stderr=subprocess.STDOUT, universal_newlines=True, ) ver = tuple( map(int, PY_VER_RE.match(out).group("ver_min").split("."))) if ver >= MIN_PY_VERSION: # Python version is new enough, make use of environment # to execute script py_exec = "python" else: logger.warning( "Conda environment defines Python " "version < {0}.{1}. Using Python of the " "master process to execute " "script. Note that this cannot be avoided, " "because the script uses data structures from " "Snakemake which are Python >={0}.{1} " "only.".format(*MIN_PY_VERSION)) if singularity_img is not None: # use python from image py_exec = "python" # use the same Python as the running process or the one from the environment shell("{py_exec} {f.name:q}", bench_record=bench_record) elif language == "r": if conda_env is not None and "R_LIBS" in os.environ: logger.warning("R script job uses conda environment but " "R_LIBS environment variable is set. This " "is likely not intended, as R_LIBS can " "interfere with R packages deployed via " "conda. Consider running `unset R_LIBS` or " "remove it entirely before executing " "Snakemake.") shell("Rscript --vanilla {f.name:q}", bench_record=bench_record) elif language == "rmarkdown": if len(output) != 1: raise WorkflowError( "RMarkdown scripts (.Rmd) may only have a single output file." ) out = os.path.abspath(output[0]) shell( 'Rscript --vanilla -e \'rmarkdown::render("{f.name}", output_file="{out}", quiet=TRUE, knit_root_dir = "{workdir}", params = list(rmd="{f.name}"))\'', bench_record=bench_record, workdir=os.getcwd(), ) elif language == "julia": shell("julia {f.name:q}", bench_record=bench_record) except URLError as e: raise WorkflowError(e) finally: if f: os.remove(f.name)
def report(text, path, stylesheet=os.path.join(os.path.dirname(__file__), "report.css"), defaultenc="utf8", template=None, metadata=None, **files): """ Create an HTML report using python docutils. Attention: This function needs Python docutils to be installed for the python installation you use with Snakemake. Arguments text -- The "restructured text" as it is expected by python docutils. path -- The path to the desired output file stylesheet -- An optional path to a css file that defines the style of the document. This defaults to <your snakemake install>/report.css. Use the default to get a hint how to create your own. defaultenc -- The encoding that is reported to the browser for embedded text files, defaults to utf8. template -- An optional path to a docutils HTML template. metadata -- E.g. an optional author name or email address. All other keyword args are intepreted as paths to files that shall be embedded into the document. They keywords will be available as link targets in the text. E.g. append a file as keyword arg via F1=input[0] and put a download link in the text like this: report(''' ============== Report for ... ============== Some text. A link to an embedded file: F1_. Further text. ''', outputpath, F1=input[0]) Instead of specifying each file as a keyword arg, you can also expand the input of your rule if it is completely named, e.g.: report(''' Some text... ''', outputpath, **input) """ outmime, _ = mimetypes.guess_type(path) if outmime != "text/html": raise ValueError("Path to report output has to be an HTML file.") from docutils.core import publish_file definitions = textwrap.dedent(""" .. role:: raw-html(raw) :format: html """) metadata = textwrap.dedent(""" .. container:: :name: metadata {metadata} {date} """).format(metadata=metadata, date=datetime.date.today().isoformat()) text = format(textwrap.dedent(text), stepout=2) attachments = [ textwrap.dedent(""" .. container:: :name: attachments """) ] for name, file in sorted(files.items()): mime, encoding = mimetypes.guess_type(file) if mime is None: mime = "text/plain" logger.warning("Could not detect mimetype for {}, assuming " "text/plain.".format(file)) if encoding is None: encoding = defaultenc with open(file, "rb") as f: data = base64.b64encode(f.read()) attachments.append(''' .. container:: :name: {name} [{name}] :raw-html:`<a href="data:{mime};charset={charset};filename={filename};base64,{data}" download="{filename}" draggable="true">{filename}</a>` '''.format(name=name, filename=os.path.basename(file), mime=mime, charset=encoding, data=data.decode())) text = definitions + text + "\n\n" + "\n\n".join(attachments) + metadata overrides = dict() if template is not None: overrides["template"] = template if stylesheet is not None: overrides["stylesheet_path"] = stylesheet html = open(path, "w") publish_file(source=io.StringIO(text), destination=html, writer_name="html", settings_overrides=overrides)
def decorate(ruleinfo): if ruleinfo.wildcard_constraints: rule.set_wildcard_constraints( *ruleinfo.wildcard_constraints[0], **ruleinfo.wildcard_constraints[1]) if ruleinfo.input: rule.set_input(*ruleinfo.input[0], **ruleinfo.input[1]) if ruleinfo.output: rule.set_output(*ruleinfo.output[0], **ruleinfo.output[1]) if ruleinfo.params: rule.set_params(*ruleinfo.params[0], **ruleinfo.params[1]) if ruleinfo.threads: if not isinstance(ruleinfo.threads, int) and not callable( ruleinfo.threads): raise RuleException( "Threads value has to be an integer or a callable.", rule=rule) rule.resources["_cores"] = ruleinfo.threads if ruleinfo.shadow_depth: if ruleinfo.shadow_depth not in (True, "shallow", "full", "minimal"): raise RuleException( "Shadow must either be 'minimal', 'shallow', 'full', " "or True (equivalent to 'full')", rule=rule) if ruleinfo.shadow_depth is True: rule.shadow_depth = 'full' logger.warning( "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|shallow|full' instead." .format(rule)) else: rule.shadow_depth = ruleinfo.shadow_depth if ruleinfo.resources: args, resources = ruleinfo.resources if args: raise RuleException("Resources have to be named.") if not all( map(lambda r: isinstance(r, int) or callable(r), resources.values())): raise RuleException( "Resources values have to be integers or callables", rule=rule) rule.resources.update(resources) if ruleinfo.priority: if (not isinstance(ruleinfo.priority, int) and not isinstance(ruleinfo.priority, float)): raise RuleException("Priority values have to be numeric.", rule=rule) rule.priority = ruleinfo.priority if ruleinfo.version: rule.version = ruleinfo.version if ruleinfo.log: rule.set_log(*ruleinfo.log[0], **ruleinfo.log[1]) if ruleinfo.message: rule.message = ruleinfo.message if ruleinfo.benchmark: rule.benchmark = ruleinfo.benchmark if not self.run_local and ruleinfo.group is not None: rule.group = ruleinfo.group if ruleinfo.wrapper: if self.use_conda: rule.conda_env = snakemake.wrapper.get_conda_env( ruleinfo.wrapper, prefix=self.wrapper_prefix) # TODO retrieve suitable singularity image if ruleinfo.conda_env and self.use_conda: if not (ruleinfo.script or ruleinfo.wrapper or ruleinfo.shellcmd): raise RuleException( "Conda environments are only allowed " "with shell, script, or wrapper directives " "(not with run).", rule=rule) if not (urllib.parse.urlparse(ruleinfo.conda_env).scheme or os.path.isabs(ruleinfo.conda_env)): ruleinfo.conda_env = os.path.join(self.current_basedir, ruleinfo.conda_env) rule.conda_env = ruleinfo.conda_env if self.use_singularity: invalid_rule = not (ruleinfo.script or ruleinfo.wrapper or ruleinfo.shellcmd) if ruleinfo.singularity_img: if invalid_rule: raise RuleException( "Singularity directive is only allowed " "with shell, script or wrapper directives " "(not with run).", rule=rule) rule.singularity_img = ruleinfo.singularity_img elif self.global_singularity_img: if not invalid_rule: # skip rules with run directive rule.singularity_img = self.global_singularity_img rule.norun = ruleinfo.norun rule.docstring = ruleinfo.docstring rule.run_func = ruleinfo.func rule.shellcmd = ruleinfo.shellcmd rule.script = ruleinfo.script rule.wrapper = ruleinfo.wrapper rule.cwl = ruleinfo.cwl rule.restart_times = self.restart_times ruleinfo.func.__name__ = "__{}".format(rule.name) self.globals[ruleinfo.func.__name__] = ruleinfo.func setattr(rules, rule.name, RuleProxy(rule)) if checkpoint: checkpoints.register(rule) return ruleinfo.func
def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. Creates a shadow directory for the job if specified. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) for f, f_ in zip(self.output, self.rule.output): f.prepare() for f in self.files_to_download: f.download_from_remote() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare() self.remove_existing_output() if not self.is_shadow: return # Create shadow directory structure self.shadow_dir = tempfile.mkdtemp( dir=self.rule.workflow.persistence.shadow_path) cwd = os.getcwd() # Shallow simply symlink everything in the working directory. if self.rule.shadow_depth == "shallow": for source in os.listdir(cwd): link = os.path.join(self.shadow_dir, source) os.symlink(os.path.abspath(source), link) elif self.rule.shadow_depth == "full": snakemake_dir = os.path.join(cwd, ".snakemake") for dirpath, dirnames, filenames in os.walk(cwd): # Must exclude .snakemake and its children to avoid infinite # loop of symlinks. if os.path.commonprefix([snakemake_dir, dirpath ]) == snakemake_dir: continue for dirname in dirnames: if dirname == ".snakemake": continue relative_source = os.path.relpath(os.path.join(dirpath, dirname)) shadow = os.path.join(self.shadow_dir, relative_source) os.mkdir(shadow) for filename in filenames: source = os.path.join(dirpath, filename) relative_source = os.path.relpath(source) link = os.path.join(self.shadow_dir, relative_source) os.symlink(source, link)
def job_selector_ilp(self, jobs): """ Job scheduling by optimization of resource usage by solving ILP using pulp """ import pulp from pulp import lpSum from stopit import ThreadingTimeout as Timeout, TimeoutException if len(jobs) == 1: logger.debug( "Using greedy selector because only single job has to be scheduled." ) return self.job_selector_greedy(jobs) with self._lock: if not self.resources["_cores"]: return set() # assert self.resources["_cores"] > 0 scheduled_jobs = { job: pulp.LpVariable( "job_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, job in enumerate(jobs) } def size_gb(f): if self.touch: # In case of touch mode, there is no need to prioritize based on size. # We cannot access it anyway, because the files might be temporary and # not present. return 0 else: return f.size / 1e9 temp_files = { temp_file for job in jobs for temp_file in self.dag.temp_input(job) } temp_job_improvement = { temp_file: pulp.LpVariable("temp_file_{}".format(idx), lowBound=0, upBound=1, cat="Continuous") for idx, temp_file in enumerate(temp_files) } temp_file_deletable = { temp_file: pulp.LpVariable( "deletable_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, temp_file in enumerate(temp_files) } prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize) total_temp_size = max( sum([size_gb(temp_file) for temp_file in temp_files]), 1) total_core_requirement = sum( [max(job.resources.get("_cores", 1), 1) for job in jobs]) # Objective function # Job priority > Core load # Core load > temp file removal # Instant removal > temp size prob += (2 * total_core_requirement * 2 * total_temp_size * lpSum([ job.priority * scheduled_jobs[job] for job in jobs ]) + 2 * total_temp_size * lpSum([ max(job.resources.get("_cores", 1), 1) * scheduled_jobs[job] for job in jobs ]) + total_temp_size * lpSum([ temp_file_deletable[temp_file] * size_gb(temp_file) for temp_file in temp_files ]) + lpSum([ temp_job_improvement[temp_file] * size_gb(temp_file) for temp_file in temp_files ])) # Constraints: for name in self.workflow.global_resources: prob += (lpSum([ scheduled_jobs[job] * job.resources.get(name, 0) for job in jobs ]) <= self.resources[name]) # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file remaining_jobs = self.remaining_jobs for temp_file in temp_files: prob += temp_job_improvement[temp_file] <= lpSum([ scheduled_jobs[job] * self.required_by_job(temp_file, job) for job in jobs ]) / lpSum([ self.required_by_job(temp_file, job) for job in remaining_jobs ]) prob += (temp_file_deletable[temp_file] <= temp_job_improvement[temp_file]) try: with Timeout(10, swallow_exc=False): self._solve_ilp(prob) except TimeoutException as e: logger.warning( "Failed to solve scheduling problem with ILP solver in time (10s). " "Falling back to greedy solver.") return self.job_selector_greedy(jobs) except pulp.apis.core.PulpSolverError as e: logger.warning( "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver. " "Run Snakemake with --verbose to see the full solver output for debugging the problem." ) return self.job_selector_greedy(jobs) selected_jobs = set(job for job, variable in scheduled_jobs.items() if variable.value() == 1.0) if not selected_jobs: # No selected jobs. This could be due to insufficient resources or a failure in the ILP solver # Hence, we silently fall back to the greedy solver to make sure that we don't miss anything. return self.job_selector_greedy(jobs) for name in self.workflow.global_resources: self.resources[name] -= sum( [job.resources.get(name, 0) for job in selected_jobs]) return selected_jobs
def handle_protected(self, job): """ Write-protect output files that are marked with protected(). """ for f in job.expanded_output: if f in job.protected_output: logger.warning("Write-protecting output file {}".format(f)) f.protect()
def report( text, path, stylesheet=os.path.join(os.path.dirname(__file__), "report.css"), defaultenc="utf8", template=None, metadata=None, **files): """ Create an HTML report using python docutils. Attention: This function needs Python docutils to be installed for the python installation you use with Snakemake. Arguments text -- The "restructured text" as it is expected by python docutils. path -- The path to the desired output file stylesheet -- An optional path to a css file that defines the style of the document. This defaults to <your snakemake install>/report.css. Use the default to get a hint how to create your own. defaultenc -- The encoding that is reported to the browser for embedded text files, defaults to utf8. template -- An optional path to a docutils HTML template. metadata -- E.g. an optional author name or email address. All other keyword args are intepreted as paths to files that shall be embedded into the document. They keywords will be available as link targets in the text. E.g. append a file as keyword arg via F1=input[0] and put a download link in the text like this: report(''' ============== Report for ... ============== Some text. A link to an embedded file: F1_. Further text. ''', outputpath, F1=input[0]) Instead of specifying each file as a keyword arg, you can also expand the input of your rule if it is completely named, e.g.: report(''' Some text... ''', outputpath, **input) """ outmime, _ = mimetypes.guess_type(path) if outmime != "text/html": raise ValueError("Path to report output has to be an HTML file.") from docutils.core import publish_file definitions = textwrap.dedent(""" .. role:: raw-html(raw) :format: html """) metadata = textwrap.dedent(""" .. container:: :name: metadata {metadata} {date} """).format(metadata=metadata, date=datetime.date.today().isoformat()) text = format(textwrap.dedent(text), stepout=2) attachments = [textwrap.dedent(""" .. container:: :name: attachments """)] for name, file in sorted(files.items()): mime, encoding = mimetypes.guess_type(file) if mime is None: mime = "text/plain" logger.warning("Could not detect mimetype for {}, assuming " "text/plain.".format(file)) if encoding is None: encoding = defaultenc with open(file, "rb") as f: data = base64.b64encode(f.read()) attachments.append( ''' .. container:: :name: {name} [{name}] :raw-html:`<a href="data:{mime};charset={charset};filename={filename};base64,{data}" download="{filename}" draggable="true">{filename}</a>` '''.format( name=name, filename=os.path.basename(file), mime=mime, charset=encoding, data=data.decode())) text = definitions + text + "\n\n" + "\n\n".join(attachments) + metadata overrides = dict() if template is not None: overrides["template"] = template if stylesheet is not None: overrides["stylesheet_path"] = stylesheet html = open(path, "w") publish_file( source=io.StringIO(text), destination=html, writer_name="html", settings_overrides=overrides)
def snakemake(snakefile, listrules=False, cores=1, resources=None, workdir=None, targets=None, dryrun=False, touch=False, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, stats=None, printreason=False, printshellcmds=False, printdag=False, printrulegraph=False, nocolor=False, quiet=False, keepgoing=False, cluster=None, immediate_submit=False, standalone=False, ignore_ambiguity=False, snakemakepath=None, lock=True, unlock=False, cleanup_metadata=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, output_wait=3, print_compilation=False, debug=False, notemp=False, nodeps=False, jobscript=None, timestamp=False): """ Run snakemake on a given snakefile. Note: at the moment, this function is not thread-safe! Arguments snakefile -- the snakefile. list -- list rules. jobs -- maximum number of parallel jobs (default: 1). directory -- working directory (default: current directory). rule -- execute this rule (default: first rule in snakefile). dryrun -- print the rules that would be executed, but do not execute them. forcethis -- force the selected rule to be executed forceall -- force all rules to be executed time_measurements -- measure the running times of all rules lock -- lock the working directory """ init_logger(nocolor=nocolor, stdout=dryrun, debug=debug, timestamp=timestamp) if not os.path.exists(snakefile): logger.error("Error: Snakefile \"{}\" not present.".format(snakefile)) return False if workdir: olddir = os.getcwd() workflow = Workflow( snakefile=snakefile, snakemakepath=snakemakepath, jobscript=jobscript) if standalone: try: # set the process group os.setpgrp() except: # ignore: if it does not work we can still work without it pass success = True try: workflow.include(snakefile, workdir=workdir, overwrite_first_rule=True, print_compilation=print_compilation) workflow.check() if not print_compilation: if listrules: workflow.list_rules() else: if not printdag and not printrulegraph: # handle subworkflows subsnakemake = partial( snakemake, cores=cores, resources=resources, dryrun=dryrun, touch=touch, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, quiet=quiet, keepgoing=keepgoing, cluster=cluster, immediate_submit=immediate_submit, standalone=standalone, ignore_ambiguity=ignore_ambiguity, snakemakepath=snakemakepath, lock=lock, unlock=unlock, cleanup_metadata=cleanup_metadata, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, output_wait=output_wait, debug=debug, notemp=notemp, nodeps=nodeps, jobscript=jobscript, timestamp=timestamp) for subworkflow in workflow.subworkflows: logger.warning("Executing subworkflow {}.".format(subworkflow.name)) if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow.targets): success = False if workflow.subworkflows: logger.warning("Executing main workflow.") if success: success = workflow.execute( targets=targets, dryrun=dryrun, touch=touch, cores=cores, forcetargets=forcetargets, forceall=forceall, forcerun=forcerun, prioritytargets=prioritytargets, quiet=quiet, keepgoing=keepgoing, printshellcmds=printshellcmds, printreason=printreason, printrulegraph=printrulegraph, printdag=printdag, cluster=cluster, immediate_submit=immediate_submit, ignore_ambiguity=ignore_ambiguity, workdir=workdir, stats=stats, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, list_version_changes=list_version_changes, list_code_changes=list_code_changes, list_input_changes=list_input_changes, list_params_changes=list_params_changes, summary=summary, output_wait=output_wait, nolock=not lock, unlock=unlock, resources=resources, notemp=notemp, nodeps=nodeps, cleanup_metadata=cleanup_metadata ) except (Exception, BaseException) as ex: print_exception(ex, workflow.linemaps) success = False if workdir: os.chdir(olddir) if workflow.persistence: workflow.persistence.unlock() return success
# load function for statistical models def load_model(model_yaml_file): with open(model_yaml_file) as fh: MODELS = yaml.load(fh, Loader=Loader) or {} config['models'] = MODELS config['model_names'] = list(MODELS.keys()) # library preparation kit specific configuration libprep_fn = srcdir('libprep.config') with open(libprep_fn) as fh: LIBPREP_CONF = yaml.load(fh, Loader=Loader) or {} kit = config.get('libprepkit') if kit in LIBPREP_CONF: LIBPREP = LIBPREP_CONF[kit] if 'reference_db' in LIBPREP: config['db']['reference_db'] = LIBPREP['reference_db'] else: if kit is None: logger.warning('Running without LIBREPKIT defined!') else: logger.warning('`{}` is not a valid librepkit name'.format(kit)) sys.exit() # docker images docker_fn = srcdir('docker.config') with open(docker_fn) as fh: dck = yaml.load(fh, Loader=Loader) or {} update_config2(config, dck)