def code(self): try: from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter from pygments import highlight import pygments.util except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports.") source, language = None, None if self._rule.shellcmd is not None: source = self._rule.shellcmd language = "bash" elif self._rule.script is not None: logger.info("Loading script code for rule {}".format(self.name)) _, source, language = script.get_source(self._rule.script, self._rule.basedir) source = source.decode() elif self._rule.wrapper is not None: logger.info("Loading wrapper code for rule {}".format(self.name)) _, source, language = script.get_source( wrapper.get_script(self._rule.wrapper, prefix=self._rule.workflow.wrapper_prefix)) source = source.decode() try: lexer = get_lexer_by_name(language) return highlight( source, lexer, HtmlFormatter(linenos=True, cssclass="source", wrapcode=True)) except pygments.util.ClassNotFound: return "<pre><code>source</code></pre>"
def code(self): try: from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter from pygments import highlight import pygments.util except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports.") sources, language = None, None if self._rule.shellcmd is not None: sources = [self._rule.shellcmd] language = "bash" elif self._rule.script is not None and not contains_wildcard( self._rule.script): logger.info("Loading script code for rule {}".format(self.name)) _, source, language = script.get_source(self._rule.script, self._rule.basedir) sources = [source.decode()] elif self._rule.wrapper is not None and not contains_wildcard( self._rule.wrapper): logger.info("Loading wrapper code for rule {}".format(self.name)) _, source, language = script.get_source( wrapper.get_script(self._rule.wrapper, prefix=self._rule.workflow.wrapper_prefix)) sources = [source.decode()] elif self._rule.notebook is not None and not contains_wildcard( self._rule.notebook): _, source, language = script.get_source(self._rule.notebook, self._rule.basedir) language = language.split("_")[1] sources = notebook.get_cell_sources(source) else: # A run directive. There is no easy way yet to obtain # the actual uncompiled source code. sources = [] language = "python" try: lexer = get_lexer_by_name(language) highlighted = [ highlight( source, lexer, HtmlFormatter(linenos=True, cssclass="source", wrapcode=True), ) for source in sources ] return highlighted except pygments.util.ClassNotFound: return [ '<pre class="source"><code>{}</code></pre>'.format(source) for source in sources ]
def _get_provenance_hash(self, job: Job): """ Recursively calculate hash for the output of the given job and all upstream jobs in a blockchain fashion. This is based on an idea of Sven Nahnsen. Fails if job has more than one output file. The reason is that there is no way to generate a per-output file hash without generating the files. This hash, however, shall work without having to generate the files, just by describing all steps down to a given job. """ if job in self._hashes: return self._hashes[job] workflow = job.dag.workflow h = hashlib.sha256() # Hash shell command or script. if job.is_shell: # We cannot use the formatted shell command, because it also contains threads, # resources, and filenames (which shall be irrelevant for the hash). h.update(job.rule.shellcmd.encode()) elif job.is_script: _, source, _ = script.get_source( job.rule.script, basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) elif job.is_notebook: _, source, _ = script.get_source( job.rule.notebook, basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) elif job.is_wrapper: _, source, _ = script.get_source( wrapper.get_script(job.rule.wrapper, prefix=workflow.wrapper_prefix), basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) # Hash params. for key, value in sorted(job.params._allitems()): if key is not None: h.update(key.encode()) # If this raises a TypeError, we cannot calculate a reliable hash. try: h.update(json.dumps(value, sort_keys=True).encode()) except TypeError as e: raise WorkflowError( "Rule {} cannot be cached, because params " "are not JSON serializable. " "Consider converting them into a suitable format " "if you are sure that caching is necessary. " "Otherwise, deactivate caching for this rule " "by removing it from the --cache command line argument " "or removing the cache: true directive from the rule itself." .format(job.rule.name), e, ) # Hash input files that are not generated by other jobs (sorted by hash value). for file_hash in sorted( hash_file(f) for f in job.input if not any( f in depfiles for depfiles in job.dag.dependencies[job].values())): h.update(file_hash.encode()) # Hash used containers or conda environments. if workflow.use_conda and job.conda_env: if workflow.use_singularity and job.conda_env.container_img_url: h.update(job.conda_env.container_img_url.encode()) h.update(job.conda_env.content) elif workflow.use_singularity and job.container_img_url: h.update(job.container_img_url.encode()) # Generate hashes of dependencies, and add them in a blockchain fashion (as input to the current hash, sorted by hash value). for dep_hash in sorted( self._get_provenance_hash(dep) for dep in set(job.dag.dependencies[job].keys())): h.update(dep_hash.encode()) provenance_hash = h.hexdigest() # Store for re-use. self._hashes[job] = provenance_hash return provenance_hash
def _get_provenance_hash(self, job: Job): """ Recursively calculate hash for the output of the given job and all upstream jobs in a blockchain fashion. This is based on an idea of Sven Nahnsen. Fails if job has more than one output file. The reason is that there is no way to generate a per-output file hash without generating the files. This hash, however, shall work without having to generate the files, just by describing all steps down to a given job. """ if job in self._hashes: return self._hashes[job] workflow = job.dag.workflow h = hashlib.sha256() # Hash shell command or script. if job.is_shell: # We cannot use the formatted shell command, because it also contains threads, # resources, and filenames (which shall be irrelevant for the hash). h.update(job.rule.shellcmd.encode()) elif job.is_script: _, source, _ = script.get_source(job.rule.script) h.update(source) elif job.is_wrapper: _, source, _ = script.get_source( wrapper.get_script(job.rule.wrapper, prefix=workflow.wrapper_prefix)) h.update(source) # Hash params. for key, value in sorted(job.params._allitems()): h.update(key.encode()) # If this raises a TypeError, we cannot calculate a reliable hash. h.update(json.dumps(value, sort_keys=True).encode()) # Hash input files that are not generated by other jobs. for f in job.input: if not any(f in depfiles for depfiles in job.dag.dependencies[job].values()): with open(f, "b") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): h.update(byte_block) # Hash used containers or conda environments. if workflow.use_conda and job.conda_env: if workflow.use_singularity and job.conda_env.singularity_img_url: h.update(job.conda_env.singularity_img_url.encode()) h.update(job.conda_env.content) elif workflow.use_singularity and job.singularity_img_url: h.update(job.singularity_img_url.encode()) # Generate hashes of dependencies, and add them in a blockchain fashion (as input to the current hash). for dep_hash in sorted( self._get_provenance_hash(dep) for dep in set(job.dag.dependencies[job].keys())): h.update(dep_hash.encode()) provenance_hash = h.hexdigest() # Store for re-use. self._hashes[job] = provenance_hash return provenance_hash