def _free_resources(self, job): for name, value in job.resources.items(): if name in self.resources: value = self.calc_resource(name, value) self.resources[name] += value logger.debug("Releasing {} {} (now {}).".format( value, name, self.resources[name]))
def update_dynamic(self, job): dynamic_wildcards = job.dynamic_wildcards if not dynamic_wildcards: # this happens e.g. in dryrun if output is not yet present return depending = list(filter(lambda job_: not self.finished(job_), self.bfs(self.depending, job))) newrule, non_dynamic_wildcards = job.rule.dynamic_branch(dynamic_wildcards, input=False) self.replace_rule(job.rule, newrule) # no targetfile needed for job newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards) self.replace_job(job, newjob) for job_ in depending: if job_.dynamic_input: newrule_ = job_.rule.dynamic_branch(dynamic_wildcards) if newrule_ is not None: self.replace_rule(job_.rule, newrule_) if not self.dynamic(job_): logger.debug("Updating job {}.".format(job_)) newjob_ = Job(newrule_, self, targetfile=job_.targetfile) unexpected_output = self.reason(job_).missing_output.intersection(newjob.existing_output) if unexpected_output: raise UnexpectedOutputException(newjob_.rule, unexpected_output) self.replace_job(job_, newjob_) return newjob
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ while True: try: self._open_jobs.wait() except: # this will be caused because of SIGTERM or SIGINT self._executor.shutdown() return False self._open_jobs.clear() if not self.keepgoing and self._errors: logger.warning("Will exit after finishing " "currently running jobs.") self._executor.shutdown() return False if not any(self.open_jobs): self._executor.shutdown() return not self._errors needrun = list(self.open_jobs) assert needrun logger.debug("Ready jobs:\n\t" + "\n\t".join(map(str, needrun))) run = self.job_selector(needrun) logger.debug("Selected jobs:\n\t" + "\n\t".join(map(str, run))) self.running.update(run) for job in run: self.run(job)
def check_incomplete(self): if not self.ignore_incomplete: incomplete = self.incomplete_files if incomplete: if self.force_incomplete: logger.debug("Forcing incomplete files:") logger.debug("\t" + "\n\t".join(incomplete)) self.forcefiles.update(incomplete) else: raise IncompleteFilesException(incomplete)
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ try: while True: # work around so that the wait does not prevent keyboard interrupts while not self._open_jobs.wait(1): pass # obtain needrun and running jobs in a thread-safe way with self._lock: needrun = list(self.open_jobs) running = list(self.running) # free the event self._open_jobs.clear() # handle errors if not self.keepgoing and self._errors: logger.info("Will exit after finishing " "currently running jobs.") if not running: self._executor.shutdown() logger.error(_ERROR_MSG_FINAL) return False continue # normal shutdown because all jobs have been finished if not needrun and not running: self._executor.shutdown() if self._errors: logger.error(_ERROR_MSG_FINAL) return not self._errors # continue if no new job needs to be executed if not needrun: continue logger.debug("Resources before job selection: {}".format( self.resources)) logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun))) # select jobs by solving knapsack problem run = self.job_selector(needrun) logger.debug("Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run))) # update running jobs with self._lock: self.running.update(run) logger.debug( "Resources after job selection: {}".format(self.resources)) # actually run jobs for job in run: self.run(job) except (KeyboardInterrupt, SystemExit): logger.info("Terminating processes on user request.") self._executor.cancel() with self._lock: running = list(self.running) for job in running: job.cleanup() return False
def print_exception(ex, linemaps): """ Print an error message for a given exception. Arguments ex -- the exception linemaps -- a dict of a dict that maps for each snakefile the compiled lines to source code lines in the snakefile. """ tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__)) logger.debug(tb) if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=True)) return origin = get_exception_origin(ex, linemaps) if origin is not None: lineno, file = origin logger.error(format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=True)) return elif isinstance(ex, TokenError): logger.error(format_error(ex, None, show_traceback=False)) elif isinstance(ex, MissingRuleException): logger.error(format_error(ex, None, linemaps=linemaps, snakefile=ex.filename, show_traceback=False)) elif isinstance(ex, RuleException): for e in ex._include + [ex]: if not e.omit: logger.error(format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=True)) elif isinstance(ex, WorkflowError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=True)) elif isinstance(ex, KeyboardInterrupt): logger.info("Cancelling snakemake on user request.") else: traceback.print_exception(type(ex), ex, ex.__traceback__)
def shellcmd( img_path, cmd, args="", quiet=False, envvars=None, shell_executable=None, container_workdir=None, is_python_script=False, ): """Execute shell command inside singularity container given optional args and environment variables to be passed.""" if envvars: envvars = " ".join( "SINGULARITYENV_{}={}".format(k, v) for k, v in envvars.items() ) else: envvars = "" if shell_executable is None: shell_executable = "sh" else: # Ensure to just use the name of the executable, not a path, # because we cannot be sure where it is located in the container. shell_executable = os.path.split(shell_executable)[-1] if is_python_script: # mount host snakemake module into container args += " --bind {}:{}".format(SNAKEMAKE_SEARCHPATH, SNAKEMAKE_MOUNTPOINT) if container_workdir: args += " --pwd {}".format(container_workdir) cmd = "{} singularity {} exec --home {} {} {} {} -c '{}'".format( envvars, "--quiet --silent" if quiet else "", os.getcwd(), args, img_path, shell_executable, cmd.replace("'", r"'\''"), ) logger.debug(cmd) return cmd
def pull(self, dryrun=False): if self.is_local: return if dryrun: logger.info("Singularity image {} will be pulled.".format(self.url)) return logger.debug("Singularity image location: {}".format(self.path)) if not os.path.exists(self.path): logger.info("Pulling singularity image {}.".format(self.url)) try: p = subprocess.check_output(["singularity", "pull", "--name", "{}.simg".format(self.hash), self.url], cwd=self._img_dir, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise WorkflowError("Failed to pull singularity image " "from {}:\n{}".format(self.url, e.stdout.decode()))
def cancel(self): """cancel execution, usually by way of control+c. Cleanup is done in shutdown (deleting cached workdirs in Google Cloud Storage """ import googleapiclient # projects.locations.operations/cancel operations = self._api.projects().locations().operations() for job in self.active_jobs: request = operations.cancel(name=job.jobname) logger.debug("Cancelling operation {}".format(job.jobid)) try: self._retry_request(request) except (Exception, BaseException, googleapiclient.errors.HttpError): continue self.shutdown()
def _add_gpu(self, gpu_count): """Add a number of NVIDIA gpus to the current executor. This works by way of adding nvidia_gpu to the job default resources, and also changing the default machine type prefix to be n1, which is the currently only supported instance type for using GPUs for LHS. """ if not gpu_count or gpu_count == 0: return logger.debug( "found resource request for {} GPUs. This will limit to n1 " "instance types.".format(gpu_count)) self.workflow.default_resources.parsed["nvidia_gpu"] = gpu_count self.workflow.default_resources.args.append("nvidia_gpu=%s" % gpu_count) self._machine_type_prefix = self._machine_type_prefix or "" if not self._machine_type_prefix.startswith("n1"): self._machine_type_prefix = "n1"
def _wait_for_jobs(self): UNFINISHED_STATES = [ "UNKNOWN", "INITIALIZING", "QUEUED", "RUNNING", "PAUSED", ] ERROR_STATES = [ "EXECUTOR_ERROR", "SYSTEM_ERROR", "CANCELED", # TODO: really call `error_callback` on this? ] while True: with self.lock: if not self.wait: return active_jobs = self.active_jobs self.active_jobs = list() still_running = list() for j in active_jobs: with self.status_rate_limiter: # TODO: this doesn't seem to do anything? res = self.tes_client.get_task(j.jobid, view="MINIMAL") logger.debug("[TES] State of task '{id}': {state}".format( id=j.jobid, state=res.state, )) if res.state in UNFINISHED_STATES: still_running.append(j) elif res.state in ERROR_STATES: logger.info( "[TES] Task errored: {id}".format(id=j.jobid)) j.error_callback(j.job) elif res.state == "COMPLETE": logger.info( "[TES] Task completed: {id}".format(id=j.jobid)) j.callback(j.job) with self.lock: self.active_jobs.extend(still_running) time.sleep(1 / self.max_status_checks_per_second)
def run(self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) workdir = os.getcwd() jobid = self.dag.jobid(job) jobscript = self.get_jobscript(job) jobfinished = os.path.join(self.tmpdir, "{}.jobfinished".format(jobid)) jobfailed = os.path.join(self.tmpdir, "{}.jobfailed".format(jobid)) self.spawn_jobscript(job, jobscript, jobfinished=jobfinished, jobfailed=jobfailed) deps = " ".join(self.external_jobid[f] for f in job.input if f in self.external_jobid) try: submitcmd = job.format_wildcards( self.submitcmd, dependencies=deps, cluster=self.cluster_wildcards(job)) except AttributeError as e: raise WorkflowError(str(e), rule=job.rule) try: ext_jobid = subprocess.check_output( '{submitcmd} "{jobscript}"'.format(submitcmd=submitcmd, jobscript=jobscript), shell=True).decode().split("\n") except subprocess.CalledProcessError as ex: raise WorkflowError( "Error executing jobscript (exit code {}):\n{}".format( ex.returncode, ex.output.decode()), rule=job.rule) if ext_jobid and ext_jobid[0]: ext_jobid = ext_jobid[0] self.external_jobid.update((f, ext_jobid) for f in job.output) logger.debug("Submitted job {} with external jobid {}.".format( jobid, ext_jobid)) submit_callback(job) with self.lock: self.active_jobs.append(GenericClusterJob(job, callback, error_callback, jobscript, jobfinished, jobfailed))
def parseYAMLHeader(filepath): """ :param filepath: path to the file :return: String representation of the YAML header in the file, including inter-document framing ("---") """ yamlHeader = [] for i, line in enumerate(open(filepath).readlines()): # process yamlHeader.append(line.strip()[2:]) # terminate if that's already "#'---" (=end of YAML-designated area) if i != 0 and line.startswith("#'---"): break result = '\n'.join(yamlHeader) logger.debug("Got " + result + "as a result of parsing YAML header from " + filepath + ".\n") return result
def run(self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) workdir = os.getcwd() jobid = self.dag.jobid(job) jobscript = self.get_jobscript(job) jobfinished = os.path.join(self.tmpdir, "{}.jobfinished".format(jobid)) jobfailed = os.path.join(self.tmpdir, "{}.jobfailed".format(jobid)) self.spawn_jobscript(job, jobscript, jobfinished=jobfinished, jobfailed=jobfailed) deps = " ".join(self.external_jobid[f] for f in job.input if f in self.external_jobid) try: submitcmd = job.format_wildcards( self.submitcmd, dependencies=deps, cluster=self.cluster_wildcards(job)) except AttributeError as e: raise WorkflowError(str(e), rule=job.rule) try: ext_jobid = subprocess.check_output( '{submitcmd} "{jobscript}"'.format(submitcmd=submitcmd, jobscript=jobscript), shell=True).decode().split("\n") except subprocess.CalledProcessError as ex: logger.error("Error submitting jobscript (exit code {}):\n{}".format( ex.returncode, ex.output.decode())) error_callback(job) return if ext_jobid and ext_jobid[0]: ext_jobid = ext_jobid[0] self.external_jobid.update((f, ext_jobid) for f in job.output) logger.debug("Submitted job {} with external jobid {}.".format( jobid, ext_jobid)) submit_callback(job) with self.lock: self.active_jobs.append(GenericClusterJob(job, callback, error_callback, jobscript, jobfinished, jobfailed))
def write_jobscript(self, job, jobscript, **kwargs): use_threads = "--force-use-threads" if not job.is_group() else "" envvars = "\\\n".join("export {}={};".format(var, os.environ[var]) for var in self.workflow.envvars) exec_job = self.format_job(self.exec_job, job, _quote_all=False, use_threads=use_threads, envvars=envvars, **kwargs) content = self.format_job(self.jobscript, job, exec_job=exec_job, **kwargs) logger.debug("Jobscript:\n{}".format(content)) with open(jobscript, "w") as f: print(content, file=f) os.chmod(jobscript, os.stat(jobscript).st_mode | stat.S_IXUSR)
def __init__(self, *args, keep_local=False, provider=None, **kwargs): super(RemoteObject, self).__init__(*args, keep_local=keep_local, provider=provider, **kwargs) bucket_name = "test-static-remote-bucket" test_files = ("test.txt", "out1.txt", "out2.txt") s3 = boto3.resource("s3") s3.create_bucket(Bucket=bucket_name) # "Upload" files that should be in S3 before tests... s3c = S3Helper() for test_file in test_files: if not s3c.exists_in_bucket(bucket_name, test_file): logger.debug( "Pre-populating remote bucket {} with file {}".format( bucket_name, test_file)) s3c.upload_to_s3(bucket_name, test_file)
def _globus(self, *args): retry = self.provider.retry cmd = ["globus-url-copy"] + list(args) for i in range(retry + 1): try: logger.debug(" ".join(cmd)) return sp.run( cmd, check=True, stderr=sp.PIPE, stdout=sp.PIPE ).stdout.decode() except sp.CalledProcessError as e: if i == retry: raise WorkflowError( "Error calling globus-url-copy:\n{}".format( cmd, e.stderr.decode() ) ) else: # try again after some seconds time.sleep(1) continue
def __init__(self, *args, keep_local=False, provider=None, **kwargs): super(RemoteObject, self).__init__(*args, keep_local=keep_local, provider=provider, **kwargs) bucket_name = 'test-static-remote-bucket' test_files = ('test.txt', 'out1.txt', 'out2.txt') conn = boto.connect_s3() if bucket_name not in [b.name for b in conn.get_all_buckets()]: conn.create_bucket(bucket_name) # "Upload" files that should be in S3 before tests... s3c = S3Helper() for test_file in test_files: if not s3c.exists_in_bucket(bucket_name, test_file): logger.debug( "Pre-populating remote bucket {} with file {}".format( bucket_name, test_file)) s3c.upload_to_s3(bucket_name, test_file)
def _gfal(self, cmd, *args, retry=None, raise_workflow_error=True): if retry is None: retry = self.provider.retry _cmd = ["gfal-" + cmd] + list(args) for i in range(retry + 1): try: logger.debug(_cmd) return sp.run(_cmd, check=True, stderr=sp.PIPE, stdout=sp.PIPE).stdout.decode() except sp.CalledProcessError as e: if i == retry: if raise_workflow_error: raise WorkflowError( "Error calling gfal-{}:\n{}".format( cmd, e.stderr.decode())) else: raise e else: # try again after some seconds time.sleep(1) continue
def update_dynamic(self, job): """Update the DAG by evaluating the output of the given job that contains dynamic output files.""" dynamic_wildcards = job.dynamic_wildcards if not dynamic_wildcards: # this happens e.g. in dryrun if output is not yet present return depending = list( filter(lambda job_: not self.finished(job_), self.bfs(self.depending, job))) newrule, non_dynamic_wildcards = job.rule.dynamic_branch( dynamic_wildcards, input=False) self.specialize_rule(job.rule, newrule) # no targetfile needed for job newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards) self.replace_job(job, newjob) for job_ in depending: if job_.dynamic_input: newrule_ = job_.rule.dynamic_branch(dynamic_wildcards) if newrule_ is not None: self.specialize_rule(job_.rule, newrule_) if not self.dynamic(job_): logger.debug("Updating job {}.".format(job_)) newjob_ = Job(newrule_, self, targetfile=job_.targetfile) unexpected_output = self.reason( job_).missing_output.intersection( newjob.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( newjob_.rule, unexpected_output)) self.replace_job(job_, newjob_) return newjob
def _get_bucket(self): """get a connection to the storage bucket (self.bucket) and exit if the name is taken or otherwise invalid. Parameters ========== workflow: the workflow object to derive the prefix from """ import google # Hold path to requested subdirectory and main bucket bucket_name = self.workflow.default_remote_prefix.split("/")[0] self.gs_subdir = re.sub("^{}/".format(bucket_name), "", self.workflow.default_remote_prefix) self.gs_logs = os.path.join(self.gs_subdir, "google-lifesciences-logs") # Case 1: The bucket already exists try: self.bucket = self._bucket_service.get_bucket(bucket_name) # Case 2: The bucket needs to be created except google.cloud.exceptions.NotFound: self.bucket = self._bucket_service.create_bucket(bucket_name) # Case 2: The bucket name is already taken except Exception as ex: logger.error("Cannot get or create {} (exit code {}):\n{}".format( bucket_name, ex.returncode, ex.output.decode())) log_verbose_traceback(ex) raise ex logger.debug("bucket=%s" % self.bucket.name) logger.debug("subdir=%s" % self.gs_subdir) logger.debug("logs=%s" % self.gs_logs)
def _job_was_successful(self, status): """based on a status response (a [pipeline].projects.locations.operations.get debug print the list of events, return True if all return codes 0 and False otherwise (indication of failure). In that a nonzero exit status is found, we also debug print it for the user. """ success = True # https://cloud.google.com/life-sciences/docs/reference/rest/v2beta/Event for event in status["metadata"]["events"]: logger.debug(event["description"]) # Does it always result in fail for other failure reasons? if "failed" in event: success = False action = event.get("failed") logger.debug("{}: {}".format(action["code"], action["cause"])) elif "unexpectedExitStatus" in event: action = event.get("unexpectedExitStatus") if action["exitStatus"] != 0: success = False # Provide reason for the failure (desc includes exit code) msg = "%s" % event["description"] if "stderr" in action: msg += ": %s" % action["stderr"] logger.debug(msg) return success
def parseWBInfosFromRFiles(script_dir="Scripts", htmlPath="Output/html"): """ :param script_dir: Relative path to the Scripts directory :param htmlPath: Relative path to the html output path :return: a list of dictionaries with fields: - file - what is the input R file - outputFile - there to put the output html file - param - parsed yaml params """ parsedInfos = [] #errorOccured = False for filename in findFilesRecursive(script_dir, ['*.r', '*.R']): if not hasYAMLHeader(filename): # Ignore files without YAML infos continue header = parseYAMLHeader(filename) # run all the synthax checks - will raise an error if it fails yamlParamsDict = parseYamlParams(header, filename) if yamlParamsDict == None: #parsing error occured continue #go on parsing next file if type( yamlParamsDict ) is str: #allow parsing one tag without double points as string; put it in a dict and check later on yamlParamsDict = {yamlParamsDict: None} if ('wb' in yamlParamsDict): # the header contains wb informations outFile = htmlPath + "/" + pathsepsToUnderscore( os.path.splitext(filename)[0]) + ".html" parsedInfos.append({ 'file': linuxify(filename), 'outputFile': outFile, 'param': yamlParamsDict }) logger.debug("Parsed informations from R files: " + str(parsedInfos)) #if errorOccured: # raise ValueError("Errors occured in parsing the R files. Please fix them.") TODO really raise a ValueError? return parsedInfos
def parseMDFiles(script_dir="Scripts", htmlPath="Output/html"): """ :param script_dir: Relative path to the Scripts directory :param htmlPath: Relative path to the html output path :return: a list of dictionaries with fields: - file - what is the input .md file - outputFile - there to put the output html file - param - parsed yaml header - always an empty list """ logger.debug("Finding .md files:\n") foundMDFiles = [] for f in findFilesRecursive(script_dir, ['*.md']): outFile = htmlPath + "/" + pathsepsToUnderscore( os.path.splitext(f)[0]) + ".html" logger.debug("Found " + outFile + ".\n") foundMDFiles.append({ 'file': linuxify(f), 'outputFile': outFile, 'param': [] }) return foundMDFiles
def findFilesRecursive(startingPath, patterns): """ :param startingPath: root path of the search :param patterns: patterns to search file names for :return: paths to files matching the patterns """ matchedFilepaths = [] for root, dirnames, filenames in os.walk(startingPath): dirnames[:] = [d for d in dirnames if not d[0] == '_'] dirnames[:] = [d for d in dirnames if not d[0] == '.'] for file in reduce(operator.add, (fnmatch.filter(filenames, p) for p in patterns)): checkFilename(file) absFilepath = os.path.join(root, file) if not absFilepath in matchedFilepaths: matchedFilepaths.append(absFilepath) sortedMatchedFilepaths = sorted(matchedFilepaths) conf = Config() regex = re.compile(conf.get("fileRegex")) reFiles = list(filter(regex.search, sortedMatchedFilepaths)) logger.debug("Found files in scope of wBuild: " + str(reFiles) + ".\n") return reFiles
def run( self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) workdir = os.getcwd() jobid = self.dag.jobid(job) properties = job.json() jobscript = self.get_jobscript(job) jobfinished = os.path.join(self.tmpdir, "{}.jobfinished".format(jobid)) jobfailed = os.path.join(self.tmpdir, "{}.jobfailed".format(jobid)) with open(jobscript, "w") as f: print(format(self.jobscript, workflow=self.workflow, cores=self.cores), file=f) os.chmod(jobscript, os.stat(jobscript).st_mode | stat.S_IXUSR) deps = " ".join(self.external_jobid[f] for f in job.input if f in self.external_jobid) submitcmd = job.format_wildcards(self.submitcmd, dependencies=deps) try: ext_jobid = subprocess.check_output( '{submitcmd} "{jobscript}"'.format( submitcmd=submitcmd, jobscript=jobscript), shell=True).decode().split("\n") except subprocess.CalledProcessError as ex: raise WorkflowError("Error executing jobscript (exit code {}):\n{}".format(ex.returncode, ex.output.decode()), rule=job.rule) if ext_jobid and ext_jobid[0]: ext_jobid = ext_jobid[0] self.external_jobid.update((f, ext_jobid) for f in job.output) logger.debug("Submitted job {} with external jobid {}.".format(jobid, ext_jobid)) thread = threading.Thread( target=self._wait_for_job, args=(job, callback, error_callback, jobscript, jobfinished, jobfailed)) thread.daemon = True thread.start() self.threads.append(thread) submit_callback(job)
def createSampleFileMapping(self): """ create a sample file mapping with unique entries of existing files columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] """ assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']} assay_subsets = [] for id_, file_types in assay_mapping.items(): for file_type in file_types: df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy() df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True) df['ASSAY'] = id_ df['FILE_TYPE'] = file_type assay_subsets.append(df) file_mapping = pd.concat(assay_subsets) # cleaning SAMPLE_FILE_MAPPING file_mapping.dropna(inplace=True) file_mapping.drop_duplicates(inplace=True) # check for missing files existing = utils.checkFileExists(file_mapping["FILE_PATH"]) if len(existing) == 0: message = "File mapping is empty. " message += "Please check that all files in your sample annotation exist." raise FileNotFoundError(message) elif len(existing) < file_mapping.shape[0]: missing = set(file_mapping["FILE_PATH"]) - set(existing) logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...") logger.debug(f"Missing files: {missing}") file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)] # write file mapping file_mapping.to_csv(self.root / "file_mapping.csv", index=False) return file_mapping
def writeDependencyFile(): """ Entry point for writing .wBuild.depend. """ #if not wbuildVersionIsCurrent(): # print(bcolors.WARNING + "Version of the project's static .wBuild lib is not the same as the dynamically loaded " # "wBuild" # "version. It is strongly recommended to update .wBuild lib using \'wbuild update\'; " # "otherwise, the consistency of the build can not be guaranteed." + bcolors.ENDC) logger.info("Structuring dependencies...") conf = Config() htmlOutputPath = conf.get("htmlOutputPath") logger.debug("Loaded config.\n html output path (key htmlOutputPath): " + htmlOutputPath + "\n") scriptsPath = conf.get("scriptsPath") wbData = parseWBInfosFromRFiles(script_dir=scriptsPath, htmlPath=htmlOutputPath) mdData = parseMDFiles(script_dir=scriptsPath, htmlPath=htmlOutputPath) dependFile = tempfile.NamedTemporaryFile('w', delete=False) with dependFile as f: #start off with the header f.write('######\n') f.write('#This is a autogenerated snakemake file by wBuild\n') f.write('#wBuild by Leonhard Wachutka\n') f.write('######\n') # write rules for r in wbData: writeRule(r, f) # write md rules for r in mdData: writeMdRule(r, f) # write build index rule writeIndexRule(wbData, mdData, f) logger.info("Dependencies file generated.\n") return (dependFile.name)
def _get_task(self, job, jobscript): import tes checkdir, _ = os.path.split(self.snakefile) task = {} task["name"] = job.format_wildcards(self.jobname) task["description"] = self._get_task_description(job) task["inputs"] = self._get_task_inputs(job, jobscript, checkdir) task["outputs"] = self._get_task_outputs(job, checkdir) task["executors"] = self._get_task_executors() task["resources"] = tes.models.Resources() # define resources if "_cores" in job.resources: task["resources"]["cpu_cores"] = job.resources["_cores"] if "mem_mb" in job.resources: task["resources"]["ram_gb"] = job.resources["mem_mb"] / 1000 if "disk_mb" in job.resources: task["resources"]["disk_gb"] = job.resources["disk_mb"] / 1000 tes_task = tes.Task(**task) logger.debug("[TES] Built task: {task}".format(task=tes_task)) return tes_task
def update_dynamic(self, job): dynamic_wildcards = job.dynamic_wildcards if not dynamic_wildcards: # this happens e.g. in dryrun if output is not yet present return depending = list( filter(lambda job_: not self.finished(job_), self.bfs(self.depending, job))) newrule, non_dynamic_wildcards = job.rule.dynamic_branch( dynamic_wildcards, input=False) self.replace_rule(job.rule, newrule) # no targetfile needed for job newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards) self.replace_job(job, newjob) for job_ in depending: if job_.dynamic_input: newrule_ = job_.rule.dynamic_branch(dynamic_wildcards) if newrule_ is not None: self.replace_rule(job_.rule, newrule_) if not self.dynamic(job_): logger.debug("Updating job {}.".format(job_)) newjob_ = Job(newrule_, self, targetfile=job_.targetfile) unexpected_output = self.reason( job_).missing_output.intersection( newjob.existing_output) if unexpected_output: raise UnexpectedOutputException( newjob_.rule, unexpected_output) self.replace_job(job_, newjob_) return newjob
def unlock(self, *args): logger.debug("unlocking") for lockfile in self._lockfile.values(): try: logger.debug("removing lock") os.remove(lockfile) except OSError as e: if e.errno != 2: # missing file raise e logger.debug("removed all locks")
def mtime(self): if self.s3_key == "test.txt": logger.debug("test.txt is new") return float("inf") elif self.s3_key.startswith("out"): logger.debug("{} is old".format(self.s3_key)) # For some reason this breaks if you return 0 return 5 else: logger.debug("Using real mtime for {}".format(self.s3_key)) return super().mtime()
def test_wrappers(args_dict): """""" # Cleanup data and leave if args_dict["clean_output"]: logger.info("Removing output data") for wrapper_name in args_dict["wrappers"]: wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name) shutil.rmtree(wrapper_workdir, ignore_errors=True) sys.exit() # Test wrappers for wrapper_name in args_dict["wrappers"]: logger.warning("Testing Wrapper {}".format(wrapper_name)) try: snakefile = get_snakefile_fn(workflow_dir=WRAPPER_DIR, workflow=wrapper_name) wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name) logger.debug("Working in directory: {}".format(wrapper_workdir)) #Run Snakemake through the API snakemake(snakefile=snakefile, workdir=wrapper_workdir, config={"data_dir": DATA_DIR}, wrapper_prefix=WRAPPER_PREFIX, use_conda=True, cores=args_dict["cores"], verbose=args_dict["verbose"], quiet=args_dict["quiet"]) finally: logger.debug("List of file generated: {}".format( os.listdir(wrapper_workdir))) shutil.rmtree(os.path.join(wrapper_workdir, ".snakemake"), ignore_errors=True) if not args_dict["keep_output"]: logger.debug("Removing temporary directory") shutil.rmtree(wrapper_workdir, ignore_errors=True)
def _wait_for_jobs(self): """wait for jobs to complete. This means requesting their status, and then marking them as finished when a "done" parameter shows up. Even for finished jobs, the status should still return """ import googleapiclient while True: # always use self.lock to avoid race conditions with self.lock: if not self.wait: return active_jobs = self.active_jobs self.active_jobs = list() still_running = list() # Loop through active jobs and act on status for j in active_jobs: # use self.status_rate_limiter to avoid too many API calls. with self.status_rate_limiter: # https://cloud.google.com/life-sciences/docs/reference/rest/v2beta/projects.locations.operations/get # Get status from projects.locations.operations/get operations = self._api.projects().locations().operations() request = operations.get(name=j.jobname) logger.debug("Checking status for operation {}".format( j.jobid)) try: status = self._retry_request(request) except googleapiclient.errors.HttpError as ex: # Operation name not found, even finished should be found if ex.status == 404: j.error_callback(j.job) continue # Unpredictable server (500) error elif ex.status == 500: logger.error(ex["content"].decode("utf-8")) j.error_callback(j.job) except WorkflowError as ex: print_exception(ex, self.workflow.linemaps) j.error_callback(j.job) continue # The operation is done if status.get("done", False) == True: # Derive success/failure from status codes (prints too) if self._job_was_successful(status): j.callback(j.job) else: self.print_job_error(j.job, jobid=j.jobid) j.error_callback(j.job) # The operation is still running else: still_running.append(j) with self.lock: self.active_jobs.extend(still_running) sleep()
def log_verbose_traceback(ex): tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__)) logger.debug(tb)
def _generate_job_resources(self, job): """given a particular job, generate the resources that it needs, including default regions and the virtual machine configuration """ # Right now, do a best effort mapping of resources to instance types cores = job.resources.get("_cores", 1) mem_mb = job.resources.get("mem_mb", 15360) # IOPS performance proportional to disk size disk_mb = job.resources.get("disk_mb", 512000) # Convert mb to gb disk_gb = math.ceil(disk_mb / 1024) # Look for if the user wants an nvidia gpu gpu_count = job.resources.get("nvidia_gpu") or job.resources.get("gpu") gpu_model = job.resources.get("gpu_model") # If a gpu model is specified without a count, we assume 1 if gpu_model and not gpu_count: gpu_count = 1 # Update default resources using decided memory and disk self.workflow.default_resources = self.default_resources self.workflow.default_resources.args = [ "mem_mb=%s" % mem_mb, "disk_mb=%s" % disk_mb, ] self.workflow.default_resources.parsed["mem_mb"] = mem_mb self.workflow.default_resources.parsed["disk_mb"] = disk_mb # Job resource specification can be overridden by gpu preferences self.machine_type_prefix = job.resources.get("machine_type") # If gpu wanted, limit to N1 general family, and update arguments if gpu_count: self._add_gpu(gpu_count) machine_types = self.get_available_machine_types() # Alert the user of machine_types available before filtering # https://cloud.google.com/compute/docs/machine-types logger.debug( "found {} machine types across regions {} before filtering " "to increase selection, define fewer regions".format( len(machine_types), self.regions)) # First pass - eliminate anything that too low in cpu/memory keepers = dict() # Also keep track of max cpus and memory, in case none available max_cpu = 1 max_mem = 15360 for name, machine_type in machine_types.items(): max_cpu = max(max_cpu, machine_type["guestCpus"]) max_mem = max(max_mem, machine_type["memoryMb"]) if machine_type["guestCpus"] < cores or machine_type[ "memoryMb"] < mem_mb: continue keepers[name] = machine_type # If a prefix is set, filter down to it if self.machine_type_prefix: machine_types = keepers keepers = dict() for name, machine_type in machine_types.items(): if name.startswith(self.machine_type_prefix): keepers[name] = machine_type # If we don't have any contenders, workflow error if not keepers: if self.machine_type_prefix: raise WorkflowError( "Machine prefix {prefix} is too strict, or the resources cannot " " be satisfied, so there are no options " "available.".format(prefix=self.machine_type_prefix)) else: raise WorkflowError( "You requested {requestMemory} MB memory, {requestCpu} cores. " "The maximum available are {availableMemory} MB memory and " "{availableCpu} cores. These resources cannot be satisfied. " "Please consider reducing the resource requirements of the " "corresponding rule.".format( requestMemory=mem_mb, requestCpu=cores, availableCpu=max_cpu, availableMemory=max_mem, )) # Now find (quasi) minimal to satisfy constraints machine_types = keepers # Select the first as the "smallest" smallest = list(machine_types.keys())[0] min_cores = machine_types[smallest]["guestCpus"] min_mem = machine_types[smallest]["memoryMb"] for name, machine_type in machine_types.items(): if (machine_type["guestCpus"] < min_cores and machine_type["memoryMb"] < min_mem): smallest = name min_cores = machine_type["guestCpus"] min_mem = machine_type["memoryMb"] selected = machine_types[smallest] logger.debug("Selected machine type {}:{}".format( smallest, selected["description"])) virtual_machine = { "machineType": smallest, "labels": { "app": "snakemake" }, "bootDiskSizeGb": disk_gb, "preemptible": job.rule.name in self.preemptible_rules, } # If the user wants gpus, add accelerators here if gpu_count: accelerator = self._get_accelerator(gpu_count, zone=selected["zone"], gpu_model=gpu_model) virtual_machine["accelerators"] = [{ "type": accelerator["name"], "count": gpu_count }] resources = { "regions": self.regions, "virtualMachine": virtual_machine } return resources
def __init__( self, workflow, dag, cores, jobname="snakejob.{name}.{jobid}.sh", printreason=False, quiet=False, printshellcmds=False, container_image=None, regions=None, location=None, cache=False, latency_wait=3, local_input=None, restart_times=None, exec_job=None, max_status_checks_per_second=1, preemption_default=None, preemptible_rules=None, ): # Attach variables for easy access self.workflow = workflow self.quiet = quiet self.workdir = os.path.dirname(self.workflow.persistence.path) self._save_storage_cache = cache # Relative path for running on instance self._set_snakefile() # Prepare workflow sources for build package self._set_workflow_sources() exec_job = (exec_job or ("snakemake {target} --snakefile %s " "--force -j{cores} --keep-target-files --keep-remote " "--latency-wait 0 --scheduler {workflow.scheduler_type} " "--attempt 1 {use_threads} --max-inventory-time 0 " "{overwrite_config} {rules} --nocolor " "--notemp --no-hooks --nolock " % self.snakefile) + self.get_set_threads_args() + self.get_set_scatter_args()) # Set preemptible instances self._set_preemptible_rules(preemption_default, preemptible_rules) # IMPORTANT: using Compute Engine API and not k8s == no support secrets self.envvars = list(self.workflow.envvars) or [] # Quit early if we can't authenticate self._get_services() self._get_bucket() # Akin to Kubernetes, create a run namespace, default container image self.run_namespace = str(uuid.uuid4()) self.container_image = container_image or get_container_image() self.regions = regions or ["us-east1", "us-west1", "us-central1"] # The project name is required, either from client or environment self.project = (os.environ.get("GOOGLE_CLOUD_PROJECT") or self._bucket_service.project) # Determine API location based on user preference, and then regions self._set_location(location) # Tell the user right away the regions, location, and container logger.debug("regions=%s" % self.regions) logger.debug("location=%s" % self.location) logger.debug("container=%s" % self.container_image) # Keep track of build packages to clean up shutdown, and generate self._build_packages = set() targz = self._generate_build_source_package() self._upload_build_source_package(targz) # Save default resources to add later, since we need to add custom # default resources depending on the instance requested self.default_resources = self.workflow.default_resources self.workflow.default_resources.args = None super().__init__( workflow, dag, None, jobname=jobname, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, restart_times=restart_times, exec_job=exec_job, assume_shared_fs=False, max_status_checks_per_second=10, )
def _set_location(self, location=None): """The location is where the Google Life Sciences API is located. This can be meaningful if the requester has data residency requirements or multi-zone needs. To determine this value, we first use the locations API to determine locations available, and then compare them against: 1. user specified location or prefix 2. regions having the same prefix 3. if cannot be satisifed, we throw an error. """ # Derive available locations # See https://cloud.google.com/life-sciences/docs/concepts/locations locations = (self._api.projects().locations().list( name="projects/{}".format(self.project)).execute()) locations = { x["locationId"]: x["name"] for x in locations.get("locations", []) } # Alert the user about locations available logger.debug("locations-available:\n%s" % "\n".join(locations)) # If no locations, there is something wrong if not locations: raise WorkflowError( "No locations found for Google Life Sciences API.") # First pass, attempt to match the user-specified location (or prefix) if location: if location in locations: self.location = locations[location] return # It could be that a prefix was provided for contender in locations: if contender.startswith(location): self.location = locations[contender] return # If we get here and no match, alert user. raise WorkflowError( "Location or prefix requested %s is not available." % location) # If we get here, we need to select location from regions for region in self.regions: if region in locations: self.location = locations[region] return # If we get here, choose based on prefix prefixes = set([r.split("-")[0] for r in self.regions]) regexp = "^(%s)" % "|".join(prefixes) for location in locations: if re.search(regexp, location): self.location = locations[location] return # If we get here, total failure of finding location raise WorkflowError( " No locations available for regions!" " Please specify a location with --google-lifesciences-location " " or extend --google-lifesciences-regions to find a Life Sciences location." )
def set_temporary_output(*rules): """Set the output of rules to temporary""" for rule in rules: logger.debug( "setting output of rule '{rule}' to temporary".format(rule=rule)) rule.temp_output = set(rule.output)
def set_protected_output(*rules): """Set the output of rules to protected""" for rule in rules: logger.debug( "setting output of rule '{rule}' to protected".format(rule=rule)) rule.protected_output = set(rule.output)