def __init__( self, wid: str, container: Type[Container], datadirectory: str, forwardedport: int, confdir: str, containerdir: str, # for singularity containers ): import os.path Logger.debug( f"Preparing {container.__name__} MySQL container with info: wid={wid}, port={forwardedport}, confdir={confdir}" ) self._containertype = container self.container: Container = container( self.MYSQL_CONTAINERNAME, instancename="mariadb-" + wid, containerdir=containerdir, ) self.datadirectory = datadirectory self.forwardedport = forwardedport self.confdir = confdir self.startupscriptsdir = os.path.join(self.confdir, "startup") self.sqlconfdir = os.path.join(self.confdir, "conf") self.mysqldoverride = os.path.join(self.confdir, "mysqld")
def exec_command(self, command): cmd = ["docker", "exec", "-i", self.dockerid] requiresshell = not isinstance(command, list) cmd.extend(command) if isinstance(command, list) else cmd.append(command) try: Logger.info("Executing command: " + " ".join(cmd)) val = (subprocess.check_output( cmd, shell=requiresshell).decode("ascii").strip()) except subprocess.CalledProcessError as e: Logger.critical( f"Docker exec_command failed '{e}': {e.output or e.stderr}") # check the logs try: logs_command = ["docker", "logs", self.dockerid] Logger.info("Checking docker logs: " + " ".join(logs_command)) Logger.debug(subprocess.check_output(logs_command)) except: Logger.critical( f"Failed to get logs for container {self.dockerid}") return (str(e), e.returncode) return val.strip() if val else val, 0
def get_digest(self, info: ContainerInfo) -> Optional[str]: try: token = self.get_token(info) except Exception as e: Logger.critical( f"Couldn't get digest for container (couldn't get token): '{str(info)}': {str(e)}" ) return None if token: Logger.debug( f"Got token for '{info}': {token[: min(5, len(token) - 1)]}..." ) try: req = self.build_request(info, token) Logger.debug(f"Requesting digest from: {req.full_url}") with request.urlopen(req) as response: rheaders = response.headers digest = rheaders.get("etag", rheaders.get("Docker-Content-Digest")) if digest is not None: digest = digest.replace("'", "").replace('"', "") return digest except Exception as e: Logger.critical( f"Couldn't get digest for container '{str(info)}': {str(e)}")
def __init__(self, dblocation, tablename, readonly=False): sqlitedict.logger.disabled = True ro = "r" if readonly else "c" Logger.debug( f"Opening connection to {dblocation}/{tablename} with mode {ro}") self.kvdb = sqlitedict.SqliteDict(dblocation, tablename=tablename, autocommit=True, flag=ro)
def start_container(self): command = ["singularity", "instance", "start"] # if self.environment_variables: # command.extend(f"-e{k}={v}" for k, v in self.environment_variables.items()) if self.bindpoints: command.extend(f"-B{v}:{k}" for k, v in self.bindpoints.items()) if self.exposedports: command.extend([ "--net", "--network=none", "--network-args", *[ f"portmap={v}:{k}/tcp" for k, v in self.exposedports.items() ], ]) try: newenv = os.environ for k in self.environment_variables: newenv["SINGULARITYENV_" + k] = str( self.environment_variables[k]) if not self.instancename: self.instancename = generate_new_id(set()) command.extend([self.container_path(), self.instancename]) Logger.info("Starting singularity with command: " + " ".join(command)) out = subprocess.check_output(command, env=newenv, stderr=subprocess.STDOUT) Logger.debug(f"Singularity output: {out}") singrun = ["singularity", "run", "instance://" + self.instancename] Logger.debug("Started container, now running " + str(singrun)) startprocess = subprocess.Popen(singrun, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) self.run_logger = ProcessLogger( startprocess, prefix="mysql: ", logfp=None, exit_function=self.runlogger_didexit, ) except subprocess.CalledProcessError as e: raise Exception( f"An error occurred while starting a singularity container: {str(e)}.\n\tOutput: {str(e.output)}" )
def create_task_base(self, wf: Workflow, job: PreparedJob): forbiddenids = set() if job.store_in_central_db: try: with self.with_cursor() as cursor: forbiddenids = set( t[0] for t in cursor.execute("SELECT id FROM tasks").fetchall() ) except sqlite3.OperationalError as e: if "no such column: id" in repr(e): from shutil import move dt = datetime.utcnow() np = f"{job.db_path}.original-{dt.strftime('%Y%m%d')}" Logger.warn(f"Moving old janis-db to '{np}'") move(job.db_path, np) self._taskDB = None return self.create_task_base(wf=wf, job=job) raise submission_id = generate_new_id(forbiddenids) output_dir = fully_qualify_filename(job.output_dir) if not job.execution_dir: job.execution_dir = os.path.join(output_dir, "janis") Logger.debug( f"No execution-dir was provided, constructed one from the output-dir: {job.execution_dir}" ) job.execution_dir = fully_qualify_filename(job.execution_dir) Logger.info( f"Starting task with id = '{submission_id}' | output dir: {job.output_dir} | execution dir: {job.execution_dir}" ) row = TaskRow( submission_id, execution_dir=job.execution_dir, output_dir=output_dir ) WorkflowManager.create_dir_structure(job.execution_dir) if job.store_in_central_db: self.get_lazy_db_connection().insert_task(row) else: Logger.info( f"Not storing task '{submission_id}' in database. To watch, use: 'janis watch {output_dir}'" ) if self._connection: self._connection.commit() self._connection.close() self._taskDB = None self._connection = None return row
def deserialize_inner(val): if val is None: return None try: return json.loads(val) except Exception as ex: # tbh, sometimes the mysql database converts '"myvalue"' -> 'myvalue' (dropping the quotes), we'll do Logger.debug( f"Couldn't deserialize value, using string representation instead (value: {repr(val)}): {repr(ex)}" ) return str(val)
def db_connection(self): try: if self.readonly: Logger.debug( "Opening database connection to in READONLY mode: " + self.db_path ) return sqlite3.connect(f"file:{self.db_path}?mode=ro", uri=True) Logger.debug("Opening database connection: " + self.db_path) return sqlite3.connect(self.db_path) except: Logger.critical("Error when opening DB connection to: " + self.db_path) raise
def db_connection(self): config = JanisConfiguration.manager() try: if self.readonly: Logger.debug( "Opening database connection to in READONLY mode: " + config.dbpath ) return sqlite3.connect(f"file:{config.dbpath}?mode=ro", uri=True) Logger.debug("Opening database connection: " + config.dbpath) return sqlite3.connect(config.dbpath) except: Logger.critical("Error when opening DB connection to: " + config.dbpath) raise
def get( self, keys: Union[str, List[str]] = "*", where: Tuple[str, List[any]] = None, allow_operational_errors=True, ) -> Optional[List[T]]: jkeys = ", ".join(keys) if isinstance(keys, list) else keys if jkeys == "*": keys = [t.dbalias for t in self._base.keymap()] jkeys = ", ".join(keys) if isinstance(keys, list) else keys values = [] whereclauses = [] if self._scopes: scopes = self._scopes.items() whereclauses.extend(f"{k} = ?" for k, _ in scopes) values.extend(v for _, v in scopes) if where: whereclauses.append(where[0]) values.extend(where[1]) query = f"SELECT {jkeys} FROM {self._tablename}" if whereclauses: query += f" WHERE {' AND '.join(whereclauses)}" with self.with_cursor() as cursor: try: rows = cursor.execute(query, values).fetchall() except OperationalError as e: if not allow_operational_errors: raise e if "readonly database" in str(e): # mfranklin: idk, this sometimes happens. We're doing a select query, idk sqlite3 driver... Logger.debug( f"Got readonly error when running query: '{query}', skipping for now" ) return None elif "locked" in str(e): Logger.debug( f"We hit the janis database.{self._tablename} at the same time the janis process wrote to it, we'll skip for now " ) return None raise parsedrows = [self._base.deserialize(keys, r) for r in rows] return parsedrows
def save_metadata_if_required(self): if self.database.progressDB.has(ProgressKeys.savedMetadata): return Logger.debug(f"Workflow '{self.wid}' has saved metadata, skipping") engine = self.get_engine() metadir = self.get_path_for_component(self.WorkflowManagerPath.metadata) if isinstance(engine, Cromwell): import json meta = engine.raw_metadata(self.get_engine_wid()).meta with open(os.path.join(metadir, "metadata.json"), "w+") as fp: json.dump(meta, fp) elif isinstance(engine, CWLTool): import json meta = engine.metadata(self.wid) self.set_status(meta.status) with open(os.path.join(metadir, "metadata.json"), "w+") as fp: json.dump(meta.outputs, fp) else: raise Exception( f"Don't know how to save metadata for engine '{engine.id()}'" ) self.database.progressDB.set(ProgressKeys.savedMetadata)
def ensure_downloaded(self): pathed_container = self.container_path() if os.path.exists(pathed_container): return True command = self.get_build_instructions_for(pathed_container, self.container) Logger.info("Couldn't find singularity container, building with: " + " ".join(command)) try: build_result = subprocess.check_output(command) Logger.debug(build_result) except subprocess.CalledProcessError as e: Logger.log_ex(e)
def submit_workflow_if_required(self): if self.database.progressDB.has(ProgressKeys.submitWorkflow): return Logger.log(f"Workflow '{self.wid}' has submitted, skipping") fn_wf = self.database.workflowmetadata.submission_workflow fn_inp = self.database.workflowmetadata.submission_inputs fn_deps = self.database.workflowmetadata.submission_resources engine = self.get_engine() Logger.debug(f"Submitting task '{self.wid}' to '{engine.id()}'") self._engine_wid = engine.start_from_paths(self.wid, fn_wf, fn_inp, fn_deps) self.database.workflowmetadata.engine_wid = self._engine_wid Logger.info( f"Submitted workflow ({self.wid}), got engine id = '{self.get_engine_wid()}'" ) self.database.progressDB.set(ProgressKeys.submitWorkflow)
def number_of_jobs(self): query = "SELECT COUNT(*) FROM jobs WHERE submission_id = ?" values = [self.submission_id] with self.with_cursor() as cursor: try: row = cursor.execute(query, values).fetchone() return row[0] except OperationalError as e: if "readonly database" in str(e): # mfranklin: idk, this sometimes happens. We're doing a select query, idk sqlite3 driver... Logger.debug( f"Got readonly error when running query: '{query}', skipping for now" ) return None elif "locked" in str(e): Logger.debug( f"We hit the janis database.{self._tablename} at the same time the janis process wrote to it, we'll skip for now " ) return None raise
def check_extensions(inpid: str, datatype: DataType, path: str): """ This method only WARNS about incorrect extension """ if not isinstance(datatype, File): return if not isinstance(path, str): Logger.warn( f"Expecting string type input '{inpid}' of file File, but received '{type(path)}'" ) # check extension (and in future, secondaries) pre_extensions = [ datatype.extension, *list(datatype.alternate_extensions or []), ] extensions = {ext for ext in pre_extensions if ext is not None} if len(extensions) == 0: # skip because no extension return has_extension = False for ext in extensions: if path.endswith(ext): has_extension = True break if has_extension: # looks like we're sweet Logger.debug( f"Validated that the input for {inpid} had the expected extension for {datatype.id()}" ) return Logger.warn( f"The input for '{inpid}' ({datatype.name()}) did not have the expected extension " f"{' OR '.join(extensions)}: {path}. ")
def try_get_outputs_for(self, inpid, wf, inputs, output_dir, description): from janis_assistant.main import WorkflowManager, run_with_outputs if os.path.exists(output_dir): try: wm = WorkflowManager.from_path_get_latest_manager( output_dir, readonly=True ) outs_raw = wm.database.outputsDB.get() outs = { o.id_: o.value or o.new_path for o in outs_raw if o.value or o.new_path } if len(outs) > 0: out_val = first_value(outs) Logger.info( f"Using cached value of transformation ({description}) for {inpid}: {out_val}" ) return out_val Logger.log( f"Didn't get any outputs from previous workflow manager when deriving input {inpid} ({description})" ) except Exception as e: Logger.debug( f"Couldn't get outputs from existing output_path for {inpid}, '{output_dir}' ({description}): {e}" ) outs = run_with_outputs(wf, inputs=inputs, output_dir=output_dir) if not outs or len(outs) < 1: Logger.critical( f"Couldn't get outputs from transformation ({description}) for '{inpid}'" ) return None return first_value(outs)
def copy_outputs_if_required(self): if self.database.progressDB.has(ProgressKeys.copiedOutputs): return Logger.debug(f"Workflow '{self.wid}' has copied outputs, skipping") if self.database.workflowmetadata.status != TaskStatus.COMPLETED: return Logger.warn( f"Skipping copying outputs as workflow " f"status was not completed ({self.database.workflowmetadata.status})" ) wf_outputs = self.database.outputsDB.get_all() engine_outputs = self.get_engine().outputs_task(self.get_engine_wid()) eoutkeys = engine_outputs.keys() fs = self.environment.filescheme for out in wf_outputs: eout = engine_outputs.get(out.tag) if eout is None: Logger.warn( f"Couldn't find expected output with tag {out.tag}, found outputs ({', '.join(eoutkeys)}" ) continue originalfile, newfilepath = self.copy_output( fs=fs, outputid=out.tag, prefix=out.prefix, tag=out.tags, secondaries=out.secondaries, extension=out.extension, engine_output=eout, iscopyable=out.iscopyable, ) if isinstance(originalfile, list): originalfile = recursively_join(originalfile, "|") if isinstance(newfilepath, list): newfilepath = recursively_join(newfilepath, "|") self.database.outputsDB.update_paths( tag=out.tag, original_path=originalfile, new_path=newfilepath ) self.database.progressDB.set(ProgressKeys.copiedOutputs) Logger.info(f"View the task outputs: file://{self.get_task_path()}")
def send_email(subject: str, body: str): import tempfile, os nots = PreparedJob.instance().notifications mail_program = nots.mail_program if not mail_program: return Logger.debug( "Skipping email send as no mail program is configured") if not nots.email or nots.email.lower() == "none": Logger.debug("Skipping notify status change as no email") return emails: List[str] = (nots.email if isinstance(nots.email, list) else nots.email.split(",")) Logger.debug(f"Sending email with subject {subject} to {emails}") email_template = f"""\ Content-Type: text/html To: {"; ".join(emails)} From: {nots.from_email} Subject: {subject} {body}""" # 2020-08-24 mfranklin: Write to disk and cat, because some emails are just too big fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp: # do stuff with temp file tmp.write(email_template) command = f"cat '{path}' | {mail_program}" Logger.log("Sending email with command: " + str(command.replace("\n", "\\n"))) try: subprocess.call(command, shell=True) Logger.debug("Sent email successfully") except Exception as e: Logger.critical( f"Couldn't send email '{subject}' to {emails}: {e}") finally: os.remove(path)
def show_status_screen(self, **kwargs): """ This function just polls the database for metadata every so often, and simply displays it. It will keep doing that until the task moves into a TERMINAL status. It's presumed that there's a janis-monitor that's watching the engine. The kwargs argument is for passing through formatting/refresh options through from the command-line. """ if self.database.progressDB.has(ProgressKeys.workflowMovedToFinalState): meta = self.database.get_metadata() formatted = meta.format(**kwargs) print(formatted) return Logger.debug(f"Workflow '{self.wid}' has already finished, skipping") bl = None # try: # import blessed # # bl = blessed # # except Exception as e: # txt = ( # "Couldn't load 'blessed' for screen display, defaulting back to clear(): " # + str(e) # ) # Logger.warn(txt) if bl is not None: self.poll_stored_metadata_with_blessed(bl) else: self.poll_stored_metadata_with_clear(**kwargs)
def prepare_and_output_workflow_to_evaluate_if_required( self, tool: Tool, translator: TranslatorBase, validation: Optional[ValidationRequirements], batchrun: Optional[BatchRunRequirements], hints: Dict[str, str], additional_inputs: dict, max_cores=None, max_memory=None, allow_empty_container=False, container_override: dict = None, check_files=True, ) -> Tool: if self.database.progressDB.has(ProgressKeys.saveWorkflow): return Logger.info(f"Saved workflow from task '{self.wid}', skipping.") Logger.debug(f"Saving workflow with id '{tool.id()}' to {translator.name}") outdir_workflow = self.get_path_for_component(self.WorkflowManagerPath.workflow) translator.translate( tool, to_console=False, to_disk=True, hints=hints, # This is just the base tool, we're going to potentially transform the inputs # and we only really care about the inputs for the workflow we're going to run. # We'll store the original workflow to run for provenance, but not to easily rerun write_inputs_file=False, export_path=os.path.join(outdir_workflow, "original"), allow_empty_container=allow_empty_container, container_override=container_override, ) Logger.info(f"Saved workflow with id '{tool.id()}' to '{outdir_workflow}'") modifiers = [] if validation: modifiers.append(ValidatorPipelineModifier(validation)) if batchrun: modifiers.append(BatchPipelineModifier(batchrun)) modifiers.append(InputFileQualifierModifier) # THIS ONE SHOULD BE LAST modifiers.append(InputChecker(check_file_existence=check_files)) tool_to_evaluate, additional_inputs = PipelineModifierBase.apply_many( modifiers, tool, additional_inputs, hints=hints ) translator.translate( tool_to_evaluate, to_console=False, to_disk=True, with_resource_overrides=True, merge_resources=True, hints=hints, write_inputs_file=True, export_path=outdir_workflow, additional_inputs=additional_inputs, max_cores=max_cores, max_mem=max_memory, allow_empty_container=allow_empty_container, container_override=container_override, ) self.evaluate_output_params( wf=tool_to_evaluate, additional_inputs=additional_inputs ) self.database.progressDB.set(ProgressKeys.saveWorkflow) return tool_to_evaluate
def do_bed_fasta_contig_check(tool: Tool, inputs: Dict[str, any]): from janis_bioinformatics.data_types import Fasta, Bed, BedTabix supported_bed_types = (Bed, BedTabix) beds_inputs = [] refs = [] for i in tool.tool_inputs(): if isinstance(i.intype, supported_bed_types) or ( isinstance(i.intype, Array) and isinstance(i.intype.subtype(), supported_bed_types)): beds_inputs.append(i) if (isinstance(i.intype, Fasta) and i.intype.secondary_files() and ".fai" in i.intype.secondary_files()): refs.append(i) if len(refs) == 0: return if len(refs) > 1: Logger.info( "Skipping bioinformatics FASTA-BED file checks as there were more than 1 reference" ) for inp_ref in refs: value_ref = inputs[inp_ref.id()] if not value_ref: Logger.warn( f"Skipping '{inp_ref.id()}' as no value was provided") continue ref_contigs = ContigChecker.get_list_of_contigs_from_fastafai( value_ref + ".fai") if not ref_contigs: Logger.debug( f"Didn't get any contigs from ref {value_ref}.fai, skipping..." ) continue for inp_bed in beds_inputs: value_bed = inputs[inp_bed.id()] is_array = isinstance(value_bed, list) beds = value_bed if is_array else [value_bed] for b_idx in range(len(beds)): bed = beds[b_idx] bed_contigs = ContigChecker.get_list_of_contigs_from_bed( bed) missing_contigs = bed_contigs - ref_contigs if missing_contigs: inpname = (f"{inp_bed.id()}.{b_idx}" if is_array else inp_bed.id()) contiglist = (", ".join(missing_contigs) if len(missing_contigs) < 5 else (", ".join(list(missing_contigs)[:3]) + "...")) Logger.warn( f"The BED file '{inpname}' contained {len(missing_contigs)} contigs ({contiglist}) that were missing from the reference: {value_ref}" )
def guess_datatype_by_filename(filename: str): """ We'll try to guess which datatype a file with name 'filename' is. Primarily, this will look at the extension, and whether the secondary files exist :param filename: :return: """ dts = JanisShed.get_all_datatypes() fs = FileScheme.get_type_by_prefix(filename)() if not isinstance(fs, LocalFileScheme): Logger.warn( f"The filescheme detected by Janis for '{filename}' was not LOCAL. This guess datatype process may rely on " f"polling the {fs.id()} file system to check if related files exist. This might have some financial cost involved." ) file_exists_map = {} # each match has a score matches: List[Tuple[int, File]] = [] for datatype in dts: if isclass(datatype): if not issubclass(datatype, File): continue datatype = get_instantiated_type(datatype) elif not isinstance(datatype, File): continue if not datatype.extension: continue datatype: File = datatype extensions = {datatype.extension, *(datatype.alternate_extensions or [])} matching_extension = None for ext in extensions: if filename.endswith(ext): matching_extension = ext break secondaries_match = True if datatype.secondary_files(): for secondary in datatype.secondary_files(): secondary_filename = apply_secondary_file_format_to_filename( filename, secondary ) if secondary not in file_exists_map: file_exists_map[secondary] = fs.exists(secondary_filename) if not file_exists_map[secondary]: secondaries_match = False break if secondaries_match is False: continue # we got here, we're G if matching_extension is not None and secondaries_match: extension_reward = len(matching_extension) * EXTENSION_REWARD_MULTIPLER secondaries_reward = ( len(datatype.secondary_files() or []) * SECONDARIES_REWARD_MULTIPLER ) score = extension_reward + secondaries_reward matches.append((score, datatype)) if len(matches) == 0: return None elif len(matches) == 1: return matches[0][1] else: matches = sorted(matches, key=lambda a: a[0], reverse=True) matched_dt = matches[0][1] ranked = ", ".join(f"{match[1].name()} ({match[0]})" for match in matches[1:]) Logger.debug( f"There were {len(matches)} for matching datatypes. Using {matched_dt.name()} ({matches[0][0]}) " f"as it was the best match from: {ranked}" ) return matched_dt
def __init__( self, config_dir: str = None, db_path: str = None, execution_dir=None, engine: str = None, cromwell: JanisConfigurationCromwell = None, template: JanisConfigurationTemplate = None, notifications: JanisConfigurationNotifications = None, environment: JanisConfigurationEnvironment = None, run_in_background: bool = None, digest_cache_location: str = None, # job information inputs: Dict = None, output_dir: str = None, keep_intermediate_files: bool = None, recipes: List[str] = None, hints: Dict[str, str] = None, allow_empty_container: bool = None, container_override: Dict[str, str] = None, skip_digest_lookup: bool = None, skip_digest_cache: bool = None, batchrun: Union[BatchRunRequirements, Dict] = None, store_in_central_db: bool = None, skip_file_check: bool = None, strict_inputs: bool = False, validation: ValidationRequirements = None, should_watch_if_background: bool = False, call_caching_enabled: bool = None, container_type: str = None, workflow_reference: str = None, post_run_script: str = None, ): """ :param config_dir: The config_dir specifies where the janis.db, cromwell and workflow cache is. :param output_dir: This directory to copy outputs to. By default, the execution occurs in this directory (under <output>/janis) unless overriden with the '--exeution-dir' argument :param execution_dir: The directory which Janis meta and execution data is placed. If no execution directory is specified, it uses the path '<outputdir>/janis/'. Note that some templates may override the intermediate computation directory. :param engine: :param cromwell: :param template: :param notifications: :param run_in_background: :param digest_cache_location: :param inputs: YAML or JSON inputs file to provide values for the workflow (can specify multiple times) :param keep_intermediate_files: Do not remove execution directory on successful complete :param recipes: :param hints: :param allow_empty_container: :param container_override: :param skip_digest_lookup: :param skip_digest_cache: :param batchrun: :param store_in_central_db: :param skip_file_check: :param strict_inputs: :param validation: :param should_watch_if_background: :param cromwell_db_type: :param post_run_script: """ self.config_dir = config_dir self.db_path = db_path self.output_dir = output_dir self.environment: JanisConfigurationEnvironment = parse_if_dict( JanisConfigurationEnvironment, environment or {}, "environment", skip_if_empty=False, ) self.engine = EngineType(engine) if engine else None requires_cromwell_config = self.engine == EngineType.cromwell self.cromwell: JanisConfigurationCromwell = parse_if_dict( JanisConfigurationCromwell, cromwell or {}, "cromwell", skip_if_empty=not requires_cromwell_config, ) self.template: JanisConfigurationTemplate = parse_if_dict( JanisConfigurationTemplate, template or {}, "template", skip_if_empty=False) self.notifications: JanisConfigurationNotifications = parse_if_dict( JanisConfigurationNotifications, notifications or {}, "notifications", skip_if_empty=False, ) self._workflow_reference = workflow_reference self.inputs = inputs self.hints = hints self.output_dir = output_dir self.execution_dir = execution_dir self.keep_intermediate_files = keep_intermediate_files self.recipes = recipes self.allow_empty_container = allow_empty_container self.container_override = container_override self.skip_digest_lookup = skip_digest_lookup self.skip_digest_cache = skip_digest_cache self.batchrun: Optional[BatchRunRequirements] = parse_if_dict( BatchRunRequirements, batchrun, "batchrun") self.validation: Optional[ValidationRequirements] = parse_if_dict( ValidationRequirements, validation, "validation") self.store_in_central_db = store_in_central_db self.skip_file_check = skip_file_check self.strict_inputs = strict_inputs self.should_watch_if_background = should_watch_if_background self.run_in_background = run_in_background self.digest_cache_location = digest_cache_location self.call_caching_enabled = call_caching_enabled self.post_run_script = post_run_script self.container_type = ContainerType(container_type) self._container = get_container_by_name(container_type) if not self._instance: Logger.debug("Setting prepared job") PreparedJob._instance = self else: Logger.debug("Setting prepared job, when already set") PreparedJob._instance = self
def check_input_for_correctness(self, inpid: str, dt: DataType, value: any): if isinstance(dt, Array): if isinstance(value, list): return [ self.check_input_for_correctness(f"{inpid}[{idx}]", dt.subtype(), v) for idx, v in zip(range(len(value)), value) ] if not isinstance(dt, File): return value if not isinstance(value, str): Logger.warn( f"Expecting string type input '{inpid}' for type File, but received '{type(value)}'. Janis won't transform this value, but you should confirm your inputs." ) return value guessed_datatype = guess_datatype_by_filename(value) if not guessed_datatype: Logger.info( f"Couldn't guess datatype for {value}. Returning the value instead." ) return value if dt.can_receive_from(guessed_datatype): Logger.debug(f"Input '{inpid}' had a compatible type") return value message_prefix = ( f"The value for input '{inpid}' did not match the expected type {dt.name()} " f"through the extension and / or existence of secondary files" ) if not guessed_datatype: Logger.warn( message_prefix + f"\nand Janis couldn't guess the datatype from the input for {inpid} and value '{value}'." ) return value try: transformation = JanisShed.get_transformation_graph().find_connection( guessed_datatype, dt ) steps = ( "".join(t.type1.name() + " -> " for t in transformation) + transformation[-1].type2.name() ) Logger.warn( message_prefix + f",\nJanis guessed the actual datatype for '{inpid}' from data '{value}' to be {guessed_datatype.id()}, " f"and Janis was able to determine a transformation in {len(transformation)} step(s): {steps}" ) wf = JanisTransformation.convert_transformations_to_workflow(transformation) trans = wf.translate("wdl", to_console=False)[0] Logger.debug( f"Transforming {inpid} ({guessed_datatype.name()} -> {dt.name()}): {trans}" ) except Exception as e: Logger.warn( message_prefix + f",\nbut Janis couldn't find a transformation between the guessed and expected type:" f" {guessed_datatype.name()} -> {dt.name()}: {str(e)}" ) return value # maybe do some other things with respect to the path try: return self.try_get_outputs_for( inpid=inpid, wf=wf, inputs={wf.tool_inputs()[0].id(): value}, output_dir=os.path.join(self.cache_dir, inpid), description=f"{guessed_datatype.name()} -> {dt.name()}", ) except Exception as e: Logger.critical( f"An internal error occurred when performing the transformation for {inpid} " f"({guessed_datatype.name()} -> {dt.name()}): {str(e)}" ) Logger.debug(traceback.format_exc()) return value
def stop(self): Logger.debug("Received STOP request for mySQL container") self.container.stop_container()
def get_workflow_from_file(file, name, include_commandtools=False): # How to import a module given the full path # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path import importlib.util try: import sys basefilename = os.path.basename(file) sys.path.append(os.path.dirname(file)) spec = importlib.util.spec_from_file_location("module.name", file) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) ptypes = get_janis_from_module_spec( foo, include_commandtools=include_commandtools, name=name) except Exception as e: raise Exception( f"Unrecognised python file when getting workflow / command tool: {file} :: {e}" ) # Per https://github.com/PMCC-BioinformaticsCore/janis-core/issues/31, we'll use the following process: # 1. If a `name` is defined: # - Force parse every token with a case-insensitive match # - If a single item is returned from a case-sensitive match, then use that # 2. If multiple workflows are defined in the same file, use the last defined workflow # - This covers the existing _If a single workflow is defined, use that_ case # 3. If no tools were found, raise an Exception # 4. If multiple tools are defined in the file, use the last one: # - If a name was defined, `warn` the user that the case-insensitive match returned no results and use the last one # - Otherwise, just tell the user we'll use the last defined tool ptypes_casesensitive = [(k, v) for (k, v) in ptypes if k == name] if len(ptypes_casesensitive) == 1: return ptypes_casesensitive[0][1] if name is None: mains = [v for (k, v) in ptypes if k == "__JANIS_ENTRYPOINT"] if len(mains) > 0: Logger.debug( "Using workflow defined by '__JANIS_ENTRYPOINT' as no name was used" ) return mains[0] wftypes = [ t for t in ptypes if (issubclass(t[1], WorkflowBase ) if isclass(t[1]) else isinstance(t[1], WorkflowBase)) ] detected_tokens = ", ".join(f"'{x[0]}' ({x[1].__class__.__name__})" for x in ptypes) if len(wftypes) > 0: if len(wftypes) > 1: if name: Logger.warn( f"Providing the `--name` parameter performs a case-insensitive search for the tokens in " f"'{basefilename}, and a case-sensitive search returned no results. You had {len(wftypes)} " f"tokens that matched this search. Janis will use the last one, defined as " f"'{ptypes[-1][0]}' from: {detected_tokens}") else: Logger.info( f"Multiple workflows were found in '{basefilename}', using '{wftypes[-1][0]}'" ) return wftypes[-1][1] if len(ptypes) == 0: raise Exception( f"There were no valid tools in '{file}', try running with the `--name YourToolName` parameter " f"to get more information (it might have abstract / unimplemented methods)." ) if len(ptypes) > 1: if name: Logger.warn( f"Providing the `--name` parameter performs a case-insensitive search for the tokens in " f"'{basefilename}, and a case-sensitive search returned no results. You had {len(ptypes)} " f"tokens that matched this search. Janis will use the last one, defined as " f"'{ptypes[-1][0]}' from: {detected_tokens}") else: Logger.info( f"There were multiple tools (an no workflows) detected in {basefilename}, " f"Janis will use '{ptypes[-1][0]}' (the last defined)") return ptypes[-1][1]