def __init__(self): get_dir(DAGS_FOLDER) self.include_examples = False self.dag_template_with_tmp_folder = "#!/usr/bin/env python3\nfrom cwl_airflow import CWLDAG, CWLJobDispatcher, CWLJobGatherer\ndag = CWLDAG(cwl_workflow='{0}', dag_id='{1}', default_args={{'tmp_folder':'{2}'}})\ndag.create()\ndag.add(CWLJobDispatcher(dag=dag), to='top')\ndag.add(CWLJobGatherer(dag=dag), to='bottom')" self.wes_state_conversion = { "running": "RUNNING", "success": "COMPLETE", "failed": "EXECUTOR_ERROR" }
def wes_collect_attachments(self, run_id): tempdir = tempfile.mkdtemp(dir=get_dir( path.abspath( conf_get("cwl", "tmp_folder", path.join(AIRFLOW_HOME, "cwl_tmp_folder")))), prefix="run_id_" + run_id + "_") logging.debug(f"Save all attached files to {tempdir}") for k, ls in iterlists(connexion.request.files): logging.debug(f"Process attachment parameter {k}") if k == "workflow_attachment": for v in ls: try: logging.debug(f"Process attached file {v}") sp = v.filename.split("/") fn = [] for p in sp: if p not in ("", ".", ".."): fn.append(secure_filename(p)) dest = path.join(tempdir, *fn) if not path.isdir(path.dirname(dest)): get_dir(path.dirname(dest)) logging.debug(f"Save {v.filename} to {dest}") v.save(dest) except Exception as err: raise ValueError( f"Failed to process attached file {v}, {err}") body = {} for k, ls in iterlists(connexion.request.form): logging.debug(f"Process form parameter {k}") for v in ls: try: if not v: continue if k == "workflow_params": job_file = path.join(tempdir, "job.json") with open(job_file, "w") as f: json.dump(json.loads(v), f, indent=4) logging.debug(f"Save job file to {job_file}") loader = Loader(load.jobloaderctx.copy()) job_order_object, _ = loader.resolve_ref( job_file, checklinks=False) body[k] = job_order_object else: body[k] = v except Exception as err: raise ValueError( f"Failed to process form parameter {k}, {v}, {err}") if "workflow_params" not in body or "workflow_url" not in body: raise ValueError( "Missing 'workflow_params' or 'workflow_url' in submission") body["workflow_url"] = path.join(tempdir, secure_filename(body["workflow_url"])) return tempdir, body
def overwrite_deprecated_dag(dag_location, deprecated_dags_folder=None): """ Loads DAG content from "dag_location" file. Searches for "dag.create()" command. If not found, we don't need to upgrade this DAG (it's either not from CWL-Airflow, or already in a new format). If "deprecated_dags_folder" is not None, copies original DAG file there before DAG upgrading. After copying deprecated DAG to the "deprecated_dags_folder" updates ".airflowignore" with DAG file basename to exclude it from Airflow parsing. Upgraded DAG will always include base64 encoded gzip compressed workflow content. In case "workflow_location" is relative path, it will be resolved based on the dirname of "dag_location" (useful for tests only, because all our old DAGs always have absolute path to the CWL file). Function doesn't backup or update the original CWL file. TODO: in case more coplicated DAG files that include "default_args", etc, this function should be updated to the more complex one. """ with open(dag_location, "r+") as io_stream: # open for both reading and writing dag_content = io_stream.read() if not re.search("dag\\.create\\(\\)", dag_content): # do nothing if it wasn't old-style DAG return workflow_location = get_absolute_path( # resolve relative to dirname of "dag_location" (good for tests) re.search("(cwl_workflow\\s*=\\s*[\"|'])(.+?)([\"|'])", dag_content).group(2), os.path.dirname(dag_location)) dag_id = re.search("(dag_id\\s*=\\s*[\"|'])(.+?)([\"|'])", dag_content).group(2) compressed_workflow_content = get_compressed( fast_cwl_load( workflow_location ) # no "run" embedding or convertion to Workflow. If DAG worked, cwl should be ok too ) if deprecated_dags_folder is not None: # copy old DAG to the folder with deprecated DAGs, add ".airflowignore" get_dir( deprecated_dags_folder ) # try to create "deprecated_dags_folder" if it doesn't exist shutil.copy(dag_location, deprecated_dags_folder) # copy DAG file ignore = os.path.join(deprecated_dags_folder, ".airflowignore") with open( ignore, "a" ) as output_stream: # add deprecated DAG to ".airflowignore" output_stream.write(os.path.basename(dag_location) + "\n") io_stream.seek(0) # rewind "dag_location" file to the beginning io_stream.write( DAG_TEMPLATE.format(compressed_workflow_content, dag_id)) io_stream.truncate( ) # remove old data at the end of a file if anything became shorter than original
def __init__(self, simulated_reports_location=None): get_dir(DAGS_FOLDER) self.simulation_mode = False # when set to True, will bypass execution of post_dag_runs and post_dags_dag_runs functions self.include_examples = False self.dag_template_with_tmp_folder = "#!/usr/bin/env python3\nfrom cwl_airflow import CWLDAG, CWLJobDispatcher, CWLJobGatherer\ndag = CWLDAG(cwl_workflow='{0}', dag_id='{1}', default_args={{'tmp_folder':'{2}'}})\ndag.create()\ndag.add(CWLJobDispatcher(dag=dag), to='top')\ndag.add(CWLJobGatherer(dag=dag), to='bottom')" self.wes_state_conversion = {"running": "RUNNING", "success": "COMPLETE", "failed": "EXECUTOR_ERROR"} self.validated_dags = {} # stores dags' content md5 checksums as keys and one of the statuses ["checking", "success", "error"] as values if simulated_reports_location is not None: try: self.suite_data = load_yaml(simulated_reports_location) self.simulation_mode = True logging.info(f"Running simulation mode from the {simulated_reports_location}") except Exception as err: logging.error(f"Failed to load simulation data from {simulated_reports_location} \n {err}")
def load_test_suite(args): """ Loads tests from the provided --suite file. Selects tests based on the indices from --range. Updates tools locations to be absolute. Loads jobs and updates all inputs files locations to be absolute (unless --relative parameter was set). Adds "outputs_folder" to the job, as well as the "index" to indicate which test case was used. Adds run_id's as keys for easy access and proper test identification when receiving results. In case we failed to load test case, sets "finished" to True and writes reason to "error". """ suite_data = load_yaml(args.suite) suite_dir = os.path.dirname(args.suite) suite_data_filtered = OrderedDict() # use OrderedDict just to keep it similar to suite_data for i in args.range: test_data = suite_data[i] run_id = str(uuid.uuid4()) tool_location = get_absolute_path(test_data["tool"], suite_dir) logging.info(f"Read test case {i+1} to run {tool_location}") job_location = None job_data = {} if "job" in test_data: job_location = get_absolute_path(test_data["job"], suite_dir) try: if args.relative: # skips relative path resolutions as well as adding values from the workflow default inputs job_data = load_yaml(job_location) else: job_data = load_job( workflow=tool_location, job=job_location ) except Exception as ex: logging.error(f"Failed to load test case {i+1} to run {tool_location} with {job_location}") test_data.update({ "error": "Failed to load test case", "finished": True }) job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id)) test_data.update({ "job": job_data, # already parsed, includes "outputs_folder" "tool": tool_location, "dag_id": get_rootname(test_data["tool"]), "index": i+1, # to know test case number, 1-based to correspond to --range "finished": test_data.get("finished", False) # to indicate whether the test was finished or not }) logging.info(f"Successfully loaded test case {i+1} to run {tool_location} with {job_location} as {run_id}") suite_data_filtered[run_id] = test_data # use "run_id" as a key for fast access when checking results return suite_data_filtered
def get_temp_folders(task_id, job_data): """ Creates a set of folders required for workflow execution. Uses "tmp_folder" from "job_data" as a parent folder. """ step_tmp_folder = get_dir(os.path.join(job_data["tmp_folder"], task_id)) step_cache_folder = get_dir( os.path.join(step_tmp_folder, task_id + "_step_cache")) step_outputs_folder = get_dir( os.path.join(step_tmp_folder, task_id + "_step_outputs")) step_report = os.path.join(step_tmp_folder, task_id + "_step_report.json") return step_tmp_folder, step_cache_folder, step_outputs_folder, step_report
def execute(self, context): """ Loads job Object from the context. Sets "tmp_folder" and "output_folder" if they have not been set before in the job. In case "tmp_folder" and/or "output_folder" were read from the job and are relative, resolves paths relative to the "tmp_folder" and/or "outputs_folder" from "cwl_args". Dumps step outputs as a json file into "tmp_folder". Writes to X-Com report file location. """ setup_cwl_logger(context["ti"]) post_status(context) # for easy access dag_id = context["dag"].dag_id workflow = context["dag"].workflow run_id = context["run_id"].replace(":", "_").replace( "+", "_") # to make it dumpable by json cwl_args = context["dag"].default_args["cwl"] # Loads job from dag_run configuration. Sets defaults from "workflow". Fails on missing input files job_data = load_job(workflow=workflow, job=context["dag_run"].conf["job"], cwl_args=cwl_args) job_data["tmp_folder"] = get_dir( get_absolute_path( job_data.get( "tmp_folder", mkdtemp(dir=cwl_args["tmp_folder"], prefix=dag_id + "_" + run_id + "_")), cwl_args["tmp_folder"])) job_data["outputs_folder"] = get_dir( get_absolute_path( job_data.get( "outputs_folder", os.path.join(cwl_args["outputs_folder"], dag_id, run_id)), cwl_args["outputs_folder"])) _, _, _, step_report = get_temp_folders(task_id=self.task_id, job_data=job_data) dump_json(job_data, step_report) return step_report
def copy_dags(airflow_home, source_folder=None): """ Copies *.py files (dags) from source_folder (default ../../extensions/dags) to dags_folder, which is always {airflow_home}/dags """ if source_folder is None: source_folder = os.path.join( os.path.dirname(os.path.abspath(os.path.join(__file__, "../../"))), "extensions/dags", ) target_folder = get_dir(os.path.join(airflow_home, "dags")) for root, dirs, files in os.walk(source_folder): for filename in files: if re.match(".*\\.py$", filename) and filename != "__init__.py": if not os.path.isfile(os.path.join(target_folder, filename)): shutil.copy(os.path.join(root, filename), target_folder)
def load_test_suite(args): """ Loads tests from the provided --suite file. Selects tests based on the indices from --range. Updates tools locations to be absolute, loads jobs and updates all inputs files locations to be absolute too. Adds "outputs_folder" to the job, as well as the "index" to indicate which test case was used. Adds run_id's as keys for easy access and proper test identification when receiving results. """ suite_data = load_yaml(args.suite) suite_dir = os.path.dirname(args.suite) suite_data_filtered = OrderedDict() # use OrderedDict just to keep it similar to suite_data for i in args.range: test_data = suite_data[i] run_id = str(uuid.uuid4()) tool_location = get_absolute_path(test_data["tool"], suite_dir) job_location = get_absolute_path(test_data["job"], suite_dir) if "job" in test_data: job_data = load_job( workflow=tool_location, job=job_location ) else: job_data = {} job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id)) test_data.update({ "job": job_data, # already parsed, includes "outputs_folder" "tool": tool_location, "dag_id": get_rootname(test_data["tool"]), "index": i+1, # to know test case number, 1-based to correspond to --range "finished": False # to indicate whether the test was finished or not }) logging.info(f"Load test case {i+1} to run {tool_location} with {job_location} as {run_id}") suite_data_filtered[run_id] = test_data # use "run_id" as a key for fast access when checking results return suite_data_filtered
def get_default_cwl_args(preset_cwl_args=None): """ Returns default arguments required by cwltool's functions with a few parameters added and overwritten (required by CWL-Airflow). Defaults can be preset through "preset_cwl_args" if provided. All new fields from "preset_cwl_args" will be added to the returned results. """ preset_cwl_args = {} if preset_cwl_args is None else deepcopy( preset_cwl_args) # default arguments required by cwltool required_cwl_args = get_default_args() # update default arguments required by cwltool with those that were preset by user required_cwl_args.update(preset_cwl_args) # update default arguments required by cwltool with those that might # be updated based on the higher priority of airflow configuration # file. If airflow configuration file doesn't include correspondent # parameters, use those that were preset by user, or defaults required_cwl_args.update({ "tmp_folder": get_dir( conf_get("cwl", "tmp_folder", preset_cwl_args.get("tmp_folder", CWL_TMP_FOLDER))), "outputs_folder": get_dir( # for CWL-Airflow to store outputs if "outputs_folder" is not overwritten in job conf_get("cwl", "outputs_folder", preset_cwl_args.get("outputs_folder", CWL_OUTPUTS_FOLDER))), "inputs_folder": get_dir( # for CWL-Airflow to resolve relative locations for input files if job was loaded from parsed object conf_get("cwl", "inputs_folder", preset_cwl_args.get("inputs_folder", CWL_INPUTS_FOLDER))), "pickle_folder": get_dir( # for CWL-Airflow to store pickled workflows conf_get("cwl", "pickle_folder", preset_cwl_args.get("pickle_folder", CWL_PICKLE_FOLDER))), "use_container": conf_get( "cwl", "use_container", preset_cwl_args.get( "use_container", CWL_USE_CONTAINER) # execute jobs in docker containers ), "no_match_user": conf_get( "cwl", "no_match_user", preset_cwl_args.get( "no_match_user", CWL_NO_MATCH_USER ) # disables passing the current uid to "docker run --user" ), "skip_schemas": conf_get( "cwl", "skip_schemas", preset_cwl_args.get( "skip_schemas", CWL_SKIP_SCHEMAS ) # it looks like this doesn't influence anything in the latest cwltool ), "strict": conf_get("cwl", "strict", preset_cwl_args.get("strict", CWL_STRICT)), "quiet": conf_get("cwl", "quiet", preset_cwl_args.get("quiet", CWL_QUIET)), "rm_tmpdir": preset_cwl_args.get( "rm_tmpdir", CWL_RM_TMPDIR ), # even if we can set it in "preset_cwl_args" it's better not to change "move_outputs": preset_cwl_args.get( "move_outputs", CWL_MOVE_OUTPUTS ), # even if we can set it in "preset_cwl_args" it's better not to change "enable_dev": preset_cwl_args.get( "enable_dev", CWL_ENABLE_DEV ) # fails to run without it when creating workflow from tool. TODO: Ask Peter? }) return required_cwl_args