def generate_salmon_rnaseq_dag(params: SequencingDagParameters) -> DAG: default_args = { "owner": "hubmap", "depends_on_past": False, "start_date": datetime(2019, 1, 1), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=1), "xcom_push": True, "queue": utils.map_queue_name("general"), "on_failure_callback": utils.create_dataset_state_error_callback(get_uuid_for_error), } with DAG( params.dag_id, schedule_interval=None, is_paused_upon_creation=False, default_args=default_args, max_active_runs=4, user_defined_macros={"tmp_dir_path": utils.get_tmp_dir_path}, ) as dag: cwl_workflows = get_absolute_workflows( Path("salmon-rnaseq", "pipeline.cwl"), Path("portal-containers", "h5ad-to-arrow.cwl"), Path("portal-containers", "anndata-to-ui.cwl"), ) def build_dataset_name(**kwargs): id_l = kwargs["dag_run"].conf["parent_submission_id"] inner_str = id_l if isinstance(id_l, str) else "_".join(id_l) return f"{dag.dag_id}__{inner_str}__{params.pipeline_name}" prepare_cwl1 = DummyOperator(task_id="prepare_cwl1") prepare_cwl2 = DummyOperator(task_id="prepare_cwl2") prepare_cwl3 = DummyOperator(task_id="prepare_cwl3") def build_cwltool_cmd1(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dirs = ctx["parent_lz_path"] data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs print("data_dirs: ", data_dirs) command = [ *get_cwltool_base_cmd(tmpdir), "--relax-path-checks", "--debug", "--outdir", tmpdir / "cwl_out", "--parallel", cwl_workflows[0], "--assay", params.assay, "--threads", THREADS, ] for data_dir in data_dirs: command.append("--fastq_dir") command.append(data_dir) return join_quote_command_str(command) def build_cwltool_cmd2(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dir = ctx["parent_lz_path"] print("data_dir: ", data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[1], "--input_dir", # This pipeline invocation runs in a 'hubmap_ui' subdirectory, # so use the parent directory as input "..", ] return join_quote_command_str(command) def build_cwltool_cmd3(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dir = ctx["parent_lz_path"] print("data_dir: ", data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[2], "--input_dir", # This pipeline invocation runs in a 'hubmap_ui' subdirectory, # so use the parent directory as input "..", ] return join_quote_command_str(command) t_build_cmd1 = PythonOperator( task_id="build_cmd1", python_callable=build_cwltool_cmd1, provide_context=True, ) t_build_cmd2 = PythonOperator( task_id="build_cmd2", python_callable=build_cwltool_cmd2, provide_context=True, ) t_build_cmd3 = PythonOperator( task_id="build_cmd3", python_callable=build_cwltool_cmd3, provide_context=True, ) t_pipeline_exec = BashOperator( task_id="pipeline_exec", bash_command=""" \ tmp_dir={{tmp_dir_path(run_id)}} ; \ {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ echo $? """, ) t_convert_for_ui = BashOperator( task_id="convert_for_ui", bash_command=""" \ tmp_dir={{tmp_dir_path(run_id)}} ; \ ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ cd "$tmp_dir"/cwl_out ; \ mkdir -p hubmap_ui ; \ cd hubmap_ui ; \ {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ echo $? """, ) t_convert_for_ui_2 = BashOperator( task_id="convert_for_ui_2", bash_command=""" \ tmp_dir={{tmp_dir_path(run_id)}} ; \ ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ cd "$tmp_dir"/cwl_out ; \ mkdir -p hubmap_ui ; \ cd hubmap_ui ; \ {{ti.xcom_pull(task_ids='build_cmd3')}} >> $tmp_dir/session.log 2>&1 ; \ echo $? """, ) t_maybe_keep_cwl1 = BranchPythonOperator( task_id="maybe_keep_cwl1", python_callable=utils.pythonop_maybe_keep, provide_context=True, op_kwargs={ "next_op": "prepare_cwl2", "bail_op": "set_dataset_error", "test_op": "pipeline_exec", }, ) t_maybe_keep_cwl2 = BranchPythonOperator( task_id="maybe_keep_cwl2", python_callable=utils.pythonop_maybe_keep, provide_context=True, op_kwargs={ "next_op": "prepare_cwl3", "bail_op": "set_dataset_error", "test_op": "convert_for_ui", }, ) t_maybe_keep_cwl3 = BranchPythonOperator( task_id="maybe_keep_cwl3", python_callable=utils.pythonop_maybe_keep, provide_context=True, op_kwargs={ "next_op": "move_data", "bail_op": "set_dataset_error", "test_op": "convert_for_ui_2", }, ) t_send_create_dataset = PythonOperator( task_id="send_create_dataset", python_callable=utils.pythonop_send_create_dataset, provide_context=True, op_kwargs={ "parent_dataset_uuid_callable": get_parent_dataset_uuid, "http_conn_id": "ingest_api_connection", "endpoint": "/datasets/derived", "dataset_name_callable": build_dataset_name, "dataset_types": [params.dataset_type], }, ) t_set_dataset_error = PythonOperator( task_id="set_dataset_error", python_callable=utils.pythonop_set_dataset_state, provide_context=True, trigger_rule="all_done", op_kwargs={ "dataset_uuid_callable": get_dataset_uuid, "http_conn_id": "ingest_api_connection", "endpoint": "/datasets/status", "ds_state": "Error", "message": f"An error occurred in {params.pipeline_name}", }, ) send_status_msg = make_send_status_msg_function( dag_file=__file__, retcode_ops=[ "pipeline_exec", "move_data", "convert_for_ui", "convert_for_ui_2" ], cwl_workflows=cwl_workflows, ) t_send_status = PythonOperator( task_id="send_status_msg", python_callable=send_status_msg, provide_context=True, ) t_log_info = LogInfoOperator(task_id="log_info") t_join = JoinOperator(task_id="join") t_create_tmpdir = CreateTmpDirOperator(task_id="create_tmpdir") t_cleanup_tmpdir = CleanupTmpDirOperator(task_id="cleanup_tmpdir") t_set_dataset_processing = SetDatasetProcessingOperator( task_id="set_dataset_processing") t_move_data = MoveDataOperator(task_id="move_data") (dag >> t_log_info >> t_create_tmpdir >> t_send_create_dataset >> t_set_dataset_processing >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 >> prepare_cwl2 >> t_build_cmd2 >> t_convert_for_ui >> t_maybe_keep_cwl2 >> prepare_cwl3 >> t_build_cmd3 >> t_convert_for_ui_2 >> t_maybe_keep_cwl3 >> t_move_data >> t_send_status >> t_join) t_maybe_keep_cwl1 >> t_set_dataset_error t_maybe_keep_cwl2 >> t_set_dataset_error t_maybe_keep_cwl3 >> t_set_dataset_error t_set_dataset_error >> t_join t_join >> t_cleanup_tmpdir return dag
user_defined_macros={'tmp_dir_path': utils.get_tmp_dir_path}) as dag: def read_metadata_file(**kwargs): md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']), 'rslt.yml') with open(md_fname, 'r') as f: scanned_md = yaml.safe_load(f) return scanned_md def get_blank_dataset_lz_path(**kwargs): return '' # used to suppress sending of file metadata send_status_msg = make_send_status_msg_function( dag_file=__file__, retcode_ops=['run_md_extract', 'md_consistency_tests'], cwl_workflows=[], dataset_uuid_fun=get_dataset_uuid, dataset_lz_path_fun=get_blank_dataset_lz_path, metadata_fun=read_metadata_file) def wrapped_send_status_msg(**kwargs): if send_status_msg(**kwargs): scanned_md = read_metadata_file( **kwargs) # Yes, it's getting re-read kwargs['ti'].xcom_push( key='collectiontype', value=(scanned_md['collectiontype'] if 'collectiontype' in scanned_md else None)) if 'assay_type' in scanned_md: assay_type = scanned_md['assay_type'] elif 'metadata' in scanned_md and 'assay_type' in scanned_md[
# t_expand_symlinks = BashOperator( # task_id='expand_symlinks', # bash_command=""" # tmp_dir="{{tmp_dir_path(run_id)}}" ; \ # ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ # groupname="{{conf.as_dict()['connections']['OUTPUT_GROUP_NAME']}}" ; \ # cd "$ds_dir" ; \ # tar -xf symlinks.tar ; \ # echo $? # """ # ) send_status_msg = make_send_status_msg_function( dag_file=__file__, retcode_ops=['pipeline_exec_cwl1', 'pipeline_exec_cwl2', 'move_data'], cwl_workflows=cwl_workflows, ) t_send_status = PythonOperator( task_id='send_status_msg', python_callable=send_status_msg, provide_context=True, ) t_log_info = LogInfoOperator(task_id='log_info') t_join = JoinOperator(task_id='join') t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') t_set_dataset_processing = SetDatasetProcessingOperator( task_id='set_dataset_processing') t_move_data = MoveDataOperator(task_id='move_data')
bash_command=""" tmp_dir="{{tmp_dir_path(run_id)}}" ; \ ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ groupname="{{conf.as_dict()['connections']['OUTPUT_GROUP_NAME']}}" ; \ cd "$ds_dir" ; \ tar -xf symlinks.tar ; \ echo $? """) send_status_msg = make_send_status_msg_function( dag_file=__file__, retcode_ops=[ 'pipeline_exec_cwl_cytokit', 'pipeline_exec_cwl_sprm', 'pipeline_exec_cwl_create_vis_symlink_archive', 'pipeline_exec_cwl_ome_tiff_offsets', 'pipeline_exec_cwl_sprm_to_json', 'pipeline_exec_cwl_sprm_to_anndata', 'move_data', ], cwl_workflows=list(cwl_workflows.values()), ) t_send_status = PythonOperator(task_id='send_status_msg', python_callable=send_status_msg, provide_context=True) t_log_info = LogInfoOperator(task_id='log_info') t_join = JoinOperator(task_id='join') t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') t_set_dataset_processing = SetDatasetProcessingOperator(