Пример #1
0
def generate_salmon_rnaseq_dag(params: SequencingDagParameters) -> DAG:
    default_args = {
        "owner":
        "hubmap",
        "depends_on_past":
        False,
        "start_date":
        datetime(2019, 1, 1),
        "email": ["*****@*****.**"],
        "email_on_failure":
        False,
        "email_on_retry":
        False,
        "retries":
        1,
        "retry_delay":
        timedelta(minutes=1),
        "xcom_push":
        True,
        "queue":
        utils.map_queue_name("general"),
        "on_failure_callback":
        utils.create_dataset_state_error_callback(get_uuid_for_error),
    }

    with DAG(
            params.dag_id,
            schedule_interval=None,
            is_paused_upon_creation=False,
            default_args=default_args,
            max_active_runs=4,
            user_defined_macros={"tmp_dir_path": utils.get_tmp_dir_path},
    ) as dag:

        cwl_workflows = get_absolute_workflows(
            Path("salmon-rnaseq", "pipeline.cwl"),
            Path("portal-containers", "h5ad-to-arrow.cwl"),
            Path("portal-containers", "anndata-to-ui.cwl"),
        )

        def build_dataset_name(**kwargs):
            id_l = kwargs["dag_run"].conf["parent_submission_id"]
            inner_str = id_l if isinstance(id_l, str) else "_".join(id_l)
            return f"{dag.dag_id}__{inner_str}__{params.pipeline_name}"

        prepare_cwl1 = DummyOperator(task_id="prepare_cwl1")

        prepare_cwl2 = DummyOperator(task_id="prepare_cwl2")

        prepare_cwl3 = DummyOperator(task_id="prepare_cwl3")

        def build_cwltool_cmd1(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)

            data_dirs = ctx["parent_lz_path"]
            data_dirs = [data_dirs] if isinstance(data_dirs,
                                                  str) else data_dirs
            print("data_dirs: ", data_dirs)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                "--relax-path-checks",
                "--debug",
                "--outdir",
                tmpdir / "cwl_out",
                "--parallel",
                cwl_workflows[0],
                "--assay",
                params.assay,
                "--threads",
                THREADS,
            ]
            for data_dir in data_dirs:
                command.append("--fastq_dir")
                command.append(data_dir)

            return join_quote_command_str(command)

        def build_cwltool_cmd2(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)
            data_dir = ctx["parent_lz_path"]
            print("data_dir: ", data_dir)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                cwl_workflows[1],
                "--input_dir",
                # This pipeline invocation runs in a 'hubmap_ui' subdirectory,
                # so use the parent directory as input
                "..",
            ]

            return join_quote_command_str(command)

        def build_cwltool_cmd3(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)
            data_dir = ctx["parent_lz_path"]
            print("data_dir: ", data_dir)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                cwl_workflows[2],
                "--input_dir",
                # This pipeline invocation runs in a 'hubmap_ui' subdirectory,
                # so use the parent directory as input
                "..",
            ]

            return join_quote_command_str(command)

        t_build_cmd1 = PythonOperator(
            task_id="build_cmd1",
            python_callable=build_cwltool_cmd1,
            provide_context=True,
        )

        t_build_cmd2 = PythonOperator(
            task_id="build_cmd2",
            python_callable=build_cwltool_cmd2,
            provide_context=True,
        )

        t_build_cmd3 = PythonOperator(
            task_id="build_cmd3",
            python_callable=build_cwltool_cmd3,
            provide_context=True,
        )

        t_pipeline_exec = BashOperator(
            task_id="pipeline_exec",
            bash_command=""" \
            tmp_dir={{tmp_dir_path(run_id)}} ; \
            {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \
            echo $?
            """,
        )

        t_convert_for_ui = BashOperator(
            task_id="convert_for_ui",
            bash_command=""" \
            tmp_dir={{tmp_dir_path(run_id)}} ; \
            ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \
            cd "$tmp_dir"/cwl_out ; \
            mkdir -p hubmap_ui ; \
            cd hubmap_ui ; \
            {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \
            echo $?
            """,
        )

        t_convert_for_ui_2 = BashOperator(
            task_id="convert_for_ui_2",
            bash_command=""" \
            tmp_dir={{tmp_dir_path(run_id)}} ; \
            ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \
            cd "$tmp_dir"/cwl_out ; \
            mkdir -p hubmap_ui ; \
            cd hubmap_ui ; \
            {{ti.xcom_pull(task_ids='build_cmd3')}} >> $tmp_dir/session.log 2>&1 ; \
            echo $?
            """,
        )

        t_maybe_keep_cwl1 = BranchPythonOperator(
            task_id="maybe_keep_cwl1",
            python_callable=utils.pythonop_maybe_keep,
            provide_context=True,
            op_kwargs={
                "next_op": "prepare_cwl2",
                "bail_op": "set_dataset_error",
                "test_op": "pipeline_exec",
            },
        )

        t_maybe_keep_cwl2 = BranchPythonOperator(
            task_id="maybe_keep_cwl2",
            python_callable=utils.pythonop_maybe_keep,
            provide_context=True,
            op_kwargs={
                "next_op": "prepare_cwl3",
                "bail_op": "set_dataset_error",
                "test_op": "convert_for_ui",
            },
        )

        t_maybe_keep_cwl3 = BranchPythonOperator(
            task_id="maybe_keep_cwl3",
            python_callable=utils.pythonop_maybe_keep,
            provide_context=True,
            op_kwargs={
                "next_op": "move_data",
                "bail_op": "set_dataset_error",
                "test_op": "convert_for_ui_2",
            },
        )

        t_send_create_dataset = PythonOperator(
            task_id="send_create_dataset",
            python_callable=utils.pythonop_send_create_dataset,
            provide_context=True,
            op_kwargs={
                "parent_dataset_uuid_callable": get_parent_dataset_uuid,
                "http_conn_id": "ingest_api_connection",
                "endpoint": "/datasets/derived",
                "dataset_name_callable": build_dataset_name,
                "dataset_types": [params.dataset_type],
            },
        )

        t_set_dataset_error = PythonOperator(
            task_id="set_dataset_error",
            python_callable=utils.pythonop_set_dataset_state,
            provide_context=True,
            trigger_rule="all_done",
            op_kwargs={
                "dataset_uuid_callable": get_dataset_uuid,
                "http_conn_id": "ingest_api_connection",
                "endpoint": "/datasets/status",
                "ds_state": "Error",
                "message": f"An error occurred in {params.pipeline_name}",
            },
        )

        send_status_msg = make_send_status_msg_function(
            dag_file=__file__,
            retcode_ops=[
                "pipeline_exec", "move_data", "convert_for_ui",
                "convert_for_ui_2"
            ],
            cwl_workflows=cwl_workflows,
        )
        t_send_status = PythonOperator(
            task_id="send_status_msg",
            python_callable=send_status_msg,
            provide_context=True,
        )

        t_log_info = LogInfoOperator(task_id="log_info")
        t_join = JoinOperator(task_id="join")
        t_create_tmpdir = CreateTmpDirOperator(task_id="create_tmpdir")
        t_cleanup_tmpdir = CleanupTmpDirOperator(task_id="cleanup_tmpdir")
        t_set_dataset_processing = SetDatasetProcessingOperator(
            task_id="set_dataset_processing")
        t_move_data = MoveDataOperator(task_id="move_data")

        (dag >> t_log_info >> t_create_tmpdir >> t_send_create_dataset >>
         t_set_dataset_processing >> prepare_cwl1 >> t_build_cmd1 >>
         t_pipeline_exec >> t_maybe_keep_cwl1 >> prepare_cwl2 >> t_build_cmd2
         >> t_convert_for_ui >> t_maybe_keep_cwl2 >> prepare_cwl3 >>
         t_build_cmd3 >> t_convert_for_ui_2 >> t_maybe_keep_cwl3 >> t_move_data
         >> t_send_status >> t_join)
        t_maybe_keep_cwl1 >> t_set_dataset_error
        t_maybe_keep_cwl2 >> t_set_dataset_error
        t_maybe_keep_cwl3 >> t_set_dataset_error
        t_set_dataset_error >> t_join
        t_join >> t_cleanup_tmpdir

    return dag
         user_defined_macros={'tmp_dir_path': utils.get_tmp_dir_path}) as dag:

    def read_metadata_file(**kwargs):
        md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']),
                                'rslt.yml')
        with open(md_fname, 'r') as f:
            scanned_md = yaml.safe_load(f)
        return scanned_md

    def get_blank_dataset_lz_path(**kwargs):
        return ''  # used to suppress sending of file metadata

    send_status_msg = make_send_status_msg_function(
        dag_file=__file__,
        retcode_ops=['run_md_extract', 'md_consistency_tests'],
        cwl_workflows=[],
        dataset_uuid_fun=get_dataset_uuid,
        dataset_lz_path_fun=get_blank_dataset_lz_path,
        metadata_fun=read_metadata_file)

    def wrapped_send_status_msg(**kwargs):
        if send_status_msg(**kwargs):
            scanned_md = read_metadata_file(
                **kwargs)  # Yes, it's getting re-read
            kwargs['ti'].xcom_push(
                key='collectiontype',
                value=(scanned_md['collectiontype']
                       if 'collectiontype' in scanned_md else None))
            if 'assay_type' in scanned_md:
                assay_type = scanned_md['assay_type']
            elif 'metadata' in scanned_md and 'assay_type' in scanned_md[
Пример #3
0
    # t_expand_symlinks = BashOperator(
    #     task_id='expand_symlinks',
    #     bash_command="""
    #     tmp_dir="{{tmp_dir_path(run_id)}}" ; \
    #     ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \
    #     groupname="{{conf.as_dict()['connections']['OUTPUT_GROUP_NAME']}}" ; \
    #     cd "$ds_dir" ; \
    #     tar -xf symlinks.tar ; \
    #     echo $?
    #     """
    #     )

    send_status_msg = make_send_status_msg_function(
        dag_file=__file__,
        retcode_ops=['pipeline_exec_cwl1', 'pipeline_exec_cwl2', 'move_data'],
        cwl_workflows=cwl_workflows,
    )
    t_send_status = PythonOperator(
        task_id='send_status_msg',
        python_callable=send_status_msg,
        provide_context=True,
    )

    t_log_info = LogInfoOperator(task_id='log_info')
    t_join = JoinOperator(task_id='join')
    t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir')
    t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir')
    t_set_dataset_processing = SetDatasetProcessingOperator(
        task_id='set_dataset_processing')
    t_move_data = MoveDataOperator(task_id='move_data')
                                     bash_command="""
        tmp_dir="{{tmp_dir_path(run_id)}}" ; \
        ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \
        groupname="{{conf.as_dict()['connections']['OUTPUT_GROUP_NAME']}}" ; \
        cd "$ds_dir" ; \
        tar -xf symlinks.tar ; \
        echo $?
        """)

    send_status_msg = make_send_status_msg_function(
        dag_file=__file__,
        retcode_ops=[
            'pipeline_exec_cwl_cytokit',
            'pipeline_exec_cwl_sprm',
            'pipeline_exec_cwl_create_vis_symlink_archive',
            'pipeline_exec_cwl_ome_tiff_offsets',
            'pipeline_exec_cwl_sprm_to_json',
            'pipeline_exec_cwl_sprm_to_anndata',
            'move_data',
        ],
        cwl_workflows=list(cwl_workflows.values()),
    )
    t_send_status = PythonOperator(task_id='send_status_msg',
                                   python_callable=send_status_msg,
                                   provide_context=True)

    t_log_info = LogInfoOperator(task_id='log_info')
    t_join = JoinOperator(task_id='join')
    t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir')
    t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir')
    t_set_dataset_processing = SetDatasetProcessingOperator(