Пример #1
0
    def build_cwltool_cmd2(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']

        #tmpdir is temp directory in /hubmap-tmp
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)

        #get data directory
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = os.path.join(
            tmpdir, 'cwl_out',
            'ometiff-pyramids')  # This stage reads input from stage 1
        print('data_dir: ', data_dir)

        #this is the call to the CWL
        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows[1],
            '--input_directory',
            './ometiff-pyramids',
        ]

        return join_quote_command_str(command)
Пример #2
0
    def build_cwltool_cmd2(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)
        cwltool_dir = os.path.dirname(cwltool.__file__)
        while cwltool_dir:
            part1, part2 = os.path.split(cwltool_dir)
            cwltool_dir = part1
            if part2 == 'lib':
                break
        assert cwltool_dir, 'Failed to find cwltool bin directory'
        cwltool_dir = os.path.join(cwltool_dir, 'bin')

        command = [
            'env',
            'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool',
            os.fspath(PIPELINE_BASE_DIR / cwl_workflow2), '--input_dir', '.'
        ]

        command_str = ' '.join(shlex.quote(piece) for piece in command)
        print('final command_str: %s' % command_str)
        return command_str
Пример #3
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)
        cwltool_dir = get_cwltool_bin_path()

        command = [
            'env',
            'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']),
            'cwltool',
            os.fspath(PIPELINE_BASE_DIR / cwl_workflow1),
            '--data_dir',
            data_dir,
        ]

        #         command = [
        #             'cp',
        #             '-R',
        #             os.path.join(os.environ['AIRFLOW_HOME'],
        #                          'data', 'temp', 'std_salmon_out', 'cwl_out'),
        #             tmpdir
        #         ]

        command_str = ' '.join(shlex.quote(piece) for piece in command)
        print('final command_str: %s' % command_str)
        return command_str
Пример #4
0
        def build_cwltool_cmd1(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)

            data_dirs = ctx["parent_lz_path"]
            data_dirs = [data_dirs] if isinstance(data_dirs,
                                                  str) else data_dirs
            print("data_dirs: ", data_dirs)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                "--relax-path-checks",
                "--debug",
                "--outdir",
                tmpdir / "cwl_out",
                "--parallel",
                cwl_workflows[0],
                "--assay",
                params.assay,
                "--threads",
                THREADS,
            ]
            for data_dir in data_dirs:
                command.append("--fastq_dir")
                command.append(data_dir)

            return join_quote_command_str(command)
Пример #5
0
        def build_cwltool_cmd2(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)
            data_dir = ctx["parent_lz_path"]
            print("data_dir: ", data_dir)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                cwl_workflows[1],
                "--input_dir",
                ".",
            ]

            return join_quote_command_str(command)
    def build_cwltool_cwl_cytokit(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)

        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows['cytokit'],
            '--gpus=0,1',
            '--data_dir',
            data_dir,
        ]

        return join_quote_command_str(command)
    def build_cwltool_cmd_sprm_to_anndata(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = tmpdir / 'cwl_out'  # This stage reads input from stage 1
        print('data_dir: ', data_dir)

        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows['sprm_to_anndata'],
            '--input_dir',
            data_dir / 'sprm_outputs',
        ]

        return join_quote_command_str(command)
    def build_cwltool_cmd_ome_tiff_offsets(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = tmpdir / 'cwl_out'
        print('data_dir: ', data_dir)

        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows['ome_tiff_offsets'],
            '--input_dir',
            data_dir / 'stitched/expressions',
        ]

        return join_quote_command_str(command)
Пример #9
0
        def build_cwltool_cmd3(**kwargs):
            ctx = kwargs["dag_run"].conf
            run_id = kwargs["run_id"]
            tmpdir = utils.get_tmp_dir_path(run_id)
            print("tmpdir: ", tmpdir)
            data_dir = ctx["parent_lz_path"]
            print("data_dir: ", data_dir)

            command = [
                *get_cwltool_base_cmd(tmpdir),
                cwl_workflows[2],
                "--input_dir",
                # This pipeline invocation runs in a 'hubmap_ui' subdirectory,
                # so use the parent directory as input
                "..",
            ]

            return join_quote_command_str(command)
Пример #10
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        data_dir = ctx['parent_lz_path']

        command = [
            *get_cwltool_base_cmd(tmpdir),
            '--outdir',
            tmpdir / 'cwl_out',
            '--parallel',
            cwl_workflows[0],
            '--fastq_dir',
            data_dir,
            '--threads',
            THREADS,
        ]

        return join_quote_command_str(command)
Пример #11
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)
        cwltool_dir = os.path.dirname(cwltool.__file__)
        while cwltool_dir:
            part1, part2 = os.path.split(cwltool_dir)
            cwltool_dir = part1
            if part2 == 'lib':
                break
        assert cwltool_dir, 'Failed to find cwltool bin directory'
        cwltool_dir = os.path.join(cwltool_dir, 'bin')

        command = [
            'env',
            'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']),
            'cwltool',
            '--debug',
            '--outdir',
            os.path.join(tmpdir, 'cwl_out'),
            '--parallel',
            os.fspath(PIPELINE_BASE_DIR / cwl_workflow1),
            '--fastq_dir',
            data_dir,
            '--threads',
            str(THREADS),
        ]

        #         command = [
        #             'cp',
        #             '-R',
        #             os.path.join(os.environ['AIRFLOW_HOME'],
        #                          'data', 'temp', 'std_salmon_out', 'cwl_out'),
        #             tmpdir
        #         ]

        command_str = ' '.join(shlex.quote(piece) for piece in command)
        print('final command_str: %s' % command_str)
        return command_str
    def build_cwltool_cmd_create_vis_symlink_archive(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = tmpdir / 'cwl_out'
        print('data_dir: ', data_dir)

        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows['create_vis_symlink_archive'],
            '--ometiff_dir',
            data_dir / 'stitched',
            '--sprm_output',
            data_dir / 'sprm_outputs',
        ]

        return join_quote_command_str(command)
 def flex_maybe_spawn(**kwargs):
     """
     This is a generator which returns appropriate DagRunOrders
     """
     print('kwargs:')
     pprint(kwargs)
     print('dag_run conf:')
     ctx = kwargs['dag_run'].conf
     pprint(ctx)
     md_extract_retcode = int(
         kwargs['ti'].xcom_pull(task_ids="run_md_extract"))
     md_consistency_retcode = int(
         kwargs['ti'].xcom_pull(task_ids="md_consistency_tests"))
     if md_extract_retcode == 0 and md_consistency_retcode == 0:
         collectiontype = kwargs['ti'].xcom_pull(key='collectiontype',
                                                 task_ids="send_status_msg")
         assay_type = kwargs['ti'].xcom_pull(key='assay_type',
                                             task_ids="send_status_msg")
         print('collectiontype: <{}>, assay_type: <{}>'.format(
             collectiontype, assay_type))
         md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']),
                                 'rslt.yml')
         with open(md_fname, 'r') as f:
             md = yaml.safe_load(f)
         payload = {
             k: kwargs['dag_run'].conf[k]
             for k in kwargs['dag_run'].conf
         }
         payload = {
             'ingest_id': ctx['run_id'],
             'crypt_auth_tok': ctx['crypt_auth_tok'],
             'parent_lz_path': ctx['lz_path'],
             'parent_submission_id': ctx['submission_id'],
             'metadata': md,
             'dag_provenance_list': utils.get_git_provenance_list(__file__)
         }
         for next_dag in utils.downstream_workflow_iter(
                 collectiontype, assay_type):
             yield next_dag, DagRunOrder(payload=payload)
     else:
         return None
    def build_cwltool_cmd_sprm(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = tmpdir / 'cwl_out'
        print('data_dir: ', data_dir)

        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows['sprm'],
            '--enable_manhole',
            '--image_dir',
            data_dir / 'stitched/expressions',
            '--mask_dir',
            data_dir / 'stitched/mask',
        ]

        return join_quote_command_str(command)
    def build_cwltool_cwl_ome_tiff_pyramid(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']

        #tmpdir is temp directory in /hubmap-tmp
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)

        #data directory is the stitched images, which are found in tmpdir
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)

        #this is the call to the CWL
        command = [
            *get_cwltool_base_cmd(tmpdir),
            "--relax-path-checks",
            cwl_workflows['ome_tiff_pyramid'],
            '--ometiff_directory',
            '.',
        ]
        return join_quote_command_str(command)
Пример #16
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']

        #tmpdir is temp directory in /hubmap-tmp
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)

        #data directory is input directory in /hubmap-data
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)

        #this is the call to the CWL
        command = [
            *get_cwltool_base_cmd(tmpdir),
            cwl_workflows[0],
            '--ometiff_directory',
            data_dir,
        ]

        return join_quote_command_str(command)
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)

        data_dirs = ctx['parent_lz_path']
        data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs

        command = [
            *get_cwltool_base_cmd(tmpdir),
            '--outdir',
            tmpdir / 'cwl_out',
            '--parallel',
            cwl_workflows[0],
            '--threads',
            THREADS,
        ]
        for data_dir in data_dirs:
            command.append('--sequence_directory')
            command.append(data_dir)

        return join_quote_command_str(command)
Пример #18
0
    def build_cwltool_cmd3(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        parent_data_dir = ctx['parent_lz_path']
        print('parent_data_dir: ', parent_data_dir)
        data_dir = os.path.join(
            tmpdir, 'cwl_out')  # This stage reads input from stage 1
        print('data_dir: ', data_dir)
        cwltool_dir = get_cwltool_bin_path()

        command = [
            'env',
            'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool',
            os.fspath(PIPELINE_BASE_DIR / cwl_workflow3), '--input_dir',
            os.path.join(data_dir, 'sprm_outputs')
        ]

        command_str = ' '.join(shlex.quote(piece) for piece in command)
        print('final command_str: %s' % command_str)
        return command_str
Пример #19
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        tmp_subdir = tmpdir / 'cwl_out'
        data_dir = ctx['parent_lz_path']

        try:
            delay_sec = int(ctx['metadata']['delay_sec'])
        except ValueError:
            print("Could not parse delay_sec "
                  "{} ; defaulting to 30 sec".format(
                      ctx['metadata']['delay_sec']))
            delay_sec = 30
        for fname in ctx['metadata']['files_to_copy']:
            print(fname)

        commands = [
            [f'tmp_dir={tmpdir}'],
            ['sleep', delay_sec],
            ['cd', data_dir],
            ['mkdir', '-p', tmp_subdir],
        ]

        if ctx['metadata']['files_to_copy']:
            commands.append(
                ['cp', *ctx['metadata']['files_to_copy'], tmp_subdir])

        print('command list:')
        pprint(commands)

        command_strs = [
            join_quote_command_str(command) for command in commands
        ]
        command_str = ' ; '.join(command_strs)
        print('overall command_str:', command_str)
        return command_str
Пример #20
0
    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        tmp_subdir = os.path.join(tmpdir, 'cwl_out')
        print('tmp_subdir: ', tmp_subdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)

        try:
            delay_sec = int(ctx['metadata']['delay_sec'])
        except ValueError:
            print("Could not parse delay_sec "
                  "{} ; defaulting to 30 sec".format(
                      ctx['metadata']['delay_sec']))
            delay_sec = 30
        for fname in ctx['metadata']['files_to_copy']:
            print(fname)

        command = [
            'sleep', '{}'.format(delay_sec), ';', 'cd', data_dir, ';', 'mkdir',
            '-p', '{}'.format(tmp_subdir), ';'
        ]

        if ctx['metadata']['files_to_copy']:
            command.extend(['cp'])
            command.extend(ctx['metadata']['files_to_copy'])
            command.extend([tmp_subdir])

        print('command list: ', command)
        command_str = ' '.join(piece if piece == ';' else shlex.quote(piece)
                               for piece in command)
        command_str = 'tmp_dir="{}" ; '.format(tmpdir) + command_str
        print('final command_str: %s' % command_str)
        return command_str
    def send_status_msg(**kwargs):
        ctx = kwargs['dag_run'].conf
        retcode_ops = ['run_md_extract', 'md_consistency_tests']
        print('raw: ',
              [kwargs['ti'].xcom_pull(task_ids=op) for op in retcode_ops])
        retcodes = [
            int(kwargs['ti'].xcom_pull(task_ids=op)) for op in retcode_ops
        ]
        retcode_dct = {k: v for k, v in zip(retcode_ops, retcodes)}
        print('retcodes: ', retcode_dct)
        success = all([rc == 0 for rc in retcodes])
        ds_dir = ctx['lz_path']
        http_conn_id = 'ingest_api_connection'
        endpoint = '/datasets/status'
        method = 'PUT'
        headers = {
            'authorization':
            'Bearer ' + utils.decrypt_tok(ctx['crypt_auth_tok'].encode()),
            'content-type':
            'application/json'
        }
        print('headers:')
        pprint(headers)
        extra_options = []

        http = HttpHook(method, http_conn_id=http_conn_id)

        if success:
            md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']),
                                    'rslt.yml')
            with open(md_fname, 'r') as f:
                scanned_md = yaml.safe_load(f)
            dag_prv = utils.get_git_provenance_list([__file__])
            md = {'dag_provenance_list': dag_prv, 'metadata': scanned_md}
            # Inclusion of files information in this message is getting disabled due to size
            #md.update(utils.get_file_metadata_dict(ds_dir,
            #                                       utils.get_tmp_dir_path(kwargs['run_id'])))
            try:
                assert_json_matches_schema(md, 'dataset_metadata_schema.yml')
                data = {
                    'dataset_id': ctx['submission_id'],
                    'status': 'QA',
                    'message': 'the process ran',
                    'metadata': md
                }
            except AssertionError as e:
                print('invalid metadata follows:')
                pprint(md)
                data = {
                    'dataset_id': ctx['submission_id'],
                    'status': 'Error',
                    'message':
                    'internal error; schema violation: {}'.format(e),
                    'metadata': {}
                }
            kwargs['ti'].xcom_push(
                key='collectiontype',
                value=(scanned_md['collectiontype']
                       if 'collectiontype' in scanned_md else None))
            kwargs['ti'].xcom_push(
                key='assay_type',
                value=(scanned_md['assay_type']
                       if 'assay_type' in scanned_md else None))
        else:
            for op in retcode_ops:
                if retcode_dct[op]:
                    if op == 'run_md_extract':
                        log_fname = os.path.join(
                            utils.get_tmp_dir_path(kwargs['run_id']),
                            'session.log')
                        with open(log_fname, 'r') as f:
                            err_txt = '\n'.join(f.readlines())
                    else:
                        err_txt = kwargs['ti'].xcom_pull(task_ids=op,
                                                         key='err_msg')
                    break
            else:
                err_txt = 'Unknown error'
            data = {
                'dataset_id': ctx['submission_id'],
                'status': 'Invalid',
                'message': err_txt
            }
            kwargs['ti'].xcom_push(key='collectiontype', value=None)
        print('data: ')
        pprint(data)

        response = http.run(endpoint, json.dumps(data), headers, extra_options)
        print('response: ')
        pprint(response.json())
 def read_metadata_file(**kwargs):
     md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']),
                             'rslt.yml')
     with open(md_fname, 'r') as f:
         scanned_md = yaml.safe_load(f)
     return scanned_md
Пример #23
0
    def send_status_msg(**kwargs):
        ctx = kwargs['dag_run'].conf
        retcode_ops = ['pipeline_exec', 'move_data']
        retcodes = [
            int(kwargs['ti'].xcom_pull(task_ids=op)) for op in retcode_ops
        ]
        print('retcodes: ', {k: v for k, v in zip(retcode_ops, retcodes)})
        success = all([rc == 0 for rc in retcodes])
        derived_dataset_uuid = kwargs['ti'].xcom_pull(
            key='derived_dataset_uuid', task_ids="send_create_dataset")
        ds_dir = kwargs['ti'].xcom_pull(task_ids='send_create_dataset')
        if 'metadata_to_return' in ctx['metadata']:
            md_to_return = ctx['metadata']['metadata_to_return']
        else:
            md_to_return = {}
        http_conn_id = 'ingest_api_connection'
        endpoint = '/datasets/status'
        method = 'PUT'
        crypt_auth_tok = kwargs['dag_run'].conf['crypt_auth_tok']
        headers = {
            'authorization': 'Bearer ' + decrypt_tok(crypt_auth_tok.encode()),
            'content-type': 'application/json'
        }
        # print('headers:')
        # pprint(headers)  # reduce exposure of auth_tok
        extra_options = []

        http = HttpHook(method, http_conn_id=http_conn_id)

        if success:
            md = {'metadata': md_to_return}
            if 'dag_provenance' in kwargs['dag_run'].conf:
                md['dag_provenance'] = kwargs['dag_run'].conf[
                    'dag_provenance'].copy()
                md['dag_provenance'].update(
                    utils.get_git_provenance_dict([__file__]))
            else:
                dag_prv = (kwargs['dag_run'].conf['dag_provenance_list']
                           if 'dag_provenance_list' in kwargs['dag_run'].conf
                           else [])
                dag_prv.extend(utils.get_git_provenance_list([__file__]))
                md['dag_provenance_list'] = dag_prv
            md.update(
                utils.get_file_metadata_dict(
                    ds_dir, utils.get_tmp_dir_path(kwargs['run_id']), []))
            try:
                assert_json_matches_schema(md, 'dataset_metadata_schema.yml')
                data = {
                    'dataset_id': derived_dataset_uuid,
                    'status': 'QA',
                    'message': 'the process ran',
                    'metadata': md
                }
            except AssertionError as e:
                print('invalid metadata follows:')
                pprint(md)
                data = {
                    'dataset_id': derived_dataset_uuid,
                    'status': 'Error',
                    'message':
                    'internal error; schema violation: {}'.format(e),
                    'metadata': {}
                }
        else:
            log_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']),
                                     'session.log')
            with open(log_fname, 'r') as f:
                err_txt = '\n'.join(f.readlines())
            data = {
                'dataset_id': derived_dataset_uuid,
                'status': 'Invalid',
                'message': err_txt
            }
        print('data: ')
        pprint(data)

        response = http.run(endpoint, json.dumps(data), headers, extra_options)
        print('response: ')
        pprint(response.json())