示例#1
0
def test_handle_processed_files(run_awsf_event_data_secondary_files):
    data = run_awsf_event_data_secondary_files
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      s3_keys=data.get('s3_keys'),
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    workflow_uuid = data['workflow_uuid']
    workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys)

    output_files, pf_meta = handle_processed_files(workflow_info, tibanna)
    assert (output_files)
    assert len(output_files) == 3
    for of in output_files:
        if of['extension'] == '.pairs.gz':
            assert of['secondary_file_extensions'] == ['.pairs.gz.px2']
            assert of['secondary_file_formats'] == ['pairs_px2']
            assert of['extra_files']
        else:
            assert 'secondary_files_extension' not in of
            assert 'secondary_files_formats' not in of

    assert (pf_meta)
    assert len(pf_meta) == 3
    for pf in pf_meta:
        pdict = pf.__dict__
        if pdict['file_format'] == 'pairs':
            assert pdict['extra_files'] == [{'file_format': 'pairs_px2'}]
        else:
            assert 'extra_files' not in pdict
示例#2
0
def test_check_export_fastqc_e2e(fastqc_payload, ff_keys, tibanna_env):
    # lets make sure we have a valid fastqc file
    fastqs = ff_utils.get_metadata("/search/?type=FileFastq&limit=1",
                                   ff_keys)['@graph'][0]
    fastqc_payload['ff_meta']

    filename = fastqs['upload_key'].split('/')[1]
    fastqc_payload['workflow']['task_input']['inputs']['input_fastq'][
        'name'] = filename
    fastqc_payload['workflow']['export_report'][0]['value'] = fastqs['uuid']
    fastqc_payload['ff_meta']['output_files'][0]['value'] = fastqs['uuid']

    try:
        fastqc_payload.update(tibanna_env)
        ret = check_export_handler(fastqc_payload, None)
    except Exception as e:
        if "409" in e:
            # duplicate UUID, just ignore that
            return
        if "NoSuchKey" in e.message:
            pytest.skip("file not on s3 ignorring test")
            return
        raise e
    ret = check_export_handler(fastqc_payload, None)
    assert json.dumps(ret)
    assert ret['workflow']
示例#3
0
文件: service.py 项目: j1z0/tibanna
def md5_updater(status, wf_file, ff_meta, tibanna):
    # get key
    ff_key = tibanna.ff_keys
    # get metadata about original input file
    accession = wf_file.runner.inputfile_accessions[0]
    original_file = ff_utils.get_metadata(accession, key=ff_key)

    if status.lower() == 'uploaded':
        md5 = wf_file.read()
        original_md5 = original_file.get('content_md5sum', False)
        current_status = original_file.get('status', "uploading")
        if original_md5 and original_md5 != md5:
            # file status to be upload failed / md5 mismatch
            print("no matcho")
            md5_updater("upload failed", wf_file, ff_meta, tibanna)
        else:
            new_file = {}
            # change status to uploaded only if it is uploading or upload failed
            if current_status in ["uploading", "upload failed"]:
                new_file['status'] = 'uploaded'
            new_file['content_md5sum'] = md5

            try:
                ff_utils.patch_metadata(new_file, accession, key=ff_key)
            except Exception as e:
                # TODO specific excpetion
                # if patch fails try to patch worfklow status as failed
                new_file = {}
                new_file['status'] = 'upload failed'
                new_file['description'] = str(e)
                ff_utils.patch_metadata(new_file,
                                        original_file['uuid'],
                                        key=ff_key)
    elif status == 'upload failed':
        new_file = {}
        new_file['status'] = 'upload failed'
        ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key)

    # nothing to patch to ff_meta
    return None
示例#4
0
文件: service.py 项目: j1z0/tibanna
def md5_updater(status, sbg, ff_meta, tibanna):
    # get key
    ff_key = tibanna.ff_keys
    # get metadata about original input file
    accession = get_inputfile_accession(sbg)
    original_file = ff_utils.get_metadata(accession, key=ff_key)

    if status == 'uploaded':
        md5 = tibanna.s3.read_s3(ff_meta.output_files[0]['upload_key']).strip()
        original_md5 = original_file.get('content_md5sum', False)
        if original_md5 and original_md5 != md5:
            # file status to be upload failed / md5 mismatch
            print("no matcho")
            md5_updater("upload failed", sbg, ff_meta, tibanna)
        else:
            new_file = {}
            new_file['status'] = 'uploaded'
            new_file['content_md5sum'] = md5

            try:
                ff_utils.patch_metadata(new_file, accession, key=ff_key)
            except Exception as e:
                # TODO specific excpetion
                # if patch fails try to patch worfklow status as failed
                new_file = {}
                new_file['status'] = 'upload failed'
                new_file['description'] = str(e)
                ff_utils.patch_metadata(new_file,
                                        original_file['uuid'],
                                        key=ff_key)
    elif status == 'upload failed':
        new_file = {}
        new_file['status'] = 'upload failed'
        ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key)

    # nothing to patch to ff_meta
    return None
示例#5
0
文件: service.py 项目: j1z0/tibanna
def handler(event, context):
    '''
    this is generic function to run sbg workflow
    based on the data passed in

    workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name.
    Note multiple workflow_uuids can be available for an app_name
    (different versions of the same app could have a different uuid)
    '''
    # get incomming data
    input_file_list = event.get('input_files')
    app_name = event.get('app_name')
    parameter_dict = event.get('parameters')
    workflow_uuid = event.get('workflow_uuid')
    output_bucket = event.get('output_bucket')
    tibanna_settings = event.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1]))
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      s3_keys=event.get('s3_keys'),
                      ff_keys=event.get('ff_keys'),
                      settings=tibanna_settings)

    LOG.info("input data is %s" % event)
    # represents the SBG info we need
    sbg = sbg_utils.create_sbg_workflow(app_name, tibanna.sbg_keys)
    LOG.info("sbg is %s" % sbg.__dict__)

    # represents the workflow metadata to be stored in fourfront
    parameters, _ = sbg_utils.to_sbg_workflow_args(parameter_dict,
                                                   vals_as_string=True)

    # get argument format & type info from workflow
    workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys)
    LOG.info("workflow info  %s" % workflow_info)
    if 'error' in workflow_info.get('@type', []):
        raise Exception("FATAL, can't lookupt workflow info for % fourfront" %
                        workflow_uuid)

    # This dictionary has a key 'arguments' with a value
    # { 'workflow_argument_name': ..., 'argument_type': ..., 'argument_format': ... }

    # get format-extension map
    try:
        fp_schema = ff_utils.get_metadata("profiles/file_processed.json",
                                          key=tibanna.ff_keys)
        fe_map = fp_schema.get('file_format_file_extension')
    except Exception as e:
        LOG.error(
            "Can't get format-extension map from file_processed schema. %s\n" %
            e)

    # processed file metadata
    output_files = []
    try:
        if 'arguments' in workflow_info:
            pf_meta = []
            for arg in workflow_info.get('arguments'):
                if (arg.get('argument_type') in [
                        'Output processed file', 'Output report file',
                        'Output QC file'
                ]):

                    of = dict()
                    of['workflow_argument_name'] = arg.get(
                        'workflow_argument_name')
                    of['type'] = arg.get('argument_type')
                    if 'argument_format' in arg:
                        # These are not processed files but report or QC files.
                        pf = ff_utils.ProcessedFileMetadata(
                            file_format=arg.get('argument_format'))
                        try:
                            resp = pf.post(
                                key=tibanna.ff_keys
                            )  # actually post processed file metadata here
                            resp = resp.get('@graph')[0]
                            of['upload_key'] = resp.get('upload_key')
                            of['value'] = resp.get('uuid')
                        except Exception as e:
                            LOG.error(
                                "Failed to post Processed file metadata. %s\n"
                                % e)
                            LOG.error("resp" + str(resp) + "\n")
                            raise e
                        of['format'] = arg.get('argument_format')
                        of['extension'] = fe_map.get(
                            arg.get('argument_format'))
                        pf_meta.append(pf)
                    output_files.append(of)

    except Exception as e:
        LOG.error("output_files = " + str(output_files) + "\n")
        LOG.error("Can't prepare output_files information. %s\n" % e)
        raise e

    # create the ff_meta output info
    input_files = []
    for input_file in input_file_list:
        for idx, uuid in enumerate(ensure_list(input_file['uuid'])):
            input_files.append({
                'workflow_argument_name':
                input_file['workflow_argument_name'],
                'value':
                uuid,
                'ordinal':
                idx + 1
            })
    LOG.info("input_files is %s" % input_files)

    ff_meta = ff_utils.create_ffmeta(sbg,
                                     workflow_uuid,
                                     input_files,
                                     parameters,
                                     run_url=tibanna.settings.get('url', ''),
                                     output_files=output_files)
    LOG.info("ff_meta is %s" % ff_meta.__dict__)

    # store metadata so we know the run has started
    ff_meta.post(key=tibanna.ff_keys)

    # mount all input files to sbg this will also update sbg to store the import_ids
    for infile in input_file_list:
        imps = mount_on_sbg(infile, tibanna.s3_keys, sbg)
        infile['import_ids'] = imps

    # create a link to the output directory as well
    if output_bucket:
        sbg_volume = sbg_utils.create_sbg_volume_details()
        res = sbg.create_volumes(sbg_volume,
                                 output_bucket,
                                 public_key=tibanna.s3_keys['key'],
                                 secret_key=tibanna.s3_keys['secret'])
        vol_id = res.get('id')
        if not vol_id:
            # we got an error
            raise Exception("Unable to mount output volume, error is %s " %
                            res)
        sbg.output_volume_id = vol_id

    # let's not pass keys in plain text parameters
    return {
        "input_file_args": input_file_list,
        "workflow": sbg.as_dict(),
        "ff_meta": ff_meta.as_dict(),
        'pf_meta': [meta.as_dict() for meta in pf_meta],
        "_tibanna": tibanna.as_dict(),
        "parameter_dict": parameter_dict
    }
示例#6
0
文件: service.py 项目: j1z0/tibanna
def fastqc_updater(status, sbg, ff_meta, tibanna):
    if status == 'uploading':
        # wait until this bad boy is finished
        return
    # keys
    ff_key = tibanna.ff_keys
    # move files to proper s3 location
    accession = get_inputfile_accession(sbg, input_file_name='input_fastq')
    zipped_report = ff_meta.output_files[0]['upload_key'].strip()
    files_to_parse = ['summary.txt', 'fastqc_data.txt', 'fastqc_report.html']
    LOG.info("accession is %s" % accession)

    try:
        files = tibanna.s3.unzip_s3_to_s3(zipped_report,
                                          accession,
                                          files_to_parse,
                                          acl='public-read')
    except Exception as e:
        LOG.info(tibanna.s3.__dict__)
        raise Exception("%s (key={})\n".format(zipped_report) % e)
    # parse fastqc metadata
    meta = parse_fastqc(files['summary.txt']['data'],
                        files['fastqc_data.txt']['data'],
                        url=files['fastqc_report.html']['s3key'])
    LOG.info("fastqc meta is %s" % meta)

    # post fastq metadata
    qc_meta = ff_utils.post_to_metadata(meta,
                                        'quality_metric_fastqc',
                                        key=ff_key)
    if qc_meta.get('@graph'):
        qc_meta = qc_meta['@graph'][0]

    LOG.info("qc_meta is %s" % qc_meta)
    # update original file as well
    try:
        original_file = ff_utils.get_metadata(accession, key=ff_key)
        LOG.info("original_file is %s" % original_file)
    except Exception as e:
        raise Exception(
            "Couldn't get metadata for accession {} : ".format(accession) +
            str(e))
    patch_file = {'quality_metric': qc_meta['@id']}
    try:
        ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key)
    except Exception as e:
        raise Exception("patch_metadata failed in fastqc_updater." + str(e) +
                        "original_file ={}\n".format(str(original_file)))

    # patch the workflow run, value_qc is used to make drawing graphs easier.
    output_files = ff_meta.output_files
    output_files[0]['value_qc'] = qc_meta['@id']
    retval = {
        "output_quality_metrics": [{
            "name": "quality_metric_fastqc",
            "value": qc_meta['@id']
        }],
        'output_files':
        output_files
    }

    LOG.info("retval is %s" % retval)
    return retval