def test_handle_processed_files(run_awsf_event_data_secondary_files): data = run_awsf_event_data_secondary_files tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, s3_keys=data.get('s3_keys'), ff_keys=data.get('ff_keys'), settings=tibanna_settings) workflow_uuid = data['workflow_uuid'] workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys) output_files, pf_meta = handle_processed_files(workflow_info, tibanna) assert (output_files) assert len(output_files) == 3 for of in output_files: if of['extension'] == '.pairs.gz': assert of['secondary_file_extensions'] == ['.pairs.gz.px2'] assert of['secondary_file_formats'] == ['pairs_px2'] assert of['extra_files'] else: assert 'secondary_files_extension' not in of assert 'secondary_files_formats' not in of assert (pf_meta) assert len(pf_meta) == 3 for pf in pf_meta: pdict = pf.__dict__ if pdict['file_format'] == 'pairs': assert pdict['extra_files'] == [{'file_format': 'pairs_px2'}] else: assert 'extra_files' not in pdict
def test_check_export_fastqc_e2e(fastqc_payload, ff_keys, tibanna_env): # lets make sure we have a valid fastqc file fastqs = ff_utils.get_metadata("/search/?type=FileFastq&limit=1", ff_keys)['@graph'][0] fastqc_payload['ff_meta'] filename = fastqs['upload_key'].split('/')[1] fastqc_payload['workflow']['task_input']['inputs']['input_fastq'][ 'name'] = filename fastqc_payload['workflow']['export_report'][0]['value'] = fastqs['uuid'] fastqc_payload['ff_meta']['output_files'][0]['value'] = fastqs['uuid'] try: fastqc_payload.update(tibanna_env) ret = check_export_handler(fastqc_payload, None) except Exception as e: if "409" in e: # duplicate UUID, just ignore that return if "NoSuchKey" in e.message: pytest.skip("file not on s3 ignorring test") return raise e ret = check_export_handler(fastqc_payload, None) assert json.dumps(ret) assert ret['workflow']
def md5_updater(status, wf_file, ff_meta, tibanna): # get key ff_key = tibanna.ff_keys # get metadata about original input file accession = wf_file.runner.inputfile_accessions[0] original_file = ff_utils.get_metadata(accession, key=ff_key) if status.lower() == 'uploaded': md5 = wf_file.read() original_md5 = original_file.get('content_md5sum', False) current_status = original_file.get('status', "uploading") if original_md5 and original_md5 != md5: # file status to be upload failed / md5 mismatch print("no matcho") md5_updater("upload failed", wf_file, ff_meta, tibanna) else: new_file = {} # change status to uploaded only if it is uploading or upload failed if current_status in ["uploading", "upload failed"]: new_file['status'] = 'uploaded' new_file['content_md5sum'] = md5 try: ff_utils.patch_metadata(new_file, accession, key=ff_key) except Exception as e: # TODO specific excpetion # if patch fails try to patch worfklow status as failed new_file = {} new_file['status'] = 'upload failed' new_file['description'] = str(e) ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key) elif status == 'upload failed': new_file = {} new_file['status'] = 'upload failed' ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key) # nothing to patch to ff_meta return None
def md5_updater(status, sbg, ff_meta, tibanna): # get key ff_key = tibanna.ff_keys # get metadata about original input file accession = get_inputfile_accession(sbg) original_file = ff_utils.get_metadata(accession, key=ff_key) if status == 'uploaded': md5 = tibanna.s3.read_s3(ff_meta.output_files[0]['upload_key']).strip() original_md5 = original_file.get('content_md5sum', False) if original_md5 and original_md5 != md5: # file status to be upload failed / md5 mismatch print("no matcho") md5_updater("upload failed", sbg, ff_meta, tibanna) else: new_file = {} new_file['status'] = 'uploaded' new_file['content_md5sum'] = md5 try: ff_utils.patch_metadata(new_file, accession, key=ff_key) except Exception as e: # TODO specific excpetion # if patch fails try to patch worfklow status as failed new_file = {} new_file['status'] = 'upload failed' new_file['description'] = str(e) ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key) elif status == 'upload failed': new_file = {} new_file['status'] = 'upload failed' ff_utils.patch_metadata(new_file, original_file['uuid'], key=ff_key) # nothing to patch to ff_meta return None
def handler(event, context): ''' this is generic function to run sbg workflow based on the data passed in workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name. Note multiple workflow_uuids can be available for an app_name (different versions of the same app could have a different uuid) ''' # get incomming data input_file_list = event.get('input_files') app_name = event.get('app_name') parameter_dict = event.get('parameters') workflow_uuid = event.get('workflow_uuid') output_bucket = event.get('output_bucket') tibanna_settings = event.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1])) # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, s3_keys=event.get('s3_keys'), ff_keys=event.get('ff_keys'), settings=tibanna_settings) LOG.info("input data is %s" % event) # represents the SBG info we need sbg = sbg_utils.create_sbg_workflow(app_name, tibanna.sbg_keys) LOG.info("sbg is %s" % sbg.__dict__) # represents the workflow metadata to be stored in fourfront parameters, _ = sbg_utils.to_sbg_workflow_args(parameter_dict, vals_as_string=True) # get argument format & type info from workflow workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys) LOG.info("workflow info %s" % workflow_info) if 'error' in workflow_info.get('@type', []): raise Exception("FATAL, can't lookupt workflow info for % fourfront" % workflow_uuid) # This dictionary has a key 'arguments' with a value # { 'workflow_argument_name': ..., 'argument_type': ..., 'argument_format': ... } # get format-extension map try: fp_schema = ff_utils.get_metadata("profiles/file_processed.json", key=tibanna.ff_keys) fe_map = fp_schema.get('file_format_file_extension') except Exception as e: LOG.error( "Can't get format-extension map from file_processed schema. %s\n" % e) # processed file metadata output_files = [] try: if 'arguments' in workflow_info: pf_meta = [] for arg in workflow_info.get('arguments'): if (arg.get('argument_type') in [ 'Output processed file', 'Output report file', 'Output QC file' ]): of = dict() of['workflow_argument_name'] = arg.get( 'workflow_argument_name') of['type'] = arg.get('argument_type') if 'argument_format' in arg: # These are not processed files but report or QC files. pf = ff_utils.ProcessedFileMetadata( file_format=arg.get('argument_format')) try: resp = pf.post( key=tibanna.ff_keys ) # actually post processed file metadata here resp = resp.get('@graph')[0] of['upload_key'] = resp.get('upload_key') of['value'] = resp.get('uuid') except Exception as e: LOG.error( "Failed to post Processed file metadata. %s\n" % e) LOG.error("resp" + str(resp) + "\n") raise e of['format'] = arg.get('argument_format') of['extension'] = fe_map.get( arg.get('argument_format')) pf_meta.append(pf) output_files.append(of) except Exception as e: LOG.error("output_files = " + str(output_files) + "\n") LOG.error("Can't prepare output_files information. %s\n" % e) raise e # create the ff_meta output info input_files = [] for input_file in input_file_list: for idx, uuid in enumerate(ensure_list(input_file['uuid'])): input_files.append({ 'workflow_argument_name': input_file['workflow_argument_name'], 'value': uuid, 'ordinal': idx + 1 }) LOG.info("input_files is %s" % input_files) ff_meta = ff_utils.create_ffmeta(sbg, workflow_uuid, input_files, parameters, run_url=tibanna.settings.get('url', ''), output_files=output_files) LOG.info("ff_meta is %s" % ff_meta.__dict__) # store metadata so we know the run has started ff_meta.post(key=tibanna.ff_keys) # mount all input files to sbg this will also update sbg to store the import_ids for infile in input_file_list: imps = mount_on_sbg(infile, tibanna.s3_keys, sbg) infile['import_ids'] = imps # create a link to the output directory as well if output_bucket: sbg_volume = sbg_utils.create_sbg_volume_details() res = sbg.create_volumes(sbg_volume, output_bucket, public_key=tibanna.s3_keys['key'], secret_key=tibanna.s3_keys['secret']) vol_id = res.get('id') if not vol_id: # we got an error raise Exception("Unable to mount output volume, error is %s " % res) sbg.output_volume_id = vol_id # let's not pass keys in plain text parameters return { "input_file_args": input_file_list, "workflow": sbg.as_dict(), "ff_meta": ff_meta.as_dict(), 'pf_meta': [meta.as_dict() for meta in pf_meta], "_tibanna": tibanna.as_dict(), "parameter_dict": parameter_dict }
def fastqc_updater(status, sbg, ff_meta, tibanna): if status == 'uploading': # wait until this bad boy is finished return # keys ff_key = tibanna.ff_keys # move files to proper s3 location accession = get_inputfile_accession(sbg, input_file_name='input_fastq') zipped_report = ff_meta.output_files[0]['upload_key'].strip() files_to_parse = ['summary.txt', 'fastqc_data.txt', 'fastqc_report.html'] LOG.info("accession is %s" % accession) try: files = tibanna.s3.unzip_s3_to_s3(zipped_report, accession, files_to_parse, acl='public-read') except Exception as e: LOG.info(tibanna.s3.__dict__) raise Exception("%s (key={})\n".format(zipped_report) % e) # parse fastqc metadata meta = parse_fastqc(files['summary.txt']['data'], files['fastqc_data.txt']['data'], url=files['fastqc_report.html']['s3key']) LOG.info("fastqc meta is %s" % meta) # post fastq metadata qc_meta = ff_utils.post_to_metadata(meta, 'quality_metric_fastqc', key=ff_key) if qc_meta.get('@graph'): qc_meta = qc_meta['@graph'][0] LOG.info("qc_meta is %s" % qc_meta) # update original file as well try: original_file = ff_utils.get_metadata(accession, key=ff_key) LOG.info("original_file is %s" % original_file) except Exception as e: raise Exception( "Couldn't get metadata for accession {} : ".format(accession) + str(e)) patch_file = {'quality_metric': qc_meta['@id']} try: ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key) except Exception as e: raise Exception("patch_metadata failed in fastqc_updater." + str(e) + "original_file ={}\n".format(str(original_file))) # patch the workflow run, value_qc is used to make drawing graphs easier. output_files = ff_meta.output_files output_files[0]['value_qc'] = qc_meta['@id'] retval = { "output_quality_metrics": [{ "name": "quality_metric_fastqc", "value": qc_meta['@id'] }], 'output_files': output_files } LOG.info("retval is %s" % retval) return retval