def check_mismatch_and_update(x, original_x, fieldname): if check_mismatch(x, original_x): raise Exception(fieldname + " not matching the original one") if x and not original_x: new_content[fieldname] = x printlog("check_mismatch_and_update: new_content = %s" % str(new_content))
def update_ffmeta_from_awsemfile(awsemfile, ff_meta, tibanna, custom_qc_fields=None): patch_meta = False upload_key = awsemfile.key status = awsemfile.status printlog("awsemfile res is %s" % status) if status == 'COMPLETED': patch_meta = OUTFILE_UPDATERS[awsemfile.argument_type]( 'uploaded', awsemfile, ff_meta, tibanna, other_fields=custom_qc_fields) elif status in ['FAILED']: patch_meta = OUTFILE_UPDATERS[awsemfile.argument_type]( 'upload failed', awsemfile, ff_meta, tibanna, other_fields=custom_qc_fields) ff_meta.run_status = 'error' ff_meta.patch(key=tibanna.ff_keys) raise Exception("Failed to export file %s" % (upload_key)) return patch_meta
def create_and_post_processed_file(ff_keys, file_format, secondary_file_formats, source_experiments=None, other_fields=None): printlog(file_format) if not file_format: raise Exception("file format for processed file must be provided") if secondary_file_formats: extra_files = [{ "file_format": parse_formatstr(v) } for v in secondary_file_formats] else: extra_files = None pf = ProcessedFileMetadata(file_format=file_format, extra_files=extra_files, source_experiments=source_experiments, other_fields=other_fields) # actually post processed file metadata here resp = pf.post(key=ff_keys) if resp and '@graph' in resp: resp = resp.get('@graph')[0] else: raise Exception("Failed to post Processed file metadata.\n") return pf, resp
def handle_postrun_json(bucket_name, jobid, event, raise_error=True, filesystem=None): postrunjson = "%s.postrun.json" % jobid if not does_key_exist(bucket_name, postrunjson): if raise_error: postrunjson_location = "https://s3.amazonaws.com/%s/%s" % ( bucket_name, postrunjson) raise Exception("Postrun json not found at %s" % postrunjson_location) return None postrunjsoncontent = json.loads(read_s3(bucket_name, postrunjson)) if 'instance_id' in event: update_postrun_json(postrunjsoncontent, event['instance_id'], filesystem) printlog("inside funtion handle_postrun_json") printlog("content=\n" + json.dumps(postrunjsoncontent, indent=4)) try: boto3.client('s3').put_object(Bucket=bucket_name, Key=postrunjson, Body=json.dumps(postrunjsoncontent, indent=4).encode()) except Exception as e: raise "error in updating postrunjson %s" % str(e) add_postrun_json(postrunjsoncontent, event, RESPONSE_JSON_CONTENT_INCLUSION_LIMIT)
def test_tmp(update_ffmeta_tmpdata, tibanna_env): update_ffmeta_tmpdata.update(tibanna_env) with mock.patch('core.pony_utils.patch_metadata') as mock_request: ret = real_handler(update_ffmeta_tmpdata, None) mock_request.call_count == 3 printlog(ret) # once for patch pf once for workflow run assert ret
def test_register_to_higlass3(used_env): bucket = 'elasticbeanstalk-fourfront-webdev-wfoutput' bigbed_key = 'a34d5ea5-eada-4def-a4a7-c227b0d32395/4DNFIC624FKJ.bb' tibanna = Tibanna(used_env) with mock.patch('requests.post') as mock_request: res = register_to_higlass(tibanna, bucket, bigbed_key, 'bigwig', 'vector') mock_request.assert_called_once() printlog(res) assert res
def test_register_to_higlass2(used_env): bucket = 'elasticbeanstalk-fourfront-webdev-wfoutput' bigwig_key = 'a940cf00-6001-473e-80d1-1e4a43866863/4DNFI75GAT6T.bw' tibanna = Tibanna(used_env) with mock.patch('requests.post') as mock_request: res = register_to_higlass(tibanna, bucket, bigwig_key, 'bigwig', 'vector') mock_request.assert_called_once() printlog(res) assert res
def output_target_for_input_extra(target_inf, of, tibanna, overwrite_input_extra=False): extrafileexists = False printlog("target_inf = %s" % str(target_inf)) # debugging target_inf_meta = ff_utils.get_metadata(target_inf.get('value'), key=tibanna.ff_keys, ff_env=tibanna.env, add_on='frame=object', check_queue=True) target_format = parse_formatstr(of.get('format')) if target_inf_meta.get('extra_files'): for exf in target_inf_meta.get('extra_files'): if parse_formatstr(exf.get('file_format')) == target_format: extrafileexists = True if overwrite_input_extra: exf['status'] = 'to be uploaded by workflow' break if not extrafileexists: new_extra = { 'file_format': target_format, 'status': 'to be uploaded by workflow' } target_inf_meta['extra_files'].append(new_extra) else: new_extra = { 'file_format': target_format, 'status': 'to be uploaded by workflow' } target_inf_meta['extra_files'] = [new_extra] if overwrite_input_extra or not extrafileexists: # first patch metadata printlog("extra_files_to_patch: %s" % str(target_inf_meta.get('extra_files'))) # debugging ff_utils.patch_metadata( {'extra_files': target_inf_meta.get('extra_files')}, target_inf.get('value'), key=tibanna.ff_keys, ff_env=tibanna.env) # target key # NOTE : The target bucket is assume to be the same as output bucket # i.e. the bucket for the input file should be the same as the output bucket. # which is true if both input and output are processed files. orgfile_key = target_inf_meta.get('upload_key') orgfile_format = parse_formatstr(target_inf_meta.get('file_format')) fe_map = FormatExtensionMap(tibanna.ff_keys) printlog("orgfile_key = %s" % orgfile_key) printlog("orgfile_format = %s" % orgfile_format) printlog("target_format = %s" % target_format) target_key = get_extra_file_key(orgfile_format, orgfile_key, target_format, fe_map) return target_key else: raise Exception( "input already has extra: 'User overwrite_input_extra': true")
def as_dict(self): d = self.__dict__.copy() printlog(d) del (d['client']) del (d['starttimes']) del (d['endtimes']) del (d['starttime']) del (d['endtime']) del (d['filesystem']) del (d['instance_id']) return (d)
def create_ffmeta_input_files_from_pony_input_file_list(input_file_list): input_files_for_ffmeta = [] for input_file in input_file_list: dim = flatten(create_dim(input_file['uuid'])) if not dim: # singlet dim = '0' uuid = flatten(input_file['uuid']) ordinal = create_ordinal(uuid) for d, u, o in zip(aslist(dim), aslist(uuid), aslist(ordinal)): infileobj = InputFileForWFRMeta( input_file['workflow_argument_name'], u, o, input_file.get('format_if_extra', ''), d) input_files_for_ffmeta.append(infileobj.as_dict()) printlog("input_files_for_ffmeta is %s" % input_files_for_ffmeta) return input_files_for_ffmeta
def __init__(self, ff_keys): try: printlog("Searching in server : " + ff_keys['server']) ffe_all = search_metadata("/search/?type=FileFormat&frame=object", key=ff_keys) except Exception as e: raise Exception("Can't get the list of FileFormat objects. %s\n" % e) self.fe_dict = dict() printlog("**ffe_all = " + str(ffe_all)) for k in ffe_all: file_format = k['file_format'] self.fe_dict[file_format] = \ {'standard_extension': k['standard_file_extension'], 'other_allowed_extensions': k.get('other_allowed_extensions', []), 'extrafile_formats': k.get('extrafile_formats', []) }
def handler(event, context): # fix non json-serializable datetime startDate if 'Records' in event and 'eventTime' in event['Records']: event["Records"]["eventTime"] = str(event["Records"]["eventTime"]) upload_key = event['Records'][0]['s3']['object']['key'] accession = upload_key.split('/')[1].split('.')[0] if not accession.startswith('4DN'): printlog("Skipping trigger: not 4DN accession %s" % accession) return event client = boto3.client('stepfunctions', region_name=AWS_REGION) response = client.start_execution( stateMachineArn=STEP_FUNCTION_ARN(INITIATOR_STEP_FUNCTION_NAME), name=accession + '_' + str(uuid.uuid4()), input=json.dumps(event), ) printlog(str(response)) return event
def add_md5_filesize_to_pf_extra(pf, awsemfile): printlog("awsemfile.is_extra=%s" % awsemfile.is_extra) if awsemfile.is_extra: for pfextra in pf.extra_files: printlog("pfextra : %s" % str(pfextra)) printlog("awsemfile.format_if_extra : %s" % awsemfile.format_if_extra) if pfextra.get('file_format') == awsemfile.format_if_extra: if awsemfile.md5: pfextra['md5sum'] = awsemfile.md5 if awsemfile.filesize: pfextra['file_size'] = awsemfile.filesize printlog("add_md5_filesize_to_pf_extra: %s" % pf.extra_files)
def get_file_format(event): '''if the file extension matches the regular file format, returns (format, None) if it matches one of the format of an extra file, returns (format (e.g. 'pairs_px2'), 'extra') ''' # guess env from bucket name bucket = event['Records'][0]['s3']['bucket']['name'] env = '-'.join(bucket.split('-')[1:3]) if env == 'fourfront-webprod': env = 'data' upload_key = event['Records'][0]['s3']['object']['key'] uuid, object_key = upload_key.split('/') accession = object_key.split('.')[0] extension = object_key.replace(accession + '.', '') try: tibanna = Tibanna(env=env) except Exception as e: raise TibannaStartException("%s" % e) file_format, extra_formats = get_fileformats_for_accession( accession, tibanna.ff_keys, env) if file_format: fe_map = FormatExtensionMap(tibanna.ff_keys) printlog(fe_map) if extension == fe_map.get_extension(file_format): return (file_format, None) elif extension in fe_map.get_other_extensions(file_format): return (file_format, None) else: for extra_format in extra_formats: if extension == fe_map.get_extension(extra_format): return (extra_format, 'extra') elif extension in fe_map.get_other_extensions(extra_format): return (extra_format, 'extra') raise Exception( "file extension not matching: %s vs %s (%s)" % (extension, fe_map.get_extension(file_format), file_format)) else: raise Exception("Cannot get input metadata")
def create_wfr_output_files_and_processed_files( wf_meta, tibanna, pf_source_experiments=None, custom_fields=None, user_supplied_output_files=None): output_files = [] pf_meta = [] arg_type_list = [ 'Output processed file', 'Output report file', 'Output QC file', 'Output to-be-extra-input file' ] for arg in wf_meta.get('arguments', []): printlog("processing arguments %s" % str(arg)) if arg.get('argument_type') in arg_type_list: if user_supplied_output_files: pf, resp = user_supplied_proc_file( user_supplied_output_files, arg.get('workflow_argument_name'), tibanna) printlog( "proc_file_for_arg_name returned %s \nfrom ff result of\n %s" % (str(pf.__dict__), str(resp))) else: if arg.get('argument_type', '') == 'Output processed file': argname = arg.get('workflow_argument_name') pf, resp = create_and_post_processed_file( tibanna.ff_keys, arg.get('argument_format', ''), arg.get('secondary_file_formats', []), pf_source_experiments, parse_custom_fields(custom_fields, argname)) else: pf = None resp = dict() of = create_wfr_outputfiles(arg, resp) if pf: pf_meta.append(pf) if of: output_files.append(of.as_dict()) return output_files, pf_meta
def register_to_higlass(tibanna, awsemfile_bucket, awsemfile_key, filetype, datatype): payload = { "filepath": awsemfile_bucket + "/" + awsemfile_key, "filetype": filetype, "datatype": datatype } higlass_keys = tibanna.s3.get_higlass_key() if not isinstance(higlass_keys, dict): raise Exception("Bad higlass keys found: %s" % higlass_keys) auth = (higlass_keys['key'], higlass_keys['secret']) headers = { 'Content-Type': 'application/json', 'Accept': 'application/json' } res = requests.post(higlass_keys['server'] + '/api/v1/link_tile/', data=json.dumps(payload), auth=auth, headers=headers) printlog("LOG resiter_to_higlass(POST request response): " + str(res.json())) return res.json()['uuid']
def test_merge_source_experiment(run_awsem_event_data): input_file = { "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput", "workflow_argument_name": "input_pairs", "uuid": [ "d2c897ec-bdb2-47ce-b1b1-845daccaa571", "d2c897ec-bdb2-47ce-b1b1-845daccaa571" ], "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"] } data = run_awsem_event_data tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=data.get('ff_keys'), settings=tibanna_settings) res = merge_source_experiments(input_file['uuid'], tibanna.ff_keys, tibanna.env) printlog(res) assert 'fake_source_experiment' in res
def user_supplied_proc_file(user_supplied_output_files, arg_name, tibanna): if not user_supplied_output_files: raise Exception("user supplied processed files missing\n") of = [ output for output in user_supplied_output_files if output.get('workflow_argument_name') == arg_name ] if of: if len(of) > 1: raise Exception( "multiple output files supplied with same workflow_argument_name" ) of = of[0] return ProcessedFileMetadata.get(of.get('uuid'), tibanna.ff_keys, tibanna.env, return_data=True) else: printlog("no output_files found in input_json matching arg_name") printlog("user_supplied_output_files: %s" % str(user_supplied_output_files)) printlog("arg_name: %s" % str(arg_name)) printlog("tibanna is %s" % str(tibanna)) raise Exception("user supplied processed files missing\n")
def md5_updater(status, awsemfile, ff_meta, tibanna, **kwargs): # get key ff_key = tibanna.ff_keys # get metadata about original input file accession = awsemfile.runner.get_file_accessions('input_file')[0] format_if_extras = awsemfile.runner.get_format_if_extras('input_file') original_file = ff_utils.get_metadata(accession, key=ff_key, ff_env=tibanna.env, add_on='frame=object', check_queue=True) if status.lower() == 'uploaded': # md5 report file is uploaded md5, content_md5 = parse_md5_report(awsemfile.read()) # add file size to input file metadata input_file = awsemfile.runner.input_files()[0] file_size = boto3.client('s3').head_object(Bucket=input_file.bucket, Key=input_file.key).get( 'ContentLength', '') for format_if_extra in format_if_extras: printlog("format_if_extra : %s" % format_if_extra) new_file = _md5_updater(original_file, md5, content_md5, format_if_extra, file_size) if new_file: break printlog("new_file = %s" % str(new_file)) if new_file: try: resp = ff_utils.patch_metadata(new_file, accession, key=ff_key) printlog(resp) except Exception as e: # TODO specific excpetion # if patch fails try to patch worfklow status as failed raise e else: pass # nothing to patch to ff_meta return None
def real_handler(event, context): # check the status and other details of import ''' this is to check if the task run is done: http://docs.sevenbridges.com/reference#get-task-execution-details ''' # get data # used to automatically determine the environment tibanna_settings = event.get('_tibanna', {}) try: tibanna = Tibanna(tibanna_settings['env'], settings=tibanna_settings) except Exception as e: raise TibannaStartException("%s" % e) ff_meta = create_ffmeta_awsem( app_name=event.get('ff_meta').get('awsem_app_name'), **event.get('ff_meta')) if event.get('error', False): ff_meta.run_status = 'error' ff_meta.description = event.get('error') patch_res = ff_meta.patch(key=tibanna.ff_keys) printlog("patch response: " + str(patch_res)) # sending a notification email before throwing error if 'email' in event['config'] and event['config']['email']: try: send_notification_email( event['_tibanna']['settings']['run_name'], event['jobid'], ff_meta.run_status, event['_tibanna']['settings']['url']) except Exception as e: printlog("Cannot send email: %s" % e) raise Exception(event.get('error')) metadata_only = event.get('metadata_only', False) pf_meta = [ProcessedFileMetadata(**pf) for pf in event.get('pf_meta')] custom_qc_fields = event.get('custom_qc_fields', None) # ensure this bad boy is always initialized awsem = Awsem(event) # go through this and replace awsemfile_report with awsf format # actually interface should be look through ff_meta files and call # give me the status of this thing from the runner, and runner.output_files.length # so we just build a runner with interface to sbg and awsem # runner.output_files.length() # runner.output_files.file.status # runner.output_files.file.loc # runner.output_files.file.get awsem_output = awsem.output_files() awsem_output_extra = awsem.secondary_output_files() ff_output = len(ff_meta.output_files) if len(awsem_output) != ff_output: ff_meta.run_status = 'error' ff_meta.description = "%d files output expected %s" % ( ff_output, len(awsem_output)) ff_meta.patch(key=tibanna.ff_keys) raise Exception( "Failing the workflow because outputed files = %d and ffmeta = %d" % (awsem_output, ff_output)) def update_metadata_from_awsemfile_list(awsemfile_list): patch_meta = False for awsemfile in awsemfile_list: patch_meta = update_ffmeta_from_awsemfile(awsemfile, ff_meta, tibanna, custom_qc_fields) if not metadata_only: update_pfmeta_from_awsemfile(awsemfile, pf_meta, tibanna) # allow for a simple way for updater to add appropriate meta_data if patch_meta: ff_meta.__dict__.update(patch_meta) update_metadata_from_awsemfile_list(awsem_output) update_metadata_from_awsemfile_list(awsem_output_extra) # if we got all the awsemfiles let's go ahead and update our ff_metadata object ff_meta.run_status = "complete" # add postrunjson log file to ff_meta as a url ff_meta.awsem_postrun_json = get_postrunjson_url(event) # make all the file awsemfile meta-data stuff here # TODO: fix bugs with ff_meta mapping for output and input file try: ff_meta.patch(key=tibanna.ff_keys) except Exception as e: raise Exception("Failed to update run_status %s" % str(e)) # patch processed files - update only status, extra_files, md5sum and file_size if pf_meta: patch_fields = [ 'uuid', 'status', 'extra_files', 'md5sum', 'file_size', 'higlass_uid' ] try: for pf in pf_meta: printlog(pf.as_dict()) pf.patch(key=tibanna.ff_keys, fields=patch_fields) except Exception as e: raise Exception("Failed to update processed metadata %s" % str(e)) event['ff_meta'] = ff_meta.as_dict() event['pf_meta'] = [_.as_dict() for _ in pf_meta] # sending a notification email after the job finishes if 'email' in event['config'] and event['config']['email']: try: send_notification_email(event['_tibanna']['settings']['run_name'], event['jobid'], event['ff_meta']['run_status'], event['_tibanna']['settings']['url']) except Exception as e: printlog("Cannot send email: %s" % e) return event
def launch_and_get_instance_id(launch_args, jobid, spot_instance=None, spot_duration=None, behavior_on_capacity_limit='fail'): try: # capturing stdout from the launch command os.environ[ 'AWS_DEFAULT_REGION'] = 'us-east-1' # necessary? not sure just put it in there ec2 = boto3.client('ec2') except Exception as e: raise Exception("Failed to create a client for EC2") if spot_instance: spot_options = { 'SpotInstanceType': 'one-time', 'InstanceInterruptionBehavior': 'terminate' } if spot_duration: spot_options['BlockDurationMinutes'] = spot_duration launch_args.update({ 'InstanceMarketOptions': { 'MarketType': 'spot', 'SpotOptions': spot_options } }) try: res = 0 res = ec2.run_instances(**launch_args) except Exception as e: if 'InsufficientInstanceCapacity' in str( e) or 'InstanceLimitExceeded' in str(e): if behavior_on_capacity_limit == 'fail': errmsg = "Instance limit exception - use 'behavior_on_capacity_limit' option" errmsg += "to change the behavior to wait_and_retry, or retry_without_spot. %s" % str( e) raise EC2InstanceLimitException(errmsg) elif behavior_on_capacity_limit == 'wait_and_retry': errmsg = "Instance limit exception - wait and retry later: %s" % str( e) raise EC2InstanceLimitWaitException(errmsg) elif behavior_on_capacity_limit == 'retry_without_spot': if not spot_instance: errmsg = "'behavior_on_capacity_limit': 'retry_without_spot' works only with" errmsg += "'spot_instance' : true. %s" % str(e) raise Exception(errmsg) del (launch_args['InstanceMarketOptions']) try: res = ec2.run_instances(**launch_args) printlog("trying without spot : %s" % str(res)) except Exception as e2: errmsg = "Instance limit exception without spot instance %s" % str( e2) raise EC2InstanceLimitException(errmsg) else: raise Exception( "failed to launch instance for job {jobid}: {log}. %s".format( jobid=jobid, log=res) % e) try: instance_id = res['Instances'][0]['InstanceId'] except Exception as e: raise Exception( "failed to retrieve instance ID for job {jobid}".format( jobid=jobid)) return instance_id
def read_s3(bucket, object_name): response = boto3.client('s3').get_object(Bucket=bucket, Key=object_name) printlog(str(response)) return response['Body'].read()
def handler(event, context): ''' somewhere in the event data should be a jobid ''' # s3 bucket that stores the output bucket_name = event['config']['log_bucket'] # info about the jobby job jobid = event['jobid'] job_started = "%s.job_started" % jobid job_success = "%s.success" % jobid job_error = "%s.error" % jobid # check to see ensure this job has started else fail if not does_key_exist(bucket_name, job_started): raise EC2StartingException( "Failed to find jobid %s, ec2 is probably still booting" % jobid) # check to see if job has error, report if so if does_key_exist(bucket_name, job_error): handle_postrun_json(bucket_name, jobid, event, False) raise AWSEMJobErrorException( "Job encountered an error check log using invoke log --job-id=%s" % jobid) # check to see if job has completed if does_key_exist(bucket_name, job_success): handle_postrun_json(bucket_name, jobid, event) print("completed successfully") return event # checking if instance is terminated for no reason instance_id = event.get('instance_id', '') if instance_id: # skip test for instance_id by not giving it to event try: res = boto3.client('ec2').describe_instances( InstanceIds=[instance_id]) except Exception as e: if 'InvalidInstanceID.NotFound' in str(e): raise EC2UnintendedTerminationException( "EC2 is no longer found for job %s - please rerun." % jobid) else: raise e if not res['Reservations']: raise EC2UnintendedTerminationException( "EC2 is no longer found for job %s - please rerun." % jobid) else: ec2_state = res['Reservations'][0]['Instances'][0]['State']['Name'] if ec2_state in ['stopped', 'shutting-down', 'terminated']: errmsg = "EC2 is terminated unintendedly for job %s - please rerun." % jobid printlog(errmsg) raise EC2UnintendedTerminationException(errmsg) # check CPU utilization for the past hour filesystem = '/dev/nvme1n1' # doesn't matter for cpu utilization end = datetime.now(tzutc()) start = end - timedelta(hours=1) jobstart_time = boto3.client('s3').get_object( Bucket=bucket_name, Key=job_started).get('LastModified') if jobstart_time + timedelta(hours=1) < end: cw_res = TibannaResource(instance_id, filesystem, start, end).as_dict() if 'max_cpu_utilization_percent' in cw_res: if not cw_res['max_cpu_utilization_percent'] or cw_res[ 'max_cpu_utilization_percent'] < 1.0: # the instance wasn't terminated - otherwise it would have been captured in the previous error. try: boto3.client('ec2').terminate_instances( InstanceIds=[instance_id]) except Exception as e: errmsg = "Nothing has been running for the past hour for job %s," + \ "but cannot terminate the instance (cpu utilization (%s) : %s" % \ jobid, str(cw_res['max_cpu_utilization_percent']), str(e) printlog(errmsg) raise EC2IdleException(errmsg) # if none of the above raise StillRunningException("job %s still running" % jobid)
def update_pfmeta_from_awsemfile(awsemfile, pf_meta, tibanna): status = awsemfile.status printlog("awsemfile res is %s" % status) if status == 'COMPLETED': if awsemfile.argument_type == 'Output processed file': update_processed_file(awsemfile, pf_meta, tibanna)
def real_handler(event, context): ''' this is generic function to run awsem workflow based on the data passed in workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name. Note multiple workflow_uuids can be available for an app_name (different versions of the same app could have a different uuid) ''' # keep the input json on s3 logbucket = event.get('config', {}).get('log_bucket', '') jobid = event.get('jobid', '') if logbucket and jobid: boto3.client('s3').put_object(Body=json.dumps( event, indent=4).encode('ascii'), Key=jobid + '.input.json', Bucket=logbucket) # get incomming data input_file_list = event.get('input_files') for infile in input_file_list: if not infile: raise ("malformed input, check your input_files") workflow_uuid = event.get('workflow_uuid') output_bucket = event.get('output_bucket') parameters = ff_utils.convert_param(event.get('parameters'), True) tibanna_settings = event.get('_tibanna', {}) if 'overwrite_input_extra' in event.get('config'): overwrite_input_extra = event.get('config')['overwrite_input_extra'] else: overwrite_input_extra = event.get('overwrite_input_extra', False) tag = event.get('tag') # if they don't pass in env guess it from output_bucket try: env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1])) printlog("Tibanna setting : env= " + env) # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=event.get('ff_keys'), settings=tibanna_settings) printlog("Tibanna ff_keys url : " + tibanna.ff_keys['server']) printlog("Tibanna.s3.url: " + tibanna.s3.url) except Exception as e: raise TibannaStartException("%s" % e) args = dict() # get argument format & type info from workflow wf_meta = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys, ff_env=tibanna.env, add_on='frame=object') printlog("workflow info %s" % wf_meta) if 'error' in wf_meta.get('@type', []): raise Exception("FATAL, can't lookup workflow info for %s fourfront" % workflow_uuid) # get cwl info from wf_meta for k in [ 'app_name', 'app_version', 'cwl_directory_url', 'cwl_main_filename', 'cwl_child_filenames', 'wdl_directory_url', 'wdl_main_filename', 'wdl_child_filenames' ]: printlog(wf_meta.get(k)) args[k] = wf_meta.get(k, '') if not args['cwl_child_filenames']: args['cwl_child_filenames'] = [] if not args['wdl_child_filenames']: args['wdl_child_filenames'] = [] if 'workflow_language' in wf_meta and wf_meta['workflow_language'] == 'WDL': args['language'] = 'wdl' else: # switch to v1 if available if 'cwl_directory_url_v1' in wf_meta: # use CWL v1 args['cwl_directory_url'] = wf_meta['cwl_directory_url_v1'] args['cwl_version'] = 'v1' else: args['cwl_version'] = 'draft3' # input file args for awsem for input_file in input_file_list: process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args) # create the ff_meta output info input_files_for_ffmeta = create_ffmeta_input_files_from_pony_input_file_list( input_file_list) # source experiments input_file_uuids = [_['uuid'] for _ in input_file_list] pf_source_experiments = merge_source_experiments(input_file_uuids, tibanna.ff_keys, tibanna.env) # processed file metadata output_files, pf_meta = \ create_wfr_output_files_and_processed_files(wf_meta, tibanna, pf_source_experiments, custom_fields=event.get('custom_pf_fields'), user_supplied_output_files=event.get('output_files')) print("output files= %s" % str(output_files)) # 4DN dcic award and lab are used here, unless provided in wfr_meta ff_meta = create_ffmeta_awsem(workflow_uuid, args['app_name'], args['app_version'], input_files_for_ffmeta, tag=tag, run_url=tibanna.settings.get('url', ''), output_files=output_files, parameters=parameters, extra_meta=event.get('wfr_meta'), jobid=jobid) printlog("ff_meta is %s" % ff_meta.__dict__) # store metadata so we know the run has started ff_meta.post(key=tibanna.ff_keys) # parameters args['input_parameters'] = event.get('parameters') # output target args['output_target'] = dict() args['secondary_output_target'] = dict() for of in ff_meta.output_files: arg_name = of.get('workflow_argument_name') if of.get('type') == 'Output processed file': args['output_target'][arg_name] = of.get('upload_key') elif of.get('type') == 'Output to-be-extra-input file': target_inf = input_files_for_ffmeta[ 0] # assume only one input for now target_key = output_target_for_input_extra(target_inf, of, tibanna, overwrite_input_extra) args['output_target'][arg_name] = target_key else: random_tag = str(int(random.random() * 1000000000000)) # add a random tag at the end for non-processed file e.g. md5 report, # so that if two or more wfr are trigerred (e.g. one with parent file, one with extra file) # it will create a different output. Not implemented for processed files - # it's tricky because processed files must have a specific name. args['output_target'][ arg_name] = ff_meta.uuid + '/' + arg_name + random_tag if 'secondary_file_formats' in of and 'extra_files' in of and of[ 'extra_files']: for ext in of.get('extra_files'): if arg_name not in args['secondary_output_target']: args['secondary_output_target'] = { arg_name: [ext.get('upload_key')] } else: args['secondary_output_target'][arg_name].append( ext.get('upload_key')) # output bucket args['output_S3_bucket'] = event.get('output_bucket') # dependencies if 'dependency' in event: args['dependency'] = event['dependency'] # initialize config parameters as null for benchmarking config = event['config'] if 'instance_type' not in config: config['instance_type'] = '' if 'EBS_optimized' not in config: config['EBS_optimized'] = '' if 'ebs_size' not in config: config['ebs_size'] = 0 if 'public_postrun_json' not in config: config['public_postrun_json'] = True event.update({ "ff_meta": ff_meta.as_dict(), 'pf_meta': [meta.as_dict() for meta in pf_meta], "_tibanna": tibanna.as_dict(), "args": args }) return (event)
def _qc_updater(status, awsemfile, ff_meta, tibanna, quality_metric='quality_metric_fastqc', file_argument='input_fastq', report_html=None, datafiles=None, zipped=True, datajson_argument=None, other_fields=None): if datajson_argument == awsemfile.argument_name: return # avoid using [] as default argument if datafiles is None: datafiles = ['summary.txt', 'fastqc_data.txt'] if status == 'uploading': # wait until this bad boy is finished return # keys ff_key = tibanna.ff_keys # move files to proper s3 location # need to remove sbg from this line accession = awsemfile.runner.get_file_accessions(file_argument)[0] zipped_report = awsemfile.key files_to_parse = datafiles if report_html: files_to_parse.append(report_html) printlog("accession is %s" % accession) jsondata = dict() if zipped: try: files = awsemfile.s3.unzip_s3_to_s3(zipped_report, accession, files_to_parse, acl='public-read') except Exception as e: printlog(tibanna.s3.__dict__) raise Exception("%s (key={})\n".format(zipped_report) % e) printlog("files : %s" % str(files)) filedata = [files[_]['data'] for _ in datafiles] else: if datajson_argument: datajson_key = awsemfile.runner.get_file_key(datajson_argument) jsondata0 = [ json.loads(awsemfile.s3.read_s3(_)) for _ in datajson_key ] for d in jsondata0: jsondata.update(d) filedata = [awsemfile.s3.read_s3(_) for _ in datafiles] reportdata = awsemfile.s3.read_s3(report_html) report_html = accession + 'qc_report.html' awsemfile.s3.s3_put(reportdata, report_html, acl='public-read') qc_url = 'https://s3.amazonaws.com/' + awsemfile.bucket + '/' + report_html files = {report_html: {'data': reportdata, 's3key': qc_url}} # schema. do not need to check_queue qc_schema = ff_utils.get_metadata("profiles/" + quality_metric + ".json", key=ff_key, ff_env=tibanna.env) # parse fastqc metadata if report_html in files: qc_url = files[report_html]['s3key'] else: qc_url = None meta = parse_qc_table(filedata, qc_schema=qc_schema.get('properties'), url=qc_url) if jsondata: meta.update(jsondata) # custom fields if other_fields: for field in other_fields: meta.update(other_fields) printlog("qc meta is %s" % meta) # post fastq metadata qc_meta = ff_utils.post_metadata(meta, quality_metric, key=ff_key) if qc_meta.get('@graph'): qc_meta = qc_meta['@graph'][0] printlog("qc_meta is %s" % qc_meta) # update original file as well try: original_file = ff_utils.get_metadata(accession, key=ff_key, ff_env=tibanna.env, add_on='frame=object', check_queue=True) printlog("original_file is %s" % original_file) except Exception as e: raise Exception( "Couldn't get metadata for accession {} : ".format(accession) + str(e)) patch_file = {'quality_metric': qc_meta['@id']} try: ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key) except Exception as e: raise Exception("patch_metadata failed in fastqc_updater." + str(e) + "original_file ={}\n".format(str(original_file))) # patch the workflow run, value_qc is used to make drawing graphs easier. output_files = ff_meta.output_files output_files[0]['value_qc'] = qc_meta['@id'] retval = {'output_files': output_files} printlog("retval is %s" % retval) return retval