def get_metadata(keypairs_file, schema_name=None, schema_class_name=None, uuid=None): assert os.path.isfile(str(keypairs_file)) try: key = fdnDCIC.FDN_Key(keypairs_file, "default") except Exception as e: print(e) print("key error") raise e try: connection = fdnDCIC.FDN_Connection(key) except Exception as e: print(e) print("connection error") raise e try: if schema_name is not None: response = fdnDCIC.get_FDN(schema_name, connection) return(response) if schema_class_name is not None: response = fdnDCIC.get_FDN("search/?type=" + schema_class_name, connection) return(response) if uuid is not None: response = fdnDCIC.get_FDN(uuid, connection) return(response) except Exception as e: print(e) print("get error") raise e
def get_species_from_expr(expr, connection): """get species for a given experiment""" if isinstance(expr, dict): sep_resp = expr else: sep_resp = fdnDCIC.get_FDN(expr, connection) sep_resp2 = fdnDCIC.get_FDN(sep_resp["biosample"], connection)["biosource"] indv = fdnDCIC.get_FDN(sep_resp2[0], connection)["individual"] return(str(fdnDCIC.get_FDN(indv, connection)['organism']))
def get_metadata(obj_id, key='', connection=None, frame="object"): connection = fdn_connection(key, connection) res = fdnDCIC.get_FDN(obj_id, connection, frame=frame) retry = 1 sleep = [2, 4, 12] while 'error' in res.get('@type', []) and retry < 3: time.sleep(sleep[retry]) retry += 1 res = fdnDCIC.get_FDN(obj_id, connection, frame=frame) return res
def patch_to_metadata(keypairs_file, patch_item, schema_class_name=None, accession=None, uuid=None): assert os.path.isfile(keypairs_file) try: key = fdnDCIC.FDN_Key(keypairs_file, "default") except Exception as e: print(e) print("key error") raise e try: connection = fdnDCIC.FDN_Connection(key) except Exception as e: print(e) print("connection error") raise e try: if (schema_class_name is not None): resp = fdnDCIC.get_FDN("/search/?type=" + schema_class_name, connection) items_uuids = [i['uuid'] for i in resp['@graph']] elif (accession is not None): resp = fdnDCIC.get_FDN("/" + accession, connection) item_uuid = resp.get('uuid') items_uuids = [item_uuid] elif (uuid is not None): items_uuids = [uuid] else: items_uuids = [] except Exception as e: print(e) print("get error") raise e try: for item_uuid in items_uuids: response = fdnDCIC.patch_FDN(item_uuid, connection, patch_item) return (response) except Exception as e: print(e) print("get error") raise e
def delete_field(post_json, del_field, connection=None): """Does a put to delete the given field.""" my_uuid = post_json.get("uuid") my_accession = post_json.get("accesion") raw_json = fdnDCIC.get_FDN(my_uuid, connection, frame="raw") # check if the uuid is in the raw_json if not raw_json.get("uuid"): raw_json["uuid"] = my_uuid # if there is an accession, add it to raw so it does not created again if my_accession: if not raw_json.get("accession"): raw_json["accession"] = my_accession # remove field from the raw_json if raw_json.get(del_field): del raw_json[del_field] # Do the put with raw_json try: response = fdnDCIC.put_FDN(my_uuid, connection, raw_json) if response.get('status') == 'error': raise Exception("error %s \n unable to delete field: %s \n of item: %s" % (response, del_field, my_uuid)) except Exception as e: raise Exception("error %s \n unable to delete field: %s \n of item: %s" % (e, del_field, my_uuid)) return response
def delete_wfr_many(wf_uuid, keypairs_file, run_status_filter=['error'], input_source_experiment_filter=None, delete=True): """delete the wfr metadata for all wfr with a specific wf if run_status_filter is set, only those with the specific run_status is deleted run_status_filter : list of run_statuses e.g. ['started', 'error'] if run_status_filter is None, it deletes everything if input_source_experiment_filter is set (an array, e.g. ['some_uuid', 'some_other_uuid', ...]), only wfr whose input source experiment is one of these specified are deleted. """ connection = get_connection(keypairs_file) wfrsearch_resp = fdnDCIC.get_FDN('search/?workflow.uuid=' + wf_uuid + '&type=WorkflowRun', connection) for entry in wfrsearch_resp['@graph']: # skip entries that are already deleted if entry['status'] == 'deleted': continue # run_status filter if run_status_filter: if 'run_status' not in entry or entry['run_status'] not in run_status_filter: continue # input_source_experiment_filter if input_source_experiment_filter: sexp = get_wfr_input_source_experiment(entry, connection) if not set(sexp).intersection(input_source_experiment_filter): continue print('\n\ntobedeleted: ' + entry['uuid'] + ':' + str(entry)) if delete: delete_wfr(entry, connection)
def testrun_md5(keypairs_file, workflow_name='tibanna_pony', env='webdev'): """Creates a random file object with no md5sum/content_md5sum and run md5 workflow. It waits for 6 mintues till the workflow run finishes and checks the input file object has been updated. """ bucket = "elasticbeanstalk-fourfront-" + env + "-wfoutput" newfile = post_random_file(bucket, keypairs_file) uuid = newfile['uuid'] accession = newfile['accession'] input_json = { "config": { "ebs_type": "io1", "ebs_iops": 500, "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access", "ami_id": "ami-cfb14bb5", "json_bucket": "4dn-aws-pipeline-run-json", "shutdown_min": 30, "copy_to_s3": True, "launch_instance": True, "log_bucket": "tibanna-output", "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/", "key_name": "4dn-encode", "password": "" }, "_tibanna": { "env": "fourfront-webdev", "run_type": "md5" }, "parameters": {}, "app_name": "md5", "workflow_uuid": "c77a117b-9a58-477e-aaa5-291a109a99f6", "input_files": [{ "workflow_argument_name": "input_file", "bucket_name": bucket, "uuid": uuid, "object_key": accession + '.pairs.gz' }], "output_bucket": bucket } resp = run_workflow(input_json, workflow=workflow_name) print(resp) # check result key = fdnDCIC.FDN_Key(keypairs_file, "default") connection = fdnDCIC.FDN_Connection(key) time.sleep(6 * 60) # wait for 6 minutes filemeta = fdnDCIC.get_FDN(uuid, connection) content_md5sum = filemeta.get('content_md5sum') md5sum = filemeta.get('md5sum') if content_md5sum and md5sum: print(content_md5sum) print(md5sum) else: raise Exception('md5 step function run failed')
def get_datatype_for_expr(expr, connection): """get experiment type (e.g. 'in situ Hi-C') given an experiment id (or uuid)""" if isinstance(expr, dict): exp_resp = expr else: exp_resp = fdnDCIC.get_FDN(expr, connection) datatype = exp_resp['experiment_type'] return(datatype)
def patch_to_metadata(keypairs_file, patch_item, schema_class_name=None, accession=None, uuid=None): assert os.path.isfile(keypairs_file) try: key = fdnDCIC.FDN_Key(keypairs_file, "default") except Exception as e: print(e) print("key error") raise e try: connection = fdnDCIC.FDN_Connection(key) except Exception as e: print(e) print("connection error") raise e try: if(schema_class_name is not None): resp = fdnDCIC.get_FDN("/search/?type=" + schema_class_name, connection) items_uuids = [i['uuid'] for i in resp['@graph']] elif(accession is not None): resp = fdnDCIC.get_FDN("/" + accession, connection) item_uuid = resp.get('uuid') items_uuids = [item_uuid] elif(uuid is not None): items_uuids = [uuid] else: items_uuids = [] except Exception as e: print(e) print("get error") raise e try: for item_uuid in items_uuids: response = fdnDCIC.patch_FDN(item_uuid, connection, patch_item) return(response) except Exception as e: print(e) print("get error") raise e
def prep_input_file_entry_list_for_single_exp(input_argname, prev_workflow_uuid, prev_output_argument_name, connection, addon=None, wfuuid=None, datatype_filter=None, single=True): schema_name = 'search/?type=WorkflowRunAwsem&workflow.uuid=' + prev_workflow_uuid + '&run_status=complete' schema_name = schema_name + '&datastore=database' response = fdnDCIC.get_FDN(schema_name, connection) files_for_ep = map_exp_to_inputfile_entry(response, input_argname, prev_output_argument_name, connection, addon=addon, wfuuid=wfuuid, datatype_filter=datatype_filter, single=single) return(files_for_ep)
def release_all_wfr(keypairs_file, searchterm='?run_status=complete&type=WorkflowRunAwsem&status=in+review+by+lab', releaseterm='released to project'): connection = get_connection(keypairs_file) wfrsearch_resp = fdnDCIC.get_FDN(searchterm, connection) for entry in wfrsearch_resp['@graph']: patch_json = {'uuid': entry['uuid'], 'status': releaseterm} patch_resp = fdnDCIC.patch_FDN(entry['uuid'], connection, patch_json) print(patch_resp)
def get_wfr_input_source_experiment(wfr_dict, connection): "returns all the input source experiments in a nonredundant list" if 'input_files' not in wfr_dict: return(None) sexp = [] for if_id in [_['value'] for _ in wfr_dict['input_files']]: if_dict = fdnDCIC.get_FDN(if_id, connection) if 'source_experiments' in if_dict: sexp.extend(if_dict['source_experiments']) return(list(set(sexp)))
def get_digestion_enzyme_for_expr(expr, connection): """get species for a given experiment Returns enzyme name (e.g. HindIII) """ if isinstance(expr, dict): exp_resp = expr else: exp_resp = fdnDCIC.get_FDN(expr, connection) if 'digestion_enzyme' not in exp_resp: return(None) re = exp_resp['digestion_enzyme'].replace('/enzymes/', '').replace('/', '') return(re)
def create_inputfile_entry(fileId, input_argname, connection, addon=None, wfr_input_filter=None, datatype_filter=None): """create an input file entry (uuid, accession, object_key) addon : list of following strings (currently only 're' is available to add restriction enzyme info) wfr_input_filter : workflow_uuid, return None if specified and has a completed or started run of the specified workflow assumes file is a processed file (has source_experiments field) assumes single source_experiments """ file_dict = fdnDCIC.get_FDN(fileId + '?datastore=database', connection) if 'uuid' not in file_dict: raise Exception("key error uuid: " + str(file_dict)) file_uuid = file_dict['uuid'] entry = {'uuid': file_uuid, 'accession': file_dict['accession'], 'object_key': file_dict['upload_key'].replace(file_uuid + '/', ''), 'workflow_argument_name': input_argname} # add source experiment if exists if 'source_experiments' in file_dict: if file_dict['source_experiments']: sep = file_dict['source_experiments'][0] sep_dict = fdnDCIC.get_FDN(sep, connection) sep_id = sep_dict['@id'] entry['source_experiments'] = [sep_id] if datatype_filter: # would be faster if it takes sep_dict. Leave it for now datatype = get_datatype_for_expr(sep_dict, connection) if datatype not in datatype_filter: return(None) if addon: if 're' in addon: entry['RE'] = get_digestion_enzyme_for_expr(sep_dict, connection) if wfr_input_filter: wfr_info = get_info_on_workflowrun_as_input(file_dict, connection) if wfr_input_filter in wfr_info: if 'complete' in wfr_info[wfr_input_filter]: return(None) # if 'started' in wfr_info[wfr_input_filter]: # return(None) return(entry)
def delete_wfr(wfr_dict, connection): # delete all the output files first if 'output_files' in wfr_dict: outputfile_ids = [_['value'] for _ in wfr_dict['output_files']] for of_id in outputfile_ids: of_uuid = fdnDCIC.get_FDN(of_id, connection)['uuid'] output_patch_json = {'uuid': of_uuid, 'status': 'deleted'} patch_resp = fdnDCIC.patch_FDN(of_uuid, connection, output_patch_json) print(patch_resp) # then delete the wfr itself patch_json = {'uuid': wfr_dict['uuid'], 'status': 'deleted'} patch_resp = fdnDCIC.patch_FDN(wfr_dict['uuid'], connection, patch_json) print(patch_resp)
def get_metadata(keypairs_file, schema_name=None, schema_class_name=None, uuid=None): assert os.path.isfile(str(keypairs_file)) try: key = fdnDCIC.FDN_Key(keypairs_file, "default") except Exception as e: print(e) print("key error") raise e try: connection = fdnDCIC.FDN_Connection(key) except Exception as e: print(e) print("connection error") raise e try: if schema_name is not None: response = fdnDCIC.get_FDN(schema_name, connection) return (response) if schema_class_name is not None: response = fdnDCIC.get_FDN("search/?type=" + schema_class_name, connection) return (response) if uuid is not None: response = fdnDCIC.get_FDN(uuid, connection) return (response) except Exception as e: print(e) print("get error") raise e
def get_info_on_workflowrun_as_input(file_dict, connection): """given a json for file, returns a dictionary with workflow uuids as keys. dictionary structure : dict{wf_uuid}{run_status} = [wfr_id1, wfr_id2, ... ] These workflow uuids are the the ones in the workflow runs that has the given file as input """ wfr_info = dict() if 'workflow_run_inputs' in file_dict: wfr_list = file_dict.get("workflow_run_inputs") if wfr_list: for wfr in wfr_list: wfr_dict = fdnDCIC.get_FDN(wfr, connection) wf = wfr_dict['workflow'].replace('/workflows/', '').replace('/', '') run_status = wfr_dict['run_status'] if wf not in wfr_info: wfr_info[wf] = dict() if run_status not in wfr_info[wf]: wfr_info[wf][run_status] = [] wfr_info[wf][run_status].append(wfr) return(wfr_info)
def get_expset_from_exp(expr, connection): """getting the experiment sets of an experiment """ sep_dict = fdnDCIC.get_FDN(expr, connection) seps = sep_dict['experiment_sets'] return(seps)
def get_metadata(obj_id, key='', connection=None): connection = fdn_connection(key, connection) return fdnDCIC.get_FDN(obj_id, connection)
def get_nrawfiles_from_exp(expr, connection): """getting the number of raw files of an experiment """ sep_dict = fdnDCIC.get_FDN(expr, connection) nfiles = len(sep_dict['files']) return(nfiles)
def get_allexp_from_expset(expset, connection): """getting all the experiments from an experiment set """ seps_dict = fdnDCIC.get_FDN(expset, connection) return(seps_dict['experiments_in_set'])