def run_json(input_files, env, wf_info, run_name): my_s3_util = s3Utils(env=env) out_bucket = my_s3_util.outfile_bucket """Creates the trigger json that is used by foufront endpoint. """ input_json = { 'input_files': input_files, 'output_bucket': out_bucket, 'workflow_uuid': wf_info['wf_uuid'], "app_name": wf_info['wf_name'], "wfr_meta": wf_info['wfr_meta'], "parameters": wf_info['parameters'], "config": { "ebs_type": "gp2", "spot_instance": True, "json_bucket": "4dn-aws-pipeline-run-json", "ebs_iops": "", "shutdown_min": "now", "copy_to_s3": True, "launch_instance": True, "password": "", "log_bucket": "tibanna-output", "key_name": "4dn-encode" }, "_tibanna": { "env": env, "run_type": wf_info['wf_name'], "run_id": run_name } } # overwrite or add custom fields for a_key in ['config', 'custom_pf_fields', 'overwrite_input_extra']: if a_key in wf_info: input_json[a_key] = wf_info[a_key] return input_json
def extract_file_info(obj_id, arg_name, env, rename=[]): auth = ff_utils.get_authentication_with_server({}, ff_env=env) my_s3_util = s3Utils(env=env) raw_bucket = my_s3_util.raw_file_bucket out_bucket = my_s3_util.outfile_bucket """Creates the formatted dictionary for files. """ # start a dictionary template = {"workflow_argument_name": arg_name} if rename: change_from = rename[0] change_to = rename[1] # if it is list of items, change the structure if isinstance(obj_id, list): object_key = [] uuid = [] buckets = [] for obj in obj_id: metadata = ff_utils.get_metadata(obj, key=auth) object_key.append(metadata['display_title']) uuid.append(metadata['uuid']) # get the bucket if 'FileProcessed' in metadata['@type']: my_bucket = out_bucket else: # covers cases of FileFastq, FileReference, FileMicroscopy my_bucket = raw_bucket buckets.append(my_bucket) # check bucket consistency try: assert len(list(set(buckets))) == 1 except AssertionError: print('Files from different buckets', obj_id) return template['object_key'] = object_key template['uuid'] = uuid template['bucket_name'] = buckets[0] if rename: template['rename'] = [ i.replace(change_from, change_to) for i in template['object_key'] ] # if obj_id is a string else: metadata = ff_utils.get_metadata(obj_id, key=auth) template['object_key'] = metadata['display_title'] template['uuid'] = metadata['uuid'] # get the bucket if 'FileProcessed' in metadata['@type']: my_bucket = out_bucket else: # covers cases of FileFastq, FileReference, FileMicroscopy my_bucket = raw_bucket template['bucket_name'] = my_bucket if rename: template['rename'] = template['object_key'].replace( change_from, change_to) return template
def run_missing_atac1(wf_info, organism, paired, files, obj_keys, my_env, my_key, run_name): my_s3_util = s3Utils(env=my_env) raw_bucket = my_s3_util.raw_file_bucket out_bucket = my_s3_util.outfile_bucket if organism == "human": org = 'hs' input_files = [{ "object_key": "4DNFIMQPTYDY.bowtie2Index.tar", "rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", "bucket_name": raw_bucket, "workflow_argument_name": "atac.bowtie2_idx_tar", "uuid": "28ab6265-f426-4a23-bb8a-f28467ad505b" }, { "object_key": "4DNFIZ1TGJZR.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "atac.blacklist", "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966" }, { "object_key": "4DNFIZJB62D1.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "atac.chrsz", "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb" }] elif organism == "mouse": org = 'mm' input_files = [{ "object_key": "4DNFI2493SDN.bowtie2Index.tar", "rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar", "bucket_name": raw_bucket, "workflow_argument_name": "atac.bowtie2_idx_tar", "uuid": "63e22058-79c6-4e24-8231-ca4afac29dda" }, { "object_key": "4DNFIZ3FBPK8.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "atac.blacklist", "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925" }, { "object_key": "4DNFIBP173GC.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "atac.chrsz", "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9" }] input_files.append({ "object_key": obj_keys, "bucket_name": raw_bucket, "workflow_argument_name": "atac.fastqs", "uuid": files }) if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True parameters = { "atac.pipeline_type": 'atac', "atac.paired_end": chip_p, "atac.gensz": org, "atac.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt", "atac.disable_ataqc": True, "atac.enable_xcor": False, "atac.trim_adapter.auto_detect_adapter": True, "atac.bowtie2.cpu": 4, "atac.filter.cpu": 4, "atac.bam2ta.cpu": 4, "atac.trim_adapter.cpu": 4, "atac.align_only": True } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(files) parameters['atac.fraglen'] = fraglist tag = '1.1.1' """Creates the trigger json that is used by foufront endpoint. """ input_json = { 'input_files': input_files, 'output_bucket': out_bucket, 'workflow_uuid': wf_info['wf_uuid'], "app_name": wf_info['wf_name'], "wfr_meta": wf_info['wfr_meta'], "parameters": parameters, "config": wf_info['config'], "custom_pf_fields": wf_info['custom_pf_fields'], "_tibanna": { "env": my_env, "run_type": wf_info['wf_name'], "run_id": run_name }, "tag": tag } # r = json.dumps(input_json) # print(r) e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key) url = json.loads(e['input'])['_tibanna']['url'] display( HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
def run_missing_chip2(control_set, wf_info, organism, target_type, paired, ta, ta_xcor, ta_cnt, my_env, my_key, run_ids): my_s3_util = s3Utils(env=my_env) raw_bucket = my_s3_util.raw_file_bucket out_bucket = my_s3_util.outfile_bucket if organism == "human": org = 'hs' input_files = [{ "object_key": "4DNFIZ1TGJZR.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "chip.blacklist", "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966" }, { "object_key": "4DNFIZJB62D1.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "chip.chrsz", "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb" }] elif organism == "mouse": org = 'mm' input_files = [{ "object_key": "4DNFIZ3FBPK8.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "chip.blacklist", "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925" }, { "object_key": "4DNFIBP173GC.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "chip.chrsz", "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9" }] ta_f = extract_file_info(ta, 'chip.tas', my_env, rename=['bed', 'tagAlign']) input_files.append(ta_f) ta_xcor_f = extract_file_info(ta_xcor, 'chip.bam2ta_no_filt_R1.ta', my_env, rename=['bed', 'tagAlign']) input_files.append(ta_xcor_f) if control_set: ta_cnt = extract_file_info(ta_cnt, 'chip.ctl_tas', my_env, rename=['bed', 'tagAlign']) input_files.append(ta_cnt) if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True if not control_set: if target_type == 'histone': print( 'HISTONE WITHOUT CONTROL NEEDS ATTENTION (change to tf), skipping for now' ) return parameters = { "chip.pipeline_type": target_type, "chip.paired_end": chip_p, "chip.choose_ctl.always_use_pooled_ctl": True, "chip.qc_report.name": run_ids['run_name'], "chip.qc_report.desc": run_ids['desc'], "chip.gensz": org, "chip.xcor.cpu": 4, "chip.spp_cpu": 4 } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(ta) parameters['chip.fraglen'] = fraglist tag = '1.1.1' """Creates the trigger json that is used by foufront endpoint. """ input_json = { 'input_files': input_files, 'output_bucket': out_bucket, 'workflow_uuid': wf_info['wf_uuid'], "app_name": wf_info['wf_name'], "wfr_meta": wf_info['wfr_meta'], "parameters": parameters, "config": wf_info['config'], "custom_pf_fields": wf_info['custom_pf_fields'], "_tibanna": { "env": my_env, "run_type": wf_info['wf_name'], "run_id": run_ids['run_name'] }, "tag": tag } # r = json.dumps(input_json) # print(r) e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key) url = json.loads(e['input'])['_tibanna']['url'] display( HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
def run_missing_chip1(control, wf_info, organism, target_type, paired, files, obj_keys, my_env, my_key, run_name): my_s3_util = s3Utils(env=my_env) raw_bucket = my_s3_util.raw_file_bucket out_bucket = my_s3_util.outfile_bucket if organism == "human": org = 'hs' input_files = [{ "object_key": "4DNFIZQB369V.bwaIndex.tar", "rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", "bucket_name": raw_bucket, "workflow_argument_name": "chip.bwa_idx_tar", "uuid": "38077b98-3862-45cd-b4be-8e28e9494549" }, { "object_key": "4DNFIZ1TGJZR.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "chip.blacklist", "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966" }, { "object_key": "4DNFIZJB62D1.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "chip.chrsz", "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb" }] elif organism == "mouse": org = 'mm' input_files = [{ "object_key": "4DNFIZ2PWCC2.bwaIndex.tar", "rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar", "bucket_name": raw_bucket, "workflow_argument_name": "chip.bwa_idx_tar", "uuid": "f4b63d31-65d8-437f-a76a-6bedbb52ae6f" }, { "object_key": "4DNFIZ3FBPK8.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "chip.blacklist", "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925" }, { "object_key": "4DNFIBP173GC.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "chip.chrsz", "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9" }] if control: input_files.append({ "object_key": obj_keys, "bucket_name": raw_bucket, "workflow_argument_name": "chip.ctl_fastqs", "uuid": files }) else: input_files.append({ "object_key": obj_keys, "bucket_name": raw_bucket, "workflow_argument_name": "chip.fastqs", "uuid": files }) if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True if control: parameters = { "chip.pipeline_type": target_type, "chip.paired_end": chip_p, "chip.choose_ctl.always_use_pooled_ctl": True, "chip.gensz": org, "chip.bam2ta_ctl.regex_grep_v_ta": "chr[MUE]|random|alt", "chip.bwa_ctl.cpu": 8, "chip.merge_fastq_ctl.cpu": 8, "chip.filter_ctl.cpu": 8, "chip.bam2ta_ctl.cpu": 8, "chip.align_only": True } else: parameters = { "chip.pipeline_type": target_type, "chip.paired_end": chip_p, "chip.choose_ctl.always_use_pooled_ctl": True, "chip.gensz": org, "chip.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt", "chip.bwa.cpu": 8, "chip.merge_fastq.cpu": 8, "chip.filter.cpu": 8, "chip.bam2ta.cpu": 8, "chip.xcor.cpu": 8, "chip.align_only": True } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(files) parameters['chip.fraglen'] = fraglist tag = '1.1.1' """Creates the trigger json that is used by foufront endpoint. """ input_json = { 'input_files': input_files, 'output_bucket': out_bucket, 'workflow_uuid': wf_info['wf_uuid'], "app_name": wf_info['wf_name'], "wfr_meta": wf_info['wfr_meta'], "parameters": parameters, "config": wf_info['config'], "custom_pf_fields": wf_info['custom_pf_fields'], "_tibanna": { "env": my_env, "run_type": wf_info['wf_name'], "run_id": run_name }, "tag": tag } # r = json.dumps(input_json) # print(r) e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key) url = json.loads(e['input'])['_tibanna']['url'] display( HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
def find_pairs(my_rep_set, my_env, lookfor='pairs', exclude_miseq=True): auth = ff_utils.get_authentication_with_server({}, ff_env=my_env) my_s3_util = s3Utils(env=my_env) """Find fastq files from experiment set, exclude miseq. """ report = {} rep_resp = my_rep_set['experiments_in_set'] lab = [my_rep_set['lab']['@id']] enzymes = [] organisms = [] total_f_size = 0 for exp in rep_resp: exp_resp = exp report[exp['accession']] = [] if not organisms: biosample = exp['biosample'] organisms = list( set([ bs['individual']['organism']['name'] for bs in biosample['biosource'] ])) if len(organisms) != 1: print('multiple organisms in set', my_rep_set['accession']) break exp_files = exp['files'] enzyme = exp.get('digestion_enzyme') if enzyme: enzymes.append(enzyme['display_title']) for fastq_file in exp_files: file_resp = ff_utils.get_metadata(fastq_file['uuid'], key=auth) if not file_resp.get('file_size'): print("WARNING!", file_resp['accession'], 'does not have filesize') else: total_f_size += file_resp['file_size'] # skip pair no 2 if file_resp.get('paired_end') == '2': continue # exclude miseq if exclude_miseq: if file_resp.get('instrument') == 'Illumina MiSeq': # print 'skipping miseq files', exp continue # Some checks before running # check if status is deleted if file_resp['status'] == 'deleted': print('deleted file', file_resp['accession'], 'in', my_rep_set['accession']) continue # if no uploaded file in the file item report and skip if not file_resp.get('filename'): print(file_resp['accession'], "does not have a file") continue # check if file is in s3 head_info = my_s3_util.does_key_exist(file_resp['upload_key'], my_s3_util.raw_file_bucket) if not head_info: print(file_resp['accession'], "does not have a file in S3") continue # check that file has a pair f1 = file_resp['@id'] f2 = "" paired = "" # is there a pair? try: relations = file_resp['related_files'] paired_files = [ relation['file']['@id'] for relation in relations if relation['relationship_type'] == 'paired with' ] assert len(paired_files) == 1 f2 = paired_files[0] paired = "Yes" except: paired = "No" # for experiments with unpaired fastq files if lookfor == 'single': if paired == 'No': report[exp_resp['accession']].append(f1) else: print('expected single files, found paired end') return # for experiments with paired files else: if paired != 'Yes': print('expected paired files, found single end') return f2 = '' relations = file_resp.get('related_files') if not relations: print(f1, 'does not have a pair') return for relation in relations: if relation['relationship_type'] == 'paired with': f2 = relation['file']['@id'] if not f2: print(f1, 'does not have a pair') return report[exp_resp['accession']].append((f1, f2)) # get the organism if len(list(set(organisms))) == 1: organism = organisms[0] else: organism = None # get the enzyme if len(list(set(enzymes))) == 1: enz = enzymes[0] else: enz = None bwa = bwa_index.get(organism) chrsize = chr_size.get(organism) if re_nz.get(organism): enz_file = re_nz[organism].get(enz) else: print('no enzyme information for the organism {}'.format(organism)) enz_file = None return report, organism, enz, bwa, chrsize, enz_file, int( total_f_size / (1024 * 1024 * 1024)), lab
def run_missing_atac2(wf_info, organism, paired, ta, my_env, my_key, run_name): my_s3_util = s3Utils(env=my_env) raw_bucket = my_s3_util.raw_file_bucket out_bucket = my_s3_util.outfile_bucket if organism == "human": org = 'hs' input_files = [{ "object_key": "4DNFIZ1TGJZR.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "atac.blacklist", "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966" }, { "object_key": "4DNFIZJB62D1.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "atac.chrsz", "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb" }] elif organism == "mouse": org = 'mm' input_files = [{ "object_key": "4DNFIZ3FBPK8.bed.gz", "bucket_name": raw_bucket, "workflow_argument_name": "atac.blacklist", "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925" }, { "object_key": "4DNFIBP173GC.chrom.sizes", "bucket_name": raw_bucket, "workflow_argument_name": "atac.chrsz", "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9" }] ta_f = extract_file_info(ta, 'atac.tas', my_env, rename=['bed', 'tagAlign']) input_files.append(ta_f) if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True parameters = { "atac.pipeline_type": 'atac', "atac.paired_end": chip_p, "atac.gensz": org, "atac.disable_ataqc": True, "atac.enable_xcor": False, } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(ta) parameters['atac.fraglen'] = fraglist tag = '1.1.1' """Creates the trigger json that is used by foufront endpoint. """ input_json = { 'input_files': input_files, 'output_bucket': out_bucket, 'workflow_uuid': wf_info['wf_uuid'], "app_name": wf_info['wf_name'], "wfr_meta": wf_info['wfr_meta'], "parameters": parameters, "config": wf_info['config'], "custom_pf_fields": wf_info['custom_pf_fields'], "_tibanna": { "env": my_env, "run_type": wf_info['wf_name'], "run_id": run_name }, "tag": tag } # r = json.dumps(input_json) # print(r) e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key) url = json.loads(e['input'])['_tibanna']['url'] display( HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))