def build(incl_map): nm = "vg_construct_index_map" if incl_map else "vg_construct_index" wf = dxpy.new_dxworkflow(title=nm, name=nm, description=nm, project=project.get_id(), folder=folder, properties={"git_revision": git_revision}) construct_applet = find_applet("vg_construct") construct_input = { } construct_stage_id = wf.add_stage(construct_applet, stage_input=construct_input, name="construct") index_input = { "vg_tar": dxpy.dxlink({"stage": construct_stage_id, "outputField": "vg_tar"}) } index_stage_id = wf.add_stage(find_applet("vg_index"), stage_input=index_input, name="index") if incl_map: map_input = { "vg_indexed_tar": dxpy.dxlink({"stage": index_stage_id, "outputField": "vg_indexed_tar"}) } map_stage_id = wf.add_stage(find_applet("vg_map"), stage_input=map_input, name="map") return wf
def main(project, folder, name): # Build the applet app_id, app_desc = upload_applet('.', None) app_handler = dxpy.DXApplet(app_id) # Build a workflow that uses that applet workflow = dxpy.new_dxworkflow(name=name, project=project, folder=folder) workflow.add_stage(app_id) # Delete the applet, to break the workflow app_handler.remove() return workflow.get_id()
def new_workflow(args): try_call(process_dataobject_args, args) try_call(process_single_dataobject_output_args, args) init_from = None if args.init is not None: if is_analysis_id(args.init): init_from = args.init else: init_project, _init_folder, init_result = try_call( resolve_existing_path, args.init, expected='entity') init_from = dxpy.get_handler(init_result['id'], project=init_project) if args.output is None: project = dxpy.WORKSPACE_ID folder = dxpy.config.get("DX_CLI_WD", "/") name = None else: project, folder, name = try_call(dxpy.utils.resolver.resolve_path, args.output) if args.output_folder is not None: try: # Try to resolve to a path in the project _ignore, args.output_folder, _ignore = resolve_path( args.output_folder, expected='folder') except: # But if not, just use the value directly pass try: dxworkflow = dxpy.new_dxworkflow(title=args.title, summary=args.summary, description=args.description, output_folder=args.output_folder, project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, init_from=init_from) if args.brief: print(dxworkflow.get_id()) else: dxpy.utils.describe.print_desc( dxworkflow.describe(incl_properties=True, incl_details=True), args.verbose) except: err_exit()
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None): '''This function will populate a workflow for the stepsToDo.''' if len(stepsToDo) < 1: return None if appProjectId == None: appProjectId = projectId # create a workflow object wf = dxpy.new_dxworkflow(title=extras['name'],name=extras['name'],folder=resultsFolder, project=projectId,description=extras['description']) # NOTE: prevStepResults dict contains links to result files to be generated by previous steps prevStepResults = {} for step in stepsToDo: appName = STEPS[step]['app'] app = dxencode.find_applet_by_name(appName, appProjectId) appInputs = {} # file inputs for fileToken in STEPS[step]['inputs'].keys(): appInp = STEPS[step]['inputs'][fileToken] if fileToken in prevStepResults: appInputs[ appInp ] = prevStepResults[fileToken] elif fileToken in priors: if isinstance(priors[fileToken], list): appInputs[ appInp ] = [] for fid in priors[fileToken]: appInputs[ appInp ] += [ dxencode.get_file_link(fid) ] else: appInputs[ appInp ] = dxencode.get_file_link(priors[fileToken]) else: print "ERROR: step '"+step+"' can't find input '"+fileToken+"'!" sys.exit(1) # Non-file app inputs if 'params' in STEPS[step]: for param in STEPS[step]['params'].keys(): appParam = STEPS[step]['params'][param] if param in extras: appInputs[ appParam ] = extras[param] else: print "ERROR: unable to locate '"+param+"' in extras." sys.exit(1) # Add wf stage stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder) # outputs, which we will need to link to for fileToken in STEPS[step]['results'].keys(): #appOut = STEPS[step]['results'][fileToken] appOut = fileToken ## not the value prevStepResults[ fileToken ] = dxpy.dxlink({ 'stage': stageId,'outputField': appOut }) wfRun = wf.run({}) return wfRun.describe()
def main(): args = get_args() if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file.') project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/'+args.experiment): project.new_folder('/'+args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() replicates = [] for rep in args.replicates: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='glob', project=source_id, return_handler=False) replicates.extend(dx_rep) if not args.test: replicates = copy_files(replicates, project.get_id(), "/"+args.experiment) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) paired = args.paired gender = args.gender organism = args.organism #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () spec_name = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates]) wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name, name='ENCODE Bismark DNA-ME pipeline: '+spec_name, description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment, folder='/'+args.experiment, project=project.get_id()) populate_workflow(wf, replicates, args.experiment, paired, gender, organism, project.id)
def new_workflow(args): try_call(process_dataobject_args, args) try_call(process_single_dataobject_output_args, args) init_from = None if args.init is not None: if is_analysis_id(args.init): init_from = args.init else: init_project, _init_folder, init_result = try_call(resolve_existing_path, args.init, expected="entity") init_from = dxpy.get_handler(init_result["id"], project=init_project) if args.output is None: project = dxpy.WORKSPACE_ID folder = get_env_var("DX_CLI_WD", "/") name = None else: project, folder, name = dxpy.utils.resolver.resolve_path(args.output) if args.output_folder is not None: try: # Try to resolve to a path in the project _ignore, args.output_folder, _ignore = resolve_path(args.output_folder, expected="folder") except: # But if not, just use the value directly pass try: dxworkflow = dxpy.new_dxworkflow( title=args.title, summary=args.summary, description=args.description, output_folder=args.output_folder, project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, init_from=init_from, ) if args.brief: print(dxworkflow.get_id()) else: dxpy.utils.describe.print_desc(dxworkflow.describe(incl_properties=True, incl_details=True), args.verbose) except: err_exit()
def new_workflow(args): try_call(process_dataobject_args, args) try_call(process_single_dataobject_output_args, args) init_from = None if args.init is not None: try: init_project, init_folder, init_result = try_call(resolve_existing_path, args.init, expected='entity') init_from = dxpy.get_handler(init_result['id'], project=init_project) except: init_from = args.init if args.output is None: project = dxpy.WORKSPACE_ID folder = os.environ.get('DX_CLI_WD', '/') name = None else: project, folder, name = dxpy.utils.resolver.resolve_path(args.output) if args.output_folder is not None: try: # Try to resolve to a path in the project ignore, args.output_folder, ignore2 = resolve_path(args.output_folder, expected='folder') except: # But if not, just use the value directly pass try: dxworkflow = dxpy.new_dxworkflow(title=args.title, summary=args.summary, description=args.description, output_folder=args.output_folder, project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, init_from=init_from) if args.brief: print dxworkflow.get_id() else: dxpy.utils.describe.print_desc(dxworkflow.describe(incl_properties=True, incl_details=True), args.verbose) except: err_exit()
def build_workflow(): if parameters["folder_provided"] == "false": wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', output_folder=parameters["Output"], ) else: wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', ) wf_outputs = [] combine_counts_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["combine_counts"], state="closed", return_handler=True) limma_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["limma"], state="closed", return_handler=True) simple_DE_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["simple_DE"], state="closed", return_handler=True) sample_num = 0 htseq_results = [dxpy.dxlink(count_id) for count_id in samples.values()] combine_input = { "count_files": htseq_results, "name_value": "htseq", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_counts_stage_id = wf.add_stage(combine_counts_applet, stage_input=combine_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE HTSEQ") wf_outputs += [ { "name": "combined_counts", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_counts_stage_id, "outputField": "count_file" } } }, ] if parameters["limma_DE_viewer"] != "None": limma_viewer_project, limma_viewer_file = parameters[ "limma_DE_viewer"].split(":") limma_viewer_link = dxpy.dxlink({ "project": limma_viewer_project, "id": limma_viewer_file }) if parameters["limma_runnable"] == "true": limma_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "calcNormFactors_method": parameters["calcNormFactors_method"], "filter_count_type": parameters["filter_count_type"], "filter_count": int(parameters["filter_count"]), "p_value_adjust": parameters["p_value_adjust"], "contrasts_file": dxpy.dxlink(comparisons_limma_id) } if parameters["limma_DE_viewer"] != "None": limma_input["difex_viewer"] = limma_viewer_link limma_stage_id = wf.add_stage(limma_applet, stage_input=limma_input, instance_type="azure:mem1_ssd1_x4", name="LIMMA") wf_outputs += [ { "name": "limma_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "out_files" } } }, { "name": "limma_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "viewer_bookmark" } } }, ] simple_DE_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "contrasts_file": dxpy.dxlink(comparisons_all_id) } if parameters["limma_DE_viewer"] != "None": simple_DE_input["difex_viewer"] = limma_viewer_link simple_DE_stage_id = wf.add_stage(simple_DE_applet, stage_input=simple_DE_input, instance_type="azure:mem1_ssd1_x4", name="SIMPLE DIFFERENTIAL_EXPRESSION") wf_outputs += [ { "name": "simple_DE_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "out_files" } } }, { "name": "simple_DE_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "viewer_bookmark" } } }, ] wf.update(workflow_outputs=wf_outputs) wf.close() return wf.get_id()
def build_workflow(experiment, biorep_n, input_shield_stage_input, accession, use_existing_folders): output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) mapping_applet = \ find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (mapping_applet.name)) input_shield_applet = \ find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (input_shield_applet.name)) folders = ['workflows', 'fastqs', 'raw_bams', 'bams'] folder_paths = \ ['/'.join([args.outf, folder_name, experiment.get('accession'), 'rep%d' % (biorep_n)]) for folder_name in folders] paths_exist = \ [resolve_folder(output_project, folder_path) for folder_path in folder_paths if resolve_folder(output_project, folder_path)] if any(paths_exist): msg = "%s: output paths already exist: %s" % ( experiment.get('accession'), paths_exist) if use_existing_folders: logging.warning(msg) else: msg += "\nUse --use_existing_folders to supress but possibly create duplicate files" logging.error(msg) return None workflow_output_folder, fastq_output_folder, mapping_output_folder, final_output_folder = \ tuple(create_folder(output_project, folder_path) for folder_path in folder_paths) if args.raw: workflow_title = \ ('Map %s rep%d to %s (no filter)' % (experiment.get('accession'), biorep_n, args.assembly)) workflow_name = 'ENCODE raw mapping pipeline' else: workflow_title = \ ('Map %s rep%d to %s and filter' % (experiment.get('accession'), biorep_n, args.assembly)) workflow_name = 'ENCODE mapping pipeline' if args.tag: workflow_title += ': %s' % (args.tag) workflow = dxpy.new_dxworkflow(title=workflow_title, name=workflow_name, project=output_project.get_id(), folder=workflow_output_folder) input_shield_stage_id = workflow.add_stage( input_shield_applet, name='Gather inputs %s rep%d' % (experiment.get('accession'), biorep_n), folder=fastq_output_folder, stage_input=input_shield_stage_input) input_names = \ [name for name in ['reads1', 'reads2', 'crop_length', 'reference_tar', 'bwa_version', 'bwa_aln_params', 'samtools_version', 'debug'] if name in input_shield_stage_input] logging.debug('input_names: %s' % (input_names)) mapping_stage_input = dict( zip(input_names, [ dxpy.dxlink({ 'stage': input_shield_stage_id, 'outputField': input_name }) for input_name in input_names ])) logging.debug('mapping_stage_input: %s' % (mapping_stage_input)) mapping_stage_id = workflow.add_stage( mapping_applet, name='Map %s rep%d' % (experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input=mapping_stage_input) if not args.raw: filter_qc_applet = \ find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (filter_qc_applet.name)) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter and QC %s rep%d' % (experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': mapping_stage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': mapping_stage_id, 'outputField': 'paired_end' }), 'scrub': args.scrub }) xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (xcor_applet.name)) xcor_stage_id = workflow.add_stage( xcor_applet, name='Calculate cross-correlation %s rep%d' % (experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }), 'spp_version': args.spp_version }) ''' This should all be done in the shield's postprocess entrypoint if args.accession_outputs: derived_from = input_shield_stage_input.get('reads1') if reads2: derived_from.append(reads2) files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : { 'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')), 'lab': 'j-michael-cherry', 'award': 'U41HG006992', 'submitted_by': '*****@*****.**', 'file_format': 'bam', 'output_type': 'alignments', 'derived_from': derived_from, 'dataset': experiment.get('accession')} } output_shield_stage_id = workflow.add_stage( output_shield_applet, name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})], 'files_json': files_json, 'key': input_shield_stage_input.get('key')} ) ''' return workflow
def main(): args = get_args() blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if args.nomap and (args.rep1pe is None or args.rep2pe is None) and not blank_workflow: logging.error("With --nomap, endedness of replicates must be specified with --rep1pe and --rep2pe") raise ValueError if not args.target: target_type = 'default' # default else: target_type = args.target.lower() if target_type not in WF.keys(): logging.error('Target type %s is not recognized') sys.exit(2) output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) existing_folder = resolve_folder(output_project, args.outf) if not existing_folder: output_folder = create_folder(output_project, args.outf) elif args.use_existing_folders: output_folder = existing_folder else: assert (existing_folder and args.use_existing_folders), 'Output folder %s exists but --use_existing_folders is %s' % (existing_folder, args.use_existing_folders) logging.debug('Using output folder %s' % (output_folder)) workflow = dxpy.new_dxworkflow( name=args.name or WF[target_type]['wf_name'], title=args.title or WF[target_type]['wf_title'], description=args.description or WF[target_type]['wf_description'], project=output_project.get_id(), folder=output_folder, properties={'pipeline_version': str(args.pipeline_version)}) unary_control = args.unary_control or (args.rep1 and args.rep2 and args.ctl1 and not args.ctl2) if not args.genomesize: genomesize = None else: genomesize = args.genomesize if not args.chrom_sizes: chrom_sizes = None else: chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes)) if not args.blacklist: blacklist = None else: blacklist = dxpy.dxlink(resolve_file(args.blacklist)) run_idr = WF[target_type]['run_idr'] if not args.nomap: # a "superstage" is just a dict with a name, name(s) of input files, # and then names and id's of stages that process that input # each superstage here could be implemented as a stage in a more # abstract workflow. That stage would then call the various applets # that are separate # stages here. mapping_superstages = [ # the order of this list is important in that {'name': 'Rep1', 'input_args': args.rep1}, {'name': 'Rep2', 'input_args': args.rep2}, {'name': 'Ctl1', 'input_args': args.ctl1} ] if not unary_control: mapping_superstages.append( {'name': 'Ctl2', 'input_args': args.ctl2}) mapping_applet = find_applet_by_name( MAPPING_APPLET_NAME, applet_project.get_id()) # mapping_output_folder = resolve_folder( # output_project, output_folder + '/' + mapping_applet.name) mapping_output_folder = mapping_applet.name reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name( FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name( XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder # in the first pass create the mapping stage id's so we can use JBOR's # to link inputs for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder ) mapping_superstage.update({'map_stage_id': mapped_stage_id}) # in the second pass populate the stage inputs and build other stages rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1') for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') superstage_id = mapping_superstage.get('map_stage_id') if mapping_superstage.get('input_args') or blank_workflow: mapping_stage_input = {} if superstage_name != "Rep1": mapping_stage_input.update( {'reference_tar': dxpy.dxlink( {'stage': rep1_stage_id, 'inputField': 'reference_tar'})}) else: if args.reference: mapping_stage_input.update( {'reference_tar': dxpy.dxlink( reference_tar.get_id())}) if not blank_workflow: for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update({'reads%d' %(arg_index+1): reads}) # this is now done in the first pass loop above # mapped_stage_id = workflow.add_stage( # mapping_applet, # name='Map %s' %(superstage_name), # folder=mapping_output_folder, # stage_input=mapping_stage_input # ) # mapping_superstage.update({'map_stage_id': mapped_stage_id}) workflow.update_stage(superstage_id, stage_input=mapping_stage_input) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' %(superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'}) } ) mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage( xcor_applet, name='Xcor %s' %(superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}), 'spp_version': args.spp_version } ) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}) exp_rep1_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file'}) exp_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}) exp_rep2_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file'}) ctl_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}) if unary_control: ctl_rep2_ta = ctl_rep1_ta else: ctl_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}) rep1_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end'}) rep2_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end'}) else: #skipped the mapping, so just bring in the inputs from arguments if not blank_workflow: exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) exp_rep1_ta_desc = dxpy.describe(exp_rep1_ta) exp_rep2_ta_desc = dxpy.describe(exp_rep2_ta) exp_rep1_mapping_analysis_id = dxpy.describe(exp_rep1_ta_desc['createdBy']['job'])['analysis'] exp_rep2_mapping_analysis_id = dxpy.describe(exp_rep2_ta_desc['createdBy']['job'])['analysis'] exp_rep1_mapping_analysis = dxpy.describe(exp_rep1_mapping_analysis_id) exp_rep2_mapping_analysis = dxpy.describe(exp_rep2_mapping_analysis_id) exp_rep1_cc = next( stage['execution']['output']['CC_scores_file'] for stage in exp_rep1_mapping_analysis.get('stages') if stage['execution']['executableName'] == 'xcor') exp_rep2_cc = next( stage['execution']['output']['CC_scores_file'] for stage in exp_rep2_mapping_analysis.get('stages') if stage['execution']['executableName'] == 'xcor') else: exp_rep1_ta = None exp_rep2_ta = None ctl_rep1_ta = None ctl_rep2_ta = None rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe # #here we need to calculate the cc scores files, because we're only being supplied tagAligns # #if we had mapped everything above we'd already have a handle to the cc file # xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) # # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name) # xcor_output_folder = xcor_only_applet.name # xcor_only_stages = [] # exp_rep1_cc_stage_id = workflow.add_stage( # xcor_only_applet, # name="Rep1 cross-correlation", # folder=xcor_output_folder, # stage_input={ # 'input_tagAlign': exp_rep1_ta, # 'paired_end': rep1_paired_end, # 'spp_version': args.spp_version # } # ) # xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) # exp_rep1_cc = dxpy.dxlink( # {'stage': exp_rep1_cc_stage_id, # 'outputField': 'CC_scores_file'}) # exp_rep2_cc_stage_id = workflow.add_stage( # xcor_only_applet, # name="Rep2 cross-correlation", # folder=xcor_output_folder, # stage_input={ # 'input_tagAlign': exp_rep2_ta, # 'paired_end': rep2_paired_end, # 'spp_version': args.spp_version # } # ) # xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) # exp_rep2_cc = dxpy.dxlink( # {'stage': exp_rep2_cc_stage_id, # 'outputField': 'CC_scores_file'}) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name) peaks_output_folder = encode_macs2_applet.name macs2_stage_input = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)) } if genomesize: macs2_stage_input.update({'genomesize': genomesize}) if chrom_sizes: macs2_stage_input.update({'chrom_sizes': chrom_sizes}) encode_macs2_stage_id = workflow.add_stage( encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input=macs2_stage_input ) encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id}) if run_idr: encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name) idr_peaks_output_folder = encode_spp_applet.name PEAKS_STAGE_NAME = 'SPP Peaks' peaks_stage_input = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'idr_peaks': True, 'spp_version': args.spp_version } if chrom_sizes: peaks_stage_input.update({'chrom_sizes': chrom_sizes}) else: peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})}) encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input=peaks_stage_input ) encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id}) idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) idr_output_folder = idr_applet.name if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id}) final_idr_stage_input = { 'reps_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks'}), 'r1pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'r2pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'pooledpr_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'rep1_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_fc_signal'}), 'rep2_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_fc_signal'}), 'pooled_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_fc_signal'}) } if blacklist: final_idr_stage_input.update({'blacklist': blacklist}) if chrom_sizes: final_idr_stage_input.update({'chrom_sizes': chrom_sizes}) else: final_idr_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})}) final_idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=final_idr_stage_input, ) idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': final_idr_stage_id}) if target_type == 'histone': overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id()) overlap_peaks_stages = [] for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']: if peaktype == 'narrowpeaks': as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as)) peak_type_extension = 'narrowPeak' elif peaktype == 'gappedpeaks': as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as)) peak_type_extension = 'gappedPeak' elif peaktype == 'broadpeaks': as_file = dxpy.dxlink(resolve_file(args.broadpeak_as)) peak_type_extension = 'broadPeak' overlap_peaks_stage_input = { 'rep1_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_%s' %(peaktype)}), 'rep2_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_%s' %(peaktype)}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_%s' %(peaktype)}), 'pooledpr1_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_%s' %(peaktype)}), 'pooledpr2_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_%s' %(peaktype)}), 'as_file': as_file, 'peak_type': peak_type_extension, 'prefix': 'final', 'rep1_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_fc_signal'}), 'rep2_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_fc_signal'}), 'pooled_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_fc_signal'}) } if chrom_sizes: overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes}) else: overlap_peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})}) overlap_peaks_stage_id = workflow.add_stage( overlap_peaks_applet, name='Final %s' %(peaktype), folder=peaks_output_folder, stage_input=overlap_peaks_stage_input ) overlap_peaks_stages.append({'name': 'Final %s' %(peaktype), 'stage_id': overlap_peaks_stage_id}) if args.accession: accession_analysis_applet = find_applet_by_name(ACCESSION_ANALYSIS_APPLET_NAME, applet_project.get_id()) accession_output_folder = accession_analysis_applet.name accession_stage_input = { 'analysis_ids': ['self'], 'force_patch': True, 'wait_on_files': [] } if target_type == 'histone': for stage in overlap_peaks_stages: for output_field in ['overlapping_peaks', 'overlapping_peaks_bb']: accession_stage_input['wait_on_files'].append( dxpy.dxlink({'stage': stage.get('stage_id'), 'outputField': output_field}) ) elif run_idr: for output_field in ['conservative_set', 'conservative_set_bb', 'optimal_set', 'optimal_set_bb']: accession_stage_input['wait_on_files'].append( dxpy.dxlink({'stage': final_idr_stage_id, 'outputField': output_field}) ) assert accession_stage_input['wait_on_files'], "ERROR: workflow has no wait_on_files defined, so --accession is not supported." accession_stage_id = workflow.add_stage( accession_analysis_applet, name='Accession results', folder=accession_output_folder, stage_input=accession_stage_input ) if args.yes: if args.debug: job_id = workflow.run({}, folder=output_folder, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255']) else: job_id = workflow.run({}, folder=output_folder, priority='normal') logging.info("Running as job %s" %(job_id))
def main(): args = get_args() if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file.') project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/'+args.experiment): project.new_folder('/'+args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() replicates = [] for rep in args.replicates: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='exact', project=source_id, return_handler=False) replicates.extend(dx_rep) if not args.test: replicates = copy_files(replicates, project.get_id(), "/"+args.experiment) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) inputs = { 'rnd_seed': 12345 } inputs['paired'] = args.paired inputs['gender']= args.gender inputs['organism'] = args.organism inputs['library_id'] = args.library inputs['nthreads'] = args.nthreads #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () inputs['spec_name'] = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates]) title_root = 'dx_long_rna_seq_' name_root = 'ENCODE Long RNA Seq: ' desc = 'The ENCODE RNA Seq pipeline for long RNAs' if args.paired: title_root = title_root + '_paired_end ' name_root = name_root + '(paired-end) ' inputs['stranded'] = True else: title_root = title_root + '_single_end ' name_root = name_root + '(single-end) ' inputs['stranded'] = False if args.export: project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id'] wf = dxpy.new_dxworkflow(title=title_root, name=name_root, description=desc, folder=PUBLIC_FOLDER, project=project_id) else: project_id = project.get_id() wf = dxpy.new_dxworkflow(title=title_root+inputs['spec_name'], name=name_root+inputs['spec_name'], description=desc+' for experiment:' + args.experiment, folder='/'+args.experiment, project=project.get_id()) populate_workflow(wf, replicates, args.experiment, inputs, project.id, args.export)
def main(): args = get_args() ## resolve projects project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/'+args.experiment): project.new_folder('/'+args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() ## resolve replicates/fastq inputs paired = args.paired if not paired: if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file (unpaired) use -r or --replicates') replicates = find_replicates(args.replicates, source_id, project, args.experiment, args.test) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) dx_reps = { 'reads': [ dxpy.dxlink(r) for r in replicates ] } rnames = '-'.join([ r.split('.')[0] for r in args.replicates]) else: if len(args.pair1) < 1 or len(args.pair2) < 1: sys.exit("Need to have at least 1 replicate in pair1 (--r1/--pair1) and pair2 (--r2/--pair2") pair1reps = find_replicates(args.pair1, source_id, project, args.experiment, args.test) if not pair1reps: print "No replicates for pair1 found in project: " + project.name print "Looking for " + ", ".join(args.pair1) sys.exit(1) pair2reps = find_replicates(args.pair2, source_id, project, args.experiment, args.test) if not pair2reps: print "No replicates for pair2 found in project: " + project.name print "Looking for " + ", ".join(args.pair2) sys.exit(1) dx_reps = { 'pair1_reads': [ dxpy.dxlink(r) for r in pair1reps ], 'pair2_reads': [ dxpy.dxlink(r) for r in pair2reps ] } rnames = '-'.join([ r.split('.')[0] for r in args.pair1+args.pair2]) gender = args.gender organism = args.organism #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () spec_name = args.experiment+'-'+rnames title_root = 'dx_dna_me_' name_root = 'ENCODE Bismark DNA-ME pipeline: ' desc = 'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' if paired: title_root = title_root + '_paired_end' name_root = name_root + '(paired-end)' else: title_root = title_root + '_single_end' name_root = name_root + '(single-end)' if args.export: project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id'] wf = dxpy.new_dxworkflow(title=title_root, name=name_root, description=desc, folder=PUBLIC_FOLDER, project=project_id) else: project_id = project.get_id() wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name, name='ENCODE Bismark DNA-ME pipeline: '+spec_name, description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment, folder='/'+args.experiment, project=project.get_id()) populate_workflow(wf, dx_reps, args.experiment, paired, gender, organism, project.id, args.export)
def test_workflow_completion(self): dxworkflow = dxpy.new_dxworkflow(name="my workflow") self.assert_completion("dx run my", "my workflow ") dxworkflow.hide() self.assert_no_completions("dx run my")
def main(): args = get_args() blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if args.nomap and (args.rep1pe is None or args.rep2pe is None) and not blank_workflow: logging.error( "With --nomap, endedness of replicates must be specified with --rep1pe and --rep2pe" ) raise ValueError if not args.target: target_type = 'default' # default else: target_type = args.target.lower() if target_type not in WF.keys(): logging.error('Target type %s is not recognized') sys.exit(2) output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) existing_folder = resolve_folder(output_project, args.outf) if not existing_folder: output_folder = create_folder(output_project, args.outf) elif args.use_existing_folders: output_folder = existing_folder else: assert ( existing_folder and args.use_existing_folders ), 'Output folder %s exists but --use_existing_folders is %s' % ( existing_folder, args.use_existing_folders) logging.debug('Using output folder %s' % (output_folder)) workflow = dxpy.new_dxworkflow(name=args.name or WF[target_type]['wf_name'], title=args.title or WF[target_type]['wf_title'], description=args.description or WF[target_type]['wf_description'], project=output_project.get_id(), folder=output_folder) unary_control = args.unary_control or (args.rep1 and args.rep2 and args.ctl1 and not args.ctl2) if not args.genomesize: genomesize = None else: genomesize = args.genomesize if not args.chrom_sizes: chrom_sizes = None else: chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes)) if not args.blacklist: blacklist = None else: blacklist = dxpy.dxlink(resolve_file(args.blacklist)) run_idr = WF[target_type]['run_idr'] if not args.nomap: # a "superstage" is just a dict with a name, name(s) of input files, # and then names and id's of stages that process that input # each superstage here could be implemented as a stage in a more # abstract workflow. That stage would then call the various applets # that are separate # stages here. mapping_superstages = [ # the order of this list is important in that { 'name': 'Rep1', 'input_args': args.rep1 }, { 'name': 'Rep2', 'input_args': args.rep2 }, { 'name': 'Ctl1', 'input_args': args.ctl1 } ] if not unary_control: mapping_superstages.append({ 'name': 'Ctl2', 'input_args': args.ctl2 }) mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) # mapping_output_folder = resolve_folder( # output_project, output_folder + '/' + mapping_applet.name) mapping_output_folder = mapping_applet.name reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder # in the first pass create the mapping stage id's so we can use JBOR's # to link inputs for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') mapped_stage_id = workflow.add_stage(mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder) mapping_superstage.update({'map_stage_id': mapped_stage_id}) # in the second pass populate the stage inputs and build other stages rep1_stage_id = next( ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1') for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') superstage_id = mapping_superstage.get('map_stage_id') if mapping_superstage.get('input_args') or blank_workflow: mapping_stage_input = {} if superstage_name != "Rep1": mapping_stage_input.update({ 'reference_tar': dxpy.dxlink({ 'stage': rep1_stage_id, 'inputField': 'reference_tar' }) }) else: if args.reference: mapping_stage_input.update({ 'reference_tar': dxpy.dxlink(reference_tar.get_id()) }) if not blank_workflow: for arg_index, input_arg in enumerate( mapping_superstage['input_args'] ): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update( {'reads%d' % (arg_index + 1): reads}) # this is now done in the first pass loop above # mapped_stage_id = workflow.add_stage( # mapping_applet, # name='Map %s' %(superstage_name), # folder=mapping_output_folder, # stage_input=mapping_stage_input # ) # mapping_superstage.update({'map_stage_id': mapped_stage_id}) workflow.update_stage(superstage_id, stage_input=mapping_stage_input) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' % (superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': superstage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': superstage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update( {'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage(xcor_applet, name='Xcor %s' % (superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file' }) exp_rep1_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file' }) exp_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file' }) exp_rep2_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file' }) ctl_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file' }) if unary_control: ctl_rep2_ta = ctl_rep1_ta else: ctl_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file' }) rep1_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end' }) rep2_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end' }) else: #skipped the mapping, so just bring in the inputs from arguments if not blank_workflow: exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) else: exp_rep1_ta = None exp_rep2_ta = None ctl_rep1_ta = None ctl_rep2_ta = None rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name) xcor_output_folder = xcor_only_applet.name xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end }) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink({ 'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file' }) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end }) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink({ 'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file' }) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name) peaks_output_folder = encode_macs2_applet.name macs2_stage_input = { 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)) } if genomesize: macs2_stage_input.update({'genomesize': genomesize}) if chrom_sizes: macs2_stage_input.update({'chrom_sizes': chrom_sizes}) encode_macs2_stage_id = workflow.add_stage(encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input=macs2_stage_input) encode_macs2_stages.append({ 'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id }) if run_idr: encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name) idr_peaks_output_folder = encode_spp_applet.name PEAKS_STAGE_NAME = 'SPP Peaks' peaks_stage_input = { 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'idr_peaks': True } if chrom_sizes: peaks_stage_input.update({'chrom_sizes': chrom_sizes}) else: peaks_stage_input.update({ 'chrom_sizes': dxpy.dxlink({ 'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes' }) }) encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input=peaks_stage_input) encode_spp_stages.append({ 'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id }) idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) idr_output_folder = idr_applet.name if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }) }) idr_stages.append({ 'name': 'IDR True Replicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }) }) idr_stages.append({ 'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }) }) idr_stages.append({ 'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }) }) idr_stages.append({ 'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id }) final_idr_stage_input = { 'reps_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks' }), 'r1pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'r2pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'pooledpr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'rep1_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_fc_signal' }), 'rep2_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_fc_signal' }), 'pooled_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_fc_signal' }) } if blacklist: final_idr_stage_input.update({'blacklist': blacklist}) if chrom_sizes: final_idr_stage_input.update({'chrom_sizes': chrom_sizes}) else: final_idr_stage_input.update({ 'chrom_sizes': dxpy.dxlink({ 'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes' }) }) idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=final_idr_stage_input, ) idr_stages.append({ 'name': 'Final IDR peak calls', 'stage_id': idr_stage_id }) if target_type == 'histone': overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id()) overlap_peaks_stages = [] for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']: if peaktype == 'narrowpeaks': as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as)) peak_type_extension = 'narrowPeak' elif peaktype == 'gappedpeaks': as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as)) peak_type_extension = 'gappedPeak' elif peaktype == 'broadpeaks': as_file = dxpy.dxlink(resolve_file(args.broadpeak_as)) peak_type_extension = 'broadPeak' overlap_peaks_stage_input = { 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_%s' % (peaktype) }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_%s' % (peaktype) }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_%s' % (peaktype) }), 'pooledpr1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_%s' % (peaktype) }), 'pooledpr2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_%s' % (peaktype) }), 'as_file': as_file, 'peak_type': peak_type_extension, 'prefix': 'final', 'rep1_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_fc_signal' }), 'rep2_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_fc_signal' }), 'pooled_signal': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_fc_signal' }) } if chrom_sizes: overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes}) else: overlap_peaks_stage_input.update({ 'chrom_sizes': dxpy.dxlink({ 'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes' }) }) overlap_peaks_stage_id = workflow.add_stage( overlap_peaks_applet, name='Final %s' % (peaktype), folder=peaks_output_folder, stage_input=overlap_peaks_stage_input) overlap_peaks_stages.append({ 'name': 'Final %s' % (peaktype), 'stage_id': overlap_peaks_stage_id }) if args.yes: if args.debug: job_id = workflow.run( {}, folder=output_folder, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255']) else: job_id = workflow.run({}, folder=output_folder, priority='normal') logging.info("Running as job %s" % (job_id))
def main(): args = get_args() ## resolve projects project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/' + args.experiment): project.new_folder('/' + args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() ## resolve replicates/fastq inputs paired = args.paired if not paired: if len(args.replicates) < 1: sys.exit( 'Need to have at least 1 replicate file (unpaired) use -r or --replicates' ) replicates = find_replicates(args.replicates, source_id, project, args.experiment, args.test) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) dx_reps = {'reads': [dxpy.dxlink(r) for r in replicates]} rnames = '-'.join([r.split('.')[0] for r in args.replicates]) else: if len(args.pair1) < 1 or len(args.pair2) < 1: sys.exit( "Need to have at least 1 replicate in pair1 (--r1/--pair1) and pair2 (--r2/--pair2" ) pair1reps = find_replicates(args.pair1, source_id, project, args.experiment, args.test) if not pair1reps: print "No replicates for pair1 found in project: " + project.name print "Looking for " + ", ".join(args.pair1) sys.exit(1) pair2reps = find_replicates(args.pair2, source_id, project, args.experiment, args.test) if not pair2reps: print "No replicates for pair2 found in project: " + project.name print "Looking for " + ", ".join(args.pair2) sys.exit(1) dx_reps = { 'pair1_reads': [dxpy.dxlink(r) for r in pair1reps], 'pair2_reads': [dxpy.dxlink(r) for r in pair2reps] } rnames = '-'.join([r.split('.')[0] for r in args.pair1 + args.pair2]) gender = args.gender organism = args.organism #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () spec_name = args.experiment + '-' + rnames title_root = 'dx_dna_me_' name_root = 'ENCODE Bismark DNA-ME pipeline: ' desc = 'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' if paired: title_root = title_root + '_paired_end' name_root = name_root + '(paired-end)' else: title_root = title_root + '_single_end' name_root = name_root + '(single-end)' if args.export: project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id'] wf = dxpy.new_dxworkflow(title=title_root, name=name_root, description=desc, folder=PUBLIC_FOLDER, project=project_id) else: project_id = project.get_id() wf = dxpy.new_dxworkflow( title='dx_dna_me_' + spec_name, name='ENCODE Bismark DNA-ME pipeline: ' + spec_name, description= 'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment, folder='/' + args.experiment, project=project.get_id()) populate_workflow(wf, dx_reps, args.experiment, paired, gender, organism, project.id, args.export)
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.debug('Using output folder %s' % (output_folder)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) workflow = dxpy.new_dxworkflow(name=args.name, title=args.title, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not args.genomesize: genomesize = None else: genomesize = args.genomesize if not args.chrom_sizes: chrom_sizes = None else: chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes)) if not args.blacklist: blacklist = None else: blacklist = dxpy.dxlink(resolve_file(args.blacklist)) if not args.nomap: #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ # the order of this list is important in that { 'name': 'Rep1', 'input_args': args.rep1 }, { 'name': 'Rep2', 'input_args': args.rep2 }, { 'name': 'Ctl1', 'input_args': args.ctl1 } ] if not args.unary_control: mapping_superstages.append({ 'name': 'Ctl2', 'input_args': args.ctl2 }) mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder( output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder # in the first pass create the mapping stage id's so we can use JBOR's # to link inputs for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') mapped_stage_id = workflow.add_stage(mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder) mapping_superstage.update({'map_stage_id': mapped_stage_id}) # in the second pass populate the stage inputs and build other stages rep1_stage_id = next( ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1') for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') superstage_id = mapping_superstage.get('map_stage_id') if mapping_superstage.get('input_args') or blank_workflow: mapping_stage_input = {} if superstage_name != "Rep1": mapping_stage_input.update({ 'reference_tar': dxpy.dxlink({ 'stage': rep1_stage_id, 'inputField': 'reference_tar' }) }) else: if args.reference: mapping_stage_input.update({ 'reference_tar': dxpy.dxlink(reference_tar.get_id()) }) if not blank_workflow: for arg_index, input_arg in enumerate( mapping_superstage['input_args'] ): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update( {'reads%d' % (arg_index + 1): reads}) # this is now done in the first pass loop above # mapped_stage_id = workflow.add_stage( # mapping_applet, # name='Map %s' %(superstage_name), # folder=mapping_output_folder, # stage_input=mapping_stage_input # ) # mapping_superstage.update({'map_stage_id': mapped_stage_id}) workflow.update_stage(superstage_id, stage_input=mapping_stage_input) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' % (superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': superstage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': superstage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update( {'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage(xcor_applet, name='Xcor %s' % (superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file' }) exp_rep1_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file' }) exp_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file' }) exp_rep2_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file' }) ctl_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file' }) if not args.unary_control: ctl_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file' }) else: ctl_rep2_ta = ctl_rep1_ta rep1_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end' }) rep2_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end' }) else: #skipped the mapping, so just bring in the inputs from arguments exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) xcor_output_folder = resolve_folder( output_project, output_folder + '/' + xcor_only_applet.name) xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end }) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink({ 'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file' }) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end }) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink({ 'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file' }) encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] idr_peaks_output_folder = resolve_folder( output_project, output_folder + '/' + encode_spp_applet.name) PEAKS_STAGE_NAME = 'SPP Peaks' peaks_stage_input = { 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'idr_peaks': args.idr } if chrom_sizes: peaks_stage_input.update({'chrom_sizes': chrom_sizes}) encode_spp_stage_id = workflow.add_stage(encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input=peaks_stage_input) encode_spp_stages.append({ 'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id }) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] peaks_output_folder = resolve_folder( output_project, output_folder + '/' + encode_macs2_applet.name) macs2_stage_input = { 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)) } if genomesize: macs2_stage_input.update({'genomesize': genomesize}) if chrom_sizes: macs2_stage_input.update({'chrom_sizes': chrom_sizes}) else: macs2_stage_input.update({ 'chrom_sizes': dxpy.dxlink({ 'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes' }) }) encode_macs2_stage_id = workflow.add_stage(encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input=macs2_stage_input) encode_macs2_stages.append({ 'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id }) if args.idr: # if args.idrversion == "1": # idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) # elif args.idrversion == "2": # idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) # else: # logging.error("Invalid IDR version: %s" %(args.idrversion)) # idr_applet = None idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] idr_output_folder = resolve_folder( output_project, output_folder + '/' + idr_applet.name) if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }) }) idr_stages.append({ 'name': 'IDR True Replicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }) }) idr_stages.append({ 'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }) }) idr_stages.append({ 'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }) }) idr_stages.append({ 'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id }) stage_input = { 'reps_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks' }), 'r1pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'r2pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'pooledpr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)) } if blacklist: stage_input.update({'blacklist': blacklist}) if chrom_sizes: stage_input.update({'chrom_sizes': chrom_sizes}) else: stage_input.update({ 'chrom_sizes': dxpy.dxlink({ 'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes' }) }) idr_stage_id = workflow.add_stage(encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=stage_input) idr_stages.append({ 'name': 'Final IDR peak calls', 'stage_id': idr_stage_id }) if not (args.nomap): logging.debug("Mapping stages: %s" % (mapping_superstages)) else: logging.debug("xcor only stages: %s" % (xcor_only_stages)) # if not args.idronly: # logging.debug("Peak stages: %s" %(spp_stages)) logging.debug("Peak stages: %s" % (encode_spp_stages)) if args.idr: logging.debug("IDR stages: %s" % (idr_stages)) if args.yes: if args.debug: job_id = workflow.run( {}, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255']) else: job_id = workflow.run({}, priority='high') logging.info("Running as job %s" % (job_id))
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.info('Found output project %s' %(output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.info('Using output folder %s' %(output_folder)) applet_project = resolve_project(args.applets, 'r') logging.info('Found applet project %s' %(applet_project.name)) workflow = dxpy.new_dxworkflow( title=WF_TITLE, name=args.name, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) #this whole strategy is fragile and unsatisfying #subsequent code assumes reps come before contols #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ {'name': 'Rep1', 'input_args': args.rep1}, {'name': 'Rep2', 'input_args': args.rep2}, {'name': 'Ctl1', 'input_args': args.ctl1}, {'name': 'Ctl2', 'input_args': args.ctl2} # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)}, # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below ] mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') if mapping_superstage.get('input_args') or blank_workflow: if blank_workflow: mapping_stage_input = None else: mapping_stage_input = {'reference_tar' : dxpy.dxlink(reference_tar.get_id())} for arg_index,input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update({'reads%d' %(arg_index+1): reads}) mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' %(superstage_name), folder=mapping_output_folder, stage_input=mapping_stage_input, instance_type=args.instance_type ) mapping_superstage.update({'map_stage_id': mapped_stage_id}) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' %(superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'paired_end'}) }, instance_type=args.instance_type ) mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage( xcor_applet, name='Xcor %s' %(superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}) }, instance_type=args.instance_type ) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) spp_applet = find_applet_by_name(SPP_APPLET_NAME, applet_project.get_id()) spp_stages = [] peaks_output_folder = resolve_folder(output_project, output_folder + '/' + spp_applet.name) if (args.rep1 and args.ctl1) or blank_workflow: rep1_spp_stage_id = workflow.add_stage( spp_applet, name='Peaks Rep1', folder=peaks_output_folder, stage_input={ 'experiment' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}), 'control': dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}), 'xcor_scores_input': dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file'}) }, instance_type=args.instance_type ) spp_stages.append({'name': 'Peaks Rep1', 'stage_id': rep1_spp_stage_id}) if (args.rep2 and args.ctl2) or blank_workflow: rep2_spp_stage_id = workflow.add_stage( spp_applet, name='Peaks Rep2', folder=peaks_output_folder, stage_input={ 'experiment' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}), 'control': dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}), 'xcor_scores_input': dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file'}) }, instance_type=args.instance_type ) spp_stages.append({'name': 'Peaks Rep2', 'stage_id': rep2_spp_stage_id}) encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] if args.idr: idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name='Peaks for IDR', folder=idr_peaks_output_folder, stage_input={ 'rep1_ta' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}), 'rep2_ta' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}), 'ctl1_ta': dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}), 'ctl2_ta' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}), 'rep1_xcor' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': '"CC_scores_file"'}), 'rep2_xcor' : dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': '"CC_scores_file"'}), 'paired_end': dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end'}) #here we're assuming if rep1 is PE it's a PE experiment - need better error checking }, instance_type=args.instance_type ) encode_spp_stages.append({'name': 'Peaks for IDR', 'stage_id': encode_spp_stage_id}) idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] if args.idr: idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'pooled_peaks'}) }, instance_type=args.instance_type ) idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep1pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep1pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep1_peaks'}) }, instance_type=args.instance_type ) idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep2pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep2pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'rep2_peaks'}) }, instance_type=args.instance_type ) idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoeplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'pooledpr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'pooledpr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'), 'outputField': 'pooled_peaks'}) }, instance_type=args.instance_type ) idr_stages.append({'name': 'IDR Pooled Pseudoeplicates', 'stage_id': idr_stage_id}) logging.debug("Mapping stages: %s" %(mapping_superstages)) logging.debug("Peak stages: %s" %(spp_stages)) logging.debug("Peaks for IDR stages: %s" %(encode_spp_stages)) logging.debug("IDR stages: %s" %(idr_stages))
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.info('Found output project %s' % (output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.info('Using output folder %s' % (output_folder)) applet_project = resolve_project(args.applets, 'r') logging.info('Found applet project %s' % (applet_project.name)) workflow = dxpy.new_dxworkflow(name=args.name, title=args.title, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not args.nomap: #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ { 'name': 'Rep1', 'input_args': args.rep1 }, { 'name': 'Rep2', 'input_args': args.rep2 }, { 'name': 'Ctl1', 'input_args': args.ctl1 }, { 'name': 'Ctl2', 'input_args': args.ctl2 } # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)}, # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below ] mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder( output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') if mapping_superstage.get('input_args') or blank_workflow: if blank_workflow: if args.reference: mapping_stage_input = { 'reference_tar': dxpy.dxlink(reference_tar.get_id()) } else: mapping_stage_input = None else: mapping_stage_input = { 'reference_tar': dxpy.dxlink(reference_tar.get_id()) } for arg_index, input_arg in enumerate( mapping_superstage['input_args'] ): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update( {'reads%d' % (arg_index + 1): reads}) mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder, stage_input=mapping_stage_input) mapping_superstage.update({'map_stage_id': mapped_stage_id}) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' % (superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': mapped_stage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': mapped_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update( {'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage(xcor_applet, name='Xcor %s' % (superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file' }) exp_rep1_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file' }) exp_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file' }) exp_rep2_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file' }) ctl_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file' }) ctl_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file' }) rep1_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end' }) rep2_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end' }) else: #skipped the mapping, so just bring in the inputs from arguments exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) xcor_output_folder = resolve_folder( output_project, output_folder + '/' + xcor_only_applet.name) xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end }) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink({ 'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file' }) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end }) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink({ 'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file' }) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] peaks_output_folder = resolve_folder( output_project, output_folder + '/' + encode_macs2_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: encode_macs2_stage_id = workflow.add_stage( encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input={ 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)), 'genomesize': args.genomesize }) encode_macs2_stages.append({ 'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id }) #new applet here, similar to IDR, to do naive peak processing if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id()) overlap_peaks_stages = [] for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']: if peaktype == 'narrowpeaks': as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as)) peak_type_extension = 'narrowPeak' elif peaktype == 'gappedpeaks': as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as)) peak_type_extension = 'gappedPeak' elif peaktype == 'broadpeaks': as_file = dxpy.dxlink(resolve_file(args.broadpeak_as)) peak_type_extension = 'broadPeak' overlap_peaks_stage_id = workflow.add_stage( overlap_peaks_applet, name='Overlap %s' % (peaktype), folder=peaks_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_%s' % (peaktype) }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_%s' % (peaktype) }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_%s' % (peaktype) }), 'pooledpr1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_%s' % (peaktype) }), 'pooledpr2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_%s' % (peaktype) }), 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': as_file, 'peak_type': peak_type_extension }) overlap_peaks_stages.append({ 'name': 'Overlap %s' % (peaktype), 'stage_id': overlap_peaks_stage_id }) #TODO - IDR on gapped and broad peaks if args.idr: idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_peaks_output_folder = resolve_folder( output_project, output_folder + '/' + idr_applet.name) idr_output_folder = resolve_folder( output_project, output_folder + '/' + idr_applet.name) idr_stages = [] if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_narrowpeaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_narrowpeaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_narrowpeaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR True Replicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1pr1_narrowpeaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1pr2_narrowpeaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_narrowpeaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2pr1_narrowpeaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2pr2_narrowpeaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_narrowpeaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_narrowpeaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_narrowpeaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_narrowpeaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id }) final_idr_stage_input = { 'reps_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks' }), 'r1pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'r2pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'pooledpr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)) } if args.blacklist: final_idr_stage_input.update( {'blacklist': dxpy.dxlink(resolve_file(args.blacklist))}) idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=final_idr_stage_input) idr_stages.append({ 'name': 'Final IDR peak calls', 'stage_id': idr_stage_id }) if not (args.nomap): logging.debug("Mapping stages: %s" % (mapping_superstages)) else: logging.debug("xcor only stages: %s" % (xcor_only_stages)) logging.debug("Peaks for ENCODE stages: %s" % (encode_macs2_stages)) logging.debug("Peak overlap stages: %s" % (overlap_peaks_stages)) if args.idr: logging.debug("IDR stages: %s" % (idr_stages)) if args.yes: if args.debug: job_id = workflow.run( {}, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255']) else: job_id = workflow.run({}, priority='high') logging.info("Running as job %s" % (job_id))
def main(): args = get_args() if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file.') project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/' + args.experiment): project.new_folder('/' + args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() replicates = [] for rep in args.replicates: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='exact', project=source_id, return_handler=False) replicates.extend(dx_rep) if not args.test: replicates = copy_files(replicates, project.get_id(), "/" + args.experiment) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) inputs = {'rnd_seed': 12345} inputs['paired'] = args.paired inputs['gender'] = args.gender inputs['organism'] = args.organism inputs['library_id'] = args.library inputs['nthreads'] = args.nthreads #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () inputs['spec_name'] = args.experiment + '-' + '-'.join( [r.split('.')[0] for r in args.replicates]) title_root = 'dx_long_rna_seq_' name_root = 'ENCODE Long RNA Seq: ' desc = 'The ENCODE RNA Seq pipeline for long RNAs' if args.paired: title_root = title_root + '_paired_end ' name_root = name_root + '(paired-end) ' inputs['stranded'] = True else: title_root = title_root + '_single_end ' name_root = name_root + '(single-end) ' inputs['stranded'] = False if args.export: project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id'] wf = dxpy.new_dxworkflow(title=title_root, name=name_root, description=desc, folder=PUBLIC_FOLDER, project=project_id) else: project_id = project.get_id() wf = dxpy.new_dxworkflow(title=title_root + inputs['spec_name'], name=name_root + inputs['spec_name'], description=desc + ' for experiment:' + args.experiment, folder='/' + args.experiment, project=project.get_id()) populate_workflow(wf, replicates, args.experiment, inputs, project.id, args.export)
def build_workflow(): if parameters["folder_provided"] == "false": wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', output_folder=parameters["Output"], ) else: wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', ) wf_outputs = [] fastqc_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["fastqc"], state="closed", return_handler=True) star_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["star"], state="closed", return_handler=True) combine_sj_tab_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["combine_sj_out"], state="closed", return_handler=True) sort_bam_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["sort_bam"], state="closed", return_handler=True) htseq_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["htseq"], state="closed", return_handler=True) genome_cov_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["genome_coverage"], state="closed", return_handler=True) bigwig_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["bigwig"], state="closed", return_handler=True) combine_counts_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["combine_counts"], state="closed", return_handler=True) combine_flagstat_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["combine_flagstat"], state="closed", return_handler=True) limma_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["limma"], state="closed", return_handler=True) simple_DE_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["simple_DE"], state="closed", return_handler=True) bw_viewer_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["bw_viewer"], state="closed", return_handler=True) sample_num = 0 htseq_results = [] bigwig_files = [] flagstat_files_arr = [] index_project, index_id = parameters["index_file"].split(":") gtf_project, gtf_id = parameters["gtf_file"].split(":") genome_length_project, genome_length_id = parameters[ "genome_sizes_file"].split(":") gene_length_project, gene_length_id = parameters["gene_length_file"].split( ":") fpkm_results = [] fpkm_log2_results = [] sj_out_files_arr = [] star_alignment_opts = { "outSAMunmapped": parameters["outSAMunmapped"], "outSAMattributes": parameters["outSAMattributes"], "outFilterMultimapNmax": int(parameters["outFilterMultimapNmax"]), "outFilterMismatchNmax": int(parameters["outFilterMismatchNmax"]), "alignIntronMax": int(parameters["alignIntronMax"]), "outSAMstrandField": parameters["outSAMstrandField"], "chimSegmentMin": int(parameters["chimSegmentMin"]), "sjdbOverhang": int(parameters["sjdbOverhang"]), "chimJunctionOverhangMin": int(parameters["chimJunctionOverhangMin"]), "subsample_target": int(parameters["STAR_subsample_n_reads"]), } if parameters["two_pass_alignment"] == 'true': for sample_name in samples: forward_id = samples[sample_name][0] forward_link = dxpy.dxlink(forward_id) align_input = {} align_input["first_pass"] = True align_input.update(star_alignment_opts) align_input["read_file1"] = forward_link star_instance = parameters["star_instance"] align_input["mark_duplicates"] = False align_input["generate_transcriptome_BAM"] = False align_input["star_index_archive"] = dxpy.dxlink({ "project": index_project, "id": index_id }) if parameters["sjdbFileChrStartEnd"] != "null" and parameters[ "sjdbFileChrStartEnd"] != '': sjdbFileChrStartEnd_project, sjdbFileChrStartEnd_id = parameters[ "sjdbFileChrStartEnd"].split(":") align_input["sjdbFileChrStartEnd"] = dxpy.dxlink({ "project": sjdbFileChrStartEnd_project, "id": sjdbFileChrStartEnd_id }) if parameters["indexed_with_gtf"] != "true": align_input["transcriptome_gtf"] = dxpy.dxlink({ "project": gtf_project, "id": gtf_id }) align_input["output_prefix"] = sample_name align_stage_id = "" if samples[sample_name][1] != "-": reverse_id = samples[sample_name][1] reverse_link = dxpy.dxlink(reverse_id) align_input["read_file2"] = reverse_link first_align_stage_id = wf.add_stage( star_applet, stage_input=align_input, instance_type=star_instance, folder="ALIGN_Pass1", name=sample_name + ":ALIGN_Pass1") else: first_align_stage_id = wf.add_stage( star_applet, stage_input=align_input, instance_type=star_instance, folder="ALIGN_Pass1", name=sample_name + ":ALIGN_Pass1") sj_out_files_arr.append( dxpy.dxlink({ "stage": first_align_stage_id, "outputField": "sj_tab_out" })) combine_sj_out_input = {"sj_out_files": sj_out_files_arr} combine_sj_out_stage_pass1_id = wf.add_stage( combine_sj_tab_applet, stage_input=combine_sj_out_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE SJ OUT PASS1", folder="COMBINED_JUNCTIONS_PASS1") parameters["pass1_sj_out"] = dxpy.dxlink({ "stage": combine_sj_out_stage_pass1_id, "outputField": "combined_sj_out" }) for sample_name in samples: forward_id = samples[sample_name][0] forward_link = dxpy.dxlink(forward_id) if parameters["run_FastQC"] == 'true': forward_input = {"fastq_input": forward_link} fq_stage_id = wf.add_stage(fastqc_applet, stage_input=forward_input, instance_type="azure:mem2_ssd1_x2", folder="FASTQC", name=sample_name + ":Forward FASTQC") wf_outputs += [{ "name": sample_name + "_forward_fastqc_html", "class": "file", "outputSource": { "$dnanexus_link": { "stage": fq_stage_id, "outputField": "html_file" } } }, { "name": sample_name + "_forward_fastqc_zip", "class": "file", "outputSource": { "$dnanexus_link": { "stage": fq_stage_id, "outputField": "zip_file" } } }] align_input = {} align_input.update(star_alignment_opts) mark_duplicates = parameters["mark_duplicates"] if mark_duplicates == "false": mark_duplicates = False else: mark_duplicates = True align_input["mark_duplicates"] = mark_duplicates if parameters["generate_transcriptome_BAM"] == "true": align_input["generate_transcriptome_BAM"] = True else: align_input["generate_transcriptome_BAM"] = False star_instance = parameters["star_instance"] align_input["read_file1"] = forward_link align_input["star_index_archive"] = dxpy.dxlink({ "project": index_project, "id": index_id }) if parameters["indexed_with_gtf"] != "true": align_input["transcriptome_gtf"] = dxpy.dxlink({ "project": gtf_project, "id": gtf_id }) if "pass1_sj_out" in parameters: align_input["sjdbFileChrStartEnd"] = parameters["pass1_sj_out"] elif parameters["sjdbFileChrStartEnd"] != "null" and parameters[ "sjdbFileChrStartEnd"] != '': sjdbFileChrStartEnd_project, sjdbFileChrStartEnd_id = parameters[ "sjdbFileChrStartEnd"].split(":") align_input["sjdbFileChrStartEnd"] = dxpy.dxlink({ "project": sjdbFileChrStartEnd_project, "id": sjdbFileChrStartEnd_id }) align_input["output_prefix"] = sample_name align_stage_id = "" if samples[sample_name][1] != "-": reverse_id = samples[sample_name][1] reverse_link = dxpy.dxlink(reverse_id) if parameters["run_FastQC"] == 'true': rev_input = {"fastq_input": reverse_link} rev_fq_stage_id = wf.add_stage( fastqc_applet, stage_input=rev_input, instance_type="azure:mem2_ssd1_x2", folder="FASTQC", name=sample_name + ":Reverse FASTQC") wf_outputs += [{ "name": sample_name + "_reverse_fastqc_html", "class": "file", "outputSource": { "$dnanexus_link": { "stage": rev_fq_stage_id, "outputField": "html_file" } } }, { "name": sample_name + "_reverse_fastqc_zip", "class": "file", "outputSource": { "$dnanexus_link": { "stage": rev_fq_stage_id, "outputField": "zip_file" } } }] align_input["read_file2"] = reverse_link align_stage_id = wf.add_stage(star_applet, stage_input=align_input, instance_type=star_instance, folder="STAR", name=sample_name + ":ALIGN") else: align_stage_id = wf.add_stage(star_applet, stage_input=align_input, instance_type=star_instance, folder="STAR", name=sample_name + ":ALIGN") flagstat_files_arr.append( dxpy.dxlink({ "stage": align_stage_id, "outputField": "flagstat_out" })) sj_out_files_arr.append( dxpy.dxlink({ "stage": align_stage_id, "outputField": "sj_tab_out" })) wf_outputs += [ { "name": sample_name + "_star_bam", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "sorted_by_coord_bam" } } }, { "name": sample_name + "_star_log", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "log_final_out" } } }, { "name": sample_name + "_flagstat", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "flagstat_out" } } }, { "name": sample_name + "_star_splice_junctions", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "sj_tab_out" } } }, { "name": sample_name + "_star_chimeric_bam", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "chimeric_bam" } } }, { "name": sample_name + "_star_chimeric_junction", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "chimeric_junction" } } }, ] if parameters["generate_transcriptome_BAM"] == "true": wf_outputs += [ { "name": sample_name + "_star_transcriptome_bam", "class": "file", "outputSource": { "$dnanexus_link": { "stage": align_stage_id, "outputField": "to_transcriptome_bam" } } }, ] if parameters["generate_name_sorted_BAM"] == "true": sort_input = { "input_bam": dxpy.dxlink({ "stage": align_stage_id, "outputField": "sorted_by_coord_bam" }) } sort_stage_id = wf.add_stage(sort_bam_applet, stage_input=sort_input, instance_type="azure:mem2_ssd1_x2", name=sample_name + ":NAME SORT BAM", folder="STAR") wf_outputs += [ { "name": sample_name + "_name_sorted_bam", "class": "file", "outputSource": { "$dnanexus_link": { "stage": sort_stage_id, "outputField": "output_bam" } } }, ] htseq_input = { "input_bam": dxpy.dxlink({ "stage": sort_stage_id, "outputField": "output_bam" }) } htseq_input["order"] = "name" else: htseq_input = { "input_bam": dxpy.dxlink({ "stage": align_stage_id, "outputField": "sorted_by_coord_bam" }) } htseq_input["order"] = "pos" htseq_input["annotation_file"] = dxpy.dxlink({ "project": gtf_project, "id": gtf_id }) htseq_input["gene_length_file"] = dxpy.dxlink({ "project": gene_length_project, "id": gene_length_id }) htseq_input["prefix"] = sample_name htseq_input["strand"] = parameters["strandedness"] htseq_input["feature_type"] = parameters["feature_type"] htseq_input["id_attribute"] = parameters["id_attribute"] htseq_input["mode"] = parameters["mode"] htseq_input["nonunique"] = parameters["nonunique"] htseq_input["secondary_alignments"] = parameters[ "secondary_alignments"] htseq_input["supplementary_alignments"] = parameters[ "supplementary_alignments"] htseq_stage_id = wf.add_stage( htseq_applet, stage_input=htseq_input, instance_type=parameters["htseq_instance"], name=sample_name + ":HTSEQ COUNT", folder="HTSEQ") htseq_results.append( dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "htseq_counts" })) wf_outputs += [ { "name": sample_name + "_htseqcounts", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "htseq_counts" } } }, ] if parameters["id_attribute"] == "gene_name": fpkm_results.append((dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "fpkm" }))) fpkm_log2_results.append((dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "fpkm_log2" }))) wf_outputs += [ { "name": sample_name + "_fpkm", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "fpkm" } } }, { "name": sample_name + "_fpkm_log2", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "fpkm_log2" } } }, ] if parameters["run_coverage"] == 'true': gcb_input = {} gcb_input["input_bam"] = dxpy.dxlink({ "stage": align_stage_id, "outputField": "sorted_by_coord_bam" }) gcb_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) gcb_input["strandedness"] = parameters["strandedness"] gcb_input["output_prefix"] = sample_name gcb_stage_id = wf.add_stage(genome_cov_applet, stage_input=gcb_input, instance_type="azure:mem3_ssd1_x8", name=sample_name + ":COVERAGE", folder="COVERAGE") bg2bw_all_input = {} bg2bw_all_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "all_coverage_file" }) bg2bw_all_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_all_input["output_prefix"] = sample_name bg2bw_all_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_all_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-ALL", folder="BIGWIG") bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_all_stage_id, "outputField": "bigwig" })) wf_outputs += [ { "name": sample_name + "_all_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_all_stage_id, "outputField": "bigwig" } } }, ] if parameters["strandedness"] != "no": bg2bw_pos_input = {} bg2bw_pos_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "pos_coverage_file" }) bg2bw_pos_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_pos_input["output_prefix"] = sample_name bg2bw_pos_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_pos_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-POS", folder="BIGWIG") wf_outputs += [ { "name": sample_name + "_pos_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_pos_stage_id, "outputField": "bigwig" } } }, ] bg2bw_neg_input = {} bg2bw_neg_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "neg_coverage_file" }) bg2bw_neg_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_neg_input["output_prefix"] = sample_name bg2bw_neg_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_neg_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-NEG", folder="BIGWIG") wf_outputs += [ { "name": sample_name + "_neg_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_neg_stage_id, "outputField": "bigwig" } } }, ] bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_pos_stage_id, "outputField": "bigwig" })) bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_neg_stage_id, "outputField": "bigwig" })) sample_num += 1 combine_input = { "count_files": htseq_results, "name_value": "htseq", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_counts_stage_id = wf.add_stage(combine_counts_applet, stage_input=combine_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE HTSEQ") wf_outputs += [ { "name": "combined_counts", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_counts_stage_id, "outputField": "count_file" } } }, ] if parameters["id_attribute"] == "gene_name": combine_fpkm_input = { "count_files": fpkm_results, "name_value": "fpkm", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_fpkm_stage_id = wf.add_stage( combine_counts_applet, stage_input=combine_fpkm_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE FPKM") combine_fpkm_log2_input = { "count_files": fpkm_log2_results, "name_value": "fpkm.log2", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_fpkm_log2_stage_id = wf.add_stage( combine_counts_applet, stage_input=combine_fpkm_log2_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE FPKMlog2") wf_outputs += [ { "name": "combined_fpkm", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_fpkm_stage_id, "outputField": "count_file" } } }, { "name": "combined_fpkm_log2", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_fpkm_log2_stage_id, "outputField": "count_file" } } }, ] combine_flagstat_input = { "flagstat_files": flagstat_files_arr, "sample_list": dxpy.dxlink(final_sample_list_id) } combine_flagstat_stage_id = wf.add_stage( combine_flagstat_applet, stage_input=combine_flagstat_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE FLAGSTAT", folder="STAR") wf_outputs += [ { "name": "combined_flagstat", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_flagstat_stage_id, "outputField": "combined_flagstat" } } }, ] if parameters["BW_VIEWER"] != "None" and parameters[ "run_coverage"] == 'true': bw_project, bw_file = parameters["BW_VIEWER"].split(":") viewer_link = dxpy.dxlink({"project": bw_project, "id": bw_file}) bw_viewer_input = {"viewer": viewer_link, "bigwig_files": bigwig_files} bw_viewer_stage_id = wf.add_stage(bw_viewer_applet, stage_input=bw_viewer_input, instance_type="azure:mem2_ssd1_x1", name="BIGWIG_VIEWER", folder="BIGWIG") wf_outputs += [ { "name": "bw_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": bw_viewer_stage_id, "outputField": "viewer_bookmark" } } }, ] if parameters["limma_DE_viewer"] != "None": limma_viewer_project, limma_viewer_file = parameters[ "limma_DE_viewer"].split(":") limma_viewer_link = dxpy.dxlink({ "project": limma_viewer_project, "id": limma_viewer_file }) if parameters["run_limma"] == 'true' and parameters[ "limma_runnable"] == "true": limma_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "calcNormFactors_method": parameters["calcNormFactors_method"], "filter_count_type": parameters["filter_count_type"], "filter_count": int(parameters["filter_count"]), "p_value_adjust": parameters["p_value_adjust"], "contrasts_file": dxpy.dxlink(comparisons_limma_id) } if parameters["limma_DE_viewer"] != "None": limma_input["difex_viewer"] = limma_viewer_link limma_stage_id = wf.add_stage(limma_applet, stage_input=limma_input, instance_type="azure:mem1_ssd1_x4", name="LIMMA") wf_outputs += [ { "name": "limma_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "out_files" } } }, { "name": "limma_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "viewer_bookmark" } } }, ] if parameters["run_simple_dif_ex"] == 'true': simple_DE_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "contrasts_file": dxpy.dxlink(comparisons_all_id), "difex_viewer": limma_viewer_link } if parameters["limma_DE_viewer"] != "None": simple_DE_input["difex_viewer"] = limma_viewer_link simple_DE_stage_id = wf.add_stage( simple_DE_applet, stage_input=simple_DE_input, instance_type="azure:mem1_ssd1_x4", name="SIMPLE DIFFERENTIAL_EXPRESSION") wf_outputs += [ { "name": "simple_DE_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "out_files" } } }, { "name": "simple_DE_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "viewer_bookmark" } } }, ] wf.update(workflow_outputs=wf_outputs) wf.close() return wf.get_id()
def build_workflow(): wf = dxpy.new_dxworkflow(title='tcga_mc3_full_run', name='tcga_mc3_full_run', description='TCGA mc3 variant calling pipeline', project=args.project, folder=args.folder, properties={"git_revision": git_revision}) # variant calling tools pindel_applet = find_applet("pindel-tool") pindel_stage_id = wf.add_stage(pindel_applet) radia_applet = find_applet("radia-tool") radia_input = { "dnaNormalBam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBamFile" }), "dnaTumorBam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "fasta": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } radia_stage_id = wf.add_stage(radia_applet, stage_input=radia_input) somaticsniper_applet = find_applet("somaticsniper-tool") somaticsniper_input = { "normal": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBamFile" }), "tumor": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } somaticsniper_stage_id = wf.add_stage(somaticsniper_applet, stage_input=somaticsniper_input, instance_type="mem2_hdd2_x1") samtools_pileup_applet = find_applet("samtools-pileup-tool") samtools_pileup_normal_input = { "input1": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBamFile" }), "input1_index": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBaiFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } samtools_pileup_normal_stage_id = wf.add_stage( samtools_pileup_applet, stage_input=samtools_pileup_normal_input, instance_type="mem2_hdd2_x1") samtools_pileup_tumor_input = { "input1": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "input1_index": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBaiFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } samtools_pileup_tumor_stage_id = wf.add_stage( samtools_pileup_applet, stage_input=samtools_pileup_tumor_input, instance_type="mem2_hdd2_x2") muse_applet = find_applet("muse-tool") muse_input = { "tumor_bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "tumor_bai": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBaiFile" }), "normal_bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBamFile" }), "normal_bai": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBaiFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }), "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj") } muse_stage_id = wf.add_stage(muse_applet, stage_input=muse_input) varscan_applet = find_applet("varscan-tool") varscan_input = { "normal_pileup": dxpy.dxlink({ "stage": samtools_pileup_normal_stage_id, "outputField": "pileup" }), "tumor_pileup": dxpy.dxlink({ "stage": samtools_pileup_tumor_stage_id, "outputField": "pileup" }) } varscan_stage_id = wf.add_stage(varscan_applet, stage_input=varscan_input, instance_type="mem2_hdd2_x2") mutect_applet = find_applet("mutect-tool") mutect_input = { "tumor_bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "tumor_bai": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBaiFile" }), "normal_bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBamFile" }), "normal_bai": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "normalInputBaiFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }), "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj"), "cosmic": dxpy.dxlink("file-Bk9g2kQ0kF9f9XG6VZf7VGKQ"), } mutect_stage_id = wf.add_stage(mutect_applet, stage_input=mutect_input) # fpfilter (somaticSniper, Varscan) fpfilter_applet = find_applet("fpfilter-tool") somatcisniper_fpfilter_input = { "vcf": dxpy.dxlink({ "stage": somaticsniper_stage_id, "outputField": "vcf" }), "bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } somaticsniper_fpfilter_stage_id = wf.add_stage( fpfilter_applet, stage_input=somatcisniper_fpfilter_input, name="fpfilter-tool(somaticSniper)", folder="fpfiltered") varscan_snp_fpfilter_input = { "vcf": dxpy.dxlink({ "stage": varscan_stage_id, "outputField": "snp_vcf" }), "bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } varscan_snp_fpfilter_stage_id = wf.add_stage( fpfilter_applet, stage_input=varscan_snp_fpfilter_input, name="fpfilter-tool(varscan SNP)", folder="fpfiltered") varscan_indel_fpfilter_input = { "vcf": dxpy.dxlink({ "stage": varscan_stage_id, "outputField": "indel_vcf" }), "bam": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "tumorInputBamFile" }), "reference": dxpy.dxlink({ "stage": pindel_stage_id, "inputField": "inputReferenceFile" }) } varscan_indel_fpfilter_stage_id = wf.add_stage( fpfilter_applet, stage_input=varscan_indel_fpfilter_input, name="fpfilter-tool(varscan INDEL)", folder="fpfiltered") # vcf_filter (All variant callers) vcf_filter_applet = find_applet("tcga-vcf-filter-tool") radia_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": radia_stage_id, "outputField": "filtered_output_vcf" }), "filterRejects": False } radia_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=radia_vcf_filter_input, name="vcffilter-tool(radia)", folder="final_filtered") somaticsniper_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": somaticsniper_fpfilter_stage_id, "outputField": "annotated_output" }), "filterRejects": False } somaticsniper_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=somaticsniper_vcf_filter_input, name="vcffilter-tool(somaticsniper)", folder="final_filtered") varscan_snp_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": varscan_snp_fpfilter_stage_id, "outputField": "annotated_output" }), "filterRejects": True } varscan_snp_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=varscan_snp_vcf_filter_input, name="vcffilter-tool(varscan SNP)", folder="final_filtered") varscan_indel_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": varscan_indel_fpfilter_stage_id, "outputField": "annotated_output" }), "filterRejects": True } varscan_indel_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=varscan_indel_vcf_filter_input, name="vcffilter-tool(varscan INDEL)", folder="final_filtered") muse_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": muse_stage_id, "outputField": "mutations" }), "filterRejects": False } muse_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=muse_vcf_filter_input, name="vcffilter-tool(muse)", folder="final_filtered") pindel_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": pindel_stage_id, "outputField": "outputSomaticVcf" }), "filterRejects": False } pindel_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=pindel_vcf_filter_input, name="vcffilter-tool(pindel)", folder="final_filtered") mutect_vcf_filter_input = { "input_vcf": dxpy.dxlink({ "stage": mutect_stage_id, "outputField": "mutations" }), "filterRejects": True } mutect_vcf_filter_stage_id = wf.add_stage( vcf_filter_applet, stage_input=mutect_vcf_filter_input, name="vcffilter-tool(mutect)", folder="final_filtered") vcf_reheader_applet = find_applet("tcga-vcf-reheader") radia_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": radia_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "radia", "software_version": "1", "software_params": "--dnaNormalMinTotalBases 4 --dnaNormalMinAltBases 2 --dnaNormalBaseQual 10 --dnaNormalMapQual 10 --dnaTumorDescription TumorDNASample --dnaTumorMinTotalBases 4 --dnaTumorMinAltBases 2 --dnaTumorBaseQual 10 --dnaTumorMapQual 10 --dnaNormalMitochon=MT --dnaTumorMitochon=MT --genotypeMinDepth 2 --genotypeMinPct 0.100", "center": "ucsc.edu" } radia_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=radia_vcf_reheader_input, name="vcf-reheader(radia)", folder="final_reheadered") """ sample_params = { "platform": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "platform"}), "participant_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "participant_uuid"}), "disease_code": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "disease_code"}), "normal_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_analysis_uuid"}), "normal_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_bam_name"}), "normal_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_id"}), "normal_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_barcode"}), "tumor_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_analysis_uuid"}), "tumor_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_bam_name"}), "tumor_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_uuid"}), "tumor_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_barcode"}) } """ somaticsniper_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": somaticsniper_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "somaticsniper", "software_version": "v1.0.5.0", "software_params": "-Q 40 -n NORMAL -q 1 -s 0.01 -r 0.001", "center": "wustl.edu" } #somaticsniper_vcf_reheader_input.update(sample_params) somaticsniper_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=somaticsniper_vcf_reheader_input, name="vcf-reheader(somaticsniper)", folder="final_reheadered") varscan_snp_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": varscan_snp_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "varscan", "software_version": "2.3.9", "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1", "center": "wustl.edu" } #varscan_snp_vcf_reheader_input.update(sample_params) varscan_snp_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=varscan_snp_vcf_reheader_input, name="vcf-reheader(varscan SNP)", folder="final_reheadered") varscan_indel_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": varscan_indel_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "varscan", "software_version": "2.3.9", "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1", "center": "wustl.edu" } #varscan_indel_vcf_reheader_input.update(sample_params) varscan_indel_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=varscan_indel_vcf_reheader_input, name="vcf-reheader(varscan INDEL)", folder="final_reheadered") muse_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": muse_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "muse", "software_version": "v1.0rc", "software_params": "--mode wxs", "center": "mdanderson.org" } #muse_vcf_reheader_input.update(sample_params) muse_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=muse_vcf_reheader_input, name="vcf-reheader(muse)", folder="final_reheadered") pindel_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": pindel_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "pindel", "software_version": "v0.2.5b8", "software_params": "--max_range_index 1 --window_size 5 --sequencing_error_rate 0.010000 --sensitivity 0.950000 --maximum_allowed_mismatch_rate 0.020000 --NM 2 --additional_mismatch 1 --min_perfect_match_around_BP 3 --min_inversion_size 50 --min_num_matched_bases 30 --balance_cutoff 0 --anchor_quality 0 --minimum_support_for_event 3 --report_long_insertions --report_duplications --report_inversions --report_breakpoints", "center": "wustl.edu" } #pindel_vcf_reheader_input.update(sample_params) pindel_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=pindel_vcf_reheader_input, name="vcf-reheader(pindel)", folder="final_reheadered") mutect_vcf_reheader_input = { "input_vcf": dxpy.dxlink({ "stage": mutect_vcf_filter_stage_id, "outputField": "output_vcf" }), "software_name": "mutect", "software_version": "1.1.5", "software_params": "--initial_tumor_lod 4.0 --tumor_lod 10.0", "center": "broad.org" } mutect_vcf_reheader_stage_id = wf.add_stage( vcf_reheader_applet, stage_input=mutect_vcf_reheader_input, name="vcf-reheader(mutect)", folder="final_reheadered") return wf
return_handler=False) replicates = [dxpy.dxlink(r) for r in replicates] controls = dxpy.find_data_objects(classname='file', name='*.bam', name_mode='glob', project=project.get_id(), folder=CONTROLS_FOLDER, return_handler=False) controls = [dxpy.dxlink(c) for c in controls] else: if (len(args.replicates) < 1) or (len(args.controls) < 1): sys.exit( 'Need to have at least 1 replicate file and 1 control file.') project.new_folder(REPLICATES_FOLDER, True) project.new_folder(CONTROLS_FOLDER, True) replicates = copy_files(args.replicates, project, REPLICATES_FOLDER) controls = copy_files(args.controls, project, CONTROLS_FOLDER) if (len(replicates) < 1) or (len(controls) < 1): sys.exit('Need to have at least 1 replicate file and 1 control file.') # Now create a new workflow wf = dxpy.new_dxworkflow(title='dx_chip_seq', name='ENCODE ChIP-Seq 2.0', description='The ENCODE ChIP-Seq Pipeline 2.0', project=project.get_id()) populate_workflow(wf, replicates, controls, project.describe()['name'], args.sort_filter_and_remove_dups, args.duplicates_removed, args.gender, applets_project_id)
def build_workflow(experiment, biorep_n, input_shield_stage_input, key): output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (mapping_applet.name)) input_shield_applet = find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (input_shield_applet.name)) workflow_output_folder = resolve_folder( output_project, args.outf + '/workflows/' + experiment.get('accession') + '/' + 'rep%d' % (biorep_n)) fastq_output_folder = resolve_folder( output_project, args.outf + '/fastqs/' + experiment.get('accession') + '/' + 'rep%d' % (biorep_n)) mapping_output_folder = resolve_folder( output_project, args.outf + '/raw_bams/' + experiment.get('accession') + '/' + 'rep%d' % (biorep_n)) if args.raw: workflow_title = 'Map %s rep%d to %s (no filter)' % ( experiment.get('accession'), biorep_n, args.assembly) workflow_name = 'ENCODE raw mapping pipeline' else: workflow_title = 'Map %s rep%d to %s and filter' % ( experiment.get('accession'), biorep_n, args.assembly) workflow_name = 'ENCODE mapping pipeline' if args.tag: workflow_title += ': %s' % (args.tag) workflow = dxpy.new_dxworkflow(title=workflow_title, name=workflow_name, project=output_project.get_id(), folder=workflow_output_folder) input_shield_stage_id = workflow.add_stage( input_shield_applet, name='Gather inputs %s rep%d' % (experiment.get('accession'), biorep_n), folder=fastq_output_folder, stage_input=input_shield_stage_input) mapping_stage_id = workflow.add_stage( mapping_applet, name='Map %s rep%d' % (experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={ 'input_JSON': dxpy.dxlink({ 'stage': input_shield_stage_id, 'outputField': 'output_JSON' }) }) if not args.raw: final_output_folder = resolve_folder( output_project, args.outf + '/bams/' + experiment.get('accession') + '/' + 'rep%d' % (biorep_n)) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (filter_qc_applet.name)) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter and QC %s rep%d' % (experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': mapping_stage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': mapping_stage_id, 'outputField': 'paired_end' }) }) xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (xcor_applet.name)) xcor_stage_id = workflow.add_stage( xcor_applet, name='Calculate cross-correlation %s rep%d' % (experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }) }) ''' This should all be done in the shield's postprocess entrypoint if args.accession_outputs: derived_from = input_shield_stage_input.get('reads1') if reads2: derived_from.append(reads2) files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : { 'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')), 'lab': 'j-michael-cherry', 'award': 'U41HG006992', 'submitted_by': '*****@*****.**', 'file_format': 'bam', 'output_type': 'alignments', 'derived_from': derived_from, 'dataset': experiment.get('accession')} } output_shield_stage_id = workflow.add_stage( output_shield_applet, name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})], 'files_json': files_json, 'key': input_shield_stage_input.get('key')} ) ''' return workflow
def build_workflow(experiment, biorep_n, input_shield_stage_input, accession, use_existing_folders): output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) mapping_applet = \ find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (mapping_applet.name)) input_shield_applet = \ find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (input_shield_applet.name)) folders = ['workflows', 'fastqs', 'raw_bams', 'bams'] folder_paths = \ ['/'.join([args.outf, folder_name, experiment.get('accession'), 'rep%d' % (biorep_n)]) for folder_name in folders] paths_exist = \ [resolve_folder(output_project, folder_path) for folder_path in folder_paths if resolve_folder(output_project, folder_path)] if any(paths_exist): msg = "%s: output paths already exist: %s" % (experiment.get('accession'), paths_exist) if use_existing_folders: logging.warning(msg) else: msg += "\nUse --use_existing_folders to supress but possibly create duplicate files" logging.error(msg) return None workflow_output_folder, fastq_output_folder, mapping_output_folder, final_output_folder = \ tuple(create_folder(output_project, folder_path) for folder_path in folder_paths) if args.raw: workflow_title = \ ('Map %s rep%d to %s (no filter)' % (experiment.get('accession'), biorep_n, args.assembly)) workflow_name = 'ENCODE raw mapping pipeline' else: workflow_title = \ ('Map %s rep%d to %s and filter' % (experiment.get('accession'), biorep_n, args.assembly)) workflow_name = 'ENCODE mapping pipeline' if args.tag: workflow_title += ': %s' % (args.tag) workflow = dxpy.new_dxworkflow( title=workflow_title, name=workflow_name, project=output_project.get_id(), folder=workflow_output_folder ) input_shield_stage_id = workflow.add_stage( input_shield_applet, name='Gather inputs %s rep%d' % (experiment.get('accession'), biorep_n), folder=fastq_output_folder, stage_input=input_shield_stage_input ) input_names = \ [name for name in ['reads1', 'reads2', 'crop_length', 'reference_tar', 'bwa_version', 'bwa_aln_params', 'samtools_version', 'debug'] if name in input_shield_stage_input] logging.debug('input_names: %s' % (input_names)) mapping_stage_input = dict(zip( input_names, [dxpy.dxlink( {'stage': input_shield_stage_id, 'outputField': input_name}) for input_name in input_names])) logging.debug('mapping_stage_input: %s' % (mapping_stage_input)) mapping_stage_id = workflow.add_stage( mapping_applet, name='Map %s rep%d' % (experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input=mapping_stage_input ) if not args.raw: filter_qc_applet = \ find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' % (filter_qc_applet.name)) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter and QC %s rep%d' % (experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'paired_end'}), 'scrub': args.scrub } ) xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' %(xcor_applet.name)) xcor_stage_id = workflow.add_stage( xcor_applet, name='Calculate cross-correlation %s rep%d' %(experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}), 'spp_version': args.spp_version } ) ''' This should all be done in the shield's postprocess entrypoint if args.accession_outputs: derived_from = input_shield_stage_input.get('reads1') if reads2: derived_from.append(reads2) files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : { 'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')), 'lab': 'j-michael-cherry', 'award': 'U41HG006992', 'submitted_by': '*****@*****.**', 'file_format': 'bam', 'output_type': 'alignments', 'derived_from': derived_from, 'dataset': experiment.get('accession')} } output_shield_stage_id = workflow.add_stage( output_shield_applet, name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})], 'files_json': files_json, 'key': input_shield_stage_input.get('key')} ) ''' return workflow
def main(): args = get_args() blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not blank_workflow: assert args.rep1, "Reads are required for rep1" assert args.ctl1, "Reads are required for ctl1" assert not args.nomap or args.rep1pe is not None, "With --nomap, endedness of rep1 must be specified witn --rep1pe" assert not args.nomap or (not args.rep2 or args.rep2pe is not None), "With --nomap, endedness of rep2 must be specified with --rep2pe" if not args.target: target_type = 'default' # default else: target_type = args.target.lower() if target_type not in WF.keys(): logging.error('Target type %s is not recognized') sys.exit(2) output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' % (output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' % (applet_project.name)) existing_folder = resolve_folder(output_project, args.outf) if not existing_folder: output_folder = create_folder(output_project, args.outf) elif args.use_existing_folders: output_folder = existing_folder else: assert (existing_folder and args.use_existing_folders), 'Output folder %s exists but --use_existing_folders is %s' % (existing_folder, args.use_existing_folders) logging.debug('Using output folder %s' % (output_folder)) workflow = dxpy.new_dxworkflow( name=args.name or WF[target_type]['wf_name'], title=args.title or WF[target_type]['wf_title'], description=args.description or WF[target_type]['wf_description'], project=output_project.get_id(), folder=output_folder, properties={'pipeline_version': str(args.pipeline_version)}) unary_control = args.unary_control or (not blank_workflow and args.ctl2 is None) simplicate_experiment = args.simplicate_experiment or (args.rep1 and not args.rep2) if not args.genomesize: genomesize = None else: genomesize = args.genomesize if not args.chrom_sizes: chrom_sizes = None else: chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes)) if not args.blacklist: blacklist = None else: blacklist = dxpy.dxlink(resolve_file(args.blacklist)) run_idr = WF[target_type]['run_idr'] if not args.nomap: # a "superstage" is just a dict with a name, name(s) of input files, # and then names and id's of stages that process that input # each superstage here could be implemented as a stage in a more # abstract workflow. That stage would then call the various applets # that are separate # stages here. mapping_superstages = [ # the order of this list is important in that {'name': 'Rep1', 'input_args': args.rep1} ] if not simplicate_experiment: mapping_superstages.append( {'name': 'Rep2', 'input_args': args.rep2}) mapping_superstages.append( {'name': 'Ctl1', 'input_args': args.ctl1}) if not unary_control and not simplicate_experiment: mapping_superstages.append( {'name': 'Ctl2', 'input_args': args.ctl2}) mapping_applet = find_applet_by_name( MAPPING_APPLET_NAME, applet_project.get_id()) # mapping_output_folder = resolve_folder( # output_project, output_folder + '/' + mapping_applet.name) mapping_output_folder = mapping_applet.name reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name( FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name( XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder # in the first pass create the mapping stage id's so we can use JBOR's # to link inputs for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder ) mapping_superstage.update({'map_stage_id': mapped_stage_id}) # in the second pass populate the stage inputs and build other stages rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1') for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') superstage_id = mapping_superstage.get('map_stage_id') if mapping_superstage.get('input_args') or blank_workflow: mapping_stage_input = {} if superstage_name != "Rep1": mapping_stage_input.update( {'reference_tar': dxpy.dxlink( {'stage': rep1_stage_id, 'inputField': 'reference_tar'})}) else: if args.reference: mapping_stage_input.update( {'reference_tar': dxpy.dxlink( reference_tar.get_id())}) if not blank_workflow: for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update({'reads%d' %(arg_index+1): reads}) # this is now done in the first pass loop above # mapped_stage_id = workflow.add_stage( # mapping_applet, # name='Map %s' %(superstage_name), # folder=mapping_output_folder, # stage_input=mapping_stage_input # ) # mapping_superstage.update({'map_stage_id': mapped_stage_id}) workflow.update_stage(superstage_id, stage_input=mapping_stage_input) filter_qc_stage_input = { 'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'}) } if args.scrub is not None: filter_qc_stage_input.update({'scrub': args.scrub}) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' %(superstage_name), folder=filter_qc_output_folder, stage_input=filter_qc_stage_input ) mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage( xcor_applet, name='Xcor %s' %(superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}), 'spp_version': args.spp_version } ) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}) exp_rep1_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file'}) rep1_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end'}) if not simplicate_experiment: exp_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}) exp_rep2_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file'}) rep2_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end'}) else: exp_rep2_ta = None exp_rep2_cc = None rep2_paired_end = None ctl_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}) if not unary_control and not simplicate_experiment: ctl_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}) else: ctl_rep2_ta = None else: # skipped the mapping, so just bring in the inputs from arguments if not blank_workflow: exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep1_ta_desc = dxpy.describe(exp_rep1_ta) exp_rep1_mapping_analysis_id = dxpy.describe(exp_rep1_ta_desc['createdBy']['job'])['analysis'] exp_rep1_mapping_analysis = dxpy.describe(exp_rep1_mapping_analysis_id) rep1_xcor_stage_description = next( stage for stage in exp_rep1_mapping_analysis.get('stages') if stage['execution']['executableName'] == 'xcor') exp_rep1_cc = rep1_xcor_stage_description['execution']['output']['CC_scores_file'] if args.rep1pe is None: print("Inferring rep1 PE-ness from analysis") rep1_paired_end = rep1_xcor_stage_description['execution']['output']['paired_end'] else: rep1_paired_end = args.rep1pe if not simplicate_experiment: exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) exp_rep2_ta_desc = dxpy.describe(exp_rep2_ta) exp_rep2_mapping_analysis_id = dxpy.describe(exp_rep2_ta_desc['createdBy']['job'])['analysis'] exp_rep2_mapping_analysis = dxpy.describe(exp_rep2_mapping_analysis_id) rep2_xcor_stage_description = next( stage for stage in exp_rep2_mapping_analysis.get('stages') if stage['execution']['executableName'] == 'xcor') exp_rep2_cc = rep2_xcor_stage_description['execution']['output']['CC_scores_file'] if args.rep2pe is None: print("Inferring rep2 PE-ness from analysis") rep2_paired_end = rep1_xcor_stage_description['execution']['output']['paired_end'] else: rep2_paired_end = args.rep1pe else: exp_rep2_ta = None exp_rep2_cc = None rep2_paired_end = None ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) if not unary_control and not simplicate_experiment: ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) else: ctl_rep2_ta = None else: # blank workflow ctl_rep1_ta = None ctl_rep2_ta = None # here we need to calculate the cc scores files, because we're only # being supplied tagAligns # if we had mapped everything above we'd already have a handle to # the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name) xcor_output_folder = xcor_only_applet.name xcor_only_stages = [] rep1_xcor_input = {'spp_version': args.spp_version} if args.rep1pe is not None: rep1_xcor_input.update({'paired_end': args.rep1pe}) exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input=rep1_xcor_input ) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink( {'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file'}) rep1_paired_end = dxpy.dxlink( {'stage': exp_rep1_cc_stage_id, 'outputField': 'paired_end'}) exp_rep1_ta = dxpy.dxlink( {'stage': exp_rep1_cc_stage_id, 'inputField': 'input_tagAlign'}) if not simplicate_experiment: rep2_xcor_input = {'spp_version': args.spp_version} if args.rep2pe is not None: rep2_xcor_input.update({'paired_end': args.rep2pe}) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input=rep2_xcor_input ) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink( {'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file'}) rep2_paired_end = dxpy.dxlink( {'stage': exp_rep2_cc_stage_id, 'outputField': 'paired_end'}) exp_rep2_ta = dxpy.dxlink( {'stage': exp_rep2_cc_stage_id, 'inputField': 'input_tagAlign'}) else: exp_rep2_cc = None exp_rep2_ta = None rep2_paired_end = None if not args.maponly: encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name) peaks_output_folder = encode_macs2_applet.name # for simplicate experiments and/or unary controls, some of the ta inputs # will have the value None macs2_stage_input_mapping = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)), 'genomesize': genomesize, 'chrom_sizes': chrom_sizes } # have to prune out any arguments with value None because DX will error # with arguments with null values macs2_stage_input = dict([(k,v) for k,v in macs2_stage_input_mapping.iteritems() if v is not None]) encode_macs2_stage_id = workflow.add_stage( encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input=macs2_stage_input ) encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id}) if run_idr: encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name) idr_peaks_output_folder = encode_spp_applet.name PEAKS_STAGE_NAME = 'SPP Peaks' # for simplicate experiments and/or unary controls, some of the ta inputs # will have the value None peaks_stage_input_mapping = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'idr_peaks': True, 'spp_version': args.spp_version, 'spp_instance': args.spp_instance } if chrom_sizes: peaks_stage_input_mapping.update({'chrom_sizes': chrom_sizes}) else: peaks_stage_input_mapping.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})}) # have to prune out any arguments with value None because DX will error # with arguments with null values peaks_stage_input = dict([(k,v) for k,v in peaks_stage_input_mapping.iteritems() if v is not None]) encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input=peaks_stage_input ) encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id}) # TODO here I think we should abstract out all the IDR to one step like the two peak-calling steps idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) idr_output_folder = idr_applet.name if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow or simplicate_experiment: idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id}) if not simplicate_experiment: idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id}) final_idr_stage_input = { 'r1pr_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'rep1_ta': exp_rep1_ta, 'rep1_xcor': exp_rep1_cc, 'paired_end': rep1_paired_end, # applies to replicated experiments, too 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'rep1_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_fc_signal'}) } if not simplicate_experiment: final_idr_stage_input.update({ 'reps_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks'}), 'r2pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'pooledpr_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'rep2_ta': exp_rep2_ta, 'rep2_xcor': exp_rep2_cc, 'rep2_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_fc_signal'}), 'pooled_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_fc_signal'}) }) if blacklist: final_idr_stage_input.update({'blacklist': blacklist}) if chrom_sizes: final_idr_stage_input.update({'chrom_sizes': chrom_sizes}) else: final_idr_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})}) final_idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=final_idr_stage_input, ) idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': final_idr_stage_id}) if target_type == 'histone': PEAKS_STAGE_NAME = "ENCODE Peaks" overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id()) overlap_peaks_stages = [] for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']: if peaktype == 'narrowpeaks': as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as)) peak_type_extension = 'narrowPeak' elif peaktype == 'gappedpeaks': as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as)) peak_type_extension = 'gappedPeak' elif peaktype == 'broadpeaks': as_file = dxpy.dxlink(resolve_file(args.broadpeak_as)) peak_type_extension = 'broadPeak' overlap_peaks_stage_input = { 'rep1_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_%s' % (peaktype)}), 'rep2_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_%s' % (peaktype)}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_%s' % (peaktype)}), 'pooledpr1_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_%s' % (peaktype)}), 'pooledpr2_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_%s' % (peaktype)}), 'rep1_ta': exp_rep1_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_ta': exp_rep2_ta, 'rep2_xcor': exp_rep2_cc, 'paired_end': rep1_paired_end, # applies to replicated experiments, too 'as_file': as_file, 'peak_type': peak_type_extension, 'prefix': 'final', 'rep1_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_fc_signal'}), 'rep2_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_fc_signal'}), 'pooled_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_fc_signal'}) } if not simplicate_experiment else { 'rep1_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_%s' % (peaktype)}), 'rep2_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_%s' % (peaktype)}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_%s' % (peaktype)}), 'rep1_ta': exp_rep1_ta, 'rep1_xcor': exp_rep1_cc, 'paired_end': rep1_paired_end, # applies to replicated experiments, too 'as_file': as_file, 'peak_type': peak_type_extension, 'prefix': 'final', 'rep1_signal': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_fc_signal'}) } if chrom_sizes: overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes}) else: overlap_peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})}) overlap_peaks_stage_id = workflow.add_stage( overlap_peaks_applet, name='Final %s' % (peaktype), folder=peaks_output_folder, stage_input=overlap_peaks_stage_input ) overlap_peaks_stages.append({'name': 'Final %s' %(peaktype), 'stage_id': overlap_peaks_stage_id}) if args.yes: if args.debug: analysis = workflow.run({}, folder=output_folder, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['*']) else: analysis = workflow.run({}, folder=output_folder, priority='normal') analysis.set_properties({ "target_type": target_type, "unreplicated_experiment": str(simplicate_experiment), "unary_control": str(unary_control) }) print("Running %s as %s" % (analysis.name, analysis.get_id())) if args.accession: accession_analysis_applet = find_applet_by_name(ACCESSION_ANALYSIS_APPLET_NAME, applet_project.get_id()) accession_output_folder = '/' + accession_analysis_applet.name accession_job_input = { 'analysis_ids': [analysis.get_id()], 'wait_on_files': [] } if args.fqcheck is not None: accession_job_input.update({'fqcheck' : args.fqcheck}) if args.skip_control is not None: accession_job_input.update({'skip_control' : args.skip_control}) if args.force_patch is not None: accession_job_input.update({'force_patch': args.force_patch}) # assert accession_stage_input['wait_on_files'], "ERROR: workflow has no wait_on_files defined, so --accession is not supported." time.sleep(5) max_retries = 10 retries = max_retries while retries: try: accession_job = accession_analysis_applet.run( accession_job_input, name='Accession %s' % (analysis.name), folder=accession_output_folder, depends_on=analysis.describe()['dependsOn'] ) except Exception as e: logging.error("%s launching auto-accession ... %d retries left" % (e, retries)) time.sleep(5) retries -= 1 continue else: logging.info("Auto-accession will run as %s %s" % (accession_job.name, accession_job.get_id())) break else: logging.error("Auto-accession failed with %s" % ())
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' %(output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.debug('Using output folder %s' %(output_folder)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' %(applet_project.name)) workflow = dxpy.new_dxworkflow( name=args.name, title=args.title, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not args.genomesize: genomesize = None else: genomesize = args.genomesize if not args.chrom_sizes: chrom_sizes = None else: chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes)) if not args.blacklist: blacklist = None else: blacklist = dxpy.dxlink(resolve_file(args.blacklist)) if not args.nomap: #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ # the order of this list is important in that {'name': 'Rep1', 'input_args': args.rep1}, {'name': 'Rep2', 'input_args': args.rep2}, {'name': 'Ctl1', 'input_args': args.ctl1} ] if not args.unary_control: mapping_superstages.append({'name': 'Ctl2', 'input_args': args.ctl2}) mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder # in the first pass create the mapping stage id's so we can use JBOR's # to link inputs for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' %(superstage_name), folder=mapping_output_folder ) mapping_superstage.update({'map_stage_id': mapped_stage_id}) # in the second pass populate the stage inputs and build other stages rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1') for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') superstage_id = mapping_superstage.get('map_stage_id') if mapping_superstage.get('input_args') or blank_workflow: mapping_stage_input = {} if superstage_name != "Rep1": mapping_stage_input.update({'reference_tar': dxpy.dxlink({'stage': rep1_stage_id, 'inputField': 'reference_tar'})}) else: if args.reference: mapping_stage_input.update({'reference_tar' : dxpy.dxlink(reference_tar.get_id())}) if not blank_workflow: for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update({'reads%d' %(arg_index+1): reads}) # this is now done in the first pass loop above # mapped_stage_id = workflow.add_stage( # mapping_applet, # name='Map %s' %(superstage_name), # folder=mapping_output_folder, # stage_input=mapping_stage_input # ) # mapping_superstage.update({'map_stage_id': mapped_stage_id}) workflow.update_stage(superstage_id, stage_input=mapping_stage_input) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' %(superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'}) } ) mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage( xcor_applet, name='Xcor %s' %(superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}) } ) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}) exp_rep1_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file'}) exp_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}) exp_rep2_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file'}) ctl_rep1_ta = dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}) if not args.unary_control: ctl_rep2_ta = dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}) else: ctl_rep2_ta = ctl_rep1_ta rep1_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end'}) rep2_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end'}) else: #skipped the mapping, so just bring in the inputs from arguments exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name) xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end } ) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink( {'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file'}) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end } ) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink( {'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file'}) encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name) PEAKS_STAGE_NAME = 'SPP Peaks' peaks_stage_input = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'idr_peaks': args.idr } if chrom_sizes: peaks_stage_input.update({'chrom_sizes': chrom_sizes}) encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input=peaks_stage_input ) encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id}) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name) macs2_stage_input = { 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)) } if genomesize: macs2_stage_input.update({'genomesize': genomesize}) if chrom_sizes: macs2_stage_input.update({'chrom_sizes': chrom_sizes}) else: macs2_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})}) encode_macs2_stage_id = workflow.add_stage( encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input=macs2_stage_input ) encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id}) if args.idr: # if args.idrversion == "1": # idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) # elif args.idrversion == "2": # idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) # else: # logging.error("Invalid IDR version: %s" %(args.idrversion)) # idr_applet = None idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks'}) } ) idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks'}) } ) idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id}) stage_input = { 'reps_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks'}), 'r1pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'r2pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'pooledpr_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)) } if blacklist: stage_input.update({'blacklist': blacklist}) if chrom_sizes: stage_input.update({'chrom_sizes': chrom_sizes}) else: stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})}) idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=stage_input ) idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': idr_stage_id}) if not (args.nomap): logging.debug("Mapping stages: %s" %(mapping_superstages)) else: logging.debug("xcor only stages: %s" %(xcor_only_stages)) # if not args.idronly: # logging.debug("Peak stages: %s" %(spp_stages)) logging.debug("Peak stages: %s" %(encode_spp_stages)) if args.idr: logging.debug("IDR stages: %s" %(idr_stages)) if args.yes: if args.debug: job_id = workflow.run({}, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255']) else: job_id = workflow.run({}, priority='high') logging.info("Running as job %s" %(job_id))
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.info('Found output project %s' % (output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.info('Using output folder %s' % (output_folder)) applet_project = resolve_project(args.applets, 'r') logging.info('Found applet project %s' % (applet_project.name)) workflow = dxpy.new_dxworkflow(name=WF_NAME, title=args.name, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not args.nomap: #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ { 'name': 'Rep1', 'input_args': args.rep1 }, { 'name': 'Rep2', 'input_args': args.rep2 }, { 'name': 'Ctl1', 'input_args': args.ctl1 }, { 'name': 'Ctl2', 'input_args': args.ctl2 } # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)}, # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below ] mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder( output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') if mapping_superstage.get('input_args') or blank_workflow: if blank_workflow: mapping_stage_input = None else: mapping_stage_input = { 'reference_tar': dxpy.dxlink(reference_tar.get_id()) } for arg_index, input_arg in enumerate( mapping_superstage['input_args'] ): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update( {'reads%d' % (arg_index + 1): reads}) mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' % (superstage_name), folder=mapping_output_folder, stage_input=mapping_stage_input) mapping_superstage.update({'map_stage_id': mapped_stage_id}) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' % (superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': mapped_stage_id, 'outputField': 'mapped_reads' }), 'paired_end': dxpy.dxlink({ 'stage': mapped_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update( {'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage(xcor_applet, name='Xcor %s' % (superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'filtered_bam' }), 'paired_end': dxpy.dxlink({ 'stage': filter_qc_stage_id, 'outputField': 'paired_end' }) }) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file' }) exp_rep1_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file' }) exp_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file' }) exp_rep2_cc = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file' }) ctl_rep1_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file' }) ctl_rep2_ta = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file' }) rep1_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end' }) rep2_paired_end = dxpy.dxlink({ 'stage': next( ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end' }) else: #skipped the mapping, so just bring in the inputs from arguments exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) xcor_output_folder = resolve_folder( output_project, output_folder + '/' + xcor_only_applet.name) xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end }) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink({ 'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file' }) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end }) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink({ 'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file' }) # if not args.idronly: # spp_applet = find_applet_by_name(SPP_APPLET_NAME, applet_project.get_id()) # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + spp_applet.name) # spp_stages = [] # if (args.rep1 and args.ctl1) or blank_workflow: # rep1_spp_stage_id = workflow.add_stage( # spp_applet, # name='Peaks Rep1', # folder=peaks_output_folder, # stage_input={ # 'experiment': exp_rep1_ta, # 'control': ctl_rep1_ta, # 'xcor_scores_input': exp_rep1_cc, # 'bigbed': True, # 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), # 'as_file': dxpy.dxlink(resolve_file(args.as_file)) # } # ) # spp_stages.append({'name': 'Peaks Rep1', 'stage_id': rep1_spp_stage_id}) # if (args.rep2 and args.ctl2) or blank_workflow: # rep2_spp_stage_id = workflow.add_stage( # spp_applet, # name='Peaks Rep2', # folder=peaks_output_folder, # stage_input={ # 'experiment': exp_rep2_ta, # 'control': ctl_rep2_ta, # 'xcor_scores_input': exp_rep2_cc, # 'bigbed': True, # 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), # 'as_file': dxpy.dxlink(resolve_file(args.as_file)) # } # ) # spp_stages.append({'name': 'Peaks Rep2', 'stage_id': rep2_spp_stage_id}) encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id()) encode_spp_stages = [] idr_peaks_output_folder = resolve_folder( output_project, output_folder + '/' + encode_spp_applet.name) PEAKS_STAGE_NAME = 'SPP Peaks' if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: encode_spp_stage_id = workflow.add_stage( encode_spp_applet, name=PEAKS_STAGE_NAME, folder=idr_peaks_output_folder, stage_input={ 'rep1_ta': exp_rep1_ta, 'rep2_ta': exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta': ctl_rep2_ta, 'rep1_xcor': exp_rep1_cc, 'rep2_xcor': exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': dxpy.dxlink(resolve_file(args.as_file)), 'idr_peaks': args.idr }) encode_spp_stages.append({ 'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id }) if args.idr: idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] idr_output_folder = resolve_folder( output_project, output_folder + '/' + idr_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR True Replicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep1_peaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2pr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'rep2_peaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id }) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoeplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr1_peaks' }), 'rep2_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooledpr2_peaks' }), 'pooled_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME), 'outputField': 'pooled_peaks' }), 'idr_version': int(args.idrversion) }) idr_stages.append({ 'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id }) blacklist = resolve_file(args.blacklist) idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input={ 'reps_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks' }), 'r1pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'r2pr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'pooledpr_peaks': dxpy.dxlink({ 'stage': next( ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks' }), 'blacklist': dxpy.dxlink(blacklist.get_id()), 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': dxpy.dxlink(resolve_file(args.as_file)) }) idr_stages.append({ 'name': 'Final IDR peak calls', 'stage_id': idr_stage_id }) if not (args.nomap): logging.debug("Mapping stages: %s" % (mapping_superstages)) else: logging.debug("xcor only stages: %s" % (xcor_only_stages)) # if not args.idronly: # logging.debug("Peak stages: %s" %(spp_stages)) logging.debug("Peak stages: %s" % (encode_spp_stages)) if args.idr: logging.debug("IDR stages: %s" % (idr_stages)) if args.yes: job_id = workflow.run({}, delay_workspace_destruction=True) logging.info("Running as job %s" % (job_id))
def main(): args = get_args() output_project = resolve_project(args.outp, 'w') logging.info('Found output project %s' %(output_project.name)) output_folder = resolve_folder(output_project, args.outf) logging.info('Using output folder %s' %(output_folder)) applet_project = resolve_project(args.applets, 'r') logging.info('Found applet project %s' %(applet_project.name)) workflow = dxpy.new_dxworkflow( title=WF_TITLE, name=args.name, description=WF_DESCRIPTION, project=output_project.get_id(), folder=output_folder) blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2) if not args.nomap: #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input #each superstage here could be implemented as a stage in a more abstract workflow. That stage would then call the various applets that are separate #stages here. mapping_superstages = [ {'name': 'Rep1', 'input_args': args.rep1}, {'name': 'Rep2', 'input_args': args.rep2}, {'name': 'Ctl1', 'input_args': args.ctl1}, {'name': 'Ctl2', 'input_args': args.ctl2} # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)}, # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below ] mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name) reference_tar = resolve_file(args.reference) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) filter_qc_output_folder = mapping_output_folder xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) xcor_output_folder = mapping_output_folder for mapping_superstage in mapping_superstages: superstage_name = mapping_superstage.get('name') if mapping_superstage.get('input_args') or blank_workflow: if blank_workflow: mapping_stage_input = None else: mapping_stage_input = {'reference_tar' : dxpy.dxlink(reference_tar.get_id())} for arg_index,input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2 reads = dxpy.dxlink(resolve_file(input_arg).get_id()) mapping_stage_input.update({'reads%d' %(arg_index+1): reads}) mapped_stage_id = workflow.add_stage( mapping_applet, name='Map %s' %(superstage_name), folder=mapping_output_folder, stage_input=mapping_stage_input ) mapping_superstage.update({'map_stage_id': mapped_stage_id}) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter_QC %s' %(superstage_name), folder=filter_qc_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'paired_end'}) } ) mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id}) xcor_stage_id = workflow.add_stage( xcor_applet, name='Xcor %s' %(superstage_name), folder=xcor_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}) } ) mapping_superstage.update({'xcor_stage_id': xcor_stage_id}) exp_rep1_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'tagAlign_file'}) exp_rep1_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'CC_scores_file'}) exp_rep2_ta = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'tagAlign_file'}) exp_rep2_cc = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'CC_scores_file'}) ctl_rep1_ta = dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'), 'outputField': 'tagAlign_file'}) ctl_rep2_ta = dxpy.dxlink( {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'), 'outputField': 'tagAlign_file'}) rep1_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'), 'outputField': 'paired_end'}) rep2_paired_end = dxpy.dxlink( {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'), 'outputField': 'paired_end'}) else: #skipped the mapping, so just bring in the inputs from arguments exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id()) exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id()) ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id()) ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id()) rep1_paired_end = args.rep1pe rep2_paired_end = args.rep2pe #here we need to calculate the cc scores files, because we're only being supplied tagAligns #if we had mapped everything above we'd already have a handle to the cc file xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id()) xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name) xcor_only_stages = [] exp_rep1_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep1 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep1_ta, 'paired_end': rep1_paired_end } ) xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id}) exp_rep1_cc = dxpy.dxlink( {'stage': exp_rep1_cc_stage_id, 'outputField': 'CC_scores_file'}) exp_rep2_cc_stage_id = workflow.add_stage( xcor_only_applet, name="Rep2 cross-correlation", folder=xcor_output_folder, stage_input={ 'input_tagAlign': exp_rep2_ta, 'paired_end': rep2_paired_end } ) xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id}) exp_rep2_cc = dxpy.dxlink( {'stage': exp_rep2_cc_stage_id, 'outputField': 'CC_scores_file'}) encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id()) encode_macs2_stages = [] peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: encode_macs2_stage_id = workflow.add_stage( encode_macs2_applet, name='ENCODE Peaks', folder=peaks_output_folder, stage_input={ 'rep1_ta' : exp_rep1_ta, 'rep2_ta' : exp_rep2_ta, 'ctl1_ta': ctl_rep1_ta, 'ctl2_ta' : ctl_rep2_ta, 'rep1_xcor' : exp_rep1_cc, 'rep2_xcor' : exp_rep2_cc, 'rep1_paired_end': rep1_paired_end, 'rep2_paired_end': rep2_paired_end, 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)), 'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)), 'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)), 'genomesize': args.genomesize } ) encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id}) #new applet here, similar to IDR, to do naive peak processing if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id()) overlap_peaks_stages = [] for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']: if peaktype == 'narrowpeaks': as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as)) peak_type_extension = 'narrowPeak' elif peaktype == 'gappedpeaks': as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as)) peak_type_extension = 'gappedPeak' elif peaktype == 'broadpeaks': as_file = dxpy.dxlink(resolve_file(args.broadpeak_as)) peak_type_extension = 'broadPeak' overlap_peaks_stage_id = workflow.add_stage( overlap_peaks_applet, name='Overlap %s' %(peaktype), folder=peaks_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_%s' %(peaktype)}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_%s' %(peaktype)}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_%s' %(peaktype)}), 'pooledpr1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_%s' %(peaktype)}), 'pooledpr2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_%s' %(peaktype)}), 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': as_file, 'peak_type': peak_type_extension } ) overlap_peaks_stages.append({'name': 'Overlap %s' %(peaktype), 'stage_id': overlap_peaks_stage_id}) #TODO - IDR on gapped and broad peaks if args.idr: idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id()) encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id()) idr_stages = [] idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name) if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow: idr_stage_id = workflow.add_stage( idr_applet, name='IDR True Replicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_narrowpeaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_narrowpeaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_narrowpeaks'}), 'idr_version': int(args.idrversion) } ) idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 1 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1pr1_narrowpeaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1pr2_narrowpeaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep1_narrowpeaks'}), 'idr_version': int(args.idrversion) } ) idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Rep 2 Self-pseudoreplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2pr1_narrowpeaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2pr2_narrowpeaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'rep2_narrowpeaks'}), 'idr_version': int(args.idrversion) } ) idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id}) idr_stage_id = workflow.add_stage( idr_applet, name='IDR Pooled Pseudoeplicates', folder=idr_output_folder, stage_input={ 'rep1_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr1_narrowpeaks'}), 'rep2_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooledpr2_narrowpeaks'}), 'pooled_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'), 'outputField': 'pooled_narrowpeaks'}), 'idr_version': int(args.idrversion) } ) idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id}) final_idr_stage_input = { 'reps_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'), 'outputField': 'IDR_peaks'}), 'r1pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'r2pr_peaks' : dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'pooledpr_peaks': dxpy.dxlink( {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'), 'outputField': 'IDR_peaks'}), 'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)), 'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)) } if args.blacklist: final_idr_stage_input.update({'blacklist': dxpy.dxlink(resolve_file(args.blacklist))}) idr_stage_id = workflow.add_stage( encode_idr_applet, name='Final IDR peak calls', folder=idr_output_folder, stage_input=final_idr_stage_input ) idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': idr_stage_id}) if not (args.nomap): logging.debug("Mapping stages: %s" %(mapping_superstages)) else: logging.debug("xcor only stages: %s" %(xcor_only_stages)) logging.debug("Peaks for ENCODE stages: %s" %(encode_macs2_stages)) logging.debug("Peak overlap stages: %s" %(overlap_peaks_stages)) if args.idr: logging.debug("IDR stages: %s" %(idr_stages)) if args.yes: job_id = workflow.run({}, delay_workspace_destruction=True) logging.info("Running as job %s" %(job_id))
def build_workflow(): wf = dxpy.new_dxworkflow(title='tcga_mc3_full_run', name='tcga_mc3_full_run', description='TCGA mc3 variant calling pipeline', project=args.project, folder=args.folder, properties={"git_revision": git_revision}) # variant calling tools pindel_applet = find_applet("pindel-tool") pindel_stage_id = wf.add_stage(pindel_applet) radia_applet = find_applet("radia-tool") radia_input = { "dnaNormalBam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}), "dnaTumorBam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "fasta": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } radia_stage_id = wf.add_stage(radia_applet, stage_input=radia_input) somaticsniper_applet = find_applet("somaticsniper-tool") somaticsniper_input = { "normal": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}), "tumor": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } somaticsniper_stage_id = wf.add_stage(somaticsniper_applet, stage_input=somaticsniper_input, instance_type="mem2_hdd2_x1") samtools_pileup_applet = find_applet("samtools-pileup-tool") samtools_pileup_normal_input = { "input1" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}), "input1_index" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } samtools_pileup_normal_stage_id = wf.add_stage(samtools_pileup_applet, stage_input=samtools_pileup_normal_input, instance_type="mem2_hdd2_x1") samtools_pileup_tumor_input = { "input1" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "input1_index" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } samtools_pileup_tumor_stage_id = wf.add_stage(samtools_pileup_applet, stage_input=samtools_pileup_tumor_input, instance_type="mem2_hdd2_x2") muse_applet = find_applet("muse-tool") muse_input = { "tumor_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "tumor_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}), "normal_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}), "normal_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}), "reference" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}), "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj") } muse_stage_id = wf.add_stage(muse_applet, stage_input=muse_input) varscan_applet = find_applet("varscan-tool") varscan_input = { "normal_pileup": dxpy.dxlink({"stage": samtools_pileup_normal_stage_id, "outputField": "pileup"}), "tumor_pileup": dxpy.dxlink({"stage": samtools_pileup_tumor_stage_id, "outputField": "pileup"}) } varscan_stage_id = wf.add_stage(varscan_applet, stage_input=varscan_input, instance_type="mem2_hdd2_x2") mutect_applet = find_applet("mutect-tool") mutect_input = { "tumor_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "tumor_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}), "normal_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}), "normal_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}), "reference" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}), "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj"), "cosmic": dxpy.dxlink("file-Bk9g2kQ0kF9f9XG6VZf7VGKQ"), } mutect_stage_id = wf.add_stage(mutect_applet, stage_input=mutect_input) # fpfilter (somaticSniper, Varscan) fpfilter_applet = find_applet("fpfilter-tool") somatcisniper_fpfilter_input = { "vcf": dxpy.dxlink({"stage": somaticsniper_stage_id, "outputField": "vcf"}), "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } somaticsniper_fpfilter_stage_id = wf.add_stage(fpfilter_applet, stage_input=somatcisniper_fpfilter_input, name="fpfilter-tool(somaticSniper)", folder="fpfiltered") varscan_snp_fpfilter_input = { "vcf": dxpy.dxlink({"stage": varscan_stage_id, "outputField": "snp_vcf"}), "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } varscan_snp_fpfilter_stage_id = wf.add_stage(fpfilter_applet, stage_input=varscan_snp_fpfilter_input, name="fpfilter-tool(varscan SNP)", folder="fpfiltered") varscan_indel_fpfilter_input = { "vcf": dxpy.dxlink({"stage": varscan_stage_id, "outputField": "indel_vcf"}), "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}), "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}) } varscan_indel_fpfilter_stage_id = wf.add_stage(fpfilter_applet, stage_input=varscan_indel_fpfilter_input, name="fpfilter-tool(varscan INDEL)", folder="fpfiltered") # vcf_filter (All variant callers) vcf_filter_applet = find_applet("tcga-vcf-filter-tool") radia_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": radia_stage_id, "outputField": "filtered_output_vcf"}), "filterRejects": False } radia_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=radia_vcf_filter_input, name="vcffilter-tool(radia)", folder="final_filtered") somaticsniper_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": somaticsniper_fpfilter_stage_id, "outputField": "annotated_output"}), "filterRejects": False } somaticsniper_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=somaticsniper_vcf_filter_input, name="vcffilter-tool(somaticsniper)", folder="final_filtered") varscan_snp_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": varscan_snp_fpfilter_stage_id, "outputField": "annotated_output"}), "filterRejects": True } varscan_snp_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=varscan_snp_vcf_filter_input, name="vcffilter-tool(varscan SNP)", folder="final_filtered") varscan_indel_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": varscan_indel_fpfilter_stage_id, "outputField": "annotated_output"}), "filterRejects": True } varscan_indel_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=varscan_indel_vcf_filter_input, name="vcffilter-tool(varscan INDEL)", folder="final_filtered") muse_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": muse_stage_id, "outputField": "mutations"}), "filterRejects": False } muse_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=muse_vcf_filter_input, name="vcffilter-tool(muse)", folder="final_filtered") pindel_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": pindel_stage_id, "outputField": "outputSomaticVcf"}), "filterRejects": False } pindel_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=pindel_vcf_filter_input, name="vcffilter-tool(pindel)", folder="final_filtered") mutect_vcf_filter_input = { "input_vcf": dxpy.dxlink({"stage": mutect_stage_id, "outputField": "mutations"}), "filterRejects": True } mutect_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet, stage_input=mutect_vcf_filter_input, name="vcffilter-tool(mutect)", folder="final_filtered") vcf_reheader_applet = find_applet("tcga-vcf-reheader") radia_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": radia_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "radia", "software_version": "1", "software_params": "--dnaNormalMinTotalBases 4 --dnaNormalMinAltBases 2 --dnaNormalBaseQual 10 --dnaNormalMapQual 10 --dnaTumorDescription TumorDNASample --dnaTumorMinTotalBases 4 --dnaTumorMinAltBases 2 --dnaTumorBaseQual 10 --dnaTumorMapQual 10 --dnaNormalMitochon=MT --dnaTumorMitochon=MT --genotypeMinDepth 2 --genotypeMinPct 0.100", "center": "ucsc.edu" } radia_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=radia_vcf_reheader_input, name="vcf-reheader(radia)", folder="final_reheadered") """ sample_params = { "platform": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "platform"}), "participant_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "participant_uuid"}), "disease_code": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "disease_code"}), "normal_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_analysis_uuid"}), "normal_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_bam_name"}), "normal_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_id"}), "normal_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_barcode"}), "tumor_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_analysis_uuid"}), "tumor_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_bam_name"}), "tumor_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_uuid"}), "tumor_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_barcode"}) } """ somaticsniper_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": somaticsniper_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "somaticsniper", "software_version": "v1.0.5.0", "software_params": "-Q 40 -n NORMAL -q 1 -s 0.01 -r 0.001", "center": "wustl.edu" } #somaticsniper_vcf_reheader_input.update(sample_params) somaticsniper_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=somaticsniper_vcf_reheader_input, name="vcf-reheader(somaticsniper)", folder="final_reheadered") varscan_snp_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": varscan_snp_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "varscan", "software_version": "2.3.9", "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1", "center": "wustl.edu" } #varscan_snp_vcf_reheader_input.update(sample_params) varscan_snp_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=varscan_snp_vcf_reheader_input, name="vcf-reheader(varscan SNP)", folder="final_reheadered") varscan_indel_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": varscan_indel_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "varscan", "software_version": "2.3.9", "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1", "center": "wustl.edu" } #varscan_indel_vcf_reheader_input.update(sample_params) varscan_indel_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=varscan_indel_vcf_reheader_input, name="vcf-reheader(varscan INDEL)", folder="final_reheadered") muse_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": muse_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "muse", "software_version": "v1.0rc", "software_params": "--mode wxs", "center": "mdanderson.org" } #muse_vcf_reheader_input.update(sample_params) muse_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=muse_vcf_reheader_input, name="vcf-reheader(muse)", folder="final_reheadered") pindel_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": pindel_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "pindel", "software_version": "v0.2.5b8", "software_params": "--max_range_index 1 --window_size 5 --sequencing_error_rate 0.010000 --sensitivity 0.950000 --maximum_allowed_mismatch_rate 0.020000 --NM 2 --additional_mismatch 1 --min_perfect_match_around_BP 3 --min_inversion_size 50 --min_num_matched_bases 30 --balance_cutoff 0 --anchor_quality 0 --minimum_support_for_event 3 --report_long_insertions --report_duplications --report_inversions --report_breakpoints", "center": "wustl.edu" } #pindel_vcf_reheader_input.update(sample_params) pindel_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=pindel_vcf_reheader_input, name="vcf-reheader(pindel)", folder="final_reheadered") mutect_vcf_reheader_input = { "input_vcf": dxpy.dxlink({"stage": mutect_vcf_filter_stage_id, "outputField": "output_vcf"}), "software_name": "mutect", "software_version": "1.1.5", "software_params": "--initial_tumor_lod 4.0 --tumor_lod 10.0", "center": "broad.org" } mutect_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet, stage_input=mutect_vcf_reheader_input, name="vcf-reheader(mutect)", folder="final_reheadered") return wf
def build_workflow(experiment, biorep_n, input_shield_stage_input, key): output_project = resolve_project(args.outp, 'w') logging.debug('Found output project %s' %(output_project.name)) applet_project = resolve_project(args.applets, 'r') logging.debug('Found applet project %s' %(applet_project.name)) mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' %(mapping_applet.name)) input_shield_applet = find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' %(input_shield_applet.name)) workflow_output_folder = resolve_folder(output_project, args.outf + '/workflows/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n)) fastq_output_folder = resolve_folder(output_project, args.outf + '/fastqs/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n)) mapping_output_folder = resolve_folder(output_project, args.outf + '/raw_bams/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n)) if args.raw: workflow_title = 'Map %s rep%d to %s (no filter)' %(experiment.get('accession'), biorep_n, args.assembly) workflow_name = 'ENCODE raw mapping pipeline' else: workflow_title = 'Map %s rep%d to %s and filter' %(experiment.get('accession'), biorep_n, args.assembly) workflow_name = 'ENCODE mapping pipeline' if args.tag: workflow_title += ': %s' %(args.tag) workflow = dxpy.new_dxworkflow( title=workflow_title, name=workflow_name, project=output_project.get_id(), folder=workflow_output_folder ) input_shield_stage_id = workflow.add_stage( input_shield_applet, name='Gather inputs %s rep%d' %(experiment.get('accession'), biorep_n), folder=fastq_output_folder, stage_input=input_shield_stage_input ) mapping_stage_id = workflow.add_stage( mapping_applet, name='Map %s rep%d' %(experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={'input_JSON': dxpy.dxlink({'stage': input_shield_stage_id, 'outputField': 'output_JSON'})} ) if not args.raw: final_output_folder = resolve_folder(output_project, args.outf + '/bams/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n)) filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' %(filter_qc_applet.name)) filter_qc_stage_id = workflow.add_stage( filter_qc_applet, name='Filter and QC %s rep%d' %(experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}), 'paired_end': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'paired_end'}) } ) xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id()) logging.debug('Found applet %s' %(xcor_applet.name)) xcor_stage_id = workflow.add_stage( xcor_applet, name='Calculate cross-correlation %s rep%d' %(experiment.get('accession'), biorep_n), folder=final_output_folder, stage_input={ 'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}), 'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}) } ) ''' This should all be done in the shield's postprocess entrypoint if args.accession_outputs: derived_from = input_shield_stage_input.get('reads1') if reads2: derived_from.append(reads2) files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : { 'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')), 'lab': 'j-michael-cherry', 'award': 'U41HG006992', 'submitted_by': '*****@*****.**', 'file_format': 'bam', 'output_type': 'alignments', 'derived_from': derived_from, 'dataset': experiment.get('accession')} } output_shield_stage_id = workflow.add_stage( output_shield_applet, name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n), folder=mapping_output_folder, stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})], 'files_json': files_json, 'key': input_shield_stage_input.get('key')} ) ''' return workflow
def build_workflow(): if parameters["folder_provided"] == "false": wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', output_folder=parameters["Output"], ) else: wf = dxpy.new_dxworkflow( name='WARDEN_workflow', description='RNA-SEQ Workflow', ) wf_outputs = [] htseq_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["htseq"], state="closed", return_handler=True) genome_cov_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["genome_coverage"], state="closed", return_handler=True) bigwig_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["bigwig"], state="closed", return_handler=True) combine_counts_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["combine_counts"], state="closed", return_handler=True) limma_applet = dxpy.search.find_one_data_object(classname="applet", name=app_names["limma"], state="closed", return_handler=True) simple_DE_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["simple_DE"], state="closed", return_handler=True) bw_viewer_applet = dxpy.search.find_one_data_object( classname="applet", name=app_names["bw_viewer"], state="closed", return_handler=True) sample_num = 0 htseq_results = [] bigwig_files = [] index_project, index_id = parameters["index_file"].split(":") gtf_project, gtf_id = parameters["gtf_file"].split(":") genome_length_project, genome_length_id = parameters[ "genome_sizes_file"].split(":") gene_length_project, gene_length_id = parameters["gene_length_file"].split( ":") fpkm_results = [] fpkm_log2_results = [] for sample_name in samples: bam_id = samples[sample_name] bam_link = dxpy.dxlink(bam_id) htseq_input = {"input_bam": bam_link} if parameters["sort_order"] == "position": htseq_input["order"] = "pos" else: htseq_input["order"] = "name" htseq_input["annotation_file"] = dxpy.dxlink({ "project": gtf_project, "id": gtf_id }) htseq_input["gene_length_file"] = dxpy.dxlink({ "project": gene_length_project, "id": gene_length_id }) htseq_input["prefix"] = sample_name htseq_input["strand"] = parameters["strandedness"] htseq_input["feature_type"] = parameters["feature_type"] htseq_input["id_attribute"] = parameters["id_attribute"] htseq_input["mode"] = parameters["mode"] htseq_input["nonunique"] = parameters["nonunique"] htseq_input["secondary_alignments"] = parameters[ "secondary_alignments"] htseq_input["supplementary_alignments"] = parameters[ "supplementary_alignments"] htseq_stage_id = wf.add_stage( htseq_applet, stage_input=htseq_input, instance_type=parameters["htseq_instance"], name=sample_name + ":HTSEQ COUNT", folder="HTSEQ") htseq_results.append( dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "htseq_counts" })) wf_outputs += [ { "name": sample_name + "_htseqcounts", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "htseq_counts" } } }, ] if parameters["id_attribute"] == "gene_name": fpkm_results.append((dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "fpkm" }))) fpkm_log2_results.append((dxpy.dxlink({ "stage": htseq_stage_id, "outputField": "fpkm_log2" }))) wf_outputs += [ { "name": sample_name + "_fpkm", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "fpkm" } } }, { "name": sample_name + "_fpkm_log2", "class": "file", "outputSource": { "$dnanexus_link": { "stage": htseq_stage_id, "outputField": "fpkm_log2" } } }, ] if parameters["run_coverage"] == 'true': gcb_input = {} gcb_input["input_bam"] = bam_link if parameters["sort_order"] == "name": gcb_input["sorted"] = False else: gcb_input["sorted"] = True gcb_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) gcb_input["strandedness"] = parameters["strandedness"] gcb_input["output_prefix"] = sample_name gcb_stage_id = wf.add_stage(genome_cov_applet, stage_input=gcb_input, instance_type="azure:mem3_ssd1_x8", name=sample_name + ":COVERAGE", folder="COVERAGE") bg2bw_all_input = {} bg2bw_all_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "all_coverage_file" }) bg2bw_all_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_all_input["output_prefix"] = sample_name bg2bw_all_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_all_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-ALL", folder="BIGWIG") bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_all_stage_id, "outputField": "bigwig" })) wf_outputs += [ { "name": sample_name + "_all_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_all_stage_id, "outputField": "bigwig" } } }, ] if parameters["strandedness"] != "no": bg2bw_pos_input = {} bg2bw_pos_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "pos_coverage_file" }) bg2bw_pos_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_pos_input["output_prefix"] = sample_name bg2bw_pos_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_pos_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-POS", folder="BIGWIG") wf_outputs += [ { "name": sample_name + "_pos_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_pos_stage_id, "outputField": "bigwig" } } }, ] bg2bw_neg_input = {} bg2bw_neg_input["bedgraph_file"] = dxpy.dxlink({ "stage": gcb_stage_id, "outputField": "neg_coverage_file" }) bg2bw_neg_input["genome_sizes_file"] = dxpy.dxlink({ "project": genome_length_project, "id": genome_length_id }) bg2bw_neg_input["output_prefix"] = sample_name bg2bw_neg_stage_id = wf.add_stage( bigwig_applet, stage_input=bg2bw_neg_input, instance_type="azure:mem2_ssd1_x4", name=sample_name + ":BED To BW-NEG", folder="BIGWIG") wf_outputs += [ { "name": sample_name + "_neg_bigwig", "class": "file", "outputSource": { "$dnanexus_link": { "stage": bg2bw_neg_stage_id, "outputField": "bigwig" } } }, ] bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_pos_stage_id, "outputField": "bigwig" })) bigwig_files.append( dxpy.dxlink({ "stage": bg2bw_neg_stage_id, "outputField": "bigwig" })) sample_num += 1 combine_input = { "count_files": htseq_results, "name_value": "htseq", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_counts_stage_id = wf.add_stage(combine_counts_applet, stage_input=combine_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE HTSEQ") wf_outputs += [ { "name": "combined_counts", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_counts_stage_id, "outputField": "count_file" } } }, ] if parameters["id_attribute"] == "gene_name": combine_fpkm_input = { "count_files": fpkm_results, "name_value": "fpkm", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_fpkm_stage_id = wf.add_stage( combine_counts_applet, stage_input=combine_fpkm_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE FPKM") combine_fpkm_log2_input = { "count_files": fpkm_log2_results, "name_value": "fpkm.log2", "sample_files": [dxpy.dxlink(final_sample_list_id)] } combine_fpkm_log2_stage_id = wf.add_stage( combine_counts_applet, stage_input=combine_fpkm_log2_input, instance_type="azure:mem2_ssd1_x1", name="COMBINE FPKMlog2") wf_outputs += [ { "name": "combined_fpkm", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_fpkm_stage_id, "outputField": "count_file" } } }, { "name": "combined_fpkm_log2", "class": "file", "outputSource": { "$dnanexus_link": { "stage": combine_fpkm_log2_stage_id, "outputField": "count_file" } } }, ] if parameters["BW_VIEWER"] != "None" and parameters[ "run_coverage"] == 'true': bw_project, bw_file = parameters["BW_VIEWER"].split(":") viewer_link = dxpy.dxlink({"project": bw_project, "id": bw_file}) bw_viewer_input = {"viewer": viewer_link, "bigwig_files": bigwig_files} bw_viewer_stage_id = wf.add_stage(bw_viewer_applet, stage_input=bw_viewer_input, instance_type="azure:mem2_ssd1_x1", name="BIGWIG_VIEWER", folder="BIGWIG") wf_outputs += [ { "name": "bw_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": bw_viewer_stage_id, "outputField": "viewer_bookmark" } } }, ] if parameters["limma_DE_viewer"] != "None": limma_viewer_project, limma_viewer_file = parameters[ "limma_DE_viewer"].split(":") limma_viewer_link = dxpy.dxlink({ "project": limma_viewer_project, "id": limma_viewer_file }) if parameters["run_limma"] == 'true' and parameters[ "limma_runnable"] == "true": limma_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "calcNormFactors_method": parameters["calcNormFactors_method"], "filter_count_type": parameters["filter_count_type"], "filter_count": int(parameters["filter_count"]), "p_value_adjust": parameters["p_value_adjust"], "contrasts_file": dxpy.dxlink(comparisons_limma_id) } if parameters["limma_DE_viewer"] != "None": limma_input["difex_viewer"] = limma_viewer_link limma_stage_id = wf.add_stage(limma_applet, stage_input=limma_input, instance_type="azure:mem1_ssd1_x4", name="LIMMA") wf_outputs += [ { "name": "limma_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "out_files" } } }, { "name": "limma_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": limma_stage_id, "outputField": "viewer_bookmark" } } }, ] if parameters["run_simple_dif_ex"] == 'true': simple_DE_input = { "input_count_file": dxpy.dxlink({ "stage": combine_counts_stage_id, "outputField": "count_file" }), "sample_list_file": dxpy.dxlink(final_sample_list_id), "contrasts_file": dxpy.dxlink(comparisons_all_id), "difex_viewer": limma_viewer_link } if parameters["limma_DE_viewer"] != "None": simple_DE_input["difex_viewer"] = limma_viewer_link simple_DE_stage_id = wf.add_stage( simple_DE_applet, stage_input=simple_DE_input, instance_type="azure:mem1_ssd1_x4", name="SIMPLE DIFFERENTIAL_EXPRESSION") wf_outputs += [ { "name": "simple_DE_outfiles", "class": "array:file", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "out_files" } } }, { "name": "simple_DE_viewer", "class": "record", "outputSource": { "$dnanexus_link": { "stage": simple_DE_stage_id, "outputField": "viewer_bookmark" } } }, ] wf.update(workflow_outputs=wf_outputs) wf.close() return wf.get_id()
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None): '''This function will populate a workflow for the stepsToDo.''' if len(stepsToDo) < 1: return None if appProjectId == None: appProjectId = projectId # create a workflow object wf = dxpy.new_dxworkflow(title=extras['name'], name=extras['name'], folder=resultsFolder, project=projectId, description=extras['description']) # NOTE: prevStepResults dict contains links to result files to be generated by previous steps prevStepResults = {} for step in stepsToDo: appName = STEPS[step]['app'] app = dxencode.find_applet_by_name(appName, appProjectId) appInputs = {} # file inputs for fileToken in STEPS[step]['inputs'].keys(): appInp = STEPS[step]['inputs'][fileToken] if fileToken in prevStepResults: appInputs[appInp] = prevStepResults[fileToken] elif fileToken in priors: if isinstance(priors[fileToken], list): appInputs[appInp] = [] for fid in priors[fileToken]: appInputs[appInp] += [dxencode.get_file_link(fid)] else: appInputs[appInp] = dxencode.get_file_link( priors[fileToken]) else: print "ERROR: step '" + step + "' can't find input '" + fileToken + "'!" sys.exit(1) # Non-file app inputs if 'params' in STEPS[step]: for param in STEPS[step]['params'].keys(): appParam = STEPS[step]['params'][param] if param in extras: appInputs[appParam] = extras[param] else: print "ERROR: unable to locate '" + param + "' in extras." sys.exit(1) # Add wf stage stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder) # outputs, which we will need to link to for fileToken in STEPS[step]['results'].keys(): #appOut = STEPS[step]['results'][fileToken] appOut = fileToken ## not the value prevStepResults[fileToken] = dxpy.dxlink({ 'stage': stageId, 'outputField': appOut }) wfRun = wf.run({}) return wfRun.describe()