def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() applet = dxencode.find_applet_by_name('fastqc-exp', pid ) (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue run = applet.run({ "accession": acc}, project=pid) print "Running: %s for %s" % (run, acc) n = n + 1 if n > cmnd.number: break else: print "Skipping %s (0 replicates)" % acc
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None): '''This function will populate a workflow for the stepsToDo.''' if len(stepsToDo) < 1: return None if appProjectId == None: appProjectId = projectId # create a workflow object wf = dxpy.new_dxworkflow(title=extras['name'],name=extras['name'],folder=resultsFolder, project=projectId,description=extras['description']) # NOTE: prevStepResults dict contains links to result files to be generated by previous steps prevStepResults = {} for step in stepsToDo: appName = STEPS[step]['app'] app = dxencode.find_applet_by_name(appName, appProjectId) appInputs = {} # file inputs for fileToken in STEPS[step]['inputs'].keys(): appInp = STEPS[step]['inputs'][fileToken] if fileToken in prevStepResults: appInputs[ appInp ] = prevStepResults[fileToken] elif fileToken in priors: if isinstance(priors[fileToken], list): appInputs[ appInp ] = [] for fid in priors[fileToken]: appInputs[ appInp ] += [ dxencode.get_file_link(fid) ] else: appInputs[ appInp ] = dxencode.get_file_link(priors[fileToken]) else: print "ERROR: step '"+step+"' can't find input '"+fileToken+"'!" sys.exit(1) # Non-file app inputs if 'params' in STEPS[step]: for param in STEPS[step]['params'].keys(): appParam = STEPS[step]['params'][param] if param in extras: appInputs[ appParam ] = extras[param] else: print "ERROR: unable to locate '"+param+"' in extras." sys.exit(1) # Add wf stage stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder) # outputs, which we will need to link to for fileToken in STEPS[step]['results'].keys(): #appOut = STEPS[step]['results'][fileToken] appOut = fileToken ## not the value prevStepResults[ fileToken ] = dxpy.dxlink({ 'stage': stageId,'outputField': appOut }) wfRun = wf.run({}) return wfRun.describe()
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() applet = dxencode.find_applet_by_name('fastqc-exp', pid) (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % ( acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % ( acc, exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue run = applet.run({"accession": acc}, project=pid) print "Running: %s for %s" % (run, acc) n = n + 1 if n > cmnd.number: break else: print "Skipping %s (0 replicates)" % acc
def main(): args = get_args() (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % ( args.experiment, exp ) sys.exit(1) #replicate = "rep%s_%s" % (args.br, args.tr) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br,args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) mapping['replicate'] = replicate try: mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")] except KeyError: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) psv = pipeline_specific_vars(args, mapping, pairedEnd) project = dxencode.get_project(args.project) projectId = project.get_id() ## TODO this is a bunch of ugly if pairedEnd: paired_fqs = { '1': [], '2': [] } read1s = [] read2s = [] for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") read1s.append(p1['accession']) read2s.append(p2['accession']) pipePath = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ] pipePath = STEP_ORDER['se'] for step in pipePath: STEPS[step] = calculate_steps(step) pipeSteps = STEPS ## warning ugly kludge here file_globs = {} for app in STEPS.keys(): for token in STEPS[app]['results'].keys(): file_globs[token] = STEPS[app]['results'][token] print "Checking for prior results..." priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId) if pairedEnd: priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId) priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId) priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads'] submitted = { 'all_reads': read1s + read2s } else: priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId) priors['all_reads'] = priors['reads'] submitted = { 'all_reads': [ f['accession'] for f in mapping['unpaired']], } print "Determining steps to run..." #print priors #sys.exit(1) # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [] # old results will need to be moved/removed if step is rerun stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True) print "Checking for currently running analyses..." dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True) if len(stepsToDo): print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo sys.exit(0) print priors to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ] n = 0 # skip reads print "Attempting to submit %s files to args.experiment" % len(to_submit) while(to_submit): if n > len(priors) * len(priors): print "Too many itereations: %s" % priors break token = to_submit.pop(0) print "%s %s - %s" % (token, priors[token], n) f_ob = POST_TEMPLATES.get(token, None) n += 1 if f_ob: derive_check = f_ob.get('derived_from', []) if derive_check: derived = [ submitted[f] for f in derive_check if submitted.get(f) ] if not derived: to_submit.append(token) continue else: f_ob['derived_from'] = list(itertools.chain(*derived)) dxFile = dxpy.DXFile(dxid=priors[token]) print "Post File: %s %s" % (token, dxFile.name) f_ob['dataset'] = args.experiment f_ob['lab'] = '/labs/j-michael-cherry/' f_ob['award'] = '/awards/U41HG006992/' f_ob['assembly'] = mapping['genome'] ## temporary haxors until file display works f_ob['replicate'] = mapping['replicate_id'] f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software())) print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': ')) if args.testserver: server = 'test' else: server = 'www' if args.test: fake_acc = 'ENCFF%03dAAA' % n print "Fake submission: %s" % fake_acc submitted[token] = [ fake_acc ] else: applet = dxencode.find_applet_by_name('validate-post', projectId ) job = applet.run({ "pipe_file": dxpy.dxlink(dxFile), "file_meta": f_ob, "key": server, "debug": True, "skipvalidate": args.skipvalidate or False }) print "Submitting %s" % job.id job.wait_on_done(interval=1) accession = job.describe()['output'].get('accession', "Unknown Acc") error = job.describe()['output'].get('error', "Unknown Error") submitted[token] = [ accession ] print "Posted (%s): %s" % (error, accession) # Exit if test only if args.test: print "Fake submitted %s files." % n if args.test: sys.exit(0)
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None): '''This function will populate a workflow for the stepsToDo.''' if len(stepsToDo) < 1: return None if appProjectId == None: appProjectId = projectId # create a workflow object wf = dxpy.new_dxworkflow(title=extras['name'], name=extras['name'], folder=resultsFolder, project=projectId, description=extras['description']) # NOTE: prevStepResults dict contains links to result files to be generated by previous steps prevStepResults = {} for step in stepsToDo: appName = STEPS[step]['app'] app = dxencode.find_applet_by_name(appName, appProjectId) appInputs = {} # file inputs for fileToken in STEPS[step]['inputs'].keys(): appInp = STEPS[step]['inputs'][fileToken] if fileToken in prevStepResults: appInputs[appInp] = prevStepResults[fileToken] elif fileToken in priors: if isinstance(priors[fileToken], list): appInputs[appInp] = [] for fid in priors[fileToken]: appInputs[appInp] += [dxencode.get_file_link(fid)] else: appInputs[appInp] = dxencode.get_file_link( priors[fileToken]) else: print "ERROR: step '" + step + "' can't find input '" + fileToken + "'!" sys.exit(1) # Non-file app inputs if 'params' in STEPS[step]: for param in STEPS[step]['params'].keys(): appParam = STEPS[step]['params'][param] if param in extras: appInputs[appParam] = extras[param] else: print "ERROR: unable to locate '" + param + "' in extras." sys.exit(1) # Add wf stage stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder) # outputs, which we will need to link to for fileToken in STEPS[step]['results'].keys(): #appOut = STEPS[step]['results'][fileToken] appOut = fileToken ## not the value prevStepResults[fileToken] = dxpy.dxlink({ 'stage': stageId, 'outputField': appOut }) wfRun = wf.run({}) return wfRun.describe()