def main(): argparser = get_fastqc.get_args() argparser.add_argument("--skipfq", help="Skip parsing fastqc", action="store_true", required=False) args = argparser.parse_args() project = dxencode.get_project(args.project) print "\t".join(["Experiment", "Replicate"] + labels + ["lambda " + l for l in labels] + ["Estimated Coverage"]) if args.experiment: process_exp(args.experiment, project, skipfq=args.skipfq) elif args.all: assay = args.assay or "OBI:0001863" query = ( "/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&files.file_format=fastq" % assay ) res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()["@graph"] for exp in exps: acc = exp["accession"] if len(exp["replicates"]) > 0: process_exp(acc, project, skipfq=args.skipfq)
def main(): argparser = get_fastqc.get_args() argparser.add_argument('--skipfq', help='Skip parsing fastqc', action='store_true', required=False) args = argparser.parse_args() project = dxencode.get_project(args.project) print "\t".join(['Experiment', 'Replicate'] + labels + ["lambda " + l for l in labels] + ['Estimated Coverage']) if args.experiment: process_exp(args.experiment, project, skipfq=args.skipfq) elif args.all: assay = args.assay or "OBI:0001863" query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&files.file_format=fastq' % assay res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: process_exp(acc, project, skipfq=args.skipfq)
def main(): argparser = get_args() args = argparser.parse_args() project = dxencode.get_project(args.project) if args.file: getr = dxencode.encoded_get(SERVER+args.file, AUTHID=AUTHID, AUTHPW=AUTHPW) try: getr.raise_for_status() except: print "Could not find %s in db" % args.file raise encff = getr.json() metrics = get_fastqc(encff['accession'], project) print json.dumps(metrics) elif args.experiment: get_exp_time(args.experiment, project) elif args.all: assay = args.assay or "OBI:0001271" query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % assay res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue get_exp_time(acc, project)
def main(): args = get_args() (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % ( args.experiment, exp ) sys.exit(1) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br,args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) if args.maplambda: genome = 'lambda' else: if mapping['organism'] == 'mouse': genome = 'mm10' elif mapping['organism'] == 'human': genome = 'hg19' else: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip) project = dxencode.get_project(args.project) projectId = project.get_id() # args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome args.resultsLoc = RESULT_FOLDER_DEFAULT # not sure we need genome resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate if args.maplambda: resultsFolder = resultsFolder + '/lambda' if not args.test: if not dxencode.project_has_folder(project, resultsFolder): project.new_folder(resultsFolder,parents=True) if pairedEnd: paired_fqs = { '1': [], '2': [] } for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") steps = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ] steps = STEP_ORDER['se'] print "Generating workflow steps (single-end)..." for step in steps: STEPS[step] = calculate_steps(step) print "Checking for prior results..." # Check if there are previous results # Perhaps reads files are already there? # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo # and fill in inputs to workflow steps priors = findPriorResults(pairedEnd,resultsFolder,projectId, maplambda=True) print "Checking for read files..." # Find all reads files and move into place # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/... # or possibly local, Currently only DX locations are supported. if pairedEnd: reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId) reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId) else: # trim-se and trim-pe use different input tokens. reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId) print "Looking for reference files..." findReferenceFiles(GENOME_REFERENCES.keys(), priors,args.refLoc,extras) print "Determining steps to run..." # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [] # old results will need to be moved/removed if step is rerun stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force) # Report the plans print "Running '"+extras['title']+"'" print " on "+extras['subTitle'] if pairedEnd: print "- Reads1: " else: print "- Reads: " for fid in reads1: print " " + dxencode.file_path_from_fid(fid) if pairedEnd: print "- Reads2: " for fid in reads2: print " " + dxencode.file_path_from_fid(fid) print "- Reference files:" for token in GENOME_REFERENCES.keys(): print " " + dxencode.file_path_from_fid(priors[token],True) print "- Results written to: " + args.project + ":" +resultsFolder if len(stepsToDo) == 0: print "* All expected results are in the results folder, so there is nothing to do." print " If this experiment/replicate needs to be rerun, then use the --force flag to " print " rerun all steps; or remove suspect results from the folder before launching." sys.exit(0) else: print "- Steps to run:" steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: STEPS[step] = calculate_steps(step) if step in stepsToDo: print " * "+STEPS[step]['app']+" will be run" else: if not step.find('concat') == 0: print " "+STEPS[step]['app']+" has already been run" print "Checking for currently running analyses..." checkRunsPreviouslyLaunched(resultsFolder,projectId) if len(deprecateFiles) > 0: if args.test: print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'." for fid in deprecateFiles: print " " + dxencode.file_path_from_fid(fid) else: print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'..." dxencode.move_files(deprecateFiles,resultsFolder+"/deprecated",projectId) # Exit if test only if args.test: print "TEST ONLY - exiting." sys.exit(0) print "Launch sequence initiating..." wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder,projectId) print " We have liftoff!" logThisRun(wfRun['id'],resultsFolder,projectId) print " Launched " + wfRun['id'] print "(success)"
def main(): args = get_args() (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % ( args.experiment, exp ) sys.exit(1) #replicate = "rep%s_%s" % (args.br, args.tr) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br,args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) mapping['replicate'] = replicate try: mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")] except KeyError: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) psv = pipeline_specific_vars(args, mapping, pairedEnd) project = dxencode.get_project(args.project) projectId = project.get_id() ## TODO this is a bunch of ugly if pairedEnd: paired_fqs = { '1': [], '2': [] } read1s = [] read2s = [] for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") read1s.append(p1['accession']) read2s.append(p2['accession']) pipePath = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ] pipePath = STEP_ORDER['se'] for step in pipePath: STEPS[step] = calculate_steps(step) pipeSteps = STEPS ## warning ugly kludge here file_globs = {} for app in STEPS.keys(): for token in STEPS[app]['results'].keys(): file_globs[token] = STEPS[app]['results'][token] print "Checking for prior results..." priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId) if pairedEnd: priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId) priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId) priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads'] submitted = { 'all_reads': read1s + read2s } else: priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId) priors['all_reads'] = priors['reads'] submitted = { 'all_reads': [ f['accession'] for f in mapping['unpaired']], } print "Determining steps to run..." #print priors #sys.exit(1) # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [] # old results will need to be moved/removed if step is rerun stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True) print "Checking for currently running analyses..." dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True) if len(stepsToDo): print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo sys.exit(0) print priors to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ] n = 0 # skip reads print "Attempting to submit %s files to args.experiment" % len(to_submit) while(to_submit): if n > len(priors) * len(priors): print "Too many itereations: %s" % priors break token = to_submit.pop(0) print "%s %s - %s" % (token, priors[token], n) f_ob = POST_TEMPLATES.get(token, None) n += 1 if f_ob: derive_check = f_ob.get('derived_from', []) if derive_check: derived = [ submitted[f] for f in derive_check if submitted.get(f) ] if not derived: to_submit.append(token) continue else: f_ob['derived_from'] = list(itertools.chain(*derived)) dxFile = dxpy.DXFile(dxid=priors[token]) print "Post File: %s %s" % (token, dxFile.name) f_ob['dataset'] = args.experiment f_ob['lab'] = '/labs/j-michael-cherry/' f_ob['award'] = '/awards/U41HG006992/' f_ob['assembly'] = mapping['genome'] ## temporary haxors until file display works f_ob['replicate'] = mapping['replicate_id'] f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software())) print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': ')) if args.testserver: server = 'test' else: server = 'www' if args.test: fake_acc = 'ENCFF%03dAAA' % n print "Fake submission: %s" % fake_acc submitted[token] = [ fake_acc ] else: applet = dxencode.find_applet_by_name('validate-post', projectId ) job = applet.run({ "pipe_file": dxpy.dxlink(dxFile), "file_meta": f_ob, "key": server, "debug": True, "skipvalidate": args.skipvalidate or False }) print "Submitting %s" % job.id job.wait_on_done(interval=1) accession = job.describe()['output'].get('accession', "Unknown Acc") error = job.describe()['output'].get('error', "Unknown Error") submitted[token] = [ accession ] print "Posted (%s): %s" % (error, accession) # Exit if test only if args.test: print "Fake submitted %s files." % n if args.test: sys.exit(0)
def main(): args = get_args() (AUTHID, AUTHPW, SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' % ( args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % (args.experiment, exp) sys.exit(1) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br, args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) if args.maplambda: genome = 'lambda' else: if mapping['organism'] == 'mouse': genome = 'mm10' elif mapping['organism'] == 'human': genome = 'hg19' else: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % ( len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip) project = dxencode.get_project(args.project) projectId = project.get_id() # args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome args.resultsLoc = RESULT_FOLDER_DEFAULT # not sure we need genome resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate if args.maplambda: resultsFolder = resultsFolder + '/lambda' if not args.test: if not dxencode.project_has_folder(project, resultsFolder): project.new_folder(resultsFolder, parents=True) if pairedEnd: paired_fqs = {'1': [], '2': []} for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession'] + ".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession'] + ".fastq.gz") steps = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession'] + ".fastq.gz" for f in mapping['unpaired'] ] steps = STEP_ORDER['se'] print "Generating workflow steps (single-end)..." for step in steps: STEPS[step] = calculate_steps(step) print "Checking for prior results..." # Check if there are previous results # Perhaps reads files are already there? # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo # and fill in inputs to workflow steps priors = findPriorResults(pairedEnd, resultsFolder, projectId, maplambda=True) print "Checking for read files..." # Find all reads files and move into place # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/... # or possibly local, Currently only DX locations are supported. if pairedEnd: reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId) reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId) else: # trim-se and trim-pe use different input tokens. reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId) print "Looking for reference files..." findReferenceFiles(GENOME_REFERENCES.keys(), priors, args.refLoc, extras) print "Determining steps to run..." # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [ ] # old results will need to be moved/removed if step is rerun stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force) # Report the plans print "Running '" + extras['title'] + "'" print " on " + extras['subTitle'] if pairedEnd: print "- Reads1: " else: print "- Reads: " for fid in reads1: print " " + dxencode.file_path_from_fid(fid) if pairedEnd: print "- Reads2: " for fid in reads2: print " " + dxencode.file_path_from_fid(fid) print "- Reference files:" for token in GENOME_REFERENCES.keys(): print " " + dxencode.file_path_from_fid(priors[token], True) print "- Results written to: " + args.project + ":" + resultsFolder if len(stepsToDo) == 0: print "* All expected results are in the results folder, so there is nothing to do." print " If this experiment/replicate needs to be rerun, then use the --force flag to " print " rerun all steps; or remove suspect results from the folder before launching." sys.exit(0) else: print "- Steps to run:" steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: STEPS[step] = calculate_steps(step) if step in stepsToDo: print " * " + STEPS[step]['app'] + " will be run" else: if not step.find('concat') == 0: print " " + STEPS[step]['app'] + " has already been run" print "Checking for currently running analyses..." checkRunsPreviouslyLaunched(resultsFolder, projectId) if len(deprecateFiles) > 0: if args.test: print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'." for fid in deprecateFiles: print " " + dxencode.file_path_from_fid(fid) else: print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'..." dxencode.move_files(deprecateFiles, resultsFolder + "/deprecated", projectId) # Exit if test only if args.test: print "TEST ONLY - exiting." sys.exit(0) print "Launch sequence initiating..." wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId) print " We have liftoff!" logThisRun(wfRun['id'], resultsFolder, projectId) print " Launched " + wfRun['id'] print "(success)"