def pipeline_specific_vars(self,args,verbose=False): '''Adds pipeline specific variables to a dict, for use building the workflow.''' psv = Launch.pipeline_specific_vars(self,args) # Could be multiple annotations supported per genome psv['annotation'] = args.annotation if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT: psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']] if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]: print psv['genome']+" has no "+psv['annotation']+" annotation." sys.exit(1) # Some specific settings psv['nthreads'] = 8 psv['rnd_seed'] = 12345 # If annotation is not default, then add it to title if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]: psv['title'] += ', ' + psv['annotation'] psv['name'] += '_' + psv['annotation'] self.no_tophat = args.no_tophat if not self.no_tophat: self.PRUNE_STEPS = [] # Must override results location because of annotation psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \ psv['genome'],psv['annotation']) psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' self.update_rep_result_folders(psv) if verbose: print "Pipeline Specific Vars:" print json.dumps(psv,indent=4) return psv
def pipeline_specific_vars(self,args,verbose=False): '''Adds pipeline specific variables to a dict, for use building the workflow.''' psv = Launch.pipeline_specific_vars(self,args) # Could be multiple annotations supported per genome psv['annotation'] = args.annotation if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT: psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']] if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]: print psv['genome']+" has no "+psv['annotation']+" annotation." sys.exit(1) if not psv['paired_end']: print "Rampage is always expected to be paired-end but mapping says otherwise." sys.exit(1) # Some specific settings psv['nthreads'] = 8 psv['control'] = args.control # run will either be for combined or single rep. if not psv['combined']: run = psv['reps']['a'] # If not combined then run will be for the first (only) replicate else: run = psv # workflow labeling psv['description'] = "The ENCODE Rampage RNA pipeline for long RNAs" run['name'] = "rampage_"+psv['genome'] if psv['genome'] == 'mm10': run['name'] += psv['annotation'] if psv['gender'] == 'female': run['name'] += "XX" else: run['name'] += "XY" run['title'] = "Rampage RNA " + psv['experiment'] + " - " + run['rep_tech'] run['name'] += "_"+psv['experiment']+"_" + run['rep_tech'] if not psv['combined']: run['title'] += " [library '"+run['library_id']+"']" run['title'] += " on " + psv['genome']+" - "+psv['gender'] # Must override results location because of annotation psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \ psv['genome'],psv['annotation']) psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' psv['reps']['a']['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' + \ psv['reps']['a']['rep_tech'] + '/' if psv['combined']: psv['reps']['b']['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' + \ psv['reps']['b']['rep_tech'] + '/' if verbose: print "Pipeline Specific Vars:" print json.dumps(psv,indent=4) return psv
def pipeline_specific_vars(self,args,verbose=False): '''Adds pipeline specific variables to a dict, for use building the workflow.''' psv = Launch.pipeline_specific_vars(self,args) # Could be multiple annotations supported per genome psv['annotation'] = args.annotation if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT: psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']] if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]: print psv['genome']+" has no "+psv['annotation']+" annotation." sys.exit(1) if not psv['paired_end']: print "Rampage is always expected to be paired-end but mapping says otherwise." sys.exit(1) # Some specific settings psv['nthreads'] = 8 psv['control'] = args.control # run will either be for combined or single rep. if not self.combined_reps: run = psv['reps']['a'] # If not combined then run will be for the first (only) replicate else: run = psv # If annotation is not default, then add it to title if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]: psv['title'] += ', ' + psv['annotation'] psv['name'] += '_' + psv['annotation'] # Must override results location because of annotation psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \ psv['genome'],psv['annotation']) psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' self.update_rep_result_folders(psv) if verbose: print "Pipeline Specific Vars:" print json.dumps(psv,indent=4) return psv
def pipeline_specific_vars(self,args,verbose=False): '''Adds pipeline specific variables to a dict, for use building the workflow.''' psv = Launch.pipeline_specific_vars(self,args) # Now add pipline specific variables and tests # Could be multiple annotations supported per genome psv['annotation'] = args.annotation if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT: psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']] if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]: print psv['genome']+" has no "+psv['annotation']+" annotation." sys.exit(1) # Paired ends? if psv['paired_end']: print "Small-RNA is always expected to be single-end but mapping says otherwise." #print json.dumps(psv,indent=4,sort_keys=True) sys.exit(1) # Some specific settings psv['nthreads'] = 8 # If annotation is not default, then add it to title if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]: psv['title'] += ', ' + psv['annotation'] psv['name'] += '_' + psv['annotation'] # Must override results location because of annotation psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \ psv['genome'],psv['annotation']) psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' self.update_rep_result_folders(psv) if verbose: print "Pipeline Specific Vars:" print json.dumps(psv,indent=4) return psv
def run(self): '''Runs recovery from start to finish using command line arguments.''' args = self.get_args() self.test = args.test self.ignore = False if args.ignore_properties: print "Ignoring DXFile properties (will post to test server)" self.ignore = args.ignore_properties self.server_key = 'test' # mandated because option is dangerous self.server_key = args.server self.authid, self.authpw, self.server = dxencode.processkey(self.server_key) if self.server_key == "www": self.acc_prefix = "ENCFF" self.proj_name = dxencode.env_get_current_project() if self.proj_name == None or args.project != None: self.proj_name = args.project if self.proj_name == None: print "Please enter a '--project' to run in." sys.exit(1) self.project = dxencode.get_project(self.proj_name) self.proj_id = self.project.get_id() print "== Running in project [%s] and will attempt recovery to the [%s] server ==" % \ (self.proj_name,self.server_key) exp_count = 0 halted = 0 total_recovered = 0 for exp_id in args.experiments: sys.stdout.flush() # Slow running job should flush to piped log self.exp_id = exp_id self.obj_cache["exp"] = {} # clear exp cache, which will hold exp specific wf_run and step_run objects # 1) Lookup experiment type from encoded, based on accession print "Working on %s..." % self.exp_id self.exp = dxencode.get_exp(self.exp_id,must_find=True,key=self.server_key) if self.exp == None or self.exp["status"] == "error": print "Unable to locate experiment %s in encoded (%s)" % (self.exp_id, self.server_key) continue self.exp_type = dxencode.get_exp_type(self.exp_id,self.exp,self.EXPERIMENT_TYPES_SUPPORTED) if self.exp_type == None: continue # 2) Locate the experiment accession named folder # NOTE: genome and annotation are not known for this exp yet, so the umbrella folder is just based on exp_type self.umbrella_folder = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,self.exp_type) self.exp_folder = dxencode.find_exp_folder(self.project,exp_id,self.umbrella_folder,warn=True) if self.exp_folder == None: continue print "- Examining %s:%s for '%s' results..." % \ (self.proj_name, self.exp_folder, self.exp_type) # 3) Given the experiment type, determine the expected results self.pipeline = self.pipeline_specification(args,self.exp_type,self.exp_folder) self.replicates = self.find_replicate_folders(self.exp_folder, verbose=args.verbose) # 4) Given expected results locate any files (by glob) that should be posted for # a) each single replicate (in replicate sub-folders named as reN_N/ # b) combined replicates in the experiment folder itself files_expected = self.find_expected_files(self.exp_folder, self.replicates, verbose=args.verbose) print "- Found %d files that are available in DX." % len(files_expected) if len(files_expected) == 0: continue # 5) For each file that should be posted, determine if the file needs to be posted. files_posted = self.find_posted_files(files_expected, test=self.test, verbose=args.verbose) #True) print "- Found %d files that have been posted" % len(files_posted) if len(files_posted) == 0: continue # 6) For each file that needs to be posted: exp_count += 1 file_count = 0 recovery_count = 0 for (out_type,rep_tech,fid) in files_posted: sys.stdout.flush() # Slow running job should flush to piped log accession = self.found[fid]['accession'] file_name = dxencode.file_path_from_fid(fid) if args.start_at != None: if accession != args.start_at and not file_name.endswith(args.start_at): continue else: print "- Starting at %s" % (file_name) args.start_at = None # a) discover all necessary dx information needed for post. # b) gather any other information necessary from dx and encoded. print "- Handle file %s %s" % (accession,dxencode.file_path_from_fid(fid)) payload = self.make_payload_obj(out_type,rep_tech,fid, verbose=args.verbose) file_count += 1 # c) Update encoded database only if necessary. if self.file_metadata_recovery(fid,payload,args.test,verbose=args.verbose): recovery_count += 1 if args.files != 0 and file_count >= args.files: # Short circuit for test print "- Just trying %d file(s) by request" % file_count break if not args.test: print "- For %s Processed %d file(s), recovered %s" % (self.exp_id, file_count, recovery_count) else: print "- For %s Processed %d file(s), would recover %s" % (self.exp_id, file_count, recovery_count) total_recovered += recovery_count if not args.test: print "Processed %d experiment(s), halted %d, recovered %d file(s)" % (exp_count, halted, total_recovered) else: print "Processed %d experiment(s), halted %d, would recover %d file(s)" % (exp_count, halted, total_recovered) if halted == exp_count: sys.exit(1) print "(finished)"