def PSL_to_dict(self, filename): """Process a PSL file to the dict format """ try: inf = open(filename, 'r') except Exception as inst: if type(inst) == IOError: logger.error("Failed to open mapping dictionary %s." % filename) raise inst read_map = {} i = 0 startT = time.time() psl_header = False for l in inf: i += 1 # Check for PSL header and skip 5 lines if it exists if i == 1 and l.split()[0] == 'psLayout': psl_header = True if psl_header and i <= 5: continue l2 = l.strip().split("\t") readid = keyfunction(self.params['sra'])( l2[9]) # .split("/")[0] # remove unique part of PE reads target = l2[13] # handle references built using assembled contigs: if len(target.split("_:_")) > 1: target = target.split("_:_")[1] if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT)) return read_map
def PSL_to_dict(self, filename): """Process a PSL file to the dict format """ try: inf = open(filename, 'r') except Exception as inst: if type(inst) == IOError: logger.error("Failed to open mapping dictionary %s." % filename) raise inst read_map = {} i = 0 startT = time.time() psl_header = False for l in inf: i += 1 # Check for PSL header and skip 5 lines if it exists if i == 1 and l.split()[0] == 'psLayout': psl_header = True if psl_header and i <= 5: continue l2 = l.strip().split("\t") readid = keyfunction(self.params['sra'])(l2[9]) # .split("/")[0] # remove unique part of PE reads target = l2[13] # handle references built using assembled contigs: if len(target.split("_:_")) > 1: target = target.split("_:_")[1] if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT)) return read_map
def SAM_to_dict(self, filename): """ Read a SAM file to a mapping dict and return it """ #Check for necessary files: if os.path.exists(filename) is False: raise exceptions.FatalError("Missing SAM file") try: inf = open(filename, 'r') except Exception as exc: txt = "Failed to open SAM file %s" % filename txt += '\n\t' + str(exc) raise exceptions.FatalError(txt) read_map = {} # target:{read} dictionary of dictionaries i = 0 startT = time.time() for l in inf: i += 1 if l[0] != "@": # skip header lines l2 = l.strip().split() if l2[2] == "*": # skip unmapped continue readid = l2[0].split("/")[0] target = l2[2] #handle references built using assembled contigs: if len(target.split("_:_")) == 3: target, status = target.split("_:_")[1:] # This keeps ARC from writing reads which mapped to finished contigs if status.startswith("Contig") or status.startswith("isogroup"): continue if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 #Report total time: logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." % (self.params['sample'], i, time.time() - startT)) return read_map
def RunMapAgainstReads(self): """ A pseudo-assembler for cases where we don't actually assemble reads and instead just write them out as contigs. """ #print "Creating finished file: " + os.path.join(self.params['target_dir'], 'finished') start = time.time() outf = open(os.path.join(self.params['target_dir'], 'finished'), 'w') outf.write("map_against_reads") sample = self.params['sample'] target = self.params['target'] logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf.close()
def start(self): """ run through list of targets, check any that haven't finished already """ sample = self.params['sample'] completed = sum(self.params['targets'].values()) logger.info("Sample: %s AssemblyChecker started with %s of %s targets completed" % (sample, completed, len(self.params['targets']))) for target_folder in self.params['targets']: if not self.params['targets'][target_folder]: f = os.path.join(target_folder, 'finished') if os.path.exists(f): self.params['targets'][target_folder] = True logger.info("%s exists" % f) completed += 1 #Now check whether all have finished, if not, add a new AssemblyChecker to the queue if len(self.params['targets']) > sum(self.params['targets'].values()): #some jobs haven't completed yet checker_params = {} for k in self.params: checker_params[k] = self.params[k] #checker_params = deepcopy(self.params) # checker = AssemblyChecker(checker_params) time.sleep(5) # sleep 5 seconds before putting a checker back on the job_q self.submit(AssemblyChecker.to_job(checker_params)) logger.info("Sample: %s Assemblies not finished: %s of %s targets completed" % (sample, completed, len(self.params['targets']))) else: params = {} for k in self.params: params[k] = self.params[k] # params = deepcopy(self.params) # finisher = Finisher(params) logger.debug("Sample: %s, iteration %s, Submitting finisher job to queue." % (sample, self.params['iteration'])) self.submit(Finisher.to_job(params)) logger.info("Sample: %s Assemblies finished: %s of %s targets completed" % (sample, completed, len(self.params['targets'])))
def start(self, loglevel, configfile='ARC_config.txt'): try: logger.setup(loglevel=loglevel) logger.info("Reading config file...") config = Config(configfile) values = config.get() logger.info( "Setting up working directories and building indexes...") self.setup(values) spawn = Spawn(values) logger.info("Running ARC.") spawn.submit() spawn.run() logger.info("Cleaning up.") self.clean() return 0 except FatalError as e: logger.error("A fatal error was encountered. \n\t%s" % str(e)) return 1 except (KeyboardInterrupt, SystemExit): self.clean() logger.error("%s unexpectedly terminated" % (__name__)) return 1
def RunMapAgainstReads(self): """ A pseudo-assembler for cases where we don't actually assemble reads and instead just write them out as contigs. """ #print "Creating finished file: " + os.path.join(self.params['target_dir'], 'finished') start = time.time() outf = open(os.path.join(self.params['target_dir'], 'finished'), 'w') outf.write("map_against_reads") sample = self.params['sample'] target = self.params['target'] logger.info( "Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf.close()
def set_defaults(self): for key, value in self.OPTIONS.iteritems(): if key not in self.config: if value is None: raise exceptions.FatalError( "Error, %s required but not specificed in " "ARC_self.config.txt" % key) else: logger.info( "%s not specified in ARC_config.txt, defaulting to " "%s" % (key, value)) self.config[key] = value # Anything listed below here is not expected to be in the config but # needs initialized self.config['iteration'] = 0
def SAM_to_dict(self, filename): """ Read a SAM file to a mapping dict and return it """ #Check for necessary files: if os.path.exists(filename) is False: raise exceptions.FatalError("Missing SAM file") try: inf = open(filename, 'r') except Exception as exc: txt = "Failed to open SAM file %s" % filename txt += '\n\t' + str(exc) raise exceptions.FatalError(txt) read_map = {} # target:{read} dictionary of dictionaries i = 0 discards = 0 startT = time.time() for l in inf: i += 1 if l[0] != "@": # skip header lines l2 = l.strip().split() if l2[2] == "*": # skip unmapped continue readid = keyfunction(self.params['sra'])( l2[0]) # .split("/")[0] target = l2[2] # handle references built using assembled contigs: if len(target.split("_:_")) == 3: target, status = target.split("_:_")[1:] # This keeps ARC from writing reads which mapped to finished contigs if status.startswith("Contig") or status.startswith( "isogroup"): discards += 1 continue if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 # Report total time: logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." % (self.params['sample'], i, time.time() - startT)) if discards > 0: logger.info( "%s out of %s reads mapped to finished contigs and were not recruited for assembly." % (discards, i)) return read_map
def submit(self): # Get the number of samples from the configuration logger.info("Submitting initial mapping runs.") for sample in self.config['Samples']: s = self.config['Samples'][sample] params = {} for k in self.config: params[k] = self.config[k] params['working_dir'] = s['working_dir'] params['finished_dir'] = s['finished_dir'] #params['reference'] = s['reference'] params['reference'] = os.path.join(s['working_dir'], 'I000_contigs.fasta') params['sample'] = sample if 'PE1' in s and 'PE2' in s: params['PE1'] = s['PE1'] params['PE2'] = s['PE2'] if 'SE' in s: params['SE'] = s['SE'] # mapper = Mapper(params) self.q.put(Mapper.to_job(params))
def start(self): if not('mapper' in self.params): raise exceptions.FatalError("mapper not defined in params") if self.params['mapper'] == 'bowtie2': logger.info("Sample: %s Running bowtie2." % self.params['sample']) self.run_bowtie2() if self.params['mapper'] == 'blat': logger.info("Sample: %s Running blat." % self.params['sample']) self.run_blat() #Mapping is done, run splitreads: logger.info("Sample: %s Running splitreads." % self.params['sample']) self.splitreads()
def start(self): if not ('mapper' in self.params): raise exceptions.FatalError("mapper not defined in params") if self.params['mapper'] == 'bowtie2': logger.info("Sample: %s Running bowtie2." % self.params['sample']) self.run_bowtie2() if self.params['mapper'] == 'blat': logger.info("Sample: %s Running blat." % self.params['sample']) self.run_blat() #Mapping is done, run splitreads: logger.info("Sample: %s Running splitreads." % self.params['sample']) self.splitreads()
def start(self): """ run through list of targets, check any that haven't finished already """ sample = self.params['sample'] completed = sum(self.params['targets'].values()) logger.info( "Sample: %s AssemblyChecker started with %s of %s targets completed" % (sample, completed, len(self.params['targets']))) for target_folder in self.params['targets']: if not self.params['targets'][target_folder]: f = os.path.join(target_folder, 'finished') if os.path.exists(f): self.params['targets'][target_folder] = True logger.info("%s exists" % f) completed += 1 #Now check whether all have finished, if not, add a new AssemblyChecker to the queue if len(self.params['targets']) > sum(self.params['targets'].values()): #some jobs haven't completed yet checker_params = {} for k in self.params: checker_params[k] = self.params[k] #checker_params = deepcopy(self.params) # checker = AssemblyChecker(checker_params) time.sleep( 5 ) # sleep 5 seconds before putting a checker back on the job_q self.submit(AssemblyChecker.to_job(checker_params)) logger.info( "Sample: %s Assemblies not finished: %s of %s targets completed" % (sample, completed, len(self.params['targets']))) else: params = {} for k in self.params: params[k] = self.params[k] # params = deepcopy(self.params) # finisher = Finisher(params) logger.debug( "Sample: %s, iteration %s, Submitting finisher job to queue." % (sample, self.params['iteration'])) self.submit(Finisher.to_job(params)) logger.info( "Sample: %s Assemblies finished: %s of %s targets completed" % (sample, completed, len(self.params['targets'])))
def RunSpades(self): """ Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir] """ #Check that required params are available if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or ('assembly_SE' in self.params)): raise exceptions.FatalError('Missing self.params in RunSpades.') #Check that the files actually exist if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not(os.path.exists(self.params['assembly_PE1']) or not(os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunSpades.') if 'assembly_SE' in self.params and not(os.path.exists(self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunSpades.') sample = self.params['sample'] target = self.params['target'] #Build args for assembler call args = ['spades.py', '-t', '1'] if self.params['only-assembler'] and not self.params['last_assembly']: args.append("--only-assembler") if self.params['format'] == 'fasta': args.append('--only-assembler') # spades errors on read correction if the input isn't fastq if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args += ['-1', self.params['assembly_PE1'], '-2', self.params['assembly_PE2']] if 'assembly_SE' in self.params: args += ['-s', self.params['assembly_SE']] args += ['-o', os.path.join(self.params['target_dir'], 'assembly')] if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') logger.debug("Sample: %s target: %s Running spades assembler." % (sample, target)) logger.info(" ".join(args)) killed = False failed = False start = time.time() try: #ret = subprocess.call(args, stderr=out, stdout=out) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: ret.kill() killed = True logger.warn("Sample: %s target: %s Assembly killed after %s seconds." % (sample, target, time.time() - start)) break time.sleep(.5) except Exception as exc: txt = ("Sample: %s, Target: %s: Unhandeled error running Spades assembly" % (sample, target)) txt += '\n\t' + str(exc) logger.warn(txt) failed = True pass finally: out.close() #Ensure that assembler exits cleanly: self.kill_process_children(pid) if not killed and ret.poll() != 0: failed = True if failed: logger.info("Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed") outf.close() elif killed: logger.info("Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed") outf.close() else: #Run finished without error logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete") outf.close()
def run_bowtie2(self): """ Builds idx and runs bowtie2 -I 0 -X 1500 --local Expects params: sample, target, reference, working_dir, PE1 and PE2 and/or SE """ #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Make idx directory try: working_dir = self.params['working_dir'] idx_dir = os.path.realpath(os.path.join(working_dir, 'idx')) os.mkdir(idx_dir) except Exception as exc: txt = "Sample: %s Error creating working directory." % ( self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Set up a path to the index base = os.path.join(idx_dir, 'idx') #Build index #The idea is to map against the finished contigs and in-progress # contigs, thereby ensuring that the -k parameter (or best map) # are respected properly, and avoid the situation where reads which # were mapped to a now finished target might later be mapped to a an # in-progress target. fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta') args = ['bowtie2-build', '-f'] if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0: args.append(','.join((fin_outf, self.params['reference']))) else: args.append(self.params['reference']) args.append(base) logger.info("Sample: %s Calling bowtie2-build." % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2-build" % self.params['sample']) + '\n\t' + str(exc) # make sure that out is closed before throwing exception out.close() raise exceptions.FatalError(txt) if ret != 0: out.close() raise exceptions.FatalError( "Sample: %s Error creating bowtie2 index, check log file." % self.params['sample']) #Do bowtie2 mapping: n_bowtieprocs = int(round(max(float(self.params['nprocs'])/len(self.params['Samples']), 1))) args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal'] #Tune the sensitivity so that on the first iteration the mapper is # very sensitive. On later iterations the mapper is very specific. if self.params['iteration'] == 0 and self.params['sloppymapping']: args.append("--very-sensitive-local") else: args += ["--very-fast-local", "--mp", "12", "--rdg", "12,6", "--rfg", "12,6"] args += ['-p', str(n_bowtieprocs), '-x', base] if self.params['bowtie2_k'] > 1: args += ['-k', str(self.params['bowtie2_k'])] if self.params['format'] == 'fasta': args += ['-f'] if 'PE1' in self.params and 'PE2' in self.params: args += ['-1', self.params['PE1'], '-2', self.params['PE2']] if 'SE' in self.params: args += ['-U', self.params['SE']] args += ['-S', os.path.join(working_dir, 'mapping.sam')] logger.info( "Sample: %s Calling bowtie2 mapper" % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) out.close() except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2 mapping" % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) out.close() if ret != 0: raise exceptions.FatalError( "Sample %s: Bowtie2 mapping returned an error, check log file." % self.params['sample']) #Extract the SAM to a dict self.params['mapping_dict'] = self.SAM_to_dict( os.path.join(working_dir, 'mapping.sam')) #clean up intermediary files: os.remove(os.path.join(working_dir, 'mapping.sam')) os.system("rm -rf %s" % idx_dir)
def RunNewbler(self): #Code for running newbler """ Expects params keys: PE1 and PE2 and/or SE target_dir -urt """ #Check for necessary params: if not ( ('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or 'assembly_SE' in self.params): raise exceptions.FatalError('Missing self.params in RunNewbler.') #Check for necessary files: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not ( os.path.exists(self.params['assembly_PE1']) or not (os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunNewbler.') if 'assembly_SE' in self.params and not (os.path.exists( self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunNewbler.') sample = self.params['sample'] target = self.params['target'] killed = False failed = False #determine whether to pipe output to a file or /dev/null if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') #Build args for newAssembly: args = ['newAssembly', '-force'] if self.params['last_assembly'] and self.params['cdna']: #only run with cdna switch on the final assembly args += ['-cdna'] args += [os.path.join(self.params['target_dir'], 'assembly')] logger.debug("Calling newAssembly for sample: %s target %s" % (sample, target)) logger.info(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for addRun: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_PE1']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_PE2']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) if 'assembly_SE' in self.params: args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_SE']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for runProject args = ['runProject'] args += ['-cpu', '1'] if self.params['last_assembly'] and self.params['cdna']: args += ['-noace'] else: args += ['-nobig'] if self.params['urt'] and not self.params['last_assembly']: #only run with the -urt switch when it isn't the final assembly args += ['-urt'] if self.params['rip']: args += ['-rip'] args += [os.path.join(self.params['target_dir'], 'assembly')] try: start = time.time() logger.debug("Calling runProject for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: self.kill_process_children(pid) logger.warn( "Sample: %s target: %s iteration: %s Killing assembly after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) killed = True break time.sleep(.5) except Exception as exc: txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % ( self.params['sample'], self.params['target']) txt += '\n\t' + str(exc) + "".join(traceback.format_exception) logger.warn(txt) failed = True pass finally: out.close() #Sometimes newbler doesn't seem to exit completely: self.kill_process_children(pid) #if ret != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") if not killed and ret.poll() != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") failed = True if failed: logger.info( "Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed\t" + str(time.time() - start)) outf.close() if killed: logger.info( "Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed\t" + str(time.time() - start)) outf.close() else: #Run finished without error logger.info( "Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete\t" + str(time.time() - start)) outf.close()
def setup(self, config): """ Set up working folder for each sample. Also assign a "safe_target" name to each target so that folder creation works. This is a little bit tricky because if the user has targets with the _:_ seperator in the name it messes up the splitter and SAM_to_dict. This code is therefore written with the assumption that the user has put the _:_ in the name purposely so that multiple entries in the reference fasta will be treated as a single target. """ format = config['format'] for sample in config['Samples']: s = config['Samples'][sample] working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample) #working_dir = os.path.realpath('./working_' + sample) finished_dir = os.path.realpath('./finished_' + sample) config['Samples'][sample]['working_dir'] = working_dir config['Samples'][sample]['finished_dir'] = finished_dir if os.path.exists(working_dir): logger.info( "WARNING working directory already exists for " "sample %s, deleting old results if any." % (sample)) os.system('rm -rf %s' % finished_dir) os.system('rm -rf %s/t__*' % working_dir) os.system('rm -rf %s/*.psl' % working_dir) os.system('rm %s/I*_contigs.fasta' % working_dir) if os.path.exists('%s/idx' % working_dir): os.system('rm -rf %s/idx' % working_dir) os.mkdir(finished_dir) else: os.mkdir(working_dir) os.mkdir(finished_dir) # Create stats file: statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w') statsf.write('\t'.join( ['Sample', 'Target', 'Iteration', 'Reads']) + '\n') statsf.close() # Create Target Summary Table tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w') tstf.write('\t'.join( ['Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n') tstf.close() #Create a stats file for cdna if config['cdna']: countsf = open(os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a') countsf.write('\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n') countsf.close() # Build a separate index for each read file in the input, put them # in working_dir #Consider parallelizing this? start = time.time() if 'PE1' in s: if not os.path.exists(os.path.join(working_dir, "PE1.idx")): print s['PE1'] p1 = SeqIO.index_db( os.path.join(working_dir, "PE1.idx"), s['PE1'], format, key_function=lambda x: x.split("/")[0]) if 'PE2' in s: if not os.path.exists(os.path.join(working_dir, "PE2.idx")): print s['PE2'] p2 = SeqIO.index_db( os.path.join(working_dir, "PE2.idx"), s['PE2'], format, key_function=lambda x: x.split("/")[0]) if len(p1) != len(p2): logger.error("The number of reads in %s and %s do not match, " "check the config for errors" % (s['PE1'], s['PE2'])) if 'SE' in s: if not os.path.exists(os.path.join(working_dir, "SE.idx")): print s['SE'] SeqIO.index_db( os.path.join(working_dir, "SE.idx"), s['SE'], format, key_function=lambda x: x.split("/")[0]) logger.info( "Sample: %s, indexed reads in %s seconds." % ( sample, time.time() - start)) #Read through the references, mask them if necessary #mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') # Read through the reference, set up a set of safe names for the targets. # Also create the Target Summary Table which is indexed by original target name (following ARC conventions) # Also mask sequences and write them to a new set of output files #safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID. summary_stats = {} safe_targets = {} new_refsf = {} for sample in config['Samples']: s = config['Samples'][sample] new_refsf[sample] = open(os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w') i = 0 for t in SeqIO.parse(config['reference'], "fasta"): if len(t.name.split("_:_")) == 1: target = t.name else: target = t.name.split("_:_")[1] safe_targets[target] = "t__%06d" % i safe_targets["t__%06d" % i] = target i += 1 if target not in summary_stats: summary_stats[target] = {'targetLength': len(t)} else: summary_stats[target]['targetLength'] = (summary_stats[target]['targetLength'] + len(t)) #Write contigs: if config['maskrepeats']: #t.seq = Seq(str(mask_seq(t.seq.tostring(), config['mapper']))) t.seq = Seq(str(mask_seq(str(t.seq), config['mapper']))) #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(t) != t.seq.count('n'): for outf in new_refsf.values(): SeqIO.write(t, outf, "fasta") else: writeTargetStats(finished_dir=s['finished_dir'], sample=sample, target=target, targetLength=summary_stats[target]['targetLength'], status='MaskedOut', iteration=0, readcount=0, num_contigs=0, contig_length=0) del summary_stats[target] config['safe_targets'] = safe_targets config['summary_stats'] = summary_stats
def RunSpades(self): """ Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir] """ #Check that required params are available if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or ('assembly_SE' in self.params)): raise exceptions.FatalError('Missing self.params in RunSpades.') #Check that the files actually exist if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not ( os.path.exists(self.params['assembly_PE1']) or not (os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunSpades.') if 'assembly_SE' in self.params and not (os.path.exists( self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunSpades.') sample = self.params['sample'] target = self.params['target'] #Build args for assembler call args = ['spades.py', '-t', '1'] if self.params['only-assembler'] and not self.params['last_assembly']: args.append("--only-assembler") if self.params['format'] == 'fasta': args.append( '--only-assembler' ) # spades errors on read correction if the input isn't fastq if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args += [ '-1', self.params['assembly_PE1'], '-2', self.params['assembly_PE2'] ] if 'assembly_SE' in self.params: args += ['-s', self.params['assembly_SE']] args += ['-o', os.path.join(self.params['target_dir'], 'assembly')] if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') logger.debug("Sample: %s target: %s Running spades assembler." % (sample, target)) logger.info(" ".join(args)) killed = False failed = False start = time.time() try: #ret = subprocess.call(args, stderr=out, stdout=out) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: ret.kill() killed = True logger.warn( "Sample: %s target: %s Assembly killed after %s seconds." % (sample, target, time.time() - start)) break time.sleep(.5) except Exception as exc: txt = ( "Sample: %s, Target: %s: Unhandeled error running Spades assembly" % (sample, target)) txt += '\n\t' + str(exc) logger.warn(txt) failed = True pass finally: out.close() #Ensure that assembler exits cleanly: self.kill_process_children(pid) if not killed and ret.poll() != 0: failed = True if failed: logger.info( "Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed") outf.close() elif killed: logger.info( "Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed") outf.close() else: #Run finished without error logger.info( "Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete") outf.close()
def splitreads(self): """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names""" self.params['iteration'] += 1 # Write out statistics for any/all targets which failed to recruit reads: for target in self.params['summary_stats'].keys(): # print "Target", target if target not in self.params['mapping_dict']: writeTargetStats(finished_dir=self.params['finished_dir'], sample=self.params['sample'], target=target, targetLength=self.params['summary_stats'][target]['targetLength'], status='NoReads', iteration=self.params['iteration'], readcount=0, num_contigs=0, contig_length=0) del self.params['summary_stats'][target] checker_params = {} for k in self.params: checker_params[k] = self.params[k] del checker_params['mapping_dict'] checker_params['targets'] = {} iteration = self.params['iteration'] # open previously created indexes: if 'PE1' in self.params and 'PE2' in self.params: idx_PE1 = SeqIO.index_db(os.path.join(self.params['working_dir'], "PE1.idx"), key_function=keyfunction(self.params['sra'])) idx_PE2 = SeqIO.index_db(os.path.join(self.params['working_dir'], "PE2.idx"), key_function=keyfunction(self.params['sra'])) if 'SE' in self.params: idx_SE = SeqIO.index_db(os.path.join(self.params['working_dir'], "SE.idx"), key_function=keyfunction(self.params['sra'])) if 'readcounts' not in checker_params: checker_params['readcounts'] = {} # if 'contigcounts' not in checker_params: # checker_params['contigcounts'] = {} statsf = open(os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'), 'a') for target in self.params['mapping_dict']: startT = time.time() # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target)) target_dir = os.path.join(self.params['working_dir'], self.params['safe_targets'][target]) if target not in checker_params['readcounts']: checker_params['readcounts'][target] = Counter() # if target not in checker_params['contigcounts']: # checker_params['contigcounts'] = Counter() if os.path.exists(target_dir): os.system("rm -rf %s" % target_dir) os.mkdir(target_dir) reads = self.params['mapping_dict'][target] # track how many total reads were added for this cycle checker_params['readcounts'][target][iteration] = len(reads) statsf.write('\t'.join([self.params['sample'], target, str(iteration), str(len(reads))]) + '\n') SEs = PEs = 0 if 'PE1' and 'PE2' in self.params: outf_PE1 = open(os.path.join(target_dir, "PE1." + self.params['format']), 'w') outf_PE2 = open(os.path.join(target_dir, "PE2." + self.params['format']), 'w') if 'SE' in self.params: outf_SE = open(os.path.join(target_dir, "SE." + self.params['format']), 'w') for readID in reads: if self.params['subsample'] < 1 and randint(0, 100) > self.params['subsample'] * 100: continue if 'PE1' in self.params and readID in idx_PE1: # read1 = idx_PE1[readID] # read2 = idx_PE2[readID] read1 = idx_PE1.get(readID, None) read2 = idx_PE2.get(readID, None) if read2 is None: raise exceptions.FatalError("ERROR: ReadID %s was found in PE1 file but not PE2" % readID) new_readID = readID.replace(":", "_") + ":0:0:0:0#0/" read1.id = read1.name = new_readID + "1" read2.id = read2.name = new_readID + "2" SeqIO.write(read1, outf_PE1, self.params['format']) SeqIO.write(read2, outf_PE2, self.params['format']) PEs += 1 elif 'SE' in self.params and readID in idx_SE: read1 = idx_SE[readID] read1.id = read1.name = readID.replace(":", "_") + ":0:0:0:0#0/" SeqIO.write(read1, outf_SE, self.params['format']) SEs += 1 if 'PE1' in self.params and 'PE2' in self.params: outf_PE1.close() outf_PE2.close() if 'SE' in self.params: outf_SE.close() #Build assembly job: assembly_params = {} assembly_params['target'] = target assembly_params['target_dir'] = target_dir assembly_params['iteration'] = iteration assembly_params['last_assembly'] = False assembler_keys = ['assembler', 'sample', 'verbose', 'format', 'assemblytimeout', 'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip', 'only-assembler'] for k in assembler_keys: assembly_params[k] = self.params[k] cur_reads = checker_params['readcounts'][target][iteration] # note that this is a counter, so no key errors can occur previous_reads = checker_params['readcounts'][target][iteration - 1] #Turn off URT in situations where this will be the last iteration due to readcounts: if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params['numcycles']: logger.info("Sample: %s target: %s iteration: %s Setting last_assembly to True" % (self.params['sample'], target, self.params['iteration'])) assembly_params['last_assembly'] = True #properly handle the case where no reads ended up mapping for the PE or SE inputs: if PEs > 0: assembly_params['assembly_PE1'] = os.path.join(target_dir, "PE1." + self.params['format']) assembly_params['assembly_PE2'] = os.path.join(target_dir, "PE2." + self.params['format']) if SEs > 0: assembly_params['assembly_SE'] = os.path.join(target_dir, "SE." + self.params['format']) #All reads have been written at this point, add an assembly to the queue: logger.info("Sample: %s target: %s iteration: %s Split %s reads in %s seconds" % (self.params['sample'], target, self.params['iteration'], len(reads), time.time() - startT)) #Only add an assembly job and AssemblyChecker target if is there are >0 reads: if PEs + SEs > 0: checker_params['targets'][target_dir] = False self.submit(Assembler.to_job(assembly_params)) statsf.close() logger.info("------------------------------------") logger.info("| Sample: %s Iteration %s of numcycles %s" % (checker_params['sample'], checker_params['iteration'], checker_params['numcycles'])) logger.info("------------------------------------") if 'PE1' in self.params and 'PE2' in self.params: idx_PE1.close() idx_PE2.close() del idx_PE1 del idx_PE2 if 'SE' in self.params: idx_SE.close() del idx_SE #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue if len(checker_params['targets']) > 0: # checker = AssemblyChecker(checker_params) self.submit(AssemblyChecker.to_job(checker_params)) else: logger.info("Sample: %s No reads mapped, no more work to do." % checker_params['sample'])
def run_blat(self): #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing self.params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Blat doesn't need an index working_dir = self.params['working_dir'] #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Build a temporary txt file with all of the reads: allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w') if 'PE1' in self.params and 'PE2' in self.params: allreads_outf.write(self.params['PE1'] + '\n') allreads_outf.write(self.params['PE2'] + '\n') if 'SE' in self.params: allreads_outf.write(self.params['SE'] + '\n') allreads_outf.close() #Do blat mapping args = [ 'blat', self.params['reference'], os.path.join(working_dir, 'reads.txt') ] if self.params['format'] == 'fastq': args.append('-fastq') if self.params['fastmap']: args.append('-fastMap') #Some new experimental params to increase specificity after the first iteration: if self.params['maskrepeats']: args.append("-mask=lower") if self.params['iteration'] > 0 or not self.params['sloppymapping']: args.append("-minIdentity=98") args.append("-minScore=40") args.append(os.path.join(working_dir, 'mapping.psl')) logger.info("Sample: %s Calling blat mapper" % self.params['sample']) logger.debug(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ( "Sample %s: Unhandeled error running blat mapping, check log file." % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) finally: out.close() if ret != 0: raise exceptions.FatalError( 'Sample: %s Error running blat mapping, check log file. \n\t %s' % (self.params['sample'], " ".join(args))) #Extract the PSL to a dict self.params['mapping_dict'] = self.PSL_to_dict( os.path.join(working_dir, 'mapping.psl')) #Cleanup os.remove(os.path.join(working_dir, 'mapping.psl')) out.close()
def write_target(self, target, target_folder, outf, finished=False, map_against_reads=False, killed=False, status=None): # either map_against_reads was passed in, or # no contigs were assembled and target isn't finished, or # assembler crashed and no contig file was created # --> write reads as contigs num_contigs = 0 # store how many contigs were written out contig_length = 0 if finished and status is None: status = 'Finished' if killed: status = 'Killed' if map_against_reads is False and killed is False: if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") elif self.params['assembler'] == 'spades': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "contigs.fasta") #add support for a special output if this is the final assembly and newbler -cdna was used: if finished and self.params['cdna'] and self.params[ 'assembler'] == 'newbler': self.writeCDNAresults(target, target_folder, outf, contigf) elif os.path.exists(contigf): i = 0 contig_inf = open(contigf, 'r') for contig in SeqIO.parse(contig_inf, 'fasta'): i += 1 if finished: contig.name = contig.id = self.params[ 'sample'] + "_:_" + target + "_:_" + "Contig%03d" % i else: contig.name = contig.id = self.params[ 'sample'] + "_:_" + target + "_:_" + "Unfinished%03d" % i contig = contig.upper() #Only mask repeats on intermediate iterations. if self.params['maskrepeats'] and not finished: #contig.seq = Seq(str(mask_seq(contig.seq.tostring(), self.params['mapper']))) contig.seq = Seq( str( mask_seq(str(contig.seq), self.params['mapper']))) #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(contig.seq) != contig.seq.count('n'): SeqIO.write(contig, outf, "fasta") contig_length += len(contig.seq) contig_inf.close() logger.info( "Sample: %s target: %s iteration: %s Finished writing %s contigs " % (self.params['sample'], target, self.params['iteration'], i)) num_contigs += i #if i == 0 and finished is False and self.params['iteration'] < 2: # map_against_reads = True if map_against_reads: i = 0 logger.info("Sample %s target %s: Writing reads as contigs." % (self.params['sample'], target)) if 'PE1' in self.params and 'PE2' in self.params: inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format']) inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format']) if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n): inf_PE1 = open(inf_PE1n, 'r') inf_PE2 = open(inf_PE2n, 'r') for r in SeqIO.parse(inf_PE1, self.params['format']): i += 1 r.name = r.id = self.params[ 'sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") for r in SeqIO.parse(inf_PE2, self.params['format']): i += 1 r.name = r.id = self.params[ 'sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") inf_PE1.close() inf_PE2.close() if 'SE' in self.params: inf_SEn = os.path.join(target_folder, "SE." + self.params['format']) if os.path.exists(inf_SEn): inf_SE = open(inf_SEn, 'r') for r in SeqIO.parse(inf_SE, self.params['format']): i += 1 r.name = r.id = self.params[ 'sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") inf_SE.close() num_contigs += i if finished or killed: #Write reads: if 'PE1' in self.params and 'PE2' in self.params: inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format']) inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format']) if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n): inf_PE1 = open(inf_PE1n, 'r') inf_PE2 = open(inf_PE2n, 'r') outf_PE1 = open( os.path.join(self.params['finished_dir'], "PE1." + self.params['format']), 'a') outf_PE2 = open( os.path.join(self.params['finished_dir'], "PE2." + self.params['format']), 'a') for r in SeqIO.parse(inf_PE1, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_PE1, self.params['format']) for r in SeqIO.parse(inf_PE2, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_PE2, self.params['format']) outf_PE1.close() outf_PE2.close() if 'SE' in self.params: inf_SEn = os.path.join(target_folder, "SE." + self.params['format']) if os.path.exists(inf_SEn): inf_SE = open(inf_SEn, 'r') outf_SE = open( os.path.join(self.params['finished_dir'], "SE." + self.params['format']), 'a') for r in SeqIO.parse(inf_SE, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_SE, self.params['format']) outf_SE.close() # Finally a special case for situations where assembly of a target is killed, but contigs exist from # a previous assembly. Note that we only do this when not running in cDNA mode. if killed and self.params['iteration'] > 1 and not self.params['cdna']: #No contigs will be available, however contigs from the previous iteration will be present in # I00N_contigs.fasta, grab these and write them out instead logger.info( "Sample: %s target: %s iteration: %s Writing contigs from previous iteration." % (self.params['sample'], target, self.params['iteration'])) contigf = os.path.join( self.params['working_dir'], 'I%03d' % (self.params['iteration'] - 1) + '_contigs.fasta') if os.path.exists(contigf): for contig in SeqIO.parse(contigf, 'fasta'): if contig.id.split("_:_")[1] == target: contig.name = contig.id = contig.id.replace( "Unfinished", "Contig") SeqIO.write(contig, outf, "fasta") num_contigs += 1 contig_length += len(contig.seq) #Cleanup temporary assembly, and reads: if not self.params['keepassemblies']: os.system("rm -rf %s" % target_folder) #write out target stats: if finished or killed: writeTargetStats(finished_dir=self.params['finished_dir'], sample=self.params['sample'], target=target, targetLength=self.params['summary_stats'][target] ['targetLength'], status=status, iteration=self.params['iteration'], readcount=self.params['readcounts'][target][ self.params['iteration']], num_contigs=num_contigs, contig_length=contig_length) del self.params['summary_stats'][target] #writeTargetStats(target, status, num_contigs, contig_length, self.params) #summary_stats[target] = {'RefLen': len(t), 'Status': 'NA', 'Iteration': None, # 'Reads': None, 'Contigs': None, 'ContigLength': None} #[self.params['Sample'], target, 'TargetLength', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n') if finished or killed: return 0 else: return num_contigs
def run_bowtie2(self): """ Builds idx and runs bowtie2 -I 0 -X 1500 --local Expects params: sample, target, reference, working_dir, PE1 and PE2 and/or SE """ #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Make idx directory try: working_dir = self.params['working_dir'] idx_dir = os.path.realpath(os.path.join(working_dir, 'idx')) os.mkdir(idx_dir) except Exception as exc: txt = "Sample: %s Error creating working directory." % ( self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Set up a path to the index base = os.path.join(idx_dir, 'idx') #Build index #The idea is to map against the finished contigs and in-progress # contigs, thereby ensuring that the -k parameter (or best map) # are respected properly, and avoid the situation where reads which # were mapped to a now finished target might later be mapped to a an # in-progress target. fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta') args = ['bowtie2-build', '-f'] if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0: args.append(','.join((fin_outf, self.params['reference']))) else: args.append(self.params['reference']) args.append(base) logger.info("Sample: %s Calling bowtie2-build." % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2-build" % self.params['sample']) + '\n\t' + str(exc) # make sure that out is closed before throwing exception out.close() raise exceptions.FatalError(txt) if ret != 0: out.close() raise exceptions.FatalError( "Sample: %s Error creating bowtie2 index, check log file." % self.params['sample']) #Do bowtie2 mapping: n_bowtieprocs = int( round( max( float(self.params['nprocs']) / len(self.params['Samples']), 1))) args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal'] #Tune the sensitivity so that on the first iteration the mapper is # very sensitive. On later iterations the mapper is very specific. if self.params['iteration'] == 0 and self.params['sloppymapping']: args.append("--very-sensitive-local") else: args += [ "--very-fast-local", "--mp", "12", "--rdg", "12,6", "--rfg", "12,6" ] args += ['-p', str(n_bowtieprocs), '-x', base] if self.params['bowtie2_k'] > 1: args += ['-k', str(self.params['bowtie2_k'])] if self.params['format'] == 'fasta': args += ['-f'] if 'PE1' in self.params and 'PE2' in self.params: args += ['-1', self.params['PE1'], '-2', self.params['PE2']] if 'SE' in self.params: args += ['-U', self.params['SE']] args += ['-S', os.path.join(working_dir, 'mapping.sam')] logger.info("Sample: %s Calling bowtie2 mapper" % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) out.close() except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2 mapping" % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) out.close() if ret != 0: raise exceptions.FatalError( "Sample %s: Bowtie2 mapping returned an error, check log file." % self.params['sample']) #Extract the SAM to a dict self.params['mapping_dict'] = self.SAM_to_dict( os.path.join(working_dir, 'mapping.sam')) #clean up intermediary files: os.remove(os.path.join(working_dir, 'mapping.sam')) os.system("rm -rf %s" % idx_dir)
def run(self): logger.info("Starting...") logger.debug("Setting up workers.") for i in range(self.nprocs): worker = ProcessRunner( i, self.q, self.status, self.stats, self.pid) self.workers.append(worker) worker.daemon = False worker.start() while True: try: self.q.join() # This shouldn't be needed but we will check just in case if self.all_workers_waiting(): logger.debug("Workers are all waiting and the queue is empty. Exiting") break else: logger.debug("Workers are not in a waiting state. Waiting for more.") time.sleep(5) except exceptions.FatalError: logger.error("A fatal error was encountered.") self.killall() raise except (KeyboardInterrupt, SystemExit): logger.error("Terminating processes") self.killall() raise except Exception as e: ex_type, ex, tb = sys.exc_info() logger.error("\n".join(traceback.format_exception(ex_type, ex, tb))) logger.error("An unhandled exception occurred") self.killall() raise finally: # Kill 'em all! self.killall() logger.info("-----") logger.info("%d processes returned ok." % (self.stats[0])) logger.info("%d processes had to be rerun." % (self.stats[1])) logger.info("-----") logger.info("%d Mapper jobs run." % (self.stats[2])) logger.info("%d Assembly jobs run." % (self.stats[3])) logger.info("%d Checker jobs run." % (self.stats[4])) logger.info("%d Finisher jobs run." % (self.stats[5])) logger.info("-----")
def writeCDNAresults(self, target, target_folder, outf, contigf): """ This is ONLY called when a cDNA target is finished. When doing a cDNA type run, it is very useful to have both the following: 1) All contigs that belong to a gene (isogroup) - It would be particularly good to re-orient these if they are in RC. 2) Total number of reads assembled in each gene (isogroup) Additionally it would be excellent to some day also get the following: 3) Transcript (isotig) structure 4) Estimate of isotig specific reads. """ if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") isotigsf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454IsotigsLayout.txt") readstatusf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454ReadStatus.txt") else: logger.info("WARNING writeCDNAresults called when assembler was not Newbler") return None if not (os.path.exists(contigf) and os.path.exists(isotigsf) and os.path.exists(readstatusf)): logger.info("CDNA WARNING MISSING FILE!! %s %s" % (target, self.params['sample'])) logger.info(contigf, os.path.exists(contigf)) logger.info(isotigsf, os.path.exists(isotigsf)) logger.info(readstatusf, os.path.exists(readstatusf)) return None #Storage data structures: isogroups = {} # A dict of isogroups which each contain an in-order list of contigs readcounts = Counter() # A dict of all contigs, these contain read counts (from ReadStatus) contig_orientation = {} contig_to_isogroup = {} contig_idx = SeqIO.index(contigf, "fasta") # Parse isotigsf: igroup = "" #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf for l in open(isotigsf, 'r'): #Handle lines with only a '\n' if l == '\n': pass #Handle lines for isogroup: elif l[0:9] == '>isogroup': igroup = l.strip().split()[0].strip(">") #Handle lines containing all contigs: elif l.strip().split()[0] == 'Contig': l2 = l.strip().split() contigs = map(lambda x: "contig" + x, l2[2:-1]) isogroups[igroup] = contigs for contig in contigs: if contig not in contig_orientation: contig_orientation[contig] = '+' contig_to_isogroup[contig] = igroup else: raise exceptions.FatalError('Contig %s in %s more than once' % (contig, contigf)) #Handle lines containing contig orientation info: elif l[0:6] == 'isotig': l2 = l[l.find(" ") + 1: l.rfind(" ") - 1] l3 = [l2[i:i+6] for i in range(0, len(l2), 6)] for i in range(len(l3)): if l3[i][0] == '<': # contig is in reverse orientation contig = isogroups[igroup][i] contig_orientation[contig] = '-' #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation) #Now parse readstatus: inf = open(readstatusf, 'r') inf.readline() # discard first line for l in inf: l2 = l.strip().split('\t') #Determine if this read was assembled if len(l2) == 8: contig = l2[2] # Note that there are some built in limits to the number of contigs that can be in an isogroup: # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/ # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file. if contig in contig_to_isogroup: readcounts[contig_to_isogroup[contig]] += 1 else: readcounts['ExceedsThreshold'] += 1 #print self.params['sample'], target, "Parse read status" #Finally, output all of this information appropriately: countsf = open(os.path.join(self.params['finished_dir'], "isogroup_read_counts.tsv"), 'a') sample = self.params['sample'] #First write out readcounts: sample \t target \t isogroup \t readcount for isogroup in readcounts: countsf.write('\t'.join([sample, target, isogroup, str(readcounts[isogroup])]) + '\n') countsf.close() #print self.params['sample'], target, "Wrote readcounts" #Next write the contigs in proper order and orientation: ncontigs = 0 nisogroups = 0 for isogroup in isogroups: nisogroups += 1 for contig in isogroups[isogroup]: ncontigs += 1 seqrec = contig_idx[contig] #print self.params['sample'], target, seqrec if contig_orientation[contig] == '-': seqrec.seq = seqrec.seq.reverse_complement() #print self.params['sample'], target, seqrec seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig #print self.params['sample'], target, seqrec SeqIO.write(seqrec, outf, "fasta") ## TODO: add support for the ExceedsThreshold contigs logger.info("Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups " % (self.params['sample'], target, self.params['iteration'], ncontigs, nisogroups))
def start(self): sample = self.params['sample'] logger.info("Sample: %s Starting finisher" % self.params['sample']) finished_dir = self.params['finished_dir'] sample_finished = False targets_written = 0 iteration = self.params['iteration'] #Set up output for both finished and additional mapping outputs fin_outf = open(os.path.join(finished_dir, 'contigs.fasta'), 'a') remap_outf = open(os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta'), 'w') #check whether the sample is globally finished if self.params['iteration'] >= self.params['numcycles']: sample_finished = True #loop over the current set of targets_folders for target_folder in self.params['targets']: #Extract target specific details: target_map_against_reads = False safe_target = target_folder.split("/")[-1] # get last element of path name target = self.params['safe_targets'][safe_target] cur_reads = self.params['readcounts'][target][iteration] # note that this is a counter, so no key errors can occur previous_reads = self.params['readcounts'][target][iteration - 1] #Get finished assembly status: with open(os.path.join(target_folder, 'finished'), 'r') as finishedf: l = finishedf.readline().strip().split()[0] logger.info("Sample: %s target: %s finishing target.." % (self.params['sample'], target)) logger.info("Sample: %s target: %s iteration: %s Assembly reports status: %s." % (sample, target, self.params['iteration'], l)) if l in ('assembly_failed', 'map_against_reads'): target_map_against_reads = True if l == 'assembly_killed': #only write out the reads, assembly won't have contigs self.write_target(target, target_folder, outf=fin_outf, finished=False, map_against_reads=False, killed=True) elif sample_finished: # everything goes into the final file/folders. self.write_target(target, target_folder, outf=fin_outf, finished=True, ) elif target_map_against_reads and cur_reads > previous_reads and iteration < 3: #Only map against reads if we have improvement in mapping and we haven't been mapping for multiple iterations targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False, map_against_reads=True) else: #Check read counts and retire target, or send it back for re-mapping depending on mapped reads if iteration > 1 and cur_reads != 0 and previous_reads != 0: if cur_reads / previous_reads > self.params['max_incorporation']: logger.info("Sample %s target %s hit a repetitive region, no more mapping will be done" % (self.params['sample'], target)) self.write_target(target, target_folder, outf=fin_outf, finished=True, status='Repeat') elif cur_reads <= previous_reads and iteration > 2: #Give the mapper a couple extra iterations in case the first mapping got a lot of reads which didn't assemble logger.info("Sample %s target %s did not incorporate any more reads, no more mapping will be done" % (self.params['sample'], target)) self.write_target(target, target_folder, outf=fin_outf, finished=True) else: #nothing fancy is going on, just write the contigs out for remapping targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False) else: #nothing fancy is going on, just write the contigs out for remapping targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False) fin_outf.flush() remap_outf.flush() fin_outf.close() remap_outf.close() if targets_written > 0: # Build a new mapper and put it on the queue from ARC.runners import Mapper mapper_params = {} for k in self.params: mapper_params[k] = self.params[k] del mapper_params['targets'] mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') self.submit(Mapper.to_job(mapper_params)) logger.info("Sample: %s Added new mapper to queue: iteration %s" % (self.params['sample'], self.params['iteration'])) else: logger.info("Sample: %s Mapper not added to queue. Work finished." % self.params['sample'])
def run_blat(self): #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing self.params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Blat doesn't need an index working_dir = self.params['working_dir'] #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Build a temporary txt file with all of the reads: allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w') if 'PE1' in self.params and 'PE2' in self.params: allreads_outf.write(self.params['PE1'] + '\n') allreads_outf.write(self.params['PE2'] + '\n') if 'SE' in self.params: allreads_outf.write(self.params['SE'] + '\n') allreads_outf.close() #Do blat mapping args = ['blat', self.params['reference'], os.path.join(working_dir, 'reads.txt')] if self.params['format'] == 'fastq': args.append('-fastq') if self.params['fastmap']: args.append('-fastMap') #Some new experimental params to increase specificity after the first iteration: if self.params['maskrepeats']: args.append("-mask=lower") if self.params['iteration'] > 0 or not self.params['sloppymapping']: args.append("-minIdentity=98") args.append("-minScore=40") args.append(os.path.join(working_dir, 'mapping.psl')) logger.info("Sample: %s Calling blat mapper" % self.params['sample']) logger.debug(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ("Sample %s: Unhandeled error running blat mapping, check log file." % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) finally: out.close() if ret != 0: raise exceptions.FatalError('Sample: %s Error running blat mapping, check log file. \n\t %s' % (self.params['sample'], " ".join(args))) #Extract the PSL to a dict self.params['mapping_dict'] = self.PSL_to_dict(os.path.join(working_dir, 'mapping.psl')) #Cleanup os.remove(os.path.join(working_dir, 'mapping.psl')) out.close()
def log(self, msg): if logger.level() == logging.DEBUG: name = self.name else: name = self.__class__.__name__ logger.info("%-12s| %s" % (name, msg))
def splitreads(self): """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names""" self.params['iteration'] += 1 # Write out statistics for any/all targets which failed to recruit reads: for target in self.params['summary_stats'].keys(): # print "Target", target if target not in self.params['mapping_dict']: writeTargetStats(finished_dir=self.params['finished_dir'], sample=self.params['sample'], target=target, targetLength=self.params['summary_stats'] [target]['targetLength'], status='NoReads', iteration=self.params['iteration'], readcount=0, num_contigs=0, contig_length=0) del self.params['summary_stats'][target] checker_params = {} for k in self.params: checker_params[k] = self.params[k] del checker_params['mapping_dict'] checker_params['targets'] = {} iteration = self.params['iteration'] # open previously created indexes: if 'PE1' in self.params and 'PE2' in self.params: idx_PE1 = SeqIO.index_db( os.path.join(self.params['working_dir'], "PE1.idx"), key_function=keyfunction(self.params['sra'])) idx_PE2 = SeqIO.index_db( os.path.join(self.params['working_dir'], "PE2.idx"), key_function=keyfunction(self.params['sra'])) if 'SE' in self.params: idx_SE = SeqIO.index_db( os.path.join(self.params['working_dir'], "SE.idx"), key_function=keyfunction(self.params['sra'])) if 'readcounts' not in checker_params: checker_params['readcounts'] = {} # if 'contigcounts' not in checker_params: # checker_params['contigcounts'] = {} statsf = open( os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'), 'a') for target in self.params['mapping_dict']: startT = time.time() # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target)) target_dir = os.path.join(self.params['working_dir'], self.params['safe_targets'][target]) if target not in checker_params['readcounts']: checker_params['readcounts'][target] = Counter() # if target not in checker_params['contigcounts']: # checker_params['contigcounts'] = Counter() if os.path.exists(target_dir): os.system("rm -rf %s" % target_dir) os.mkdir(target_dir) reads = self.params['mapping_dict'][target] # track how many total reads were added for this cycle checker_params['readcounts'][target][iteration] = len(reads) statsf.write('\t'.join([ self.params['sample'], target, str(iteration), str(len(reads)) ]) + '\n') SEs = PEs = 0 if 'PE1' and 'PE2' in self.params: outf_PE1 = open( os.path.join(target_dir, "PE1." + self.params['format']), 'w') outf_PE2 = open( os.path.join(target_dir, "PE2." + self.params['format']), 'w') if 'SE' in self.params: outf_SE = open( os.path.join(target_dir, "SE." + self.params['format']), 'w') for readID in reads: if self.params['subsample'] < 1 and randint( 0, 100) > self.params['subsample'] * 100: continue if 'PE1' in self.params and readID in idx_PE1: # read1 = idx_PE1[readID] # read2 = idx_PE2[readID] read1 = idx_PE1.get(readID, None) read2 = idx_PE2.get(readID, None) if read2 is None: raise exceptions.FatalError( "ERROR: ReadID %s was found in PE1 file but not PE2" % readID) new_readID = readID.replace(":", "_") + ":0:0:0:0#0/" read1.id = read1.name = new_readID + "1" read2.id = read2.name = new_readID + "2" SeqIO.write(read1, outf_PE1, self.params['format']) SeqIO.write(read2, outf_PE2, self.params['format']) PEs += 1 elif 'SE' in self.params and readID in idx_SE: read1 = idx_SE[readID] read1.id = read1.name = readID.replace(":", "_") + ":0:0:0:0#0/" SeqIO.write(read1, outf_SE, self.params['format']) SEs += 1 if 'PE1' in self.params and 'PE2' in self.params: outf_PE1.close() outf_PE2.close() if 'SE' in self.params: outf_SE.close() #Build assembly job: assembly_params = {} assembly_params['target'] = target assembly_params['target_dir'] = target_dir assembly_params['iteration'] = iteration assembly_params['last_assembly'] = False assembler_keys = [ 'assembler', 'sample', 'verbose', 'format', 'assemblytimeout', 'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip', 'only-assembler' ] for k in assembler_keys: assembly_params[k] = self.params[k] cur_reads = checker_params['readcounts'][target][ iteration] # note that this is a counter, so no key errors can occur previous_reads = checker_params['readcounts'][target][iteration - 1] #Turn off URT in situations where this will be the last iteration due to readcounts: if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params[ 'numcycles']: logger.info( "Sample: %s target: %s iteration: %s Setting last_assembly to True" % (self.params['sample'], target, self.params['iteration'])) assembly_params['last_assembly'] = True #properly handle the case where no reads ended up mapping for the PE or SE inputs: if PEs > 0: assembly_params['assembly_PE1'] = os.path.join( target_dir, "PE1." + self.params['format']) assembly_params['assembly_PE2'] = os.path.join( target_dir, "PE2." + self.params['format']) if SEs > 0: assembly_params['assembly_SE'] = os.path.join( target_dir, "SE." + self.params['format']) #All reads have been written at this point, add an assembly to the queue: logger.info( "Sample: %s target: %s iteration: %s Split %s reads in %s seconds" % (self.params['sample'], target, self.params['iteration'], len(reads), time.time() - startT)) #Only add an assembly job and AssemblyChecker target if is there are >0 reads: if PEs + SEs > 0: checker_params['targets'][target_dir] = False self.submit(Assembler.to_job(assembly_params)) statsf.close() logger.info("------------------------------------") logger.info("| Sample: %s Iteration %s of numcycles %s" % (checker_params['sample'], checker_params['iteration'], checker_params['numcycles'])) logger.info("------------------------------------") if 'PE1' in self.params and 'PE2' in self.params: idx_PE1.close() idx_PE2.close() del idx_PE1 del idx_PE2 if 'SE' in self.params: idx_SE.close() del idx_SE #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue if len(checker_params['targets']) > 0: # checker = AssemblyChecker(checker_params) self.submit(AssemblyChecker.to_job(checker_params)) else: logger.info("Sample: %s No reads mapped, no more work to do." % checker_params['sample'])
def start(self): sample = self.params['sample'] logger.info("Sample: %s Starting finisher" % self.params['sample']) finished_dir = self.params['finished_dir'] sample_finished = False targets_written = 0 iteration = self.params['iteration'] #Set up output for both finished and additional mapping outputs fin_outf = open(os.path.join(finished_dir, 'contigs.fasta'), 'a') remap_outf = open( os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta'), 'w') #check whether the sample is globally finished if self.params['iteration'] >= self.params['numcycles']: sample_finished = True #loop over the current set of targets_folders for target_folder in self.params['targets']: #Extract target specific details: target_map_against_reads = False safe_target = target_folder.split("/")[ -1] # get last element of path name target = self.params['safe_targets'][safe_target] cur_reads = self.params['readcounts'][target][ iteration] # note that this is a counter, so no key errors can occur previous_reads = self.params['readcounts'][target][iteration - 1] #Get finished assembly status: with open(os.path.join(target_folder, 'finished'), 'r') as finishedf: l = finishedf.readline().strip().split()[0] logger.info("Sample: %s target: %s finishing target.." % (self.params['sample'], target)) logger.info( "Sample: %s target: %s iteration: %s Assembly reports status: %s." % (sample, target, self.params['iteration'], l)) if l in ('assembly_failed', 'map_against_reads'): target_map_against_reads = True if l == 'assembly_killed': #only write out the reads, assembly won't have contigs self.write_target(target, target_folder, outf=fin_outf, finished=False, map_against_reads=False, killed=True) elif sample_finished: # everything goes into the final file/folders. self.write_target( target, target_folder, outf=fin_outf, finished=True, ) elif target_map_against_reads and cur_reads > previous_reads and iteration < 3: #Only map against reads if we have improvement in mapping and we haven't been mapping for multiple iterations targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False, map_against_reads=True) else: #Check read counts and retire target, or send it back for re-mapping depending on mapped reads if iteration > 1 and cur_reads != 0 and previous_reads != 0: if cur_reads / previous_reads > self.params[ 'max_incorporation']: logger.info( "Sample %s target %s hit a repetitive region, no more mapping will be done" % (self.params['sample'], target)) self.write_target(target, target_folder, outf=fin_outf, finished=True, status='Repeat') elif cur_reads <= previous_reads and iteration > 2: #Give the mapper a couple extra iterations in case the first mapping got a lot of reads which didn't assemble logger.info( "Sample %s target %s did not incorporate any more reads, no more mapping will be done" % (self.params['sample'], target)) self.write_target(target, target_folder, outf=fin_outf, finished=True) else: #nothing fancy is going on, just write the contigs out for remapping targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False) else: #nothing fancy is going on, just write the contigs out for remapping targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False) fin_outf.flush() remap_outf.flush() fin_outf.close() remap_outf.close() if targets_written > 0: # Build a new mapper and put it on the queue from ARC.runners import Mapper mapper_params = {} for k in self.params: mapper_params[k] = self.params[k] del mapper_params['targets'] mapper_params['reference'] = os.path.join( self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') self.submit(Mapper.to_job(mapper_params)) logger.info("Sample: %s Added new mapper to queue: iteration %s" % (self.params['sample'], self.params['iteration'])) else: logger.info( "Sample: %s Mapper not added to queue. Work finished." % self.params['sample'])
def RunNewbler(self): #Code for running newbler """ Expects params keys: PE1 and PE2 and/or SE target_dir -urt """ #Check for necessary params: if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or 'assembly_SE' in self.params): raise exceptions.FatalError('Missing self.params in RunNewbler.') #Check for necessary files: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not(os.path.exists(self.params['assembly_PE1']) or not(os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunNewbler.') if 'assembly_SE' in self.params and not(os.path.exists(self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunNewbler.') sample = self.params['sample'] target = self.params['target'] killed = False failed = False #determine whether to pipe output to a file or /dev/null if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') #Build args for newAssembly: args = ['newAssembly', '-force'] if self.params['last_assembly'] and self.params['cdna']: #only run with cdna switch on the final assembly args += ['-cdna'] args += [os.path.join(self.params['target_dir'], 'assembly')] logger.debug("Calling newAssembly for sample: %s target %s" % (sample, target)) logger.info(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for addRun: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')] args += [self.params['assembly_PE1']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')] args += [self.params['assembly_PE2']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) if 'assembly_SE' in self.params: args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')] args += [self.params['assembly_SE']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for runProject args = ['runProject'] args += ['-cpu', '1'] if self.params['last_assembly'] and self.params['cdna']: args += ['-noace'] else: args += ['-nobig'] if self.params['urt'] and not self.params['last_assembly']: #only run with the -urt switch when it isn't the final assembly args += ['-urt'] if self.params['rip']: args += ['-rip'] args += [os.path.join(self.params['target_dir'], 'assembly')] try: start = time.time() logger.debug("Calling runProject for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: self.kill_process_children(pid) logger.warn("Sample: %s target: %s iteration: %s Killing assembly after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) killed = True break time.sleep(.5) except Exception as exc: txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % (self.params['sample'], self.params['target']) txt += '\n\t' + str(exc) + "".join(traceback.format_exception) logger.warn(txt) failed = True pass finally: out.close() #Sometimes newbler doesn't seem to exit completely: self.kill_process_children(pid) #if ret != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") if not killed and ret.poll() != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") failed = True if failed: logger.info("Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed\t" + str(time.time() - start)) outf.close() if killed: logger.info("Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed\t" + str(time.time() - start)) outf.close() else: #Run finished without error logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete\t" + str(time.time() - start)) outf.close()
def write_target(self, target, target_folder, outf, finished=False, map_against_reads=False, killed=False, status=None): # either map_against_reads was passed in, or # no contigs were assembled and target isn't finished, or # assembler crashed and no contig file was created # --> write reads as contigs num_contigs = 0 # store how many contigs were written out contig_length = 0 if finished and status is None: status = 'Finished' if killed: status = 'Killed' if map_against_reads is False and killed is False: if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") elif self.params['assembler'] == 'spades': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "contigs.fasta") #add support for a special output if this is the final assembly and newbler -cdna was used: if finished and self.params['cdna'] and self.params['assembler'] == 'newbler': self.writeCDNAresults(target, target_folder, outf, contigf) elif os.path.exists(contigf): i = 0 contig_inf = open(contigf, 'r') for contig in SeqIO.parse(contig_inf, 'fasta'): i += 1 if finished: contig.name = contig.id = self.params['sample'] + "_:_" + target + "_:_" + "Contig%03d" % i else: contig.name = contig.id = self.params['sample'] + "_:_" + target + "_:_" + "Unfinished%03d" % i contig = contig.upper() #Only mask repeats on intermediate iterations. if self.params['maskrepeats'] and not finished: #contig.seq = Seq(str(mask_seq(contig.seq.tostring(), self.params['mapper']))) contig.seq = Seq(str(mask_seq(str(contig.seq), self.params['mapper']))) #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(contig.seq) != contig.seq.count('n'): SeqIO.write(contig, outf, "fasta") contig_length += len(contig.seq) contig_inf.close() logger.info("Sample: %s target: %s iteration: %s Finished writing %s contigs " % (self.params['sample'], target, self.params['iteration'], i)) num_contigs += i #if i == 0 and finished is False and self.params['iteration'] < 2: # map_against_reads = True if map_against_reads: i = 0 logger.info("Sample %s target %s: Writing reads as contigs." % (self.params['sample'], target)) if 'PE1' in self.params and 'PE2' in self.params: inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format']) inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format']) if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n): inf_PE1 = open(inf_PE1n, 'r') inf_PE2 = open(inf_PE2n, 'r') for r in SeqIO.parse(inf_PE1, self.params['format']): i += 1 r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") for r in SeqIO.parse(inf_PE2, self.params['format']): i += 1 r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") inf_PE1.close() inf_PE2.close() if 'SE' in self.params: inf_SEn = os.path.join(target_folder, "SE." + self.params['format']) if os.path.exists(inf_SEn): inf_SE = open(inf_SEn, 'r') for r in SeqIO.parse(inf_SE, self.params['format']): i += 1 r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i SeqIO.write(r, outf, "fasta") inf_SE.close() num_contigs += i if finished or killed: #Write reads: if 'PE1' in self.params and 'PE2' in self.params: inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format']) inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format']) if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n): inf_PE1 = open(inf_PE1n, 'r') inf_PE2 = open(inf_PE2n, 'r') outf_PE1 = open(os.path.join(self.params['finished_dir'], "PE1." + self.params['format']), 'a') outf_PE2 = open(os.path.join(self.params['finished_dir'], "PE2." + self.params['format']), 'a') for r in SeqIO.parse(inf_PE1, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_PE1, self.params['format']) for r in SeqIO.parse(inf_PE2, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_PE2, self.params['format']) outf_PE1.close() outf_PE2.close() if 'SE' in self.params: inf_SEn = os.path.join(target_folder, "SE." + self.params['format']) if os.path.exists(inf_SEn): inf_SE = open(inf_SEn, 'r') outf_SE = open(os.path.join(self.params['finished_dir'], "SE." + self.params['format']), 'a') for r in SeqIO.parse(inf_SE, self.params['format']): r.description = self.params['sample'] + "_:_" + target SeqIO.write(r, outf_SE, self.params['format']) outf_SE.close() # Finally a special case for situations where assembly of a target is killed, but contigs exist from # a previous assembly. Note that we only do this when not running in cDNA mode. if killed and self.params['iteration'] > 1 and not self.params['cdna']: #No contigs will be available, however contigs from the previous iteration will be present in # I00N_contigs.fasta, grab these and write them out instead logger.info("Sample: %s target: %s iteration: %s Writing contigs from previous iteration." % (self.params['sample'], target, self.params['iteration'])) contigf = os.path.join(self.params['working_dir'], 'I%03d' % (self.params['iteration'] - 1) + '_contigs.fasta') if os.path.exists(contigf): for contig in SeqIO.parse(contigf, 'fasta'): if contig.id.split("_:_")[1] == target: contig.name = contig.id = contig.id.replace("Unfinished", "Contig") SeqIO.write(contig, outf, "fasta") num_contigs += 1 contig_length += len(contig.seq) #Cleanup temporary assembly, and reads: if not self.params['keepassemblies']: os.system("rm -rf %s" % target_folder) #write out target stats: if finished or killed: writeTargetStats(finished_dir=self.params['finished_dir'], sample=self.params['sample'], target=target, targetLength=self.params['summary_stats'][target]['targetLength'], status=status, iteration=self.params['iteration'], readcount=self.params['readcounts'][target][self.params['iteration']], num_contigs=num_contigs, contig_length=contig_length) del self.params['summary_stats'][target] #writeTargetStats(target, status, num_contigs, contig_length, self.params) #summary_stats[target] = {'RefLen': len(t), 'Status': 'NA', 'Iteration': None, # 'Reads': None, 'Contigs': None, 'ContigLength': None} #[self.params['Sample'], target, 'TargetLength', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n') if finished or killed: return 0 else: return num_contigs
def writeCDNAresults(self, target, target_folder, outf, contigf): """ This is ONLY called when a cDNA target is finished. When doing a cDNA type run, it is very useful to have both the following: 1) All contigs that belong to a gene (isogroup) - It would be particularly good to re-orient these if they are in RC. 2) Total number of reads assembled in each gene (isogroup) Additionally it would be excellent to some day also get the following: 3) Transcript (isotig) structure 4) Estimate of isotig specific reads. """ if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") isotigsf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454IsotigsLayout.txt") readstatusf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454ReadStatus.txt") else: logger.info( "WARNING writeCDNAresults called when assembler was not Newbler" ) return None if not (os.path.exists(contigf) and os.path.exists(isotigsf) and os.path.exists(readstatusf)): logger.info("CDNA WARNING MISSING FILE!! %s %s" % (target, self.params['sample'])) logger.info(contigf, os.path.exists(contigf)) logger.info(isotigsf, os.path.exists(isotigsf)) logger.info(readstatusf, os.path.exists(readstatusf)) return None #Storage data structures: isogroups = { } # A dict of isogroups which each contain an in-order list of contigs readcounts = Counter( ) # A dict of all contigs, these contain read counts (from ReadStatus) contig_orientation = {} contig_to_isogroup = {} contig_idx = SeqIO.index(contigf, "fasta") # Parse isotigsf: igroup = "" #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf for l in open(isotigsf, 'r'): #Handle lines with only a '\n' if l == '\n': pass #Handle lines for isogroup: elif l[0:9] == '>isogroup': igroup = l.strip().split()[0].strip(">") #Handle lines containing all contigs: elif l.strip().split()[0] == 'Contig': l2 = l.strip().split() contigs = map(lambda x: "contig" + x, l2[2:-1]) isogroups[igroup] = contigs for contig in contigs: if contig not in contig_orientation: contig_orientation[contig] = '+' contig_to_isogroup[contig] = igroup else: raise exceptions.FatalError( 'Contig %s in %s more than once' % (contig, contigf)) #Handle lines containing contig orientation info: elif l[0:6] == 'isotig': l2 = l[l.find(" ") + 1:l.rfind(" ") - 1] l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)] for i in range(len(l3)): if l3[i][0] == '<': # contig is in reverse orientation contig = isogroups[igroup][i] contig_orientation[contig] = '-' #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation) #Now parse readstatus: inf = open(readstatusf, 'r') inf.readline() # discard first line for l in inf: l2 = l.strip().split('\t') #Determine if this read was assembled if len(l2) == 8: contig = l2[2] # Note that there are some built in limits to the number of contigs that can be in an isogroup: # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/ # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file. if contig in contig_to_isogroup: readcounts[contig_to_isogroup[contig]] += 1 else: readcounts['ExceedsThreshold'] += 1 #print self.params['sample'], target, "Parse read status" #Finally, output all of this information appropriately: countsf = open( os.path.join(self.params['finished_dir'], "isogroup_read_counts.tsv"), 'a') sample = self.params['sample'] #First write out readcounts: sample \t target \t isogroup \t readcount for isogroup in readcounts: countsf.write('\t'.join( [sample, target, isogroup, str(readcounts[isogroup])]) + '\n') countsf.close() #print self.params['sample'], target, "Wrote readcounts" #Next write the contigs in proper order and orientation: ncontigs = 0 nisogroups = 0 for isogroup in isogroups: nisogroups += 1 for contig in isogroups[isogroup]: ncontigs += 1 seqrec = contig_idx[contig] #print self.params['sample'], target, seqrec if contig_orientation[contig] == '-': seqrec.seq = seqrec.seq.reverse_complement() #print self.params['sample'], target, seqrec seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig #print self.params['sample'], target, seqrec SeqIO.write(seqrec, outf, "fasta") ## TODO: add support for the ExceedsThreshold contigs logger.info( "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups " % (self.params['sample'], target, self.params['iteration'], ncontigs, nisogroups))
def setup(self, config): """ Set up working folder for each sample. Also assign a "safe_target" name to each target so that folder creation works. This is a little bit tricky because if the user has targets with the _:_ seperator in the name it messes up the splitter and SAM_to_dict. This code is therefore written with the assumption that the user has put the _:_ in the name purposely so that multiple entries in the reference fasta will be treated as a single target. """ format = config['format'] for sample in config['Samples']: s = config['Samples'][sample] working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample) #working_dir = os.path.realpath('./working_' + sample) finished_dir = os.path.realpath('./finished_' + sample) config['Samples'][sample]['working_dir'] = working_dir config['Samples'][sample]['finished_dir'] = finished_dir if os.path.exists(working_dir): logger.info("WARNING working directory already exists for " "sample %s, deleting old results if any." % (sample)) os.system('rm -rf %s' % finished_dir) os.system('rm -rf %s/t__*' % working_dir) os.system('rm -rf %s/*.psl' % working_dir) os.system('rm %s/I*_contigs.fasta' % working_dir) if os.path.exists('%s/idx' % working_dir): os.system('rm -rf %s/idx' % working_dir) os.mkdir(finished_dir) else: os.mkdir(working_dir) os.mkdir(finished_dir) # Create stats file: statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w') statsf.write( '\t'.join(['Sample', 'Target', 'Iteration', 'Reads']) + '\n') statsf.close() # Create Target Summary Table tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w') tstf.write('\t'.join([ 'Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength' ]) + '\n') tstf.close() # Create a stats file for cdna if config['cdna']: countsf = open( os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a') countsf.write( '\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n') countsf.close() # Build a separate index for each read file in the input, put them # in working_dir # Consider parallelizing this? try: start = time.time() if 'PE1' in s: if not os.path.exists(os.path.join(working_dir, "PE1.idx")): print s['PE1'] index_file = os.path.join(working_dir, "PE1.idx") p1 = SeqIO.index_db(index_file, s['PE1'], format, key_function=keyfunction( config['sra'])) if 'PE2' in s: if not os.path.exists(os.path.join(working_dir, "PE2.idx")): print s['PE2'] index_file = os.path.join(working_dir, "PE2.idx") p2 = SeqIO.index_db(index_file, s['PE2'], format, key_function=keyfunction( config['sra'])) if len(p1) != len(p2): logger.error( "The number of reads in %s and %s do not match, " "check the config for errors" % (s['PE1'], s['PE2'])) if 'SE' in s: if not os.path.exists(os.path.join(working_dir, "SE.idx")): print s['SE'] index_file = os.path.join(working_dir, "SE.idx") SeqIO.index_db(index_file, s['SE'], format, key_function=keyfunction(config['sra'])) except (KeyboardInterrupt, SystemExit): print "Removing partial index: %s" % index_file os.unlink(index_file) raise logger.info("Sample: %s, indexed reads in %s seconds." % (sample, time.time() - start)) # Read through the references, mask them if necessary # mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') # Read through the reference, set up a set of safe names for the targets. # Also create the Target Summary Table which is indexed by original target name (following ARC conventions) # Also mask sequences and write them to a new set of output files # safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID. summary_stats = {} safe_targets = {} new_refsf = {} for sample in config['Samples']: s = config['Samples'][sample] new_refsf[sample] = open( os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w') i = 0 for t in SeqIO.parse(config['reference'], "fasta"): if len(t.name.split("_:_")) == 1: target = t.name else: target = t.name.split("_:_")[1] safe_targets[target] = "t__%06d" % i safe_targets["t__%06d" % i] = target i += 1 if target not in summary_stats: summary_stats[target] = {'targetLength': len(t)} else: summary_stats[target]['targetLength'] = ( summary_stats[target]['targetLength'] + len(t)) # Write contigs: if config['maskrepeats']: t.seq = Seq(str(mask_seq(str(t.seq), config['mapper']))) # Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(t) != t.seq.count('n'): for outf in new_refsf.values(): SeqIO.write(t, outf, "fasta") else: writeTargetStats( finished_dir=s['finished_dir'], sample=sample, target=target, targetLength=summary_stats[target]['targetLength'], status='MaskedOut', iteration=0, readcount=0, num_contigs=0, contig_length=0) del summary_stats[target] config['safe_targets'] = safe_targets config['summary_stats'] = summary_stats
def info(self, msg): if self.loglevel == logging.DEBUG: name = self.name else: name = self.__class__.__name__ logger.info("%-12s| %s" % (name, msg))
def run(self): logger.info("Starting...") logger.debug("Setting up workers.") for i in range(self.nprocs): worker = ProcessRunner(i, self.q, self.status, self.stats, self.pid) self.workers.append(worker) worker.daemon = False worker.start() while True: try: self.q.join() # This shouldn't be needed but we will check just in case if self.all_workers_waiting(): logger.debug( "Workers are all waiting and the queue is empty. Exiting" ) break else: logger.debug( "Workers are not in a waiting state. Waiting for more." ) time.sleep(5) except exceptions.FatalError: logger.error("A fatal error was encountered.") self.killall() raise except (KeyboardInterrupt, SystemExit): logger.error("Terminating processes") self.killall() raise except Exception as e: ex_type, ex, tb = sys.exc_info() logger.error("\n".join( traceback.format_exception(ex_type, ex, tb))) logger.error("An unhandled exception occurred") self.killall() raise finally: # Kill 'em all! self.killall() logger.info("-----") logger.info("%d processes returned ok." % (self.stats[0])) logger.info("%d processes had to be rerun." % (self.stats[1])) logger.info("-----") logger.info("%d Mapper jobs run." % (self.stats[2])) logger.info("%d Assembly jobs run." % (self.stats[3])) logger.info("%d Checker jobs run." % (self.stats[4])) logger.info("%d Finisher jobs run." % (self.stats[5])) logger.info("-----")