def start(self): if not ('assembler' in self.params): raise exceptions.FatalError("assembler not defined in params") if self.params['map_against_reads'] and self.params['iteration'] == 1: self.RunMapAgainstReads() elif self.params['assembler'] == 'newbler': self.RunNewbler() elif self.params['assembler'] == 'spades': self.RunSpades() else: raise exceptions.FatalError("Assembler %s isn't recognized." % self.params['assembler'])
def check_bins(self, bins): for bin in bins: try: subprocess.check_output(['which', bin]) except CalledProcessError: raise exceptions.FatalError( "Cannot find %s in path, or the 'which' " "command is missing" % (bin))
def SAM_to_dict(self, filename): """ Read a SAM file to a mapping dict and return it """ #Check for necessary files: if os.path.exists(filename) is False: raise exceptions.FatalError("Missing SAM file") try: inf = open(filename, 'r') except Exception as exc: txt = "Failed to open SAM file %s" % filename txt += '\n\t' + str(exc) raise exceptions.FatalError(txt) read_map = {} # target:{read} dictionary of dictionaries i = 0 discards = 0 startT = time.time() for l in inf: i += 1 if l[0] != "@": # skip header lines l2 = l.strip().split() if l2[2] == "*": # skip unmapped continue readid = keyfunction(self.params['sra'])( l2[0]) # .split("/")[0] target = l2[2] # handle references built using assembled contigs: if len(target.split("_:_")) == 3: target, status = target.split("_:_")[1:] # This keeps ARC from writing reads which mapped to finished contigs if status.startswith("Contig") or status.startswith( "isogroup"): discards += 1 continue if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 # Report total time: logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." % (self.params['sample'], i, time.time() - startT)) if discards > 0: logger.info( "%s out of %s reads mapped to finished contigs and were not recruited for assembly." % (discards, i)) return read_map
def start(self): if not ('mapper' in self.params): raise exceptions.FatalError("mapper not defined in params") if self.params['mapper'] == 'bowtie2': logger.info("Sample: %s Running bowtie2." % self.params['sample']) self.run_bowtie2() if self.params['mapper'] == 'blat': logger.info("Sample: %s Running blat." % self.params['sample']) self.run_blat() #Mapping is done, run splitreads: logger.info("Sample: %s Running splitreads." % self.params['sample']) self.splitreads()
def check(self): # Check that the reference file exists if 'reference' in self.config: self.config['reference'] = os.path.realpath( self.config['reference']) if not os.path.exists(self.config['reference']): raise exceptions.FatalError("Error, cannot find reference %s" % (self.config['reference'])) else: raise exceptions.FatalError('Error, reference not included in %s' % self.filename) # Check to see if the samples are valid if len(self.config['Samples']) > 0: for sample in self.config['Samples']: pe_one = 'PE1' in self.config['Samples'][sample] pe_two = 'PE2' in self.config['Samples'][sample] pe = pe_one and pe_two se = 'SE' in self.config['Samples'][sample] if not (pe or se): raise exceptions.FatalError( "Error you must specify PE files and/or a SE file for " "each sample.") else: raise exceptions.FatalError("Could not find samples in %s" % self.filename) if self.config['format'] not in self.FORMATS: raise exceptions.FatalError( "Error, file format not specificed in ARC_self.txt.") if self.config['mapper'] not in self.MAPPERS: raise exceptions.FatalError("Error mapper must be either %s" % (', '.join(self.MAPPERS.keys()))) else: self.check_bins(self.MAPPERS[self.config['mapper']]) if self.config['assembler'] not in self.ASSEMBLERS: raise exceptions.FatalError("Error assembler must be either: %s" % (', '.join(self.ASSEMBLERS.keys()))) else: self.check_bins(self.ASSEMBLERS[self.config['assembler']]) if self.config['subsample'] <= 0 and self.config['subsample'] > 1: raise exceptions.FatalError( "Error, you must specify a value greater than 0 and less than or equal to 1 for subsample" )
def set_defaults(self): for key, value in self.OPTIONS.iteritems(): if key not in self.config: if value is None: raise exceptions.FatalError( "Error, %s required but not specificed in " "ARC_self.config.txt" % key) else: logger.info( "%s not specified in ARC_config.txt, defaulting to " "%s" % (key, value)) self.config[key] = value # Anything listed below here is not expected to be in the config but # needs initialized self.config['iteration'] = 0
def __init__(self, filename): if os.path.exists(filename) is False: raise exceptions.FatalError( "Error, you must run ARC in a folder containing " "ARC_config.txt") self.filename = filename # Initialize config self.config = {} # Read config file, set the defaults, and check self.read() self.set_defaults() self.check() self.convert()
def read(self): infile = open(self.filename, 'r') # Read in comments and globals. Treats '##' as comments and '#' as # global variables. while True: line = infile.readline() if not line: break line = line.strip() # Blank line if line == "": continue arr = line.split() if arr[0] == "#": cfg = arr[1].split('=') if len(cfg) != 2 or cfg[1] == "": raise exceptions.FatalError( "Error, parameters not specified correctly, please " "use # name=value. Offending entry: \n\t%s" % arr[1]) # Go ahead and convert the things that should be ints to ints key = cfg[0].strip() value = cfg[1].strip() if re.match(r"^[0-9]*\.[0-9]+$", value): self.config[key] = float(value) elif re.match(r"^[0-9]+$", value): self.config[key] = int(value) elif value in ('True', 'true'): self.config[key] = True elif value in ('False', 'false'): self.config[key] = False else: self.config[key] = value elif arr[0] == "##": pass else: # We just sucked in the header for the samples break # Now get the sample information self.config['Samples'] = {} while True: line = infile.readline() if not line: break line = line.strip() # Blank line if line == "" or line[0] == '#': continue arr = line.split() if len(arr) != 3: raise exceptions.FatalError( "Error, sample description entry is not properly " "formatted! Offending entry: %s" % line) sample_id = arr[0].strip() filename = arr[1].strip() filetype = arr[2].strip() if sample_id not in self.config['Samples']: self.config['Samples'][sample_id] = {} if filetype in self.config['Samples'][sample_id]: raise exceptions.FatalError( "Error same FileType specified more than once " "for sample_id %s." % sample_id) if not os.path.exists(filename): raise exceptions.FatalError( "%s file indicated but not found: %s" % (filetype, filename)) else: self.config['Samples'][sample_id][filetype] = os.path.realpath( filename)
def RunNewbler(self): #Code for running newbler """ Expects params keys: PE1 and PE2 and/or SE target_dir -urt """ #Check for necessary params: if not ( ('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or 'assembly_SE' in self.params): raise exceptions.FatalError('Missing self.params in RunNewbler.') #Check for necessary files: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not ( os.path.exists(self.params['assembly_PE1']) or not (os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunNewbler.') if 'assembly_SE' in self.params and not (os.path.exists( self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunNewbler.') sample = self.params['sample'] target = self.params['target'] killed = False failed = False #determine whether to pipe output to a file or /dev/null if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') #Build args for newAssembly: args = ['newAssembly', '-force'] if self.params['last_assembly'] and self.params['cdna']: #only run with cdna switch on the final assembly args += ['-cdna'] args += [os.path.join(self.params['target_dir'], 'assembly')] logger.debug("Calling newAssembly for sample: %s target %s" % (sample, target)) logger.info(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for addRun: if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_PE1']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_PE2']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) if 'assembly_SE' in self.params: args = [ 'addRun', os.path.join(self.params['target_dir'], 'assembly') ] args += [self.params['assembly_SE']] logger.debug("Calling addRun for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.call(args, stdout=out, stderr=out) #Build args for runProject args = ['runProject'] args += ['-cpu', '1'] if self.params['last_assembly'] and self.params['cdna']: args += ['-noace'] else: args += ['-nobig'] if self.params['urt'] and not self.params['last_assembly']: #only run with the -urt switch when it isn't the final assembly args += ['-urt'] if self.params['rip']: args += ['-rip'] args += [os.path.join(self.params['target_dir'], 'assembly')] try: start = time.time() logger.debug("Calling runProject for sample: %s target %s" % (sample, target)) logger.debug(" ".join(args)) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: self.kill_process_children(pid) logger.warn( "Sample: %s target: %s iteration: %s Killing assembly after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) killed = True break time.sleep(.5) except Exception as exc: txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % ( self.params['sample'], self.params['target']) txt += '\n\t' + str(exc) + "".join(traceback.format_exception) logger.warn(txt) failed = True pass finally: out.close() #Sometimes newbler doesn't seem to exit completely: self.kill_process_children(pid) #if ret != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") if not killed and ret.poll() != 0: #raise exceptions.RerunnableError("Newbler assembly failed.") failed = True if failed: logger.info( "Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed\t" + str(time.time() - start)) outf.close() if killed: logger.info( "Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed\t" + str(time.time() - start)) outf.close() else: #Run finished without error logger.info( "Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete\t" + str(time.time() - start)) outf.close()
def RunSpades(self): """ Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir] """ #Check that required params are available if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or ('assembly_SE' in self.params)): raise exceptions.FatalError('Missing self.params in RunSpades.') #Check that the files actually exist if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not ( os.path.exists(self.params['assembly_PE1']) or not (os.path.exists(self.params['assembly_PE2']))): raise exceptions.FatalError('Missing PE files in RunSpades.') if 'assembly_SE' in self.params and not (os.path.exists( self.params['assembly_SE'])): raise exceptions.FatalError('Missing SE file in RunSpades.') sample = self.params['sample'] target = self.params['target'] #Build args for assembler call args = ['spades.py', '-t', '1'] if self.params['only-assembler'] and not self.params['last_assembly']: args.append("--only-assembler") if self.params['format'] == 'fasta': args.append( '--only-assembler' ) # spades errors on read correction if the input isn't fastq if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params: args += [ '-1', self.params['assembly_PE1'], '-2', self.params['assembly_PE2'] ] if 'assembly_SE' in self.params: args += ['-s', self.params['assembly_SE']] args += ['-o', os.path.join(self.params['target_dir'], 'assembly')] if self.params['verbose']: out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w') else: out = open(os.devnull, 'w') logger.debug("Sample: %s target: %s Running spades assembler." % (sample, target)) logger.info(" ".join(args)) killed = False failed = False start = time.time() try: #ret = subprocess.call(args, stderr=out, stdout=out) ret = subprocess.Popen(args, stdout=out, stderr=out) pid = ret.pid while ret.poll() is None: if time.time() - start > self.params['assemblytimeout']: ret.kill() killed = True logger.warn( "Sample: %s target: %s Assembly killed after %s seconds." % (sample, target, time.time() - start)) break time.sleep(.5) except Exception as exc: txt = ( "Sample: %s, Target: %s: Unhandeled error running Spades assembly" % (sample, target)) txt += '\n\t' + str(exc) logger.warn(txt) failed = True pass finally: out.close() #Ensure that assembler exits cleanly: self.kill_process_children(pid) if not killed and ret.poll() != 0: failed = True if failed: logger.info( "Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_failed") outf.close() elif killed: logger.info( "Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_killed") outf.close() else: #Run finished without error logger.info( "Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start)) outf = open(os.path.join(self.params['target_dir'], "finished"), 'w') outf.write("assembly_complete") outf.close()
def writeCDNAresults(self, target, target_folder, outf, contigf): """ This is ONLY called when a cDNA target is finished. When doing a cDNA type run, it is very useful to have both the following: 1) All contigs that belong to a gene (isogroup) - It would be particularly good to re-orient these if they are in RC. 2) Total number of reads assembled in each gene (isogroup) Additionally it would be excellent to some day also get the following: 3) Transcript (isotig) structure 4) Estimate of isotig specific reads. """ if self.params['assembler'] == 'newbler': contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna") isotigsf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454IsotigsLayout.txt") readstatusf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454ReadStatus.txt") else: logger.info( "WARNING writeCDNAresults called when assembler was not Newbler" ) return None if not (os.path.exists(contigf) and os.path.exists(isotigsf) and os.path.exists(readstatusf)): logger.info("CDNA WARNING MISSING FILE!! %s %s" % (target, self.params['sample'])) logger.info(contigf, os.path.exists(contigf)) logger.info(isotigsf, os.path.exists(isotigsf)) logger.info(readstatusf, os.path.exists(readstatusf)) return None #Storage data structures: isogroups = { } # A dict of isogroups which each contain an in-order list of contigs readcounts = Counter( ) # A dict of all contigs, these contain read counts (from ReadStatus) contig_orientation = {} contig_to_isogroup = {} contig_idx = SeqIO.index(contigf, "fasta") # Parse isotigsf: igroup = "" #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf for l in open(isotigsf, 'r'): #Handle lines with only a '\n' if l == '\n': pass #Handle lines for isogroup: elif l[0:9] == '>isogroup': igroup = l.strip().split()[0].strip(">") #Handle lines containing all contigs: elif l.strip().split()[0] == 'Contig': l2 = l.strip().split() contigs = map(lambda x: "contig" + x, l2[2:-1]) isogroups[igroup] = contigs for contig in contigs: if contig not in contig_orientation: contig_orientation[contig] = '+' contig_to_isogroup[contig] = igroup else: raise exceptions.FatalError( 'Contig %s in %s more than once' % (contig, contigf)) #Handle lines containing contig orientation info: elif l[0:6] == 'isotig': l2 = l[l.find(" ") + 1:l.rfind(" ") - 1] l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)] for i in range(len(l3)): if l3[i][0] == '<': # contig is in reverse orientation contig = isogroups[igroup][i] contig_orientation[contig] = '-' #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation) #Now parse readstatus: inf = open(readstatusf, 'r') inf.readline() # discard first line for l in inf: l2 = l.strip().split('\t') #Determine if this read was assembled if len(l2) == 8: contig = l2[2] # Note that there are some built in limits to the number of contigs that can be in an isogroup: # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/ # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file. if contig in contig_to_isogroup: readcounts[contig_to_isogroup[contig]] += 1 else: readcounts['ExceedsThreshold'] += 1 #print self.params['sample'], target, "Parse read status" #Finally, output all of this information appropriately: countsf = open( os.path.join(self.params['finished_dir'], "isogroup_read_counts.tsv"), 'a') sample = self.params['sample'] #First write out readcounts: sample \t target \t isogroup \t readcount for isogroup in readcounts: countsf.write('\t'.join( [sample, target, isogroup, str(readcounts[isogroup])]) + '\n') countsf.close() #print self.params['sample'], target, "Wrote readcounts" #Next write the contigs in proper order and orientation: ncontigs = 0 nisogroups = 0 for isogroup in isogroups: nisogroups += 1 for contig in isogroups[isogroup]: ncontigs += 1 seqrec = contig_idx[contig] #print self.params['sample'], target, seqrec if contig_orientation[contig] == '-': seqrec.seq = seqrec.seq.reverse_complement() #print self.params['sample'], target, seqrec seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig #print self.params['sample'], target, seqrec SeqIO.write(seqrec, outf, "fasta") ## TODO: add support for the ExceedsThreshold contigs logger.info( "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups " % (self.params['sample'], target, self.params['iteration'], ncontigs, nisogroups))
def run_bowtie2(self): """ Builds idx and runs bowtie2 -I 0 -X 1500 --local Expects params: sample, target, reference, working_dir, PE1 and PE2 and/or SE """ #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Make idx directory try: working_dir = self.params['working_dir'] idx_dir = os.path.realpath(os.path.join(working_dir, 'idx')) os.mkdir(idx_dir) except Exception as exc: txt = "Sample: %s Error creating working directory." % ( self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Set up a path to the index base = os.path.join(idx_dir, 'idx') #Build index #The idea is to map against the finished contigs and in-progress # contigs, thereby ensuring that the -k parameter (or best map) # are respected properly, and avoid the situation where reads which # were mapped to a now finished target might later be mapped to a an # in-progress target. fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta') args = ['bowtie2-build', '-f'] if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0: args.append(','.join((fin_outf, self.params['reference']))) else: args.append(self.params['reference']) args.append(base) logger.info("Sample: %s Calling bowtie2-build." % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2-build" % self.params['sample']) + '\n\t' + str(exc) # make sure that out is closed before throwing exception out.close() raise exceptions.FatalError(txt) if ret != 0: out.close() raise exceptions.FatalError( "Sample: %s Error creating bowtie2 index, check log file." % self.params['sample']) #Do bowtie2 mapping: n_bowtieprocs = int( round( max( float(self.params['nprocs']) / len(self.params['Samples']), 1))) args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal'] #Tune the sensitivity so that on the first iteration the mapper is # very sensitive. On later iterations the mapper is very specific. if self.params['iteration'] == 0 and self.params['sloppymapping']: args.append("--very-sensitive-local") else: args += [ "--very-fast-local", "--mp", "12", "--rdg", "12,6", "--rfg", "12,6" ] args += ['-p', str(n_bowtieprocs), '-x', base] if self.params['bowtie2_k'] > 1: args += ['-k', str(self.params['bowtie2_k'])] if self.params['format'] == 'fasta': args += ['-f'] if 'PE1' in self.params and 'PE2' in self.params: args += ['-1', self.params['PE1'], '-2', self.params['PE2']] if 'SE' in self.params: args += ['-U', self.params['SE']] args += ['-S', os.path.join(working_dir, 'mapping.sam')] logger.info("Sample: %s Calling bowtie2 mapper" % self.params['sample']) logger.info(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) out.close() except Exception as exc: txt = ("Sample %s: Unhandeled error running bowtie2 mapping" % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) out.close() if ret != 0: raise exceptions.FatalError( "Sample %s: Bowtie2 mapping returned an error, check log file." % self.params['sample']) #Extract the SAM to a dict self.params['mapping_dict'] = self.SAM_to_dict( os.path.join(working_dir, 'mapping.sam')) #clean up intermediary files: os.remove(os.path.join(working_dir, 'mapping.sam')) os.system("rm -rf %s" % idx_dir)
def splitreads(self): """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names""" self.params['iteration'] += 1 # Write out statistics for any/all targets which failed to recruit reads: for target in self.params['summary_stats'].keys(): # print "Target", target if target not in self.params['mapping_dict']: writeTargetStats(finished_dir=self.params['finished_dir'], sample=self.params['sample'], target=target, targetLength=self.params['summary_stats'] [target]['targetLength'], status='NoReads', iteration=self.params['iteration'], readcount=0, num_contigs=0, contig_length=0) del self.params['summary_stats'][target] checker_params = {} for k in self.params: checker_params[k] = self.params[k] del checker_params['mapping_dict'] checker_params['targets'] = {} iteration = self.params['iteration'] # open previously created indexes: if 'PE1' in self.params and 'PE2' in self.params: idx_PE1 = SeqIO.index_db( os.path.join(self.params['working_dir'], "PE1.idx"), key_function=keyfunction(self.params['sra'])) idx_PE2 = SeqIO.index_db( os.path.join(self.params['working_dir'], "PE2.idx"), key_function=keyfunction(self.params['sra'])) if 'SE' in self.params: idx_SE = SeqIO.index_db( os.path.join(self.params['working_dir'], "SE.idx"), key_function=keyfunction(self.params['sra'])) if 'readcounts' not in checker_params: checker_params['readcounts'] = {} # if 'contigcounts' not in checker_params: # checker_params['contigcounts'] = {} statsf = open( os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'), 'a') for target in self.params['mapping_dict']: startT = time.time() # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target)) target_dir = os.path.join(self.params['working_dir'], self.params['safe_targets'][target]) if target not in checker_params['readcounts']: checker_params['readcounts'][target] = Counter() # if target not in checker_params['contigcounts']: # checker_params['contigcounts'] = Counter() if os.path.exists(target_dir): os.system("rm -rf %s" % target_dir) os.mkdir(target_dir) reads = self.params['mapping_dict'][target] # track how many total reads were added for this cycle checker_params['readcounts'][target][iteration] = len(reads) statsf.write('\t'.join([ self.params['sample'], target, str(iteration), str(len(reads)) ]) + '\n') SEs = PEs = 0 if 'PE1' and 'PE2' in self.params: outf_PE1 = open( os.path.join(target_dir, "PE1." + self.params['format']), 'w') outf_PE2 = open( os.path.join(target_dir, "PE2." + self.params['format']), 'w') if 'SE' in self.params: outf_SE = open( os.path.join(target_dir, "SE." + self.params['format']), 'w') for readID in reads: if self.params['subsample'] < 1 and randint( 0, 100) > self.params['subsample'] * 100: continue if 'PE1' in self.params and readID in idx_PE1: # read1 = idx_PE1[readID] # read2 = idx_PE2[readID] read1 = idx_PE1.get(readID, None) read2 = idx_PE2.get(readID, None) if read2 is None: raise exceptions.FatalError( "ERROR: ReadID %s was found in PE1 file but not PE2" % readID) new_readID = readID.replace(":", "_") + ":0:0:0:0#0/" read1.id = read1.name = new_readID + "1" read2.id = read2.name = new_readID + "2" SeqIO.write(read1, outf_PE1, self.params['format']) SeqIO.write(read2, outf_PE2, self.params['format']) PEs += 1 elif 'SE' in self.params and readID in idx_SE: read1 = idx_SE[readID] read1.id = read1.name = readID.replace(":", "_") + ":0:0:0:0#0/" SeqIO.write(read1, outf_SE, self.params['format']) SEs += 1 if 'PE1' in self.params and 'PE2' in self.params: outf_PE1.close() outf_PE2.close() if 'SE' in self.params: outf_SE.close() #Build assembly job: assembly_params = {} assembly_params['target'] = target assembly_params['target_dir'] = target_dir assembly_params['iteration'] = iteration assembly_params['last_assembly'] = False assembler_keys = [ 'assembler', 'sample', 'verbose', 'format', 'assemblytimeout', 'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip', 'only-assembler' ] for k in assembler_keys: assembly_params[k] = self.params[k] cur_reads = checker_params['readcounts'][target][ iteration] # note that this is a counter, so no key errors can occur previous_reads = checker_params['readcounts'][target][iteration - 1] #Turn off URT in situations where this will be the last iteration due to readcounts: if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params[ 'numcycles']: logger.info( "Sample: %s target: %s iteration: %s Setting last_assembly to True" % (self.params['sample'], target, self.params['iteration'])) assembly_params['last_assembly'] = True #properly handle the case where no reads ended up mapping for the PE or SE inputs: if PEs > 0: assembly_params['assembly_PE1'] = os.path.join( target_dir, "PE1." + self.params['format']) assembly_params['assembly_PE2'] = os.path.join( target_dir, "PE2." + self.params['format']) if SEs > 0: assembly_params['assembly_SE'] = os.path.join( target_dir, "SE." + self.params['format']) #All reads have been written at this point, add an assembly to the queue: logger.info( "Sample: %s target: %s iteration: %s Split %s reads in %s seconds" % (self.params['sample'], target, self.params['iteration'], len(reads), time.time() - startT)) #Only add an assembly job and AssemblyChecker target if is there are >0 reads: if PEs + SEs > 0: checker_params['targets'][target_dir] = False self.submit(Assembler.to_job(assembly_params)) statsf.close() logger.info("------------------------------------") logger.info("| Sample: %s Iteration %s of numcycles %s" % (checker_params['sample'], checker_params['iteration'], checker_params['numcycles'])) logger.info("------------------------------------") if 'PE1' in self.params and 'PE2' in self.params: idx_PE1.close() idx_PE2.close() del idx_PE1 del idx_PE2 if 'SE' in self.params: idx_SE.close() del idx_SE #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue if len(checker_params['targets']) > 0: # checker = AssemblyChecker(checker_params) self.submit(AssemblyChecker.to_job(checker_params)) else: logger.info("Sample: %s No reads mapped, no more work to do." % checker_params['sample'])
def run_blat(self): #Check for necessary params: if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params and 'PE2' in self.params) or 'SE' in self.params)): raise exceptions.FatalError('Missing self.params in run_bowtie2.') #Check for necessary files: if os.path.exists(self.params['reference']) is False: raise exceptions.FatalError("Missing reference file for mapping") if 'PE1' in self.params and 'PE2' in self.params: if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])): raise exceptions.FatalError( "One or both PE files can not be found for mapping.") if 'SE' in self.params: if not os.path.exists(self.params['SE']): raise exceptions.FatalError("SE file cannot be found.") #Blat doesn't need an index working_dir = self.params['working_dir'] #Check whether to log to temporary file, or default to os.devnull if 'verbose' in self.params: out = open(os.path.join(working_dir, "mapping_log.txt"), 'w') else: out = open(os.devnull, 'w') #Build a temporary txt file with all of the reads: allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w') if 'PE1' in self.params and 'PE2' in self.params: allreads_outf.write(self.params['PE1'] + '\n') allreads_outf.write(self.params['PE2'] + '\n') if 'SE' in self.params: allreads_outf.write(self.params['SE'] + '\n') allreads_outf.close() #Do blat mapping args = [ 'blat', self.params['reference'], os.path.join(working_dir, 'reads.txt') ] if self.params['format'] == 'fastq': args.append('-fastq') if self.params['fastmap']: args.append('-fastMap') #Some new experimental params to increase specificity after the first iteration: if self.params['maskrepeats']: args.append("-mask=lower") if self.params['iteration'] > 0 or not self.params['sloppymapping']: args.append("-minIdentity=98") args.append("-minScore=40") args.append(os.path.join(working_dir, 'mapping.psl')) logger.info("Sample: %s Calling blat mapper" % self.params['sample']) logger.debug(" ".join(args)) try: ret = subprocess.call(args, stdout=out, stderr=out) except Exception as exc: txt = ( "Sample %s: Unhandeled error running blat mapping, check log file." % self.params['sample']) + '\n\t' + str(exc) raise exceptions.FatalError(txt) finally: out.close() if ret != 0: raise exceptions.FatalError( 'Sample: %s Error running blat mapping, check log file. \n\t %s' % (self.params['sample'], " ".join(args))) #Extract the PSL to a dict self.params['mapping_dict'] = self.PSL_to_dict( os.path.join(working_dir, 'mapping.psl')) #Cleanup os.remove(os.path.join(working_dir, 'mapping.psl')) out.close()