def main(): with Timer('100 status updates'): for i in xrange(100): grace.status(str(i)) grace.status('') with Timer('100 parallel processes'): @parallel_for(xrange(100)) def loop(i): pass with Timer('100 parallel processes updating statuses'): @parallel_for(xrange(100)) def loop(i): grace.status(str(i)) grace.status('') with Timer('Nested processes'): @parallel_for(xrange(3)) def _(item): print item @parallel_for(xrange(3)) def _(item2): print item, item2
def make_plot(self, plots_name, plot_names, iterator, maximum, color='0,0,0', scale_type='log', windowing='maximum'): grace.status('Write '+plots_name) filename = self.prefix + plots_name + '.igv' f = open(filename, 'wb') height = max(10,int(100.0/math.sqrt(len(plot_names)))) #... print >> f, '#track viewLimits=0:%(maximum)f autoScale=off scaleType=%(scale_type)s windowingFunction=%(windowing)s maxHeightPixels=200:%(height)d:1 color=%(color)s' % locals() print >> f, '\t'.join( [ 'Chromosome', 'Start', 'End', 'Feature'] + [ self.label_prefix + item for item in plot_names ] ) for name, pos, depths in iterator: print >> f, '\t'.join( [ name, str(pos), str(pos+1), 'F' ] + [ str(item) for item in depths ] ) f.close() grace.status('') if self.genome: #One igvtools process at a time self.wait_for_igv() p = io.run([ 'igvtools', 'toTDF', filename, self.prefix + plots_name + '.tdf', self.genome, '-f', 'max,mean' ], stdin=None, stdout=None) self.processes.append((p, filename))
def unmill(rast, mill_points): padx = max(item[0] for item in mill_points) pady = max(item[1] for item in mill_points) sy,sx = rast.shape padded = numpy.zeros((sy+pady*2,sx+padx*2), rast.dtype) padded[pady:pady+sy,padx:padx+sx] = rast result = rast.copy() mill_points = sorted(mill_points, key=lambda item:item[1]) for y in xrange(sy): row = result[y] grace.status('%d' % (sy-y)) for ox,oy,oheight in mill_points: numpy.maximum(row,padded[pady+y-oy,padx-ox:padx+sx-ox]-oheight,row) grace.status('') #old_height = 0 #for x,y,height in sorted(mill_points, key=lambda item:item[2]): # if height != old_height: # numpy.subtract(padded, height-old_height, padded) # old_height = height # print x,y,height # numpy.maximum( # result, # padded[pady-y:pady+sy-y,padx-x:padx+sx-x], # result # ) return result
def reader(working_dirs, references, use_reference, annotations={}): for name, sequence in references: features = annotations.get(sequence, []) if use_reference: readers = [ reference_reader(sequence) ] else: readers = [ ] readers.extend( evidence_reader(working_dir, name) for working_dir in working_dirs ) active_features = [ ] feature_pos = 0 for i in xrange(len(sequence)): if i % 10000 == 0: grace.status('%s %s' % (name, grace.pretty_number(i))) active_features = [ item for item in active_features if item.location.nofuzzy_end > i ] while feature_pos < len(features) and \ features[feature_pos].location.nofuzzy_start <= i: active_features.append(features[feature_pos]) feature_pos += 1 for is_insertion in (True, False): yield Calls(name, i, is_insertion, [ item.next() for item in readers ], active_features) for reader in readers: for item in reader: raise grace.Error('Unexpected extra data in evidence file') grace.status('')
def guess_quality_offset(*filenames): grace.status('Guessing quality offset') try: min_value = chr(255) #max_value = chr(0) any_reads = False for filename in filenames: for i, item in enumerate(read_sequences(filename, qualities=True)): if i > 100000: break if len(item) == 2: continue any_reads = True min_value = min(min_value, min(item[2])) #max_value = max(max_value, max(item[2])) if not any_reads: return 33 low = ord(min_value) #high = ord(max_value) #print 'Quality chars in range %d-%d in %s' % (low,high,filename) if low < 59: return 33 #Sanger and Illumina 1.8+ return 64 #Illumina pre 1.8 finally: grace.status('')
def main(): #print dir(hello) import sys print sys.modules['__main__'].__file__ a = nesoni.future(func,'a',[]) b = nesoni.future(func,'b',[]) c = nesoni.future(func,'c',[a,b]) d = nesoni.future(func,'d',[a,b]) c() d() #print legion.coordinator().get_cores() #legion.coordinator().job(hello) with Timer('100 status updates'): for i in xrange(100): grace.status(str(i)) grace.status('') with Timer('100 parallel threads'): @nesoni.thread_for(xrange(100)) def loop(i): pass with Timer('100 parallel processes'): nesoni.parallel_for(xrange(100))(do_nothing) with Timer('100 parallel processes updating statuses'): nesoni.parallel_for(xrange(100))(do_status)
def main(): #print dir(hello) import sys print sys.modules['__main__'].__file__ a = nesoni.future(func, 'a', []) b = nesoni.future(func, 'b', []) c = nesoni.future(func, 'c', [a, b]) d = nesoni.future(func, 'd', [a, b]) c() d() #print legion.coordinator().get_cores() #legion.coordinator().job(hello) with Timer('100 status updates'): for i in xrange(100): grace.status(str(i)) grace.status('') with Timer('100 parallel threads'): @nesoni.thread_for(xrange(100)) def loop(i): pass with Timer('100 parallel processes'): nesoni.parallel_for(xrange(100))(do_nothing) with Timer('100 parallel processes updating statuses'): nesoni.parallel_for(xrange(100))(do_status)
def reader(working_dirs, references, use_reference, annotations={}): for name, sequence in references: features = annotations.get(sequence, []) if use_reference: readers = [reference_reader(sequence)] else: readers = [] readers.extend(evidence_reader(working_dir, name) for working_dir in working_dirs) active_features = [] feature_pos = 0 for i in xrange(len(sequence)): if i % 10000 == 0: grace.status("%s %s" % (name, grace.pretty_number(i))) active_features = [item for item in active_features if item.location.nofuzzy_end > i] while feature_pos < len(features) and features[feature_pos].location.nofuzzy_start <= i: active_features.append(features[feature_pos]) feature_pos += 1 for is_insertion in (True, False): yield Calls(name, i, is_insertion, [item.next() for item in readers], active_features) for reader in readers: for item in reader: raise grace.Error("Unexpected extra data in evidence file") grace.status("")
def run(self): seqs = [] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned ' + grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[random.randrange(self.n)] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number( len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def main(): with Timer("100 status updates"): for i in xrange(100): grace.status(str(i)) grace.status("") with Timer("100 parallel processes"): @parallel_for(xrange(100)) def loop(i): pass with Timer("100 parallel processes updating statuses"): @parallel_for(xrange(100)) def loop(i): grace.status(str(i)) grace.status("") with Timer("Nested processes"): @parallel_for(xrange(3)) def _(item): print item @parallel_for(xrange(3)) def _(item2): print item, item2
def unmill(rast, mill_points): padx = max(item[0] for item in mill_points) pady = max(item[1] for item in mill_points) sy, sx = rast.shape padded = numpy.zeros((sy + pady * 2, sx + padx * 2), rast.dtype) padded[pady:pady + sy, padx:padx + sx] = rast result = rast.copy() mill_points = sorted(mill_points, key=lambda item: item[1]) for y in xrange(sy): row = result[y] grace.status('%d' % (sy - y)) for ox, oy, oheight in mill_points: numpy.maximum( row, padded[pady + y - oy, padx - ox:padx + sx - ox] - oheight, row) grace.status('') #old_height = 0 #for x,y,height in sorted(mill_points, key=lambda item:item[2]): # if height != old_height: # numpy.subtract(padded, height-old_height, padded) # old_height = height # print x,y,height # numpy.maximum( # result, # padded[pady-y:pady+sy-y,padx-x:padx+sx-x], # result # ) return result
def wait_for_igv(self): while self.processes: p, filename = self.processes.pop() grace.status('igvtools processing '+filename) assert p.wait() == 0, 'igvtools tile failed' grace.status('') if self.delete_igv: os.unlink(filename)
def cut_raster(self, raster): if self.bit_ball: bit = ball_mill(self.res_bit_radius) else: bit = end_mill(self.res_bit_radius) print 'Unmilling' inraster = unmill(raster, bit) print 'Again' inagain = unmill(inraster, ball_mill(self.res_bit_radius)) print 'Done' def point_score(x,y,z): return 1.0 / ((inagain[y,x]-inraster[y,x])+self.res_bit_radius) spin = 0.0 cut_z = 0 min_z = numpy.minimum.reduce(raster.flatten()) while True: cut_z -= self.res_cutting_depth grace.status('%d %d %f' % (cut_z, min_z, spin)) inmask = inraster <= cut_z self.cut_inmask(cut_z, inmask, #in_first =self.res_bit_radius/3.0+self.res_finishing_clearance, #out_first=self.res_bit_radius/3.0, #in_step =self.res_bit_radius, #out_step =self.res_bit_radius/3.0, in_first = self.res_finishing_clearance + self.res_cutting_step*spin, out_first = 0.0, in_step = self.res_cutting_step, out_step = 0.0, point_score=point_score) if self.finish: infinish_mask = inraster <= cut_z infinish_mask_lower = inraster <= (cut_z-self.res_cutting_depth) self.cut_inmask(cut_z, infinish_mask & ~infinish_mask_lower, #erode(infinish_mask_lower,self.res_horizontal_step * 1.5), #in_first =self.res_horizontal_step*2, #out_first=self.res_horizontal_step, #in_step =self.res_horizontal_step*2, #out_step =self.res_horizontal_step, in_first =self.res_horizontal_step, out_first=0.0, in_step =self.res_horizontal_step, out_step =0.0, point_score=point_score, down_cut=False ) self.cut_inmask(cut_z, infinish_mask, down_cut=False) if cut_z <= min_z: break spin = (spin-GOLDEN)%1.0 grace.status('')
def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True
def build_shrimp_mmap(self, cs=False): suffix = '-cs' if cs else '-ls' grace.status('Building SHRiMP mmap') io.execute([ 'gmapper' + suffix, '--save', self.object_filename('reference' + suffix), self.reference_fasta_filename(), ]) grace.status('')
def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True
def run(self): spans = collections.defaultdict(list) for item in legion.parallel_imap(self._load_bam, self.filenames): for key,value in item.items(): spans[key].extend(value) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): depth = [ 0.0 ] * (1+max( item[1] for item in span_list )) for start, end in span_list: depth[start] += 1.0 depth[end] -= 1.0 for i in xrange(1,len(depth)): depth[i] += depth[i-1] for start, end in self._find_spans(depth): if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n ann = annotation.Annotation() ann.source = 'nesoni' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() f.close() self.log.datum('-','called peaks',n) grace.status('')
def setup(self): grace.status('Load depths') self.sample_names = [ os.path.split(dirname)[1] for dirname in self.working_dirs ] self.workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in self.working_dirs ] self.depths = [ item.get_depths() for item in self.workspaces ] #self.depths = list(legion.imap(lambda item: item.get_object('depths.pickle.gz'), self.workspaces, local=True)) self.any_pairs = any(item.param['any_pairs'] for item in self.workspaces) grace.status('') lengths = self.workspaces[0].get_reference().get_lengths() self.chromosome_names = [ name for name, length in lengths ] self.lengths = dict(lengths) self.processes = [ ]
def read_userplot(filename): grace.status('Load '+filename) f = open(filename,'rb') lines = f.readlines() f.close() n = 0 while n < len(lines) and lines[n].startswith('#'): n += 1 headers = lines[:n] is_multiplot = (n > 0) data = [ tuple([ int(item) for item in lines[i].strip().split() ]) for i in xrange(n,len(lines)) ] grace.status('') return headers, is_multiplot, data
def guess_quality_offset(filename): grace.status('Guessing quality offset') try: min_value = chr(255) #max_value = chr(0) for i, item in enumerate(read_sequences(filename, qualities=True)): if len(item) == 2: return 33 #Not fastq min_value = min(min_value, min(item[2])) #max_value = max(max_value, max(item[2])) if i >= 100000: break low = ord(min_value) #high = ord(max_value) #print 'Quality chars in range %d-%d in %s' % (low,high,filename) if low < 59: return 33 #Sanger and Illumina 1.8+ return 64 #Illumina pre 1.8 finally: grace.status('')
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost=self.snp_cost) # assert os.path.exists(self.reference), 'Reference file does not exist' # reference_filename = workspace._object_filename('reference.fa') # if os.path.exists(reference_filename): # os.unlink(reference_filename) # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, "temp.bam") sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"]) f = open(self.input, "rb") while True: data = f.read(1 << 20) if not data: break writer.write(data) writer.close() f.close() grace.status("Sort") # io.execute([ # 'samtools', 'sort', '-n', sort_input_filename, bam_prefix # ]) sam.sort_bam(sort_input_filename, bam_prefix, by_name=True) if temp_filename is not None: os.unlink(temp_filename) grace.status("")
def read_userplot(filename): grace.status('Load ' + filename) f = open(filename, 'rb') lines = f.readlines() f.close() n = 0 while n < len(lines) and lines[n].startswith('#'): n += 1 headers = lines[:n] is_multiplot = (n > 0) data = [ tuple([int(item) for item in lines[i].strip().split()]) for i in xrange(n, len(lines)) ] grace.status('') return headers, is_multiplot, data
def _make_inner(action): timestamp = coordinator().time() assert timestamp > LOCAL.time, 'Time running in reverse.' cores = action.cores_required() if cores > 1: coordinator().trade_cores(1, cores) try: config.write_colored_text(sys.stderr, '\n'+action.describe()+'\n') if LOCAL.abort_make and not selection.matches(LOCAL.do_selection, [action.shell_name()]): raise grace.Error('%s would be run. Stopping here.' % action.ident()) old_status = grace.status(action.shell_name()) try: _run_and_save_state(action, timestamp) finally: grace.status(old_status) finally: if cores > 1: coordinator().trade_cores(cores, 1)
def _make_inner(action): timestamp = coordinator().time() assert timestamp > LOCAL.time, 'Time running in reverse.' cores = action.cores_required() if cores > 1: coordinator().trade_cores(1, cores) try: config.write_colored_text(sys.stderr, '\n'+action.describe()+'\n') if LOCAL.abort_make: raise grace.Error('%s would be run. Stopping here.' % action.ident()) grace.status(action.ident()) try: _run_and_save_state(action, timestamp) finally: grace.status('') finally: if cores > 1: coordinator().trade_cores(cores, 1)
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', sort_input_filename, bam_prefix #]) sam.sort_bam(sort_input_filename, bam_prefix, by_name=True) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', sort_input_filename, bam_prefix ]) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def find_maximum_depth(self): grace.status('Finding maximum depth') #if self.strand_specific: # iterator = itertools.chain( # self.iter_over(lambda item: item.ambiguous_depths[0], zeros=False), # self.iter_over(lambda item: item.ambiguous_depths[1], zeros=False), # ) #else: # iterator = self.iter_over_unstranded(lambda item: item.ambiguous_depths, zeros=False) maximum = 1 norm_maximum = 1.0 normalize = self.normalizer() n = 0 #for name, pos, depths in iterator: # if n % 1000000 == 0: # grace.status('Finding maximum depth %s %s' % (name, grace.pretty_number(pos))) # n += 1 # # maximum = max(maximum,max(depths)) # norm_maximum = max(norm_maximum,max(normalize(depths))) #futures = [ ] #for name in self.chromosome_names: # for i, depth in enumerate(self.depths): # if self.strand_specific: # #this = futures.append(max( # # max(depth[name].ambiguous_depths[0]), # # max(depth[name].ambiguous_depths[1]) # #) # # futures.append( (name,i,legion.future(max, depth[name].ambiguous_depths[0])) ) # futures.append( (name,i,legion.future(max, depth[name].ambiguous_depths[1])) ) # else: # #this = max(iter_add(depth[name].ambiguous_depths[0],depth[name].ambiguous_depths[1])) # futures.append( (name,i,legion.future(lambda item: max(iter_add(item[0],item[1])), depth[name].ambiguous_depths)) ) # #for name, i, future in futures: # grace.status('Finding maximum depth %s %s' % (name, self.sample_names[i])) # this = future() # maximum = max(maximum, this) # norm_maximum = max(norm_maximum, self.norm_mult[i] * this) for name in self.chromosome_names: for i, depth in enumerate(self.depths): grace.status('Finding maximum depth %s %s' % (name, self.sample_names[i])) if self.strand_specific: this = max( depth[name].ambiguous_depths[0].maximum(), depth[name].ambiguous_depths[1].maximum() ) else: this = (depth[name].ambiguous_depths[0] + depth[name].ambiguous_depths[1]).maximum() maximum = max(maximum, this) norm_maximum = max(norm_maximum, self.norm_mult[i] * this) self.maximum = maximum self.norm_maximum = norm_maximum grace.status('')
def calculate_norm_mult(self): grace.status('Calculating normalization') totals = [ 0 ] * len(self.working_dirs) for i in xrange(len(self.workspaces)): for name in self.lengths: totals[i] += self.depths[i][name].depths[0].total() + self.depths[i][name].depths[1].total() #for name, pos, depths in self.iter_over_unstranded(lambda item: item.depths): # if pos % 1000000 == 0: # grace.status('Calculating normalization %s %s' % (name, grace.pretty_number(pos))) # for i, depth in enumerate(depths): # totals[i] += depth grace.status('') nonzero = [ item for item in totals if item ] geomean = math.exp(sum( math.log(item) for item in nonzero ) / len(nonzero)) self.norm_mult = [ 1.0 if not item else geomean / item for item in totals ]
def run(self): working = io.Workspace(self.output_dir, must_exist=False) for filename in self.files: reader = io.Table_reader(filename) name = os.path.splitext(os.path.split(filename)[1])[0] rname = None files = None for record in reader: if record['Chromosome'] != rname: if files: for item in files: item.close() rname = record['Chromosome'] grace.status('Convert '+name+' '+rname) files = [ open(working / ( name + '-' + grace.filesystem_friendly_name(rname) + '-' + grace.filesystem_friendly_name(item) + '.userplot' ), 'wb') for item in reader.headings[4:] ] pos = 0 assert int(record['Start']) == pos and int(record['End']) == pos + 1 for val, f in zip(record.values()[4:], files): print >> f, val pos += 1 if files: for item in files: item.close() grace.status('')
def run(self): seqs = [ ] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[ random.randrange(self.n) ] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def count_run(min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool', 'forward', 'reverse', 'both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [] bam_filenames = [] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference( ).annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = {} merge_qualifiers = {} if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename, 'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers) + 1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = {} # reference name -> gene index feature_names = {} # feature_name -> number of occurrences features = [] n_features = 0 chromosome_length = {} for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name, 0) + 1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item, '') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert( Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log( 'No annotation files given or found, using sequences as features\n' ) name_feature = {} # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('', ) * len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [0] * n_samples n_fragments_aligned = [0] * n_samples n_low_score = [0] * n_samples n_something = [0] * n_samples n_multiple = [0] * n_samples n_span = [0] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments( bam_filenames[i], 'Counting sample %d of %d' % (i + 1, n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum(al.get_AS() for al in item) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 start = min(item.pos - 1 for item in alignments) end = max(item.pos + item.length - 1 for item in alignments) length = end - start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 assert alignments[ 0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append(key) if not use_only_monogamous or len( fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append(this_feature_hits) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1], b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1], b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i], 'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward + n_reverse < 5: continue strandedness.append( (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse)) strandedness = sum(strandedness) / len(strandedness) log.log( 'Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1], feature.count[-1]), add_defdicts(feature.common[(1, 1)], feature.common[ (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]), add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[ (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[ (-1, -1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [float(min_hits) / item for item in total_hits] total_hits = [min_hits] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( ['Feature'] + titles + ['RPKM ' + item for item in titles] + ['Length'] + qualifiers + ['On same fragment'] + (['Ambiguous alignment'] if expect_multiple_alignments else [])) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [subsample(count[i], p[i]) for i in xrange(n_samples)] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( common.items(), key=lambda item: item[1], reverse=True)) ambiguous_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( ambiguous.items(), key=lambda item: item[1], reverse=True)) print >> f, tab_encode( [feature_name] + [str(item) for item in count] + ['%.2f' % item for item in rpkm] + [str(feature.length)] + list(feature.qualifiers) + [common_str] + ([ambiguous_str] if expect_multiple_alignments else [])) f.close()
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def improve(comment, constrainer, scorer, start_x, ftol=1e-4, xtol=1e-6, initial_accuracy=0.001, monitor=lambda x, y: None): pool_size = legion.coordinator().get_cores() worker_futs = [ legion.coordinator().new_future() for i in xrange(pool_size) ] reply_futs = [] workers = [legion.future(worker, scorer, fut) for fut in worker_futs] last_t = 0.0 try: best = start_x c_score = constrainer(best) if c_score: best_score = (c_score, 0.0) else: best_score = (0.0, scorer(best)) n_good = 0 n_real = 0 i = 0 jobs = [] pool_size = int(len(best) * 5) #5 print len(best), 'parameters, pool size', pool_size currents = [(best, best_score)] done = False while not done or reply_futs: t = time.time() if t > last_t + 20.0: def rep(x): if x[0]: return 'C%.6f' % x[0] return '%.6f' % x[1] grace.status( '%s %s %d %d %d %d %s' % (rep(best_score), rep(max(item[1] for item in currents)), len(currents), n_good, n_real, i, comment)) if best_score[0] == 0: monitor(best, [item[0] for item in currents]) last_t = time.time() have_score = False if not done and worker_futs: new = make_update([item[0] for item in currents], initial_accuracy, len(currents) < pool_size) c_score = constrainer(new) if c_score: have_score = True new_score = (c_score, 0.0) else: reply_fut = legion.coordinator().new_future() worker_fut = worker_futs.pop(0) legion.coordinator().deliver_future( worker_fut, (new, reply_fut)) reply_futs.append((new, reply_fut)) if not have_score: if not reply_futs or (not done and worker_futs): continue new, reply_fut = reply_futs.pop(0) new_score, worker_fut = legion.coordinator().get_future( reply_fut) new_score = (0.0, new_score) worker_futs.append(worker_fut) if new_score[0] == 0.0: n_real += 1 l = sorted(item[1][1] for item in currents) if pool_size < len(l): c = l[pool_size] else: c = 1e30 cutoff = (best_score[0], c) if new_score <= cutoff: currents = [item for item in currents if item[1] <= cutoff] currents.append((new, new_score)) n_good += 1 if new_score < best_score: best_score = new_score best = new if len(currents) >= pool_size and best_score[0] == 0.0: xspan = 0.0 for i in xrange(len(start_x)): xspan = max( xspan, max(item[0][i] for item in currents) - min(item[0][i] for item in currents)) fspan = (max(item[1] for item in currents)[1] - best_score[1]) if xspan < xtol or (n_good >= 5000 and fspan < ftol): done = True i += 1 grace.status('') print '%s %.5f\n' % (comment, best_score[1]) finally: #pool.terminate() pass while worker_futs: fut = worker_futs.pop(0) legion.coordinator().deliver_future(fut, None) for item in workers: item() return best
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) #ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = self.min_score-1 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, ## at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end abort_score = best_score-len(self.adaptor) abort_i = min(len(seq), a_end+len(self.adaptor)) while score >= abort_score: #if (score > best_score and # (i >= good_quality_end or i >= a_end+len(self.adaptor))): if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= abort_i: break if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # Modified 2018-03-21 # poly(A) tail only within good quality region. #if a_end >= good_quality_end: break #if qual[a_end] >= ignore_quality: # if seq[a_end] == 'A': # aonly_score += 1 # else: # aonly_score -= 4 # if aonly_score <= 0: break if a_end >= len(seq): break if seq[a_end] == 'A': aonly_score += 1 else: #if qual[a_end] >= ignore_quality: aonly_score -= 4 #else: # aonly_score -= 1 a_end += 1 # 2018-03-21 # Look for tail starting after good quality, # however don't call a tail if starts after good quality if best_a_start > good_quality_end: best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_score = 0 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start >= self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') # Option to do a quick subsample if self.only and self.only <= n: break grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): assert self.what in ('fragment','5prime','3prime'), 'Unknown option for --what.' #assert self.moderation > 0.0, '--moderation must be greater than zero.' #assert self.power > 0.0, '--power must be greater than zero.' #assert self.width_power >= 1.0, '--width-power must be greater than or equal to one.' #if self.filter == 'poly': # use_bam_filename = 'alignments.bam' # use_only_top = True # use_only_monogamous = False # expect_multiple_alignments = True #elif self.filter == 'mono': # use_bam_filename = 'alignments.bam' # use_only_top = True # use_only_monogamous = True # expect_multiple_alignments = True #else: # assert self.filter == 'existing', 'Unrecognized filtering mode' # use_bam_filename = 'alignments_filtered.bam' # use_only_top = False # use_only_monogamous = False # expect_multiple_alignments = False spans = collections.defaultdict(list) for item in legion.parallel_imap(self._load_bam, self.filenames): for key,value in item.items(): spans[key].extend(value) #for i, filename in enumerate(self.filenames): # if os.path.isdir(filename): # filename = os.path.join(filename, use_bam_filename) # # n = 0 # for read_name, fragment_alignments, unmapped in \ # sam.bam_iter_fragments( # filename, # 'Scanning sample %d of %d' % (i+1,len(self.filenames))): # if not fragment_alignments: # continue # # if use_only_top: # fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] # best_score = max(fragment_scores) # fragment_alignments = [ # item # for item, score in zip(fragment_alignments, fragment_scores) # if score >= best_score ] # # for alignments in fragment_alignments: # if self.strand_specific: # strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 # else: # strand = 0 # # start = min(item.pos-1 for item in alignments) # end = max(item.pos+item.length-1 for item in alignments) # if end-start <= self.trim*2: continue # # rname = alignments[0].rname # spans[(rname, strand)].append((start+self.trim,end-self.trim)) # # n += 1 # #if n > 100000: break # #if self.deduplicate: # for key in spans: # spans[key] = list(set(spans[key])) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): depth = [ 0.0 ] * (1+max( item[1] for item in span_list )) for start, end in span_list: depth[start] += 1.0 depth[end] -= 1.0 if self.crosstalk and strand and (rname,-strand) in spans: for start, end in spans[(rname,-strand)]: if start < len(depth): depth[start] -= self.crosstalk if end < len(depth): depth[end] += self.crosstalk for i in xrange(1,len(depth)): depth[i] += depth[i-1] if self.crosstalk: for i in xrange(len(depth)): depth[i] = max(0.0,depth[i]) #import pylab #pylab.plot(depth) for start, end in self._find_spans(depth): #pylab.axvspan(start-0.5,end-0.5,alpha=0.25) if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n #if strand == -1: # id = '%s-%d..%d' % (rname,start,end+1) #elif strand == 0: # id = '%s.%d..%d' % (rname,start+1,end) #else: # id = '%s+%d..%d' % (rname,start+1,end) ann = annotation.Annotation() ann.source = 'nesoni' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() #pylab.show() f.close() self.log.datum('-','called peaks',n) grace.status('')
def bam_iter_fragments(filename, status_text='Processing'): reader = Bam_reader(filename) n = 0 n_ambiguous = 0 for read_name, alignment_iter in itertools.groupby(reader, lambda read: read.qname): if n % 100000 == 0: grace.status(status_text + ' fragment %s' % grace.pretty_number(n)) n += 1 unpaired = [ ] first = [ ] second = [ ] unmapped = [ ] for al in alignment_iter: if al.flag&FLAG_UNMAPPED: unmapped.append(al) elif not al.flag&FLAG_PAIRED or al.flag&FLAG_MATE_UNMAPPED: unpaired.append((al,)) elif al.flag&FLAG_FIRST: first.append(al) elif al.flag&FLAG_SECOND: second.append(al) else: assert False, 'Read in pair that is neither first nor second' pairs = [ ] unused = set(first + second) second_index = { } for al in second: key = (al.rname, al.pos) if key not in second_index: second_index[key] = [ ] second_index[key].append(al) for al1 in first: key = (al1.get_mrnm(), al1.mpos) for al2 in second_index.get(key, ()): if al2.get_mrnm() != al1.rname or \ al2.mpos != al1.pos: continue if al1 not in unused or al2 not in unused: # pfh says: I am displeased that the pairing is sometimes ambiguous n_ambiguous += 1 continue pairs.append( (al1, al2) ) unused.remove(al1) unused.remove(al2) if unused: print unused assert not unused, 'Alignment pairing not even pretending to make sense. Is the BAM file sorted by read name?' yield read_name, pairs + unpaired, unmapped grace.status('') if n_ambiguous: print >> sys.stderr print >> sys.stderr, 'The alignment pairing was unclear %s times, and alignments were paired arbitrarily.' % grace.pretty_number(n_ambiguous) print >> sys.stderr, 'Blame the SAM format.' print >> sys.stderr
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def make_ambiguity_bigwig_by_readname(prefix, bam_filenames, stop_after=None, subsample=1): #import pysam #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix+"-chrom.sizes","wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"],entry["LN"])) chrom_names = [ entry["SN"] for entry in header["SQ"] ] chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ] #alf.close() unambiguous = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) total = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) old = grace.status("Ambiguity bigwig") for filename in bam_filenames: #alf = pysam.AlignmentFile(filename) alf = sam.Bam_reader(filename) n = 0 sub = subsample-1 for (key,items) in itertools.groupby(alf, lambda item: item.query_name): sub = (sub + 1) % subsample if sub: continue items = [ item for item in items if not item.is_unmapped and not item.is_supplementary ] if not items: continue # Only use top scoring alignments AS = [ item.get_AS() for item in items ] best_AS = max(AS) items = [ item for item, this_AS in zip(items,AS) if this_AS >= best_AS ] for item in items: #spanner = fragment_split_coverage([item]) spanner = fragment_coverage([item]) #TODO fixme when blocks available spanner = scale_spanner(1.0/len(items), spanner) total[item.reference_name].add(spanner) if len(items) == 1: unambiguous[item.reference_name].add(spanner) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n)) alf.close() ambiguities = [ ] for i in xrange(len(total)): u = unambiguous[chrom_names[i]].get() t = map_spanner(lambda x: x*1j, total[chrom_names[i]].get()) c = pile([u,t],initial=0.0) c = map_spanner(lambda x: max(0.0,x.imag-x.real)/max(x.imag,1.0), c) ambiguities.append(c) bedgraph(prefix+".bedgraph", zip(chrom_names, [ item for item in ambiguities ])) subprocess.check_call([ "wigToBigWig",prefix+".bedgraph",prefix+"-chrom.sizes",prefix+".bw"]) os.unlink(prefix+".bedgraph") os.unlink(prefix+"-chrom.sizes") grace.status(old)
def read_unstranded_userplot(filename): headers, is_multiplot, data = read_userplot(filename) if is_multiplot: return [ item[1]+item[2] for item in data ] else: return [ item[0] for item in data ] def write_unstranded_userplot(filename, array): f = open(filename,'wb') for x in array: f.write( '%f\n' % float(x) ) f.close() def write_normalized_userplot((headers, is_multiplot, data), factor, filename): grace.status('Write '+filename) f = open(filename, 'wb') for line in headers: f.write(line) for item in data: if is_multiplot: print >> f, item[0], ' '.join([ str(depth * factor) for depth in item[1:] ]) else: print >> f, item[0] * factor f.close() grace.status('') def normalize_files(dirnames, prefix, min_depth): contents = [ read_userplot(os.path.join(item,prefix+'-depth.userplot')) for item in dirnames ] data = [ item[2] for item in contents ]
def run(self): assert len(self.pickles) > 0, "No samples to count." work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading "+os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max(max_length, max( item[0] #tail_length for feature in datum for item in feature.hits ) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [ 0 ] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [ ] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ (item.total_count, item.tail_counts) for item in row ] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ] sample_quantile_tail = collections.OrderedDict( (item, [ [None]*n_samples for i in xrange(n_features) ]) for item in [25,50,75,100] ) overall_n = [ 0 ]*n_features # [feature] Overall count overall_prop = [ None ]*n_features # [feature] Overall proportion with tail overall_tail = [ None ]*n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ]*n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) ) if sample_n[i][j] >= 1: sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) )) / (sample_n_tail[i][j]-1) ) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i] for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') if have_biotype: row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','') if have_parent: row[('Annotation','parent')] = annotations[i].attr.get('Parent','') if have_relation: row[('Annotation','relation')] = annotations[i].attr.get('Relation','') if have_antisense: row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','') row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','') row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','') row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','') row[('Annotation','chromosome')] = str(annotations[i].seqid) row[('Annotation','strand')] = str(annotations[i].strand) row[('Annotation','start')] = str(annotations[i].start+1) row[('Annotation','end')] = str(annotations[i].end) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str_na(overall_tail[i]) row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail',names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion',names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work/'read_count.csv', sample_n) write_csv_matrix(work/'tail_count.csv', sample_n_tail) write_csv_matrix(work/'tail.csv', sample_tail) write_csv_matrix(work/'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) ) yield row io.write_csv(work/'pooled.csv', pooled())
def recombination(args): grace.expect_no_further_options(args) if len(args) != 2: print >> sys.stderr, USAGE raise grace.Help_shown() working_dir, seq_name = args references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa'))) depth = { } prefixes = { } suffixes = { } for name in references: depth[name] = numpy.zeros(len(references[name]), 'int64') prefixes[name] = [ [] for base in references[name] ] suffixes[name] = [ [] for base in references[name] ] def register_divergence(hit): if not hit.query_forward: hit = hit.reversed() margin = 20 if hit.target_end - hit.target_start < 20: return False depth[hit.target_name][hit.target_start : hit.target_end] += 1 any = False if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq): suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] ) any = True if hit.query_start >= margin: # and hit.target_start > 0: prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] ) any = True return any n = 0 for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir): # Skip reads containing Ns if 'N' in read_seq: continue for line in hits: register_divergence(alignment_from_shrimp(line, references, read_name, read_seq)) n += 1 #if n > 100000: # break if n%10000 == 0: grace.status('Processing read %s' % grace.pretty_number(n)) grace.status('') def show_items(items): original_length = len(items) cut = 0 while len(items) > 80: cut += 1 items = [ item for item in items if item[0] >= cut ] for item in items: print item[1] if len(items) < original_length: print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) def score(items): if not items: return 1.0 return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2) def summarize_prefixes(seqs, pad): seqs = sorted(seqs, key=lambda seq: seq[::-1]) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) def summarize_suffixes(seqs, pad): seqs = sorted(seqs) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%('%d x '%n) + seq + ('...' if anylong else '')) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) print 'Position Depth Changed prefixes Changed suffixes' print ' Count % of depth Count % of depth' for i in xrange(len(references[seq_name])): print '%8d %10d %9d %11s %9d %11s' % ( i+1, depth[seq_name][i], len(prefixes[seq_name][i]), '%.3f%%' % (len(prefixes[seq_name][i])*100.0/depth[seq_name][i]) if prefixes[seq_name][i] else '', len(suffixes[seq_name][i]), '%.3f%%' % (len(suffixes[seq_name][i])*100.0/depth[seq_name][i]) if suffixes[seq_name][i] else '') #summarize_suffixes(suffixes[name][i], references[name][i+1:], references[name], suffix_depth[name][i]) print print 'Details' print for i in xrange(len(references[seq_name])): print '%-80s*' % ('Base %d' % (i+1)) print pad_slice(references[seq_name], i-80,i+1+80) summarize_prefixes(prefixes[seq_name][i], 80) summarize_suffixes(suffixes[seq_name][i], 81) print
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = 0 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(good_quality_end): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, # at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end while True: if (score > best_score and (i >= good_quality_end or i >= a_end+len(self.adaptor))): best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= a_end+len(self.adaptor) or i >= len(seq): break if qual[i] >= ignore_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # poly(A) tail only within good quality region. if a_end >= good_quality_end: break if qual[a_end] >= ignore_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'I' if item<ignore_quality else ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): title1 = self.title1 title2 = self.title2 working1 = working_directory.Working(self.working_dir1) working2 = working_directory.Working(self.working_dir2) cutoff = self.cutoff sequence_names = [ name for name, length in working1.get_reference().get_lengths() ] if title1 is None: title1 = working1.name if title2 is None: title2 = working2.name n = 1 while significance([('A', n)], [('T', n)], 1.0) > cutoff: n += 1 f = open(self.prefix + '.txt', 'wb') print >> f, '%g\tsignificance cutoff' % cutoff print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % ( title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = working1 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = working2 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) f.flush() dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) f.flush() f.close() grace.status('') return 0
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def run(self): spans = collections.defaultdict(list) #for item in legion.parallel_imap(self._load_bam, self.filenames): # for key,value in item.items(): for filename in self.filenames: for key,value in self._load_bam(filename).items(): spans[key].extend(value) grace.status('Calling peaks') f = open(self.prefix+'.gff', 'wb') annotation.write_gff3_header(f) n = 0 for (rname, strand), span_list in spans.items(): length = 1+max( item[1] for item in span_list ) depth = [ 0.0 ] * length AN_total = [ 0.0 ] * length AG_total = [ 0.0 ] * length for start, end, AN, AG in span_list: depth[start] += 1.0 depth[end] -= 1.0 AN_total[start] += AN AN_total[end] -= AN AG_total[start] += AG AG_total[end] -= AG for i in xrange(1,length): depth[i] += depth[i-1] AN_total[i] += AN_total[i-1] AG_total[i] += AG_total[i-1] for start, end in self._find_spans(depth): if end-self.lap-start <= 0: continue n += 1 id = 'peak%d' % n ann = annotation.Annotation() ann.source = 'tailtools' ann.type = self.type ann.seqid = rname ann.start = start ann.end = end - self.lap assert ann.end == ann.start+1 ann.strand = strand ann.score = None ann.phase = None ann.attr = { 'id' : id, 'n' : str(depth[start+self.lap//2]), 'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]), 'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]), 'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080', } print >> f, ann.as_gff() f.flush() f.close() self.log.datum('-','called peaks',n) grace.status('')
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): title1 = self.title1 title2 = self.title2 working1 = working_directory.Working(self.working_dir1) working2 = working_directory.Working(self.working_dir2) cutoff = self.cutoff sequence_names = [ name for name, length in working1.get_reference().get_lengths() ] if title1 is None: title1 = working1.name if title2 is None: title2 = working2.name n = 1 while significance([('A',n)],[('T',n)],1.0) > cutoff: n += 1 f = open(self.prefix + '.txt','wb') print >> f, '%g\tsignificance cutoff' % cutoff print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = working1/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = working2/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) f.flush() dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) f.flush() f.close() grace.status('') return 0