def reversed(self): # Reverse query sequence result = copy.copy(self) result.query_seq = bio.reverse_complement(result.query_seq) result.query_start, result.query_end = \ len(self.query_seq)-result.query_end, len(self.query_seq)-result.query_start result.query_forward = not result.query_forward return result
def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])
def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement( chromosomes[peaks[0].seqid][start:end])
def expected_depth(name, seq, depths, ambig_depths): med = numpy.median(depths) sane = numpy.arange(len(depths))[ (depths > med*0.5) & (depths < med*2.0) & (depths*2.0 >= ambig_depths)] #print 'median', med, 'using', len(sane) if sum(sane) < 100: warn('Skipping depth correction on ' + name) return numpy.array( [numpy.average(depths)] * len(depths) ) buckets = { } radius = 2 # examine 5-mers n = radius*2+1 sseqq = seq[len(seq)-radius:] + seq + seq[:radius] for i in sane: s = sseqq[i:i+n] if s not in buckets: buckets[s] = [ ] buckets[s].append( depths[i] ) # Pool with reverse complement new_buckets = { } for kmer in buckets: rc = bio.reverse_complement(kmer) new_buckets[kmer] = buckets[kmer] + buckets.get(rc,[]) buckets = new_buckets for key in buckets: buckets[key] = numpy.average(buckets[key]) prediction = numpy.zeros(len(seq), 'float') for i in xrange(len(seq)): s = sseqq[i:i+n] prediction[i] = buckets.get(s,0.0) # selection of radii from 8 to 4096 # TODO: make this configurable, or perhaps just larger radii = [ int(2**(0.5*i)) for i in xrange(3*2,12*2+1) ] prediction_windower = windower(prediction, radii[-1]) a = numpy.arange(len(seq)) / float(len(seq)) predictors = numpy.transpose( [ numpy.ones(len(seq), 'float'), numpy.cos(a * (2.0*numpy.pi)), numpy.sin(a * (2.0*numpy.pi)), ] + [ use_windower(prediction_windower, radius) for radius in radii ] ) x = linalg.lstsq(predictors[sane], depths[sane])[0] #print x prediction = numpy.sum(predictors * x[None,:], 1) return prediction
def get_prepeak_seq(gene,peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end-start > self.max_seq: return '' return bio.reverse_complement(chromosomes[gene.seqid][start:end])
def expected_depth(name, seq, depths, ambig_depths): med = numpy.median(depths) sane = numpy.arange( len(depths))[(depths > med * 0.5) & (depths < med * 2.0) & (depths * 2.0 >= ambig_depths)] #print 'median', med, 'using', len(sane) if sum(sane) < 100: warn('Skipping depth correction on ' + name) return numpy.array([numpy.average(depths)] * len(depths)) buckets = {} radius = 2 # examine 5-mers n = radius * 2 + 1 sseqq = seq[len(seq) - radius:] + seq + seq[:radius] for i in sane: s = sseqq[i:i + n] if s not in buckets: buckets[s] = [] buckets[s].append(depths[i]) # Pool with reverse complement new_buckets = {} for kmer in buckets: rc = bio.reverse_complement(kmer) new_buckets[kmer] = buckets[kmer] + buckets.get(rc, []) buckets = new_buckets for key in buckets: buckets[key] = numpy.average(buckets[key]) prediction = numpy.zeros(len(seq), 'float') for i in xrange(len(seq)): s = sseqq[i:i + n] prediction[i] = buckets.get(s, 0.0) # selection of radii from 8 to 4096 # TODO: make this configurable, or perhaps just larger radii = [int(2**(0.5 * i)) for i in xrange(3 * 2, 12 * 2 + 1)] prediction_windower = windower(prediction, radii[-1]) a = numpy.arange(len(seq)) / float(len(seq)) predictors = numpy.transpose([ numpy.ones(len(seq), 'float'), numpy.cos(a * (2.0 * numpy.pi)), numpy.sin(a * (2.0 * numpy.pi)), ] + [use_windower(prediction_windower, radius) for radius in radii]) x = linalg.lstsq(predictors[sane], depths[sane])[0] #print x prediction = numpy.sum(predictors * x[None, :], 1) return prediction
def get_prepeak_seq(gene, peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end - start > self.max_seq: return '' return bio.reverse_complement( chromosomes[gene.seqid][start:end])
def get_seq(self, seq_dict): seq = seq_dict[self.seqid] if self.end <= 0 or self.start >= len(seq): extract = 'N' * (self.end-self.start) else: extract = seq[max(self.start,0):min(self.end,len(seq))] if self.start < 0: extract = 'N' * -self.start + extract if self.end > len(seq): extract = extract + 'N' * (self.end-len(seq)) if self.strand < 0: extract = bio.reverse_complement(extract) return extract
def get_seq(self, seq_dict): seq = seq_dict[self.seqid] if self.end <= 0 or self.start >= len(seq): extract = 'N' * (self.end - self.start) else: extract = seq[max(self.start, 0):min(self.end, len(seq))] if self.start < 0: extract = 'N' * -self.start + extract if self.end > len(seq): extract = extract + 'N' * (self.end - len(seq)) if self.strand < 0: extract = bio.reverse_complement(extract) return extract
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def expected_depth(name, seq, depths, ambig_depths, radius=2): import numpy from numpy import linalg med = numpy.median(depths) sane = numpy.arange(len(depths))[ (depths > med*0.5) & (depths < med*2.0) & (depths*2.0 >= ambig_depths)] #print 'median', med, 'using', len(sane) if sum(sane) < 100: warn('Skipping depth correction on ' + name) return numpy.array( [numpy.average(depths)] * len(depths) ) buckets = { } #radius = 2 # examine 5-mers n = radius*2+1 sseqq = seq[len(seq)-radius:] + seq + seq[:radius] for i in sane: s = sseqq[i:i+n] if s not in buckets: buckets[s] = [ ] buckets[s].append( depths[i] ) # Pool with reverse complement new_buckets = { } for kmer in buckets: rc = bio.reverse_complement(kmer) pool = buckets[kmer] + buckets.get(rc,[]) new_buckets[kmer] = pool buckets = new_buckets buckets_individual = buckets.copy() for key in buckets: buckets[key] = numpy.average(buckets[key]) avg_depth = numpy.average(depths[sane]) listing = [ (key, numpy.log(value)-numpy.log(avg_depth) ) for key,value in buckets.items() if key <= bio.reverse_complement(key) ] listing.sort(key=lambda x: abs(x[1]), reverse=True) print 'Top k-mer log2 fold change' for key,value in listing[:10]: print key, '% .2f' % (value / numpy.log(2.0)), '(%d)' % len(buckets_individual[key]) print prediction = numpy.zeros(len(seq), 'float') for i in xrange(len(seq)): s = sseqq[i:i+n] prediction[i] = buckets.get(s,0.0) # selection of radii from 8 to 4096 # TODO: make this configurable, or perhaps just larger radii = [ int(2**(0.5*i)) for i in xrange(3*2,12*2+1) ] prediction_windower = windower(numpy.log(prediction), radii[-1]) a = numpy.arange(len(seq)) / float(len(seq)) predictors = numpy.transpose( [ numpy.ones(len(seq), 'float'), numpy.cos(a * (2.0*numpy.pi)), numpy.sin(a * (2.0*numpy.pi)), ] + [ use_windower(prediction_windower, radius) for radius in radii ] ) x = linalg.lstsq(predictors[sane], numpy.log(depths[sane]))[0] #print x prediction = numpy.sum(predictors * x[None,:], 1) change = numpy.median( numpy.abs( numpy.log(depths) - prediction ) ) print 'Median log2 fold error:', change / numpy.log(2.0) print print return numpy.exp( prediction )
def expected_depth(name, seq, depths, ambig_depths, radius=2): import numpy from numpy import linalg med = numpy.median(depths) sane = numpy.arange( len(depths))[(depths > med * 0.5) & (depths < med * 2.0) & (depths * 2.0 >= ambig_depths)] #print 'median', med, 'using', len(sane) if sum(sane) < 100: warn('Skipping depth correction on ' + name) return numpy.array([numpy.average(depths)] * len(depths)) buckets = {} #radius = 2 # examine 5-mers n = radius * 2 + 1 sseqq = seq[len(seq) - radius:] + seq + seq[:radius] for i in sane: s = sseqq[i:i + n] if s not in buckets: buckets[s] = [] buckets[s].append(depths[i]) # Pool with reverse complement new_buckets = {} for kmer in buckets: rc = bio.reverse_complement(kmer) pool = buckets[kmer] + buckets.get(rc, []) new_buckets[kmer] = pool buckets = new_buckets buckets_individual = buckets.copy() for key in buckets: buckets[key] = numpy.average(buckets[key]) avg_depth = numpy.average(depths[sane]) listing = [(key, numpy.log(value) - numpy.log(avg_depth)) for key, value in buckets.items() if key <= bio.reverse_complement(key)] listing.sort(key=lambda x: abs(x[1]), reverse=True) print 'Top k-mer log2 fold change' for key, value in listing[:10]: print key, '% .2f' % (value / numpy.log(2.0)), '(%d)' % len( buckets_individual[key]) print prediction = numpy.zeros(len(seq), 'float') for i in xrange(len(seq)): s = sseqq[i:i + n] prediction[i] = buckets.get(s, 0.0) # selection of radii from 8 to 4096 # TODO: make this configurable, or perhaps just larger radii = [int(2**(0.5 * i)) for i in xrange(3 * 2, 12 * 2 + 1)] prediction_windower = windower(numpy.log(prediction), radii[-1]) a = numpy.arange(len(seq)) / float(len(seq)) predictors = numpy.transpose([ numpy.ones(len(seq), 'float'), numpy.cos(a * (2.0 * numpy.pi)), numpy.sin(a * (2.0 * numpy.pi)), ] + [use_windower(prediction_windower, radius) for radius in radii]) x = linalg.lstsq(predictors[sane], numpy.log(depths[sane]))[0] #print x prediction = numpy.sum(predictors * x[None, :], 1) change = numpy.median(numpy.abs(numpy.log(depths) - prediction)) print 'Median log2 fold error:', change / numpy.log(2.0) print print return numpy.exp(prediction)
def run(self): seqs = env.load_ref(self.reference).seqs result = [] errors = [] with open(self.csv_file, "rU") as f: reader = csv.reader(f) headings = reader.next() headings = [item.lower() for item in headings] assert "id" in headings assert "primer" in headings id_col = headings.index("id") primer_col = headings.index("primer") for row in reader: if len(row) == 0 or (not row[id_col].strip() and not row[primer_col].strip()): continue id = row[id_col].strip() assert " " not in id, "ID contains space: " + id primer = row[primer_col].strip().upper() assert len(primer) > self.skip, "Primer too short: " + id assert [char in "ACGT" for char in primer], "Primer not ACGT: " + id primer = primer[self.skip:] rprimer = bio.reverse_complement(primer) hits = [] for seq_name in seqs: for match in re.finditer(primer, seqs[seq_name], re.IGNORECASE): hits.append((seq_name, 1, match.start(), match.start() + self.length)) for match in re.finditer(rprimer, seqs[seq_name], re.IGNORECASE): hits.append((seq_name, -1, match.end() - self.length, match.end())) if len(hits) > 100: raise config.Error("Many many hits for " + id + ".") if not hits: errors.append("No hits for " + id + ".") continue if len(hits) > 1: self.log.log("Warning: %d hits for %s.\n" % (len(hits), id)) for i, hit in enumerate(hits): hit_name = id if len(hits) > 1: hit_name += "-%dof%d" % (i + 1, len(hits)) result.append( annotation.Annotation(seqid=hit[0], source="tail-tools", type="region", start=hit[2], end=hit[3], strand=hit[1], attr=dict(ID=hit_name, Primer=primer))) if errors: raise config.Error("\n".join(errors)) annotation.write_gff3(self.prefix + ".gff", result)
def run(self): seqs = env.load_ref(self.reference).seqs result = [ ] errors = [ ] with open(self.csv_file, "rU") as f: reader = csv.reader(f) headings = reader.next() headings = [ item.lower() for item in headings ] assert "id" in headings assert "primer" in headings id_col = headings.index("id") primer_col = headings.index("primer") for row in reader: if len(row) == 0 or (not row[id_col].strip() and not row[primer_col].strip()): continue id = row[id_col].strip() assert " " not in id, "ID contains space: "+id primer = row[primer_col].strip().upper() assert len(primer) > self.skip, "Primer too short: "+id assert [ char in "ACGT" for char in primer ], "Primer not ACGT: "+id primer = primer[self.skip:] rprimer = bio.reverse_complement(primer) hits = [ ] for seq_name in seqs: for match in re.finditer( primer, seqs[seq_name], re.IGNORECASE): hits.append( (seq_name, 1, match.start(), match.start()+self.length) ) for match in re.finditer( rprimer, seqs[seq_name], re.IGNORECASE): hits.append( (seq_name, -1, match.end()-self.length, match.end()) ) if len(hits) > 100: raise config.Error("Many many hits for "+id+".") if not hits: errors.append("No hits for "+id+".") continue if len(hits) > 1: self.log.log("Warning: %d hits for %s.\n" % (len(hits),id)) for i, hit in enumerate(hits): hit_name = id if len(hits) > 1: hit_name += "-%dof%d" % (i+1,len(hits)) result.append(annotation.Annotation( seqid = hit[0], source = "tail-tools", type = "region", start = hit[2], end = hit[3], strand = hit[1], attr = dict( ID=hit_name, Primer=primer ) )) if errors: raise config.Error("\n".join(errors)) annotation.write_gff3(self.prefix+".gff", result)
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank+right i = 0 variants_used = [ ] with open(workspace/'reads.fq','wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append( (variant,count) ) seq = left+variant+right for j in xrange(count): pos = len(variant)+random.randrange(read_length-len(variant)) read = seq[pos:pos+read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read)) reference = left+self.ref+right primary_variant = left+variants_used[0][0]+right with open(workspace/'reference.fa','wb') as f: io.write_fasta(f,'chr1',reference) legion.remake_needed() self.analysis( workspace/'sample', workspace/'reference.fa', reads = [ workspace/'reads.fq' ], ).run() self.freebayes( workspace/'freebayes', workspace/'sample', ).run() self.vcf_filter( workspace/'filtered', workspace/'freebayes.vcf', ).run() Vcf_patch( workspace/'patch', workspace/('sample','reference'), workspace/'filtered.vcf' ).run() patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1] masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper() with open(workspace/'freebayes.vcf','rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace/'filtered.vcf','rU') as f: reader = vcf.Reader(f) filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU')))) with open(workspace/('sample','report.txt'),'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name,'raw variants', raw_count) self.log.datum(workspace.name,'variants after filtering', filtered_count) self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def fill_scaffolds(args): max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000) if len(args) < 2: print USAGE return 1 (output_dir, graph_dir), args = args[:2], args[2:] scaffolds = [ ] def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) ) grace.execute(args, [scaffold]) custom_scaffolds = (len(scaffolds) != 0) sequences = dict( (a.split()[0], b.upper()) for a,b in io.read_sequences(os.path.join( graph_dir, '454AllContigs.fna'))) sequence_names = sorted(sequences) sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1))) contexts = { } context_names = { } context_depths = { } for i in xrange(1,len(sequence_names)+1): seq = sequences[sequence_names[i-1]] contexts[ i ] = seq context_names[ i ] = sequence_names[i-1]+'-fwd' contexts[ -i ] = bio.reverse_complement(seq) context_names[ -i ] = sequence_names[i-1]+'-rev' links = collections.defaultdict(list) for line in open( os.path.join(graph_dir, '454ContigGraph.txt'), 'rU'): parts = line.rstrip('\n').split('\t') if parts[0].isdigit(): seq = sequence_ids[parts[1]] context_depths[ seq] = float(parts[3]) context_depths[-seq] = float(parts[3]) if parts[0] == 'C': name1 = 'contig%05d' % int(parts[1]) dir1 = {"3'" : 1, "5'" : -1 }[parts[2]] name2 = 'contig%05d' % int(parts[3]) dir2 = {"5'" : 1, "3'" : -1 }[parts[4]] depth = int(parts[5]) #print name1, dir1, name2, dir2, depth links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) ) links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) ) if parts[0] == 'S' and not custom_scaffolds: name = 'scaffold%05d' % int(parts[2]) components = parts[3].split(';') scaffold = [ ] for component in components: a,b = component.split(':') if a == 'gap': scaffold.append( ('gap',int(b)) ) else: strand = { '+': +1, '-': -1 }[ b ] scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) ) scaffolds.append( (name, scaffold) ) #paths = { } # #todo = [ ] #for i in contexts: # for depth_left, neg_left in links[-i]: # left = -neg_left # for depth_right, right in links[i]: # todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) ) # #heapq.heapify(todo) #while todo: # score, source, dest, path = heapq.heappop(todo) # if (source,dest) in paths: continue # # paths[(source,dest)] = path # # if len(contexts[dest]) > max_filler_length: continue # # for depth, next in links[dest]: # heapq.heappush(todo, # ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,)) # ) path_source_dest = collections.defaultdict(dict) # source -> dest -> next path_dest_source = collections.defaultdict(dict) # dest -> source -> next # Use links, in order to depth of coverage, to construct paths between contigs # Thus: paths have maximum minimum depth # subsections of paths also have this property todo = [ ] for i in contexts: for depth_link, right in links[i]: todo.append( ( depth_link, i, right) ) todo.sort(reverse=True) for score, left, right in todo: if right in path_source_dest[left]: continue sources = [(left,right)] if len(contexts[left]) <= max_filler_length: sources += path_dest_source[left].items() destinations = [right] if len(contexts[right]) <= max_filler_length: destinations += path_source_dest[right].keys() for source, next in sources: for dest in destinations: if dest in path_source_dest[source]: continue path_source_dest[source][dest] = next path_dest_source[dest][source] = next workspace = io.Workspace(output_dir) scaffold_f = workspace.open('scaffolds.fa','wb') #comments = [ ] features = [ ] used = set() previous_total = 0 for i, (name, scaffold) in enumerate(scaffolds): result = '' # Inefficient. Meh. n_filled = 0 n_failed = 0 for j, item in enumerate(scaffold): if item[0] == 'contig': result += contexts[item[1]] used.add(abs(item[1])) else: left = scaffold[j-1] right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular assert left[0] == 'contig' assert right[0] == 'contig' gap_start = len(result) can_fill = right[1] in path_source_dest[left[1]] if can_fill: n = 0 k = path_source_dest[left[1]][right[1]] while k != right[1]: n += len(contexts[k]) result += contexts[k].lower() used.add(abs(k)) k = path_source_dest[k][right[1]] n_filled += 1 if item[1] is not None and max(n,item[1]) > min(n,item[1])*4: print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1) else: n_failed += 1 #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1) result += 'n' * (9 if item[1] is None else item[1]) gap_end = len(result) #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( # 'all-scaffolds', # 'fill-scaffolds', # 'gap', # previous_total + gap_start+1, # previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. # '.', #score # '+', #strand # '.', #frame # '' #properties #)) features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( name, 'fill-scaffolds', 'gap', gap_start+1, max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. '.', #score '+', #strand '.', #frame '' #properties )) io.write_fasta(scaffold_f, name, result) previous_total += len(result) #comments.append('##sequence-region %s %d %d' % (name, 1, len(result))) print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed) scaffold_f.close() gff_f = workspace.open('scaffolds.gff', 'wb') #print >>gff_f, '##gff-version 3' #for comment in comments: # print >>gff_f, comment for feature in features: print >>gff_f, feature gff_f.close() leftovers_f = workspace.open('leftovers.fa', 'wb') for name in sequence_names: if sequence_ids[name] not in used: io.write_fasta(leftovers_f, name, sequences[name]) leftovers_f.close() ends = { } for i, (name, scaffold) in enumerate(scaffolds): if scaffold[-1][0] == 'gap': continue ends[ '%s start' % name ] = scaffold[-1][1] ends[ '%s end ' % name ] = -scaffold[0][1] for end1 in sorted(ends): options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ] if len(options) == 1: print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank + right i = 0 variants_used = [] with open(workspace / 'reads.fq', 'wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append((variant, count)) seq = left + variant + right for j in xrange(count): pos = len(variant) + random.randrange(read_length - len(variant)) read = seq[pos:pos + read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f, 'read_%s_%d' % (variant, i), read, chr(64 + 30) * len(read)) reference = left + self.ref + right primary_variant = left + variants_used[0][0] + right with open(workspace / 'reference.fa', 'wb') as f: io.write_fasta(f, 'chr1', reference) legion.remake_needed() self.analysis( workspace / 'sample', workspace / 'reference.fa', reads=[workspace / 'reads.fq'], ).run() self.freebayes( workspace / 'freebayes', workspace / 'sample', ).run() self.vcf_filter( workspace / 'filtered', workspace / 'freebayes.vcf', ).run() Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'), workspace / 'filtered.vcf').run() patched = io.read_sequences(workspace / ('patch', 'sample.fa')).next()[1] masked = io.read_sequences( workspace / ('sample', 'consensus_masked.fa')).next()[1].upper() with open(workspace / 'freebayes.vcf', 'rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace / 'filtered.vcf', 'rU') as f: reader = vcf.Reader(f) filtered_count = len( list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU')))) with open(workspace / ('sample', 'report.txt'), 'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name, 'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name, 'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name, 'raw variants', raw_count) self.log.datum(workspace.name, 'variants after filtering', filtered_count) self.log.datum(workspace.name, 'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [ ] filenames = [ ] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append(itertools.izip( io.read_sequences(filename, qualities=True) )) for pair_filenames in self.pairs: assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append(itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True) )) for filename in self.interleaved: filenames.append(filename) any_paired = True iterators.append(deinterleave( io.read_sequences(filename, qualities=True) )) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ] assert iterators, 'Nothing to clip' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [ ] adaptor_names = [ ] if self.adaptor_clip: if self.adaptor_file: adaptor_iter = io.read_sequences(self.adaptor_file) else: adaptor_iter = ADAPTORS for name, seq in adaptor_iter: seq = seq.upper().replace('U','T') adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0]) if fragment_reads == 2: names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames() f_paired = map(io.open_possibly_compressed_writer, names) if output_rejects: f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [ 0 ] * fragment_reads n_out = [ 0 ] * fragment_reads n_q_clipped = [ 0 ] * fragment_reads n_a_clipped = [ 0 ] * fragment_reads n_homopolymers = [ 0 ] * fragment_reads total_out_length = [ 0 ] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single+n_in_paired) % 10000 == 0: grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [ ] rejects = [ ] for i, (name, seq, qual) in enumerate(fragment): seq = seq.upper() total_in_length[i] += len(seq) if self.trim_to: seq = seq[:self.trim_to] qual = qual[:self.trim_to] start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq)-trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j-start: best_start = start best_len = j-start start = j + 1 j = len(seq)-trim_end if best_len < j-start: best_start = start best_len = j-start clipped_seq = seq[best_start:best_start+best_len] clipped_qual = qual[best_start:best_start+best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append( (name,seq,qual,'quality') ) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ] clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ] end_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append( (name,seq,qual,'homopolymer') ) continue graduates.append( (name, clipped_seq, clipped_qual) ) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name,seq,qual,reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [ (name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates ] if len(graduates) == 1: n_single += 1 (name, seq, qual) = graduates[0] write_sequence(f_single, name, seq, qual) else: assert len(graduates) == 2 n_paired += 1 # Write the pair to an interleaved file or separate l/r files for (lr,(name, seq, qual)) in enumerate(graduates): write_sequence(f_paired[lr%len(f_paired)], name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: map(lambda f: f.close(), f_paired) f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips)+1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum( item2[0] for item2 in item )) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name,'read-pairs', n_in_paired) if n_in_single: log.datum(log_name,'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single+n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name,'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): adaptor = self.adaptor.upper() name = self.name or os.path.basename(self.prefix) headers = sam.bam_headers(self.input) writer = sam.Bam_writer(self.prefix+"_temp.bam", headers) n_kept = 0 n_unaligned = 0 n_discarded = 0 n_multi = 0 for i, al in enumerate(sam.Bam_reader(self.input)): if al.flag & sam.FLAG_UNMAPPED: writer.write(al) n_unaligned += 1 continue reverse = al.flag & sam.FLAG_REVERSE if reverse: read_bases = bio.reverse_complement(al.seq) cigar = cigar_parts(al.cigar)[::-1] else: read_bases = al.seq.upper() cigar = cigar_parts(al.cigar) n_unaligned = 0 if cigar and cigar[-1][1] == "S": n_unaligned = cigar[-1][0] n_aligned = len(read_bases) - n_unaligned seq_unaligned = read_bases[n_aligned:] seq_aligned = read_bases[:n_aligned] AN, AD = a_adaptor_count(seq_unaligned, adaptor) AG = a_count(seq_aligned[::-1]) if AN: al.extra.append("AN:i:%d" % AN) if AD: al.extra.append("AD:i:%d" % AN) if AG: al.extra.append("AG:i:%d" % AN) if AN >= 4: al.extra.append("AA:i:1") if n_aligned - AG < self.min_genomic: al.flag = al.flag | sam.FLAG_UNMAPPED n_discarded += 1 else: n_kept += 1 NH = 1 for item in al.extra: if item.startswith("NH:i:"): NH = int(item[5:]) if NH > 1: n_multi += 1 if i % 10000 == 0: print al.rname, al.pos print cigar print " "*(len(seq_aligned)-AG)+"="*AG print seq_aligned#[::-1] print seq_unaligned print "="*AN+"D"*AD writer.write(al) self.log.datum(name, "reads", n_unaligned+n_kept+n_discarded) self.log.datum(name, "did not align", n_unaligned) self.log.datum(name, "short non-A alignments discarded", n_discarded) self.log.datum(name, "alignments kept", n_kept) self.log.datum(name, "multimappers", n_multi) writer.close() sam.sort_and_index_bam( self.prefix+"_temp.bam", self.prefix) os.unlink(self.prefix+"_temp.bam")
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [ ] contig_filenames = [ ] grace.execute(args, { 'contigs' : lambda args: contig_filenames.extend(args) }, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([ (name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename) ]) dir_contigs = { } for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = { } for name in dir_contigs: dir_contigs_used[name] = [ False ] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa','wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [ -1 ] * (len(ref_seq)*2) strings = [ 'N', '' ] * (len(ref_seq)) contexts = [ None for i in xrange(len(ref_seq)*2) ] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i*2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute(['nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix+'.fa', contig_filename]) for contig_name, contig_seq in io.read_sequences(contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name], stderr=subprocess.PIPE) alignments = [ ] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [ ] al_query = [ ] while True: block = [ ] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq)+1-query_start query_end = len(contig_seq)+1-query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower() put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name,start,end,score = context for i in xrange(start,end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa','wb') for name in sorted(contigs): used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j-i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i+1,j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j+1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)