def filter_low_qual_seqs(gz_filename1, gz_filename2, phred_offset, phred_cutoff): """ Takes a BowTie-style gzipped file (ex: .aligned.composite.gz) and retain only seqs that have every base phred >= <cutoff> Outputs: .phred<cutoff>_passed for both files """ assert phred_offset >= 0 assert phred_cutoff >= 0 bad = 0 good = 0 start_t = time.time() print gz_filename1, gz_filename2 f1 = BowTieWriter(gz_filename1 + ".phred{0}_passed".format(phred_cutoff), 'w') f2 = BowTieWriter(gz_filename2 + ".phred{0}_passed".format(phred_cutoff), 'w') for r1, r2 in itertools.izip(BowTieReader(gz_filename1,False), BowTieReader(gz_filename2,False)): if all(ord(x)-phred_offset >= phred_cutoff for x in r1['qual']) and \ all(ord(x)-phred_offset >= phred_cutoff for x in r2['qual']): good += 1 f1.write(r1) f2.write(r2) else: bad += 1 with open(gz_filename1 + ".phred{0}_passed.log".format(phred_cutoff), 'w') as f: f.write("Running filter_low_qual_seq took {0} secs\n".format(time.time()-start_t)) f.write("Input: " + gz_filename1 + ',' + gz_filename2 + '\n') f.write("PhredCutoff: " + str(phred_cutoff) + '\n') f.write("RemovedDueToLowQual: " + str(bad) + '\n') f.write("RemainingTotal: " + str(good) + '\n')
def writeout(clusters, output_filename): w = BowTieWriter(output_filename) for cid, c in clusters.iteritems(): qual = "".join(chr(int(round(x+33))) for x in np.array(c['qual']) / np.array(c['abun'])) r = {'ID': cid, 'seq': c['seq'], 'qual': qual, 'strand': '+', \ 'ref': str(c['cycle']), 'offset': max(c['abun'])} w.write(r) w.close()
def filter_low_qual_seqs(gz_filename1, gz_filename2, phred_offset, phred_cutoff): """ Takes a BowTie-style gzipped file (ex: .aligned.composite.gz) and retain only seqs that have every base phred >= <cutoff> Outputs: .phred<cutoff>_passed for both files """ assert phred_offset >= 0 assert phred_cutoff >= 0 bad = 0 good = 0 start_t = time.time() print gz_filename1, gz_filename2 f1 = BowTieWriter(gz_filename1 + ".phred{0}_passed".format(phred_cutoff), 'w') f2 = BowTieWriter(gz_filename2 + ".phred{0}_passed".format(phred_cutoff), 'w') for r1, r2 in itertools.izip(BowTieReader(gz_filename1, False), BowTieReader(gz_filename2, False)): if all(ord(x)-phred_offset >= phred_cutoff for x in r1['qual']) and \ all(ord(x)-phred_offset >= phred_cutoff for x in r2['qual']): good += 1 f1.write(r1) f2.write(r2) else: bad += 1 with open(gz_filename1 + ".phred{0}_passed.log".format(phred_cutoff), 'w') as f: f.write( "Running filter_low_qual_seq took {0} secs\n".format(time.time() - start_t)) f.write("Input: " + gz_filename1 + ',' + gz_filename2 + '\n') f.write("PhredCutoff: " + str(phred_cutoff) + '\n') f.write("RemovedDueToLowQual: " + str(bad) + '\n') f.write("RemainingTotal: " + str(good) + '\n')
def remove_high_expected_error_PE(file1, file2, max_expected_error): """ Remove all reads where the expected error (sum of err probs from phred scores) exceeds <max_expected_error> """ assert os.path.exists(file1) and os.path.exists(file2) os.system("rm {0}.experror_*".format(file1)) os.system("rm {0}.experror_*".format(file2)) hgood1 = BowTieWriter(file1 + '.experror_good') hgood2 = BowTieWriter(file2 + '.experror_good') hbad1 = BowTieWriter(file1 + '.experror_bad') hbad2 = BowTieWriter(file2 + '.experror_bad') hlog = open(file1 + '.experror.log', 'w') start_t = time.time() good, bad = 0, 0 for r1, r2 in itertools.izip(BowTieReader(file1, False), BowTieReader(file2, False)): if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \ sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error: hgood1.write(r1) hgood2.write(r2) good += 1 else: hbad1.write(r1) hbad2.write(r2) bad += 1 hlog.write("Expected error filtering took {0} sec.\n".format(time.time() - start_t)) hlog.write("Max allowed expected error: {0}\n".format(max_expected_error)) hlog.write("# of original reads: {0}\n".format(good + bad)) hlog.write("# of reads removed: {0} ({1:.2f})\n".format( bad, bad * 1. / (good + bad))) hlog.write("# of reads remaining: {0} ({1:.2f})\n".format( good, good * 1. / (good + bad))) hgood1.close() hgood2.close() hbad1.close() hbad2.close() hlog.close() os.system("gzip " + hgood1.f.name) os.system("gzip " + hgood2.f.name) os.system("gzip " + hbad1.f.name) os.system("gzip " + hbad2.f.name)
def remove_high_expected_error_PE(file1, file2, max_expected_error): """ Remove all reads where the expected error (sum of err probs from phred scores) exceeds <max_expected_error> """ assert os.path.exists(file1) and os.path.exists(file2) os.system("rm {0}.experror_*".format(file1)) os.system("rm {0}.experror_*".format(file2)) hgood1 = BowTieWriter(file1 + '.experror_good') hgood2 = BowTieWriter(file2 + '.experror_good') hbad1 = BowTieWriter(file1 + '.experror_bad') hbad2 = BowTieWriter(file2 + '.experror_bad') hlog = open(file1 + '.experror.log', 'w') start_t = time.time() good, bad = 0,0 for r1, r2 in itertools.izip(BowTieReader(file1, False), BowTieReader(file2, False)): if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \ sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error: hgood1.write(r1) hgood2.write(r2) good += 1 else: hbad1.write(r1) hbad2.write(r2) bad += 1 hlog.write("Expected error filtering took {0} sec.\n".format(time.time()-start_t)) hlog.write("Max allowed expected error: {0}\n".format(max_expected_error)) hlog.write("# of original reads: {0}\n".format(good+bad)) hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad))) hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad))) hgood1.close() hgood2.close() hbad1.close() hbad2.close() hlog.close() os.system("gzip " + hgood1.f.name) os.system("gzip " + hgood2.f.name) os.system("gzip " + hbad1.f.name) os.system("gzip " + hbad2.f.name)
def detect_primers_PE(input1, input2, output_prefix, f_primer, r_primer, min_match_len, max_mm, max_de, max_in): """ NOTE: this is for paired end reads that comes in two separate files ex: DS19342_CTTGTA_L006_R1_001.fastq.gz and DS19342_CTTGTA_L006_R2_001.fastq.gz Given a pair of reads from input1, input2: 1. Detect that F primer exists in one read and R primer in the other 2. If both reads pass primer detection, output 3. Otherwise, discard Output: <output_prefix>.{F|R}primer_good <output_prefix>.primer.bad <output_prefix>.primer.log """ def process_primer(r, match_len, is_reverse): # get record into miscBowTie.BowTieReader format # strip away primers from seq & qual, properly rev comp! r['offset'] = match_len r['seq'] = r['seq'][match_len:] r['qual'] = r['qual'][match_len:] r['ref'] = 'NA' if is_reverse: r['seq'] = Seq(r['seq']).reverse_complement().tostring() r['qual'] = r['qual'][::-1] os.system("rm {0}.*primer_*".format(output_prefix)) Fgood = BowTieWriter(output_prefix + '.Fprimer_good') Rgood = BowTieWriter(output_prefix + '.Rprimer_good') hbad1 = FastqWriter(output_prefix + '.primer_bad.1') hbad2 = FastqWriter(output_prefix + '.primer_bad.2') hverbose = open(output_prefix + '.primer.verbose', 'w') hlog = open(output_prefix + '.primer.log', 'w') start_t = time.time() good, bad = 0,0 pmF = PrimerMatch(f_primer) pmR = PrimerMatch(r_primer) for r1, r2 in itertools.izip(FastqReader(input1), FastqReader(input2)): # NOTE: in the case of PE reads # regardless of whether we're matching for F or R primer # they would all appear at the 5' end of the read # which is why we call match_primer_len with is_reverse = False match_f_len1, mmf1 = match_primer_len(r1['seq'], f_primer, max_mm, min_match_len, False) match_r_len1, mmr1 = match_primer_len(r1['seq'], r_primer, max_mm, min_match_len, False) match_f_len2, mmf2 = match_primer_len(r2['seq'], f_primer, max_mm, min_match_len, False) match_r_len2, mmr2 = match_primer_len(r2['seq'], r_primer, max_mm, min_match_len, False) #match_f_len1 = match_f_len2 =match_r_len1=match_r_len2=0 if match_f_len1 > 0 and match_r_len2 > 0: # case 1, read 1 is F, read 2 is R good += 1 process_primer(r1, match_f_len1, False) Fgood.write(r1) process_primer(r2, match_r_len2, False) Rgood.write(r2) elif match_f_len2 > 0 and match_r_len1 > 0: # case 2, read 1 is R, case 2 is F good += 1 process_primer(r2, match_f_len2, False) Fgood.write(r2) process_primer(r1, match_r_len1, False) Rgood.write(r1) else: pmF.make_suffix(r1['seq']) pmF.match(min_match_len, max_mm, max_in, max_de) if pmF.match_result is not None: pmR.make_suffix(r2['seq']) pmR.match(min_match_len, max_mm, max_in, max_de) if pmR.match_result is not None: # case 1, read 1 is F, read 2 is R good += 1 process_primer(r1, pmF.match_result.match_len, False) Fgood.write(r1) hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmF.match_result.match_len, pmF.match_result.miss)) process_primer(r2, pmR.match_result.match_len, False) Rgood.write(r2) hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmR.match_result.match_len, pmR.match_result.miss)) else: hbad1.write(r1) hbad2.write(r2) bad += 1 else: pmR.make_suffix(r1['seq']) pmR.match(min_match_len, max_mm, max_in, max_de) if pmR.match_result is not None: pmF.make_suffix(r2['seq']) pmF.match(min_match_len, max_mm, max_in, max_de) if pmF.match_result is not None: good += 1 # case 2, read 1 is R, read 2 is F process_primer(r2, pmF.match_result.match_len, False) hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmF.match_result.match_len, pmF.match_result.miss)) Fgood.write(r2) process_primer(r1, pmR.match_result.match_len, False) Rgood.write(r1) hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmR.match_result.match_len, pmR.match_result.miss)) else: # case 3: unresolved, bad read pair hbad1.write(r1) hbad2.write(r2) bad += 1 hlog.write("Input 1: {0}\nInput 2: {1}\n".format(input1, input2)) hlog.write("F primer: {0}\nR primer: {1}\n".format(f_primer, r_primer)) hlog.write("Min match len: {0}\n".format(min_match_len)) hlog.write("Max mismatch: {0}\n".format(max_mm)) hlog.write("Max deletion: {0}\n".format(max_de)) hlog.write("Max insertion: {0}\n".format(max_in)) hlog.write("Primer detection and removal took {0} sec.\n".format(time.time()-start_t)) hlog.write("# of original reads: {0}\n".format(good+bad)) hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad))) hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad))) Fgood.close() Rgood.close() hbad1.close() hbad2.close() hlog.close() hverbose.close() os.system("gzip " + Fgood.f.name) os.system("gzip " + Rgood.f.name) os.system("gzip " + hbad1.f.name) os.system("gzip " + hbad2.f.name) os.system("gzip " + hverbose.name)