def writeout(clusters, output_filename):
	w = BowTieWriter(output_filename)
	for cid, c in clusters.iteritems():
		qual = "".join(chr(int(round(x+33))) for x in np.array(c['qual']) / np.array(c['abun']))
		r = {'ID': cid, 'seq': c['seq'], 'qual': qual, 'strand': '+', \
				'ref': str(c['cycle']), 'offset': max(c['abun'])}
		w.write(r)
	w.close()
Exemplo n.º 2
0
def main(fq1, fq2, output_prefix, abundance_filename):
    abundance = {}
    if abundance_filename is None:
        abundance = defaultdict(lambda: 1)
    else:
        with open(abundance_filename) as f:
            for line in f:
                _id, _count = line.strip().split('\t')
                abundance[_id] = int(_count)

    matchf = BowTieWriter(output_prefix + '.overlap.aligned')
    unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned')
    unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned')

    total = 0
    total_expanded = 0
    aligned = 0
    aligned_expanded = 0
    for r1, r2 in FastqReaderPaired(fq1, fq2):
        realid = r1['ID'][:r1['ID'].find('/')]
        total += 1
        total_expanded += abundance[realid]
        if find_overlap(r1, r2, matchf, unf1, unf2):  #overlap found
            aligned += 1
            aligned_expanded += abundance[realid]

    with open(output_prefix + '.overlap.log', 'w') as f:
        p = aligned * 100. / total
        f.write("# reads processed: {0}\n".format(total))
        f.write(
            "# reads with at least one reported alignment: {0} ({1:.2f}%)\n".
            format(aligned, p))
        f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(
            total - aligned, 100 - p))
        f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".
                format(aligned))

    with open(output_prefix + '.overlap.log_expanded', 'w') as f:
        p = aligned_expanded * 100. / total_expanded
        f.write("# reads processed: {0}\n".format(total_expanded))
        f.write(
            "# reads with at least one reported alignment: {0} ({1:.2f}%)\n".
            format(aligned_expanded, p))
        f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(
            total_expanded - aligned_expanded, 100 - p))
        f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".
                format(aligned_expanded))

    matchf.close()
    unf1.close()
    unf2.close()
def main(fq1, fq2, output_prefix, abundance_filename):
	abundance = {}
	if abundance_filename is None:
		abundance = defaultdict(lambda: 1)
	else:	
		with open(abundance_filename) as f:
			for line in f:
				_id, _count = line.strip().split('\t')
				abundance[_id] = int(_count)
			
	matchf = BowTieWriter(output_prefix + '.overlap.aligned')
	unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned')
	unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned')

	total = 0
	total_expanded = 0
	aligned = 0
	aligned_expanded = 0
	for r1, r2 in FastqReaderPaired(fq1, fq2):
		realid = r1['ID'][:r1['ID'].find('/')]
		total += 1
		total_expanded += abundance[realid]
		if find_overlap(r1, r2, matchf, unf1, unf2): #overlap found
			aligned += 1
			aligned_expanded += abundance[realid]
	
	with open(output_prefix + '.overlap.log', 'w') as f:
		p = aligned*100./total
		f.write("# reads processed: {0}\n".format(total))
		f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned,p))
		f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total-aligned,100-p))
		f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned))

	with open(output_prefix + '.overlap.log_expanded', 'w') as f:
		p = aligned_expanded*100./total_expanded
		f.write("# reads processed: {0}\n".format(total_expanded))
		f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned_expanded,p))
		f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total_expanded-aligned_expanded,100-p))
		f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned_expanded))


	matchf.close()
	unf1.close()
	unf2.close()
def filter_low_qual_seqs(gz_filename1, gz_filename2, phred_offset, phred_cutoff):
    """
    Takes a BowTie-style gzipped file (ex: .aligned.composite.gz)
    and retain only seqs that have every base phred >= <cutoff>

    Outputs: .phred<cutoff>_passed for both files
    """
    assert phred_offset >= 0
    assert phred_cutoff >= 0
    bad = 0
    good = 0
    start_t = time.time()
    print gz_filename1, gz_filename2
    f1 = BowTieWriter(gz_filename1 + ".phred{0}_passed".format(phred_cutoff), 'w')
    f2 = BowTieWriter(gz_filename2 + ".phred{0}_passed".format(phred_cutoff), 'w')
    for r1, r2 in itertools.izip(BowTieReader(gz_filename1,False), BowTieReader(gz_filename2,False)):
        if all(ord(x)-phred_offset >= phred_cutoff for x in r1['qual']) and \
        all(ord(x)-phred_offset >= phred_cutoff for x in r2['qual']):
            good += 1
            f1.write(r1)
            f2.write(r2)
        else:
            bad += 1

    with open(gz_filename1 + ".phred{0}_passed.log".format(phred_cutoff), 'w') as f:
        f.write("Running filter_low_qual_seq took {0} secs\n".format(time.time()-start_t))
        f.write("Input: " + gz_filename1 + ',' + gz_filename2 + '\n')
        f.write("PhredCutoff: " + str(phred_cutoff) + '\n')
        f.write("RemovedDueToLowQual: " + str(bad) + '\n')
        f.write("RemainingTotal: " + str(good) + '\n')
Exemplo n.º 5
0
def split_fasta_by_otu(fasta_filename, bowtie_gz_filename, otu_filename,
                       output_dir):
    """
	For each OTU, create a subdir <output_dir>/<cluster_index> and 
	put it in the OTU's fasta and bowtie (gzipped)
	"""
    otu = {}
    fa_d = {}
    bw_d = {}
    cids = set()
    with open(otu_filename) as f:
        for line in f:
            raw = line.strip().split()
            cid = raw[0]
            cids.add(cid)
            if not os.path.exists(os.path.join(output_dir, cid)):
                os.mkdir(os.path.join(output_dir, cid))
            #os.mkdir(os.path.join(output_dir, cid))
            for seqid in raw[1:]:
                otu[seqid] = cid
    print >> sys.stderr, "finished reading", otu_filename

    for cid in cids:
        fa_d[cid] = open(os.path.join(output_dir, cid, cid + '.fasta'), 'w')
        bw_d[cid] = BowTieWriter(os.path.join(output_dir, cid,
                                              cid + '.bowtie'),
                                 mode='w')

    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        if r.id not in otu:
            continue
        cid = otu[r.id]
        fa_d[cid].write(">{0}\n{1}\n".format(r.id, r.seq))

    for r in BowTieReader(bowtie_gz_filename, False):
        try:
            cid = otu[r['ID'].split()[0]]
        except KeyError:
            continue
        bw_d[cid].write(r)

    for handle in fa_d.itervalues():
        handle.close()

    for handle in bw_d.itervalues():
        handle.close()
        os.system("gzip " + handle.f.name)
def filter_low_qual_seqs(gz_filename1, gz_filename2, phred_offset,
                         phred_cutoff):
    """
    Takes a BowTie-style gzipped file (ex: .aligned.composite.gz)
    and retain only seqs that have every base phred >= <cutoff>

    Outputs: .phred<cutoff>_passed for both files
    """
    assert phred_offset >= 0
    assert phred_cutoff >= 0
    bad = 0
    good = 0
    start_t = time.time()
    print gz_filename1, gz_filename2
    f1 = BowTieWriter(gz_filename1 + ".phred{0}_passed".format(phred_cutoff),
                      'w')
    f2 = BowTieWriter(gz_filename2 + ".phred{0}_passed".format(phred_cutoff),
                      'w')
    for r1, r2 in itertools.izip(BowTieReader(gz_filename1, False),
                                 BowTieReader(gz_filename2, False)):
        if all(ord(x)-phred_offset >= phred_cutoff for x in r1['qual']) and \
        all(ord(x)-phred_offset >= phred_cutoff for x in r2['qual']):
            good += 1
            f1.write(r1)
            f2.write(r2)
        else:
            bad += 1

    with open(gz_filename1 + ".phred{0}_passed.log".format(phred_cutoff),
              'w') as f:
        f.write(
            "Running filter_low_qual_seq took {0} secs\n".format(time.time() -
                                                                 start_t))
        f.write("Input: " + gz_filename1 + ',' + gz_filename2 + '\n')
        f.write("PhredCutoff: " + str(phred_cutoff) + '\n')
        f.write("RemovedDueToLowQual: " + str(bad) + '\n')
        f.write("RemainingTotal: " + str(good) + '\n')
Exemplo n.º 7
0
def detect_primers_PE(input1, input2, output_prefix, f_primer, r_primer, min_match_len, max_mm, max_de, max_in):
	"""
	NOTE: this is for paired end reads that comes in two separate files
	ex: DS19342_CTTGTA_L006_R1_001.fastq.gz and DS19342_CTTGTA_L006_R2_001.fastq.gz
	
	Given a pair of reads from input1, input2:
	1. Detect that F primer exists in one read and R primer in the other
	2. If both reads pass primer detection, output
	3. Otherwise, discard
	
	Output:  <output_prefix>.{F|R}primer_good
	         <output_prefix>.primer.bad
	         <output_prefix>.primer.log
	"""
	def process_primer(r, match_len, is_reverse):
		# get record into miscBowTie.BowTieReader format 
		# strip away primers from seq & qual, properly rev comp!
		r['offset'] = match_len
		r['seq'] = r['seq'][match_len:]
		r['qual'] = r['qual'][match_len:]
		r['ref'] = 'NA'
		if is_reverse:
			r['seq'] = Seq(r['seq']).reverse_complement().tostring()
			r['qual'] = r['qual'][::-1]
	
	os.system("rm {0}.*primer_*".format(output_prefix))
	Fgood = BowTieWriter(output_prefix + '.Fprimer_good')
	Rgood = BowTieWriter(output_prefix + '.Rprimer_good')
	hbad1 = FastqWriter(output_prefix + '.primer_bad.1')
	hbad2 = FastqWriter(output_prefix + '.primer_bad.2')
	hverbose = open(output_prefix + '.primer.verbose', 'w')
	hlog = open(output_prefix + '.primer.log', 'w')
	start_t = time.time()
	good, bad = 0,0
	
	pmF = PrimerMatch(f_primer)
	pmR = PrimerMatch(r_primer)

	for r1, r2 in itertools.izip(FastqReader(input1), FastqReader(input2)):
		# NOTE: in the case of PE reads
		#       regardless of whether we're matching for F or R primer
		#       they would all appear at the 5' end of the read
		#       which is why we call match_primer_len with is_reverse = False
		match_f_len1, mmf1 = match_primer_len(r1['seq'], f_primer, max_mm, min_match_len, False)
		match_r_len1, mmr1 = match_primer_len(r1['seq'], r_primer, max_mm, min_match_len, False)
		match_f_len2, mmf2 = match_primer_len(r2['seq'], f_primer, max_mm, min_match_len, False)
		match_r_len2, mmr2 = match_primer_len(r2['seq'], r_primer, max_mm, min_match_len, False)
		#match_f_len1 = match_f_len2 =match_r_len1=match_r_len2=0
		if match_f_len1 > 0 and match_r_len2 > 0:
			# case 1, read 1 is F, read 2 is R
			good += 1
			process_primer(r1, match_f_len1, False)
			Fgood.write(r1)
			process_primer(r2, match_r_len2, False)
			Rgood.write(r2)
		elif match_f_len2 > 0 and match_r_len1 > 0:
			# case 2, read 1 is R, case 2 is F
			good += 1
			process_primer(r2, match_f_len2, False)
			Fgood.write(r2)
			process_primer(r1, match_r_len1, False)
			Rgood.write(r1)
		else:
			pmF.make_suffix(r1['seq'])
			pmF.match(min_match_len, max_mm, max_in, max_de)
			if pmF.match_result is not None: 
				pmR.make_suffix(r2['seq'])
				pmR.match(min_match_len, max_mm, max_in, max_de)
				if pmR.match_result is not None:  # case 1, read 1 is F, read 2 is R
					good += 1
					process_primer(r1, pmF.match_result.match_len, False)
					Fgood.write(r1)
					hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmF.match_result.match_len, pmF.match_result.miss))
					process_primer(r2, pmR.match_result.match_len, False)
					Rgood.write(r2)
					hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmR.match_result.match_len, pmR.match_result.miss))
				else:
					hbad1.write(r1)
					hbad2.write(r2)
					bad += 1
			else:
				pmR.make_suffix(r1['seq'])
				pmR.match(min_match_len, max_mm, max_in, max_de)
				if pmR.match_result is not None:
					pmF.make_suffix(r2['seq'])
					pmF.match(min_match_len, max_mm, max_in, max_de)
					if pmF.match_result is not None:
						good += 1
						# case 2, read 1 is R, read 2 is F
						process_primer(r2, pmF.match_result.match_len, False)
						hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmF.match_result.match_len, pmF.match_result.miss))
						Fgood.write(r2)
						process_primer(r1, pmR.match_result.match_len, False)
						Rgood.write(r1)
						hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmR.match_result.match_len, pmR.match_result.miss))
					else:
						# case 3: unresolved, bad read pair
						hbad1.write(r1)
						hbad2.write(r2)
						bad += 1

	hlog.write("Input 1: {0}\nInput 2: {1}\n".format(input1, input2))
	hlog.write("F primer: {0}\nR primer: {1}\n".format(f_primer, r_primer))
	hlog.write("Min match len: {0}\n".format(min_match_len))
	hlog.write("Max mismatch: {0}\n".format(max_mm))
	hlog.write("Max deletion: {0}\n".format(max_de))
	hlog.write("Max insertion: {0}\n".format(max_in))
	hlog.write("Primer detection and removal took {0} sec.\n".format(time.time()-start_t))
	hlog.write("# of original reads: {0}\n".format(good+bad))
	hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad)))
	hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad)))


	Fgood.close()
	Rgood.close()
	hbad1.close()
	hbad2.close()
	hlog.close()
	hverbose.close()
	os.system("gzip " + Fgood.f.name)
	os.system("gzip " + Rgood.f.name)
	os.system("gzip " + hbad1.f.name)
	os.system("gzip " + hbad2.f.name)
	os.system("gzip " + hverbose.name)
Exemplo n.º 8
0
			qual += chr(int(-10*log10(e)+33))
		seq += r2['seq'][N-delta:]
		qual += r2['qual'][N-delta:]
	return seq, qual, N-delta

if __name__ == "__main__":
	from miscBowTie import BowTieReader, BowTieWriter
	from cPickle import *
	from optparse import OptionParser

	parser = OptionParser()
	parser.add_option("--input", dest="input", help="Input bowtie aligned file (gzipped)")
	parser.add_option("--output", dest="output", help="Output composite read filename")
	
	options, args = parser.parse_args()

	reader = BowTieReader(options.input, is_paired=True)
	print >> sys.stderr, "calculating base frequencies"
	base_freq_pickle = options.input + ".base_freq.pickle"
	if os.path.exists(base_freq_pickle):
		base_freq = load(open(base_freq_pickle))
	else:
		base_freq = reader.get_base_frequency()
		with open(options.input + ".base_freq.pickle", 'w') as f:
			dump(base_freq, f)
	print >> sys.stderr, "reading bowtie aligned file..."
	writer = BowTieWriter(options.output)
	for r1, r2 in reader:
		seq, qual, overlap = compose2(r1, r2, base_freq)
		writer.write_composite(r1, r2, seq, qual, overlap)
def remove_high_expected_error_PE(file1, file2, max_expected_error):
	"""
	Remove all reads where the expected error (sum of err probs from phred scores)
	exceeds <max_expected_error>
	"""
	assert os.path.exists(file1) and os.path.exists(file2)
	os.system("rm {0}.experror_*".format(file1))
	os.system("rm {0}.experror_*".format(file2))
	hgood1 = BowTieWriter(file1 + '.experror_good')
	hgood2 = BowTieWriter(file2 + '.experror_good')
	hbad1  = BowTieWriter(file1 + '.experror_bad')
	hbad2  = BowTieWriter(file2 + '.experror_bad')
	hlog = open(file1 + '.experror.log', 'w')
	start_t = time.time()
	good, bad = 0,0
	for r1, r2 in itertools.izip(BowTieReader(file1, False), BowTieReader(file2, False)):
		if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \
		sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error:
			hgood1.write(r1)
			hgood2.write(r2)
			good += 1
		else:
			hbad1.write(r1)
			hbad2.write(r2)
			bad += 1
	hlog.write("Expected error filtering took {0} sec.\n".format(time.time()-start_t))
	hlog.write("Max allowed expected error: {0}\n".format(max_expected_error))
	hlog.write("# of original reads: {0}\n".format(good+bad))
	hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad)))
	hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad)))

	hgood1.close()
	hgood2.close()
	hbad1.close()
	hbad2.close()
	hlog.close()
	os.system("gzip " + hgood1.f.name)
	os.system("gzip " + hgood2.f.name)
	os.system("gzip " + hbad1.f.name)
	os.system("gzip " + hbad2.f.name)
Exemplo n.º 10
0
def remove_high_expected_error_PE(file1, file2, max_expected_error):
    """
	Remove all reads where the expected error (sum of err probs from phred scores)
	exceeds <max_expected_error>
	"""
    assert os.path.exists(file1) and os.path.exists(file2)
    os.system("rm {0}.experror_*".format(file1))
    os.system("rm {0}.experror_*".format(file2))
    hgood1 = BowTieWriter(file1 + '.experror_good')
    hgood2 = BowTieWriter(file2 + '.experror_good')
    hbad1 = BowTieWriter(file1 + '.experror_bad')
    hbad2 = BowTieWriter(file2 + '.experror_bad')
    hlog = open(file1 + '.experror.log', 'w')
    start_t = time.time()
    good, bad = 0, 0
    for r1, r2 in itertools.izip(BowTieReader(file1, False),
                                 BowTieReader(file2, False)):
        if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \
        sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error:
            hgood1.write(r1)
            hgood2.write(r2)
            good += 1
        else:
            hbad1.write(r1)
            hbad2.write(r2)
            bad += 1
    hlog.write("Expected error filtering took {0} sec.\n".format(time.time() -
                                                                 start_t))
    hlog.write("Max allowed expected error: {0}\n".format(max_expected_error))
    hlog.write("# of original reads: {0}\n".format(good + bad))
    hlog.write("# of reads removed: {0} ({1:.2f})\n".format(
        bad, bad * 1. / (good + bad)))
    hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(
        good, good * 1. / (good + bad)))

    hgood1.close()
    hgood2.close()
    hbad1.close()
    hbad2.close()
    hlog.close()
    os.system("gzip " + hgood1.f.name)
    os.system("gzip " + hgood2.f.name)
    os.system("gzip " + hbad1.f.name)
    os.system("gzip " + hbad2.f.name)