def prepend_barcode(seqfile, bcfile, rc, text=''): tmph = open(seqfile + '.tmp', 'w') itr1 = FastqGeneralIterator(open(seqfile)) itr2 = FastqGeneralIterator(open(bcfile)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break if rc: rcs = Seq(s2, generic_dna) s2 = rcs.reverse_complement() q2 = q2[::-1] if text: h1 = h1 + '.' + text tmph.write("@%s\n%s%s\n+\n%s%s\n" % (h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break tmph.close() os.rename(seqfile + '.tmp', seqfile)
def prepend_barcode(seqfile, bcfile, rc, text=''): tmph = open(seqfile+'.tmp', 'w') itr1 = FastqGeneralIterator(open(seqfile)) itr2 = FastqGeneralIterator(open(bcfile)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break if rc: rcs = Seq(s2, generic_dna) s2 = rcs.reverse_complement() q2 = q2[::-1] if text: h1 = h1+'.'+text tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break tmph.close() os.rename(seqfile+'.tmp', seqfile)
def main(args): usage = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>"+__doc__ parser = OptionParser(usage) parser.add_option("-i", "--index", dest="index", default=None, help="Input index fastq file.") parser.add_option("-s", "--seq", dest="seq", default=None, help="Input seq fastq file.") parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.") (opts, args) = parser.parse_args() if not (opts.index and os.path.isfile(opts.index) and opts.seq and os.path.isfile(opts.seq) and opts.output): parser.error("Missing input and/or output") outh = open(opts.output+'.tmp', 'w') itr1 = FastqGeneralIterator(open(opts.seq)) itr2 = FastqGeneralIterator(open(opts.index)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break outh.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break outh.close() os.rename(opts.output+'.tmp', opts.output) return 0
def main(args): usage = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>" + __doc__ parser = OptionParser(usage) parser.add_option("-i", "--index", dest="index", default=None, help="Input index fastq file.") parser.add_option("-s", "--seq", dest="seq", default=None, help="Input seq fastq file.") parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.") (opts, args) = parser.parse_args() if not (opts.index and os.path.isfile(opts.index) and opts.seq and os.path.isfile(opts.seq) and opts.output): parser.error("Missing input and/or output") outh = open(opts.output + '.tmp', 'w') itr1 = FastqGeneralIterator(open(opts.seq)) itr2 = FastqGeneralIterator(open(opts.index)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break outh.write("@%s\n%s%s\n+\n%s%s\n" % (h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break outh.close() os.rename(opts.output + '.tmp', opts.output) return 0
def main(): count = 0 fnum = 1 if not paired: handle = open( "%s_%03d.fq" % (arguments['--output'], fnum), "w") for t,s,q in FastqGeneralIterator(open(arguments['<file.fastq>'], "r")): if count >= arguments['--number']: handle.close() count = 0 fnum += 1 handle = open( "%s_%03d.fq" % (arguments['--output'], fnum), "w") handle.write( "@%s\n%s\n+\n%s\n" % (t,s,q) ) count += 1 else: #going to assume all reads are in both files and skip error checking h1 = open( "%s_R1_%03d.fq" % (arguments['--output'], fnum), "w") h2 = open( "%s_R2_%03d.fq" % (arguments['--output'], fnum), "w") r2_gen = FastqGeneralIterator(open(arguments['<read2.fastq>'], "r")) for t,s,q in FastqGeneralIterator(open(arguments['<file.fastq>'], "r")): if count >= arguments['--number']: h1.close() h2.close() count = 0 fnum += 1 h1 = open( "%s_R1_%03d.fq" % (arguments['--output'], fnum), "w") h2 = open( "%s_R2_%03d.fq" % (arguments['--output'], fnum), "w") h1.write( "@%s\n%s\n+\n%s\n" % (t,s,q) ) h2.write( "@%s\n%s\n+\n%s\n" % r2_gen.next() ) count += 1
def distribute_reads(readfiles, read_hit_dict, single=True): iterator1 = FastqGeneralIterator(open(readfiles[0])) if len(readfiles) == 1: for ID1_long, Seq1, Qual1 in iterator1: ID1 = ID1_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_single_seqs(target, ID1, Seq1) return elif len(readfiles) == 2: iterator2 = FastqGeneralIterator(open(readfiles[1])) for ID1_long, Seq1, Qual1 in iterator1: ID2_long, Seq2, Qual2 = iterator2.next() ID1 = ID1_long.split()[0] ID2 = ID2_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_paired_seqs(target, ID1, Seq1, ID2, Seq2) elif ID2 in read_hit_dict: for target in read_hit_dict[ID2]: write_paired_seqs(target, ID1, Seq1, ID2, Seq2)
def lookup_index_cycles(index_fn): iterator = FastqGeneralIterator(gzip.open(args.index_read_file)) name, seq, qual = iterator.next() return len(seq)
def distribute_reads(readfiles,read_hit_dict,single=True): iterator1 = FastqGeneralIterator(open(readfiles[0])) if len(readfiles) == 1: for ID1_long, Seq1, Qual1 in iterator1: ID1 = ID1_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_single_seqs(target,ID1,Seq1) return elif len(readfiles) == 2: iterator2 = FastqGeneralIterator(open(readfiles[1])) for ID1_long, Seq1, Qual1 in iterator1: ID2_long, Seq2, Qual2 = iterator2.next() ID1 = ID1_long.split()[0] ID2 = ID2_long.split()[0] if ID1 in read_hit_dict: for target in read_hit_dict[ID1]: write_paired_seqs(target,ID1,Seq1,ID2,Seq2) elif ID2 in read_hit_dict: for target in read_hit_dict[ID2]: write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
def stitch_seqs(outfile, file1, file2, blen): bseq = 'N' * blen bqual = '!' * blen itr1 = FastqGeneralIterator(open(file1)) itr2 = FastqGeneralIterator(open(file2)) rec1 = itr1.next() rec2 = itr2.next() outh = open(outfile, 'w') while 1: seq2 = Seq(rec2[1], generic_dna) outh.write("@%s\n%s%s%s\n+\n%s%s%s\n" %(rec1[0].split()[0], rec1[1], bseq, str(seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1])) try: rec1 = itr1.next() rec2 = itr2.next() except (StopIteration, IOError): break outh.close()
def stitch_seqs(outfile, file1, file2, blen): bseq = 'N' * blen bqual = '!' * blen itr1 = FastqGeneralIterator(open(file1)) itr2 = FastqGeneralIterator(open(file2)) rec1 = itr1.next() rec2 = itr2.next() outh = open(outfile, 'w') while 1: seq2 = Seq(rec2[1], generic_dna) outh.write( "@%s\n%s%s%s\n+\n%s%s%s\n" % (rec1[0].split()[0], rec1[1], bseq, str( seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1])) try: rec1 = itr1.next() rec2 = itr2.next() except (StopIteration, IOError): break outh.close()
def parse_2fastq_parallel(file1, file2): """ Parse two fastq files in parallel - generator yielding (name, seq1, seq2, qual1, qual2) tuples. Doesn't check that the readnames match. """ from Bio.SeqIO.QualityIO import FastqGeneralIterator # Bio is the biopython package with open(file1) as INFILE1: with open(file2) as INFILE2: generator1 = FastqGeneralIterator(INFILE1) generator2 = FastqGeneralIterator(INFILE2) if_finished_1, if_finished_2 = False, False while True: try: name1, seq1, qual1 = generator1.next() except StopIteration: if_finished_1 = True try: name2, seq2, qual2 = generator2.next() except StopIteration: if_finished_2 = True name = name1.split()[0] if not if_finished_1 and not if_finished_2: yield (name, seq1, seq2, qual1, qual2) elif if_finished_1 and if_finished_2: raise StopIteration else: raise DeepseqError("One file finished but the other one didn't! Read name %s"%( name if if_finished_2 else name2.split()[0]))
if (args.read2_file != None): iterator2 = FastqGeneralIterator(gzip.open(args.read2_file)) if (args.read3_file != None): iterator3 = FastqGeneralIterator(gzip.open(args.read3_file)) func = None if (args.mismatches > 0): func = make_fuzzy_match(args.mismatches, args.n_penalty) else: func = make_strict_match(read_index_length, indexes) for rname1, seq1, qual1 in FastqGeneralIterator(gzip.open(args.read1_file)): rnamei, seqi, quali = iteratori.next() out_index = func(seqi, indexes) if (out_index is None): out_index = 'unknown' if (iterator2 is not None): rname2, seq2, qual2 = iterator2.next() if (iterator3 is not None): rname3, seq3, qual3 = iterator3.next() read3_index_out_fh[out_index].write('@' + rname3 + ":" + seq3 + "\n")
#read1_iter = SeqIO.parse(sys.argv[1], "fastq") #read2_iter = SeqIO.parse(sys.argv[2], "fastq") read1_iter = FastqGeneralIterator(open(sys.argv[1])) read2_iter = FastqGeneralIterator(open(sys.argv[2])) read1_out = open(sys.argv[3], 'w') read2_out = open(sys.argv[4], 'w') min_length = int(sys.argv[5]) max_length = int(sys.argv[6]) pairs_discarded = 0 for read1_id, read1_seq, read1_qual in read1_iter: read2_id, read2_seq, read2_qual = read2_iter.next() read1_seq = read1_seq[5:] read1_qual = read1_qual[5:] read2_seq = read2_seq[5:] read2_qual = read2_qual[5:] if len(read1_seq) < min_length or len(read2_seq) < min_length: pairs_discarded += 1 continue #SeqIO.write(read1, read1_out, "fastq") #SeqIO.write(read2, read2_out, "fastq") if len(read1_seq) > max_length: read1_seq = read1_seq[:max_length] read1_qual = read1_qual[:max_length]
else: handle1 = open(fastq1) if fastq2.endswith(".gz"): sys.stderr.write("Decompressing %s\n" % fastq2) handle2 = gzip.open(fastq2) else: handle2 = open(fastq2) sys.stderr.write("Interlacing paired FASTQ files to stdout...\n") out_handle = sys.stdout iter1 = FastqGeneralIterator(handle1) iter2 = FastqGeneralIterator(handle2) for title1, seq1, qual1 in iter1: try: title2, seq2, qual2 = iter2.next() except StopIteration: sys_exit("More records in %s than %s, e.g. %s" % (fastq1, fastq2, title1)) id1, descr1 = title1.split(None, 1) id2, descr2 = title2.split(None, 1) if id1 == id2: # Add the /1 and /2, preserve any description after the ID if descr1: descr1 = " " + descr1 if descr2: descr2 = " " + descr2 out_handle.write("@%s/1%s\n%s\n+\n%s\n@%s/2%s\n%s\n+\n%s\n" % (id1, descr1, seq1, qual1, id2, descr2, seq2, qual2)) elif id1.endswith("/1") and id2.endswith("/2") and id1[:-2] == id2[:-2]: out_handle.write("@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n"
#!/usr/bin/python import sys from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.SeqIO.QualityIO import FastqGeneralIterator #read1_iter = SeqIO.parse(sys.argv[1], "fastq") #read2_iter = SeqIO.parse(sys.argv[2], "fastq") read1_iter = FastqGeneralIterator(open(sys.argv[1])) read2_iter = FastqGeneralIterator(open(sys.argv[2])) kv_out_file = open(sys.argv[1] + ".tmp", "w") for read1 in read1_iter: read2 = read2_iter.next() print(read1[0]) # strip off the /1 in read1 - this will be the key for the MR data file read_id = read1[0][:len(read1[0]) - 2] kv_out_file.write("\t".join([ read_id, read1[0], read1[1], read1[2], read2[0], read2[1], read2[2] ]) + "\n")