def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write(outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write( outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def testSplitUnpaired(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) names1 = [x.name for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo', 'bar', 'baz']) names2 = [x.name for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo', 'bar', 'baz']) fq1.close() fq2.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ)
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append( gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append( open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False): fastq = FASTQ(fname) if ignore_pairs: is_paired = False else: is_paired = fastq.is_paired outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = chunks last_name = None for read in fastq.fetch(quiet=quiet): if not is_paired: i += 1 elif read.name != last_name: i += 1 if i >= len(outs): i = 0 last_name = read.name read.write(outs[i]) for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False): fastq = FASTQ(fname) if ignore_pairs: is_paired = False else: is_paired = fastq.is_paired outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = chunks last_name = None for read in fastq.fetch(quiet=quiet): if not is_paired: i += 1 elif read.name != last_name: i += 1 if i >= len(outs): i = 0 last_name = read.name read.write(outs[i]) for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def testSplitThree(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) self.assertTrue(os.path.exists('%s.3.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) fq3 = FASTQ('%s.3.fastq' % templ) names1 = [x.fullname for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo /1', 'bar /2']) names2 = [x.fullname for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo /2', 'baz /1']) names3 = [x.fullname for x in fq3.fetch(quiet=True)] self.assertEqual(names3, ['bar /1', 'baz /2']) fq1.close() fq2.close() fq3.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ) os.unlink('%s.3.fastq' % templ)
def testSplitThree(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) self.assertTrue(os.path.exists('%s.3.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) fq3 = FASTQ('%s.3.fastq' % templ) names1 = [x.fullname for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo /1', 'bar /2']) names2 = [x.fullname for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo /2', 'baz /1']) names3 = [x.fullname for x in fq3.fetch(quiet=True)] self.assertEqual(names3, ['bar /1', 'baz /2']) fq1.close() fq2.close() fq3.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ) os.unlink('%s.3.fastq' % templ)
def testSplitUnpaired(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) names1 = [x.name for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo', 'bar', 'baz']) names2 = [x.name for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo', 'bar', 'baz']) fq1.close() fq2.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ)
if os.path.exists(arg): usage("File %s exists!" % arg) outname1 = arg elif not outname2: if os.path.exists(arg): usage("File %s exists!" % arg) outname2 = arg if not fqname1 or not fqname2 or not outname1 or not outname2: usage() fq1 = FASTQ(fqname1) fq2 = FASTQ(fqname2) if gz: out1 = gzip.open(outname1, 'w') out2 = gzip.open(outname2, 'w') else: out1 = open(outname1, 'w') out2 = open(outname2, 'w') total1, total2, matched = find_fastq_pairs(fq1, fq2, out1, out2) print "Totals: %s, %s" % (total1, total2) print "Proper pairs: %s" % matched fq1.close() fq2.close() out1.close() out2.close()
_d_file = None if discard_fname: _d_file = open(discard_fname, 'w') def _callback(name): _d_file.write('%s\n' % name[1:]) discard = _callback fq = FASTQ(fname) chain = FASTQReader(fq, veryverbose) for config in filters_config: if verbose: sys.stderr.write(config[0].__name__) sys.stderr.write('\t%s\n' % '\t'.join([str(x) for x in config[1:]])) clazz = config[0] opts = config[1:] if clazz == QualFilter: chain = clazz(chain, *opts, verbose=veryverbose, discard=discard, illumina=illumina) else: chain = clazz(chain, *opts, verbose=veryverbose, discard=discard) fastq_filter(chain) if _d_file: _d_file.close() fq.close()
if not fqname1 or not fqname2 or not outname1 or not outname2: usage() if not force: for fname in [outname1, outname2]: if os.path.exists(fname): usage("File %s exists!" % fname) fq1 = FASTQ(fqname1) fq2 = FASTQ(fqname2) if gz: out1 = gzip.open(outname1, 'w') out2 = gzip.open(outname2, 'w') else: out1 = open(outname1, 'w') out2 = open(outname2, 'w') paired, discard_1, discard_2 = find_fastq_pairs(fq1, fq2, out1, out2, tmpdir) print "Proper pairs: %s" % paired print "Discarded 1 : %s" % discard_1 print "Discarded 2 : %s" % discard_2 fq1.close() fq2.close() out1.close() out2.close()
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort(fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort(fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort( fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort( fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2
for read in fastq.fetch(quiet=quiet): seq = ngsutils.support.revcomp(read.seq) qual = read.qual[::-1] read.clone(seq=seq, qual=qual).write(out) def usage(): print __doc__ print "Usage: fastqutils revcomp filename.fastq{.gz}" sys.exit(1) if __name__ == '__main__': fname = None for arg in sys.argv[1:]: if arg == '-h': usage() if not fname and os.path.exists(arg): fname = arg if not fname: usage() fq = FASTQ(fname) fastq_revcomp(fq) fq.close()
outname = None read1_fname = None read2_fname = None force = False for arg in sys.argv[1:]: if arg == "-f": force = True elif not outname: if not force and os.path.exists(arg): usage("Output file exists! (Use -f to force overwriting): %s" % arg) outname = arg elif not read1_fname and os.path.exists(arg): read1_fname = arg elif not read2_fname and os.path.exists(arg): read2_fname = arg if not outname or not read1_fname: usage() read1 = FASTQ(read1_fname) read2 = FASTQ(read2_fname) if read2_fname else None bam = pysam.Samfile(outname, "wb") export_bam(bam, read1, read2) bam.close() read1.close() read2.close()
read1_fname = None read2_fname = None force = False for arg in sys.argv[1:]: if arg == '-f': force = True elif not outname: if not force and os.path.exists(arg): usage('Output file exists! (Use -f to force overwriting): %s' % arg) outname = arg elif not read1_fname and os.path.exists(arg): read1_fname = arg elif not read2_fname and os.path.exists(arg): read2_fname = arg if not outname or not read1_fname: usage() read1 = FASTQ(read1_fname) read2 = FASTQ(read2_fname) if read2_fname else None bam = pysam.Samfile(outname, 'wb') export_bam(bam, read1, read2) bam.close() read1.close() read2.close()