def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write(outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False): fastq = FASTQ(fname) if ignore_pairs: is_paired = False else: is_paired = fastq.is_paired outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = chunks last_name = None for read in fastq.fetch(quiet=quiet): if not is_paired: i += 1 elif read.name != last_name: i += 1 if i >= len(outs): i = 0 last_name = read.name read.write(outs[i]) for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def testFQRead(self): fq = StringIO.StringIO('''\ @foo ACGTacgtACGT + CDEFGHIJKLMN ''') out = StringIO.StringIO('') ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq), out=out, quiet=True) out.seek(0) fqout = FASTQ(fileobj=out) read = fqout.fetch().next() self.assertEqual(read.name, 'foo') self.assertEqual(read.seq, 'ACGTacgtACGT') self.assertEqual(read.qual, "$%&'()*+,-./")
def assert_fastq_contains(self, base, args): for tag in args: valid = args[tag][0].split() seq_qual = {} if args[tag][1]: for n, s, q in zip(valid, args[tag][1].split(), args[tag][2].split()): seq_qual[n] = (s, q) fq = FASTQ(base % tag) count = 0 for read in fq.fetch(): if read.name in valid: count += 1 if seq_qual: self.assertEqual(seq_qual[read.name], (read.seq, read.qual)) else: self.assertEqual('extra read in %s' % tag, read.name) self.assertEqual(count, len(valid))
def testSplitUnpaired(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) names1 = [x.name for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo', 'bar', 'baz']) names2 = [x.name for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo', 'bar', 'baz']) fq1.close() fq2.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ)
outname = None read1_fname = None read2_fname = None force = False for arg in sys.argv[1:]: if arg == "-f": force = True elif not outname: if not force and os.path.exists(arg): usage("Output file exists! (Use -f to force overwriting): %s" % arg) outname = arg elif not read1_fname and os.path.exists(arg): read1_fname = arg elif not read2_fname and os.path.exists(arg): read2_fname = arg if not outname or not read1_fname: usage() read1 = FASTQ(read1_fname) read2 = FASTQ(read2_fname) if read2_fname else None bam = pysam.Samfile(outname, "wb") export_bam(bam, read1, read2) bam.close() read1.close() read2.close()
if not os.path.exists(arg): usage("File %s doesn't exist!" % arg) fqname2 = arg elif not outname1: if os.path.exists(arg): usage("File %s exists!" % arg) outname1 = arg elif not outname2: if os.path.exists(arg): usage("File %s exists!" % arg) outname2 = arg if not fqname1 or not fqname2 or not outname1 or not outname2: usage() fq1 = FASTQ(fqname1) fq2 = FASTQ(fqname2) if gz: out1 = gzip.open(outname1, 'w') out2 = gzip.open(outname2, 'w') else: out1 = open(outname1, 'w') out2 = open(outname2, 'w') total1, total2, matched = find_fastq_pairs(fq1, fq2, out1, out2) print "Totals: %s, %s" % (total1, total2) print "Proper pairs: %s" % matched fq1.close()
fname = arg if not fname or not filters_config: usage() discard = None _d_file = None if discard_fname: _d_file = open(discard_fname, 'w') def _callback(name): _d_file.write('%s\n' % name[1:]) discard = _callback fq = FASTQ(fname) chain = FASTQReader(fq, veryverbose) for config in filters_config: if verbose: sys.stderr.write(config[0].__name__) sys.stderr.write('\t%s\n' % '\t'.join([str(x) for x in config[1:]])) clazz = config[0] opts = config[1:] if clazz == QualFilter: chain = clazz(chain, *opts, verbose=veryverbose, discard=discard, illumina=illumina) else: chain = clazz(chain, *opts, verbose=veryverbose, discard=discard)
def testSplitThree(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) self.assertTrue(os.path.exists('%s.3.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) fq3 = FASTQ('%s.3.fastq' % templ) names1 = [x.fullname for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo /1', 'bar /2']) names2 = [x.fullname for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo /2', 'baz /1']) names3 = [x.fullname for x in fq3.fetch(quiet=True)] self.assertEqual(names3, ['bar /1', 'baz /2']) fq1.close() fq2.close() fq3.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ) os.unlink('%s.3.fastq' % templ)
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort(fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort(fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2
read.clone(name=name, comment=comment).write(out) def usage(): print __doc__ print """Usage: fastqutils merge {-slash} file1.fastq{.gz} file2.fastq{.gz} ... -slash Split the read name at a '/' (Illumina paired format) """ sys.exit(1) if __name__ == '__main__': fnames = [] split_slashes = False for arg in sys.argv[1:]: if arg == '-slash': split_slashes = True elif os.path.exists(arg): fnames.append(arg) if len(fnames) < 2: usage() fastqs = [FASTQ(x) for x in fnames] fastq_merge(fastqs, split_slashes) for fq in fastqs: fq.close()
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort( fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort( fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2
usage("File %s doesn't exist!" % arg) fqname2 = arg elif not outname1: outname1 = arg elif not outname2: outname2 = arg if not fqname1 or not fqname2 or not outname1 or not outname2: usage() if not force: for fname in [outname1, outname2]: if os.path.exists(fname): usage("File %s exists!" % fname) fq1 = FASTQ(fqname1) fq2 = FASTQ(fqname2) if gz: out1 = gzip.open(outname1, 'w') out2 = gzip.open(outname2, 'w') else: out1 = open(outname1, 'w') out2 = open(outname2, 'w') paired, discard_1, discard_2 = find_fastq_pairs(fq1, fq2, out1, out2, tmpdir) print "Proper pairs: %s" % paired print "Discarded 1 : %s" % discard_1 print "Discarded 2 : %s" % discard_2
for read in fastq.fetch(quiet=quiet): if include_comment: out.write('%s%s%s\n' % (read.name, ' ' if read.comment else '', read.comment)) else: out.write('%s\n' % read.name) def usage(): print __doc__ print "Usage: fastqutils names {-comment} filename.fastq{.gz}" sys.exit(1) if __name__ == '__main__': fname = None include_comment = False for arg in sys.argv[1:]: if arg == '-comment': include_comment = True elif os.path.exists(arg): fname = arg if not fname: usage() fq = FASTQ(fname) export_names(fq, include_comment) fq.close()