def fastq_sort(fastq, byname=True, bysequence=False, tmpdir=None, chunksize=100000, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if byname: chunk.append((read.name, read)) if bysequence: chunk.append((read.seq, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk)) sys.stderr.write('Merging chunks...\n') buf = [None, ] * len(tmpfiles) skip = [False, ] * len(tmpfiles) eta = ETA(count) j=0 writing = True while writing: j+=1 eta.print_status(j) for i, fobj in enumerate(tmpfiles): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if byname: buf[i] = (read.name, i, read) if bysequence: buf[i] = (read.seq, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done()
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if bysequence: chunk.append((read.seq, read)) else: chunk.append((read.name, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) sys.stderr.write('\nMerging chunks...\n') sys.stderr.flush() buf = [None, ] * len(tmpfiles) skip = [False, ] * len(tmpfiles) eta = ETA(count) j=0 writing = True if nogz: tmpfobjs = [open(x) for x in tmpfiles] else: tmpfobjs = [gzip.open(x) for x in tmpfiles] while writing: j+=1 eta.print_status(j) for i, fobj in enumerate(tmpfobjs): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if bysequence: buf[i] = (read.seq, i, read) else: buf[i] = (read.name, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done() for fobj in tmpfobjs: fobj.close() for tmpfile in tmpfiles: os.unlink(tmpfile)
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if bysequence: chunk.append((read.seq, read)) else: chunk.append((read.name, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) sys.stderr.write('\nMerging chunks...\n') sys.stderr.flush() buf = [ None, ] * len(tmpfiles) skip = [ False, ] * len(tmpfiles) eta = ETA(count) j = 0 writing = True if nogz: tmpfobjs = [open(x) for x in tmpfiles] else: tmpfobjs = [gzip.open(x) for x in tmpfiles] while writing: j += 1 eta.print_status(j) for i, fobj in enumerate(tmpfobjs): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if bysequence: buf[i] = (read.seq, i, read) else: buf[i] = (read.name, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done() for fobj in tmpfobjs: fobj.close() for tmpfile in tmpfiles: os.unlink(tmpfile)