def writeKmersAndCounts(z, xs, nm=None): if nm is None: xNm = 'kmers' cNm = 'counts' else: xNm = nm + '-kmers' cNm = nm + '-counts' t = tmpfile() with open(t, 'w') as f: ys = demuxKmersAndCounts(xs, f) writeKmers(z, ys, xNm) z.add_file(cNm, t) os.remove(t)
def test_rwVector(): N = 65536 random.seed(17) xs = [int(0.5 + 10 * random.expovariate(0.5)) for i in xrange(N)] with autoremove(): t = tmpfile() with casket(t, 'w') as z: writeVector(z, xs, 'quux') with casket(t, 'r') as z: ys = readVectorList(z, 'quux') assert len(xs) == len(ys) for i in xrange(len(xs)): assert xs[i] == ys[i]
def test_kmersList(): K = 25 M = (1 << (2 * K)) - 1 N = 65536 random.seed(17) xs = [random.randint(0, M) for i in xrange(N)] xs.sort() with autoremove(): t = tmpfile() with casket(t, 'w') as z: zs = [x for x in xs] writeKmersList(z, zs) with casket(t, 'r') as z: ys = list(readKmers(z)) assert len(xs) == len(ys) for i in xrange(len(xs)): assert xs[i] == ys[i], '%d\t%s\t%s' % (i, render( K, xs[i]), render(K, ys[i]))
def mergeNinto(K, xss, hist, acgt, z, nm = None): if nm is None: knm = 'kmers' cnm = 'counts' else: knm = nm + '-kmers' cnm = nm + '-counts' t = tmpfile() with z.add_stream(knm) as kf, open(t, 'w') as cf: with kmerWriter(kf) as kx, countsWriter(cf) as cx: ss = [_kmerRadixBlockStream(K, xs) for xs in xss] q = heap(ss) while len(q) > 0: v = q.front() r = v.radix ys = {} while len(q) > 0 and v.radix == r: for (x, c) in v.xs: ys[x] = c + ys.get(x, 0) v.next() if v.done(): q.pop() if len(q) > 0: v = q.front() else: q.modifyfront() v = q.front() ys = ys.items() ys.sort() for (x, c) in ys: hist[c] = 1 + hist.get(c, 0) acgt[x&3] += c kx.append(x) cx.append(c) z.add_file(cnm, t) os.remove(t)
def __init__(self, z, zfn, comp): self.z = z self.zfn = zfn self.comp = comp self.tfn = tmpfile('.szf') self.tf = open(self.tfn, 'w')
def main(argv): opts = docopt.docopt(__doc__, argv) K = 25 nms = [] idx = {} for (nm, seq) in readFasta(openFile(opts['<baits>'])): n = len(nms) nms.append(nm) for x in kmersList(K, seq, True): if x not in idx: idx[x] = set([]) idx[x].add(n) for x in idx.keys(): idx[x] = list(idx[x]) idx[x].sort() anti = set([]) if opts['-U']: with openFile(opts['-U']) as f: for (nm, seq) in readFasta(f): for x in kmersList(K, seq, True): anti.add(x) rn = 0 if opts['-p']: hist = {} for (fn1, fn2) in pairs(opts['<input>']): tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq')) for i in xrange(len(nms))] cache = [[[], []] for i in xrange(len(nms))] counts = [0 for i in xrange(len(nms))] with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): hits = set([]) pushup = False for x in kmersList(K, fq1[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) for x in kmersList(K, fq2[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) if pushup: continue n = len(hits) hist[n] = 1 + hist.get(n, 0) for i in hits: counts[i] += 1 cache[i][0].append(fq1) cache[i][1].append(fq2) if len(cache[i][0]) >= 1024: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] for i in xrange(len(cache)): if len(cache[i][0]) > 0: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] with zipfile.ZipFile(opts['<output>'], 'w', zipfile.ZIP_DEFLATED) as z: for i in xrange(len(nms)): if counts[i] > 0: pth = '/'.join(nms[i].split()) z.write(tmps[i][0], pth + '/' + fn1) os.remove(tmps[i][0]) z.write(tmps[i][1], pth + '/' + fn2) os.remove(tmps[i][1]) hist = hist.items() hist.sort() for (n, f) in hist: print '%d\t%d' % (n, f) else: raise "not implemented"
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator2(K) n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) B = opts['-C'] if B is not None: xs = set([]) for (nm, seq) in readFasta(openFile(B)): xs |= set(kmersList(K, seq, True)) B = xs tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: nr = 0 for itm in reads(opts['<input>'], K=K, pairs=False, reads=False, kmers=True, both=True, verbose=verbose): xs = itm.kmers[0] for x in xs: acgt[x & 3] += 1 if d is not None: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) elif B is not None: found = False for x in xs: if x in B: found = True break if found: buf.addList(xs) for x in xs: m += 1 n += 1 else: buf.addList(xs) for x in xs: m += 1 n += 1 nr += 1 if (nr & 1023) == 0 and buf.mem() >= Z // 2: fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf.clear() n = 0 if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf = [] with zotk.kmers(out, 'w') as z: h = {} if len(tmps) == 0: for c in buf.countsOnly(): h[c] = 1 + h.get(c, 0) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly()) elif len(tmps) == 1: with casket(tmpnm, 'r') as z0: writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0])) else: with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, z) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr os.remove(tmpnm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = None out = opts['<output>'] px = list(pairs(opts['<input>'])) if len(px) == 1: with kmers(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] ix = px[0] if len(ix) == 1: with kmers(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) zs = hist(xs, h, acgt) writeKmersAndCounts(z, xs) else: with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1: K = z0.meta['K'] K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) ys = readKmersAndCounts(z1) zs = hist(merge(xs, ys), h, acgt) writeKmersAndCounts(z, zs) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt return tmps = [] tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: for ix in px: if len(ix) == 1: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with kmers(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) writeKmersAndCounts(z, xs, nm) else: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) ys = readKmersAndCounts(z1) writeKmersAndCounts(z, merge(xs, ys), nm) assert K is not None with kmers(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, acgt, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt os.remove(tmpnm)