def main(argv): opts = docopt.docopt(__doc__, argv) p = 0.01 if opts['-P'] is not None: p = float(opts['-P']) inp = opts['<input>'] out = opts['<output>'] with container(out, 'w') as z: h = {} with container(inp, 'r') as z0: K = z0.meta['K'] z.meta = z0.meta.copy() del z.meta['kmers'] del z.meta['counts'] xs = readKmersAndCounts(z0) if opts['-D'] is None: if opts['-S'] is not None: S = long(opts['-S']) random.seed(S) writeKmersAndCounts(K, sampleR(p, xs, h), z) else: S = 0 if opts['-S'] is not None: S = long(opts['-S']) writeKmersAndCounts(K, sampleD(p, S, xs, h), z) z.meta['hist'] = h
def test_merg2_0(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [(random.randint(0, M), pois(10)) for i in xrange(N)] xs.sort() ys = [(random.randint(0, M), pois(10)) for i in xrange(N)] ys.sort() nm0 = tmpfile() with container(nm0, 'w') as z: writeKmersAndCounts(K, xs, z, 'xs') writeKmersAndCounts(K, ys, z, 'ys') nm1 = tmpfile() h = {} with container(nm0, 'r') as z0, container(nm1, 'w') as z: merge2(z, K, readKmersAndCounts(z0, 'xs'), readKmersAndCounts(z0, 'ys'), h, 'zs') h = h.items() h.sort() ws = {} for (x, c) in xs: ws[x] = c + ws.get(x, 0) for (y, c) in ys: ws[y] = c + ws.get(y, 0) ws = ws.items() ws.sort() with container(nm1, 'r') as z: zs = list(readKmersAndCounts(z, 'zs')) assert len(ws) == len(zs) for i in xrange(len(ws)): assert ws[i] == zs[i] h1 = {} for (_, c) in ws: h1[c] = 1 + h1.get(c, 0) h1 = h1.items() h1.sort() assert len(h) == len(h1) for i in xrange(len(h)): assert h[i] == h1[i]
def test_std_0(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [(random.randint(0, M), pois(10)) for i in xrange(N)] nm = tmpfile() with container.container(nm, 'w') as z: std.writeKmersAndCounts(K, xs, z, 'wibble') with container.container(nm, 'r') as z: ys = list(std.readKmersAndCounts(z, 'wibble')) assert len(ys) == N for i in xrange(N): assert xs[i] == ys[i] os.remove(nm)
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] with container(inp, 'r') as z: K = z.meta['K'] if 'kmers' not in z.meta: print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % ( inp, ) return if 'counts' in z.meta: xs = readKmersAndCounts(z) for (x, c) in xs: print '%s\t%d' % (render(K, x), c) else: xs = readKmers(z) for x in xs: print render(K, x)
def main(argv): opts = docopt.docopt(__doc__, argv) with container(opts['<ref>'], 'r') as z: K = z.meta['K'] xs = array.array('L', readKmers(z)) Z = len(xs) with container(opts['<input>'], 'r') as z0: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K (%d)" % (K0, ) sys.exit(1) with container(opts['<output>'], 'w') as z: if 'counts' in z0.meta: ys = readKmersAndCounts(z0) writeKmersAndCounts(K, project2(xs, ys), z) else: ys = readKmers(z0) writeKmers(K, project1(xs, ys), z)
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] out = opts['<output>'] c = 0 if opts['-c'] is not None: c = int(opts['-c']) with container(inp, 'r') as z: K = z.meta['K'] h = z.meta['hist'] if c == 0: c = infer(K, h) print >> sys.stderr, 'inferred cutoff:', c xs = readKmersAndCounts(z) with container(out, 'w') as w: w.meta = z.meta.copy() del w.meta['kmers'] del w.meta['counts'] writeKmersAndCounts(K, trim(xs, c), w)
def main(argv): opts = docopt.docopt(__doc__, argv) K = None out = opts['<output>'] px = list(pairs(opts['<input>'])) if len(px) == 1: with container(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] ix = px[0] if len(ix) == 1: with container(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) zs = hist(xs, h, acgt) writeKmersAndCounts(K, xs, z) else: with container(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) with container(ix[1], 'r') as z1: K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) ys = readKmersAndCounts(z1) zs = hist(merge(xs, ys), h, acgt) writeKmersAndCounts(K, zs, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt return tmps = [] tmpnm = tmpfile('.pmc') with container(tmpnm, 'w') as z: for ix in px: if len(ix) == 1: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with container(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) writeKmersAndCounts(K, xs, z, nm) else: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with container(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) with container(ix[1], 'r') as z1: K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) ys = readKmersAndCounts(z1) writeKmersAndCounts(K, merge(xs, ys), z, nm) assert K is not None with container(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] with container(tmpnm, 'r') as z0: zs = None for fn in tmps: xs = readKmersAndCounts(z0, fn) if zs is None: zs = xs else: zs = merge(zs, xs) zs = hist(zs, h, acgt) writeKmersAndCounts(K, zs, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt os.remove(tmpnm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator() n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) tmpnm = tmpfile('.pmc') with container(tmpnm, 'w') as z: pass PN = 1024 * 1024 nr = 0 t0 = time.time() for fn in opts['<input>']: for rds in mkParser(fn): for (nm, seq) in rds: nr += 1 if nr & (PN - 1) == 0: t1 = time.time() print >> sys.stderr, 'reads processed:', nr, (PN) / ( t1 - t0), 'reads/second' t0 = t1 #buf.stat() xs = kmersList(K, seq, True) if d is None: buf.addList(xs) for x in xs: acgt[x & 3] += 1 m += 1 n += 1 else: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) acgt[x & 3] += 1 m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) if 8 * n >= Z: fn = 'tmps-%d' % (len(tmps), ) #print >> sys.stderr, "writing " + fn + "\t" + tmpnm tmps.append(fn) with container(tmpnm, 'a') as z: writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn) buf.clear() n = 0 t1 = time.time() print >> sys.stderr, 'reads processed:', nr, (nr % PN) / ( t1 - t0), 'reads/second' if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) #print >> sys.stderr, "writing " + fn + "\t" + tmpnm tmps.append(fn) with container(tmpnm, 'a') as z: writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn) buf = [] while len(tmps) > 2: tmpnm2 = tmpfile('.pmc') tmps2 = [] with container(tmpnm, 'r') as z0, container(tmpnm2, 'w') as z: ps = pairs(tmps) for p in ps: fn = 'tmps-%d' % (len(tmps2), ) tmps2.append(fn) if len(p) == 1: writeKmersAndCounts(K, readKmersAndCounts(z0, p[0]), z, fn) continue h = {} merge2(z, K, readKmersAndCounts(z0, p[0]), readKmersAndCounts(z0, p[1]), h, fn) os.remove(tmpnm) tmpnm = tmpnm2 tmps = tmps2 with container(out, 'w') as z: h = {} if len(tmps) == 0: zs = hist(mkPairs(buf.kmers()), h) writeKmersAndCounts(K, zs, z) elif len(tmps) == 1: with container(tmpnm, 'r') as z0: writeKmersAndCounts(K, hist(readKmersAndCounts(z0, tmps[0]), h), z) else: assert len(tmps) == 2 with container(tmpnm, 'r') as z0: merge2(z, K, readKmersAndCounts(z0, tmps[0]), readKmersAndCounts(z0, tmps[1]), h) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr