def main(argv): opts = docopt.docopt(__doc__, argv) with kmers(opts['<ref>'], 'r') as z: K = z.meta['K'] xs = array.array('L', readKmers(z)) Z = len(xs) with kmers(opts['<input>'], 'r') as z0: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K (%d)" % (K0, ) sys.exit(1) with kmers(opts['<output>'], 'w') as z: z.meta['K'] = K if 'counts' in z0.meta: ys = readKmersAndCounts(z0) writeKmersAndCounts(z, project2(xs, ys)) z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' else: ys = readKmers(z0) writeKmers(z, project1(xs, ys)) z.meta['kmers'] = 'kmers' z.meta['hist'] = z0.meta['hist']
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] out = opts['<output>'] c = 0 if opts['-c'] is not None: c = int(opts['-c']) C = None if opts['-C'] is not None: C0 = int(opts['-C']) if C0 > 0: C = C0 with kmers(inp, 'r') as z: K = z.meta['K'] h = z.meta['hist'] if c == 0: c = infer(K, h) print >> sys.stderr, 'inferred cutoff:', c xs = readKmersAndCounts(z) with kmers(out, 'w') as w: w.meta = z.meta.copy() del w.meta['kmers'] del w.meta['counts'] writeKmersAndCounts(w, trim(xs, c, C)) w.meta['K'] = K w.meta['kmers'] = 'kmers' w.meta['counts'] = 'counts' w.meta['hist'] = h
def main(argv): opts = docopt.docopt(__doc__, argv) p = 0.01 if opts['-P'] is not None: p = float(opts['-P']) inp = opts['<input>'] out = opts['<output>'] with kmers(out, 'w') as z: h = {} with kmers(inp, 'r') as z0: K = z0.meta['K'] z.meta = z0.meta.copy() del z.meta['kmers'] del z.meta['counts'] xs = readKmersAndCounts(z0) S = 0 if opts['-D'] is None: if opts['-S']: S = long(opts['-S']) random.seed(S) writeKmersAndCounts(z, sampleR(p, xs, h)) else: if opts['-S']: S = long(opts['-S']) writeKmersAndCounts(z, sampleD(p, S, xs, h)) z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] with kmers(inp, 'r') as z: K = z.meta['K'] if 'kmers' not in z.meta: print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (inp,) return if 'counts' in z.meta: xs = readKmersAndCounts(z) for (x, c) in xs: print '%s\t%d' % (render(K, x), c) else: xs = readKmers(z) for x in xs: print render(K, x)
def main(argv): opts = docopt.docopt(__doc__, argv) K = getK(opts['<input>']) J = K - 1 M = (1 << (2 * (K - J))) - 1 if opts['-r'] is not None: with kmers(opts['-r'], 'r') as z: xs = list(group(K, J, 0, readKmersAndCounts(z))) for fn in opts['<input>']: with kmers(fn, 'r') as z: samXs = readKmersAndCounts(z) i = 0 for (yCtx, _, yGrp) in group(K, J, 0, samXs): while i < len(xs) and xs[i][0] < yCtx: i += 1 assert i < len(xs) assert xs[i][0] == yCtx gt = float(sum([c for (x,c) in xs[i][2]])) gx = [0 for j in xrange(M+1)] for (x,c) in xs[i][2]: gx[x&M] = c st = sum([c for (x,c) in yGrp]) sx = [0 for j in xrange(M+1)] for (x,c) in yGrp: sx[x&M] = c ss = [] b = 0 for j in xrange(M+1): p = float(gx[j])/gt v = 0.0 if 0.0 < p and p < 1.0: v = logBinGe(p, st, sx[j]) if v < -10: b |= 1 << j ss.append('%3.2g' % (v,)) if b > 0: print '%s\t%s\t%s' % (render(J, yCtx), fasta(b), '\t'.join(ss)) i += 1 return # Parse files in parallel to get global distribution N = len(opts['<input>']) h = heap.heap() i = 0 for fn in opts['<input>']: (_, xs) = kfset.read(fn) i += 1 h.push(Group(K, J, i, xs)) while len(h) > 0: xfs = [] g = h.pop() gy = g.this()[0] xfs.append(g.this()) g.next() if g.valid(): h.push(g) for x in h.xs: assert x.valid() while len(h) > 0 and h.front().this()[0] == gy: g = h.pop() xfs.append(g.this()) g.next() if g.valid(): h.push(g) for i in xrange(len(h.xs)): assert h.xs[i].valid() ds = [] gc = [0 for i in xrange(M+1)] for (_, n, xc) in xfs: t = sum([c for (x,c) in xc]) d = [0 for i in xrange(M+1)] for (x,c) in xc: j = x & M gc[j] += c d[j] = c ds.append((n, d)) res = ['*' for i in xrange(N)] seen = set([]) gt = float(sum(gc)) for (n, d) in ds: t = sum(d) b = [0 for i in xrange((M+1)/4)] for i in xrange(M+1): p = float(gc[i])/gt if 0.0 < p and p < 1.0: #vL = logBinLe(p, t, d[i]) #vG = logBinGe(p, t, d[i]) #v = min(vL, vG) v = logBinGe(p, t, d[i]) if v > -10: w = i >> 2 j = i & 3 b[w] |= 1 << j res[n-1] = ''.join([fasta(b0) for b0 in b]) seen.add(res[n-1]) if len(seen) > 1: print '%s\t%s' % (render(J, gy), '\t'.join(res))
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator2(K) n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) B = opts['-C'] if B is not None: xs = set([]) for (nm, seq) in readFasta(openFile(B)): xs |= set(kmersList(K, seq, True)) B = xs tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: nr = 0 for itm in reads(opts['<input>'], K=K, pairs=False, reads=False, kmers=True, both=True, verbose=verbose): xs = itm.kmers[0] for x in xs: acgt[x & 3] += 1 if d is not None: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) elif B is not None: found = False for x in xs: if x in B: found = True break if found: buf.addList(xs) for x in xs: m += 1 n += 1 else: buf.addList(xs) for x in xs: m += 1 n += 1 nr += 1 if (nr & 1023) == 0 and buf.mem() >= Z // 2: fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf.clear() n = 0 if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf = [] with zotk.kmers(out, 'w') as z: h = {} if len(tmps) == 0: for c in buf.countsOnly(): h[c] = 1 + h.get(c, 0) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly()) elif len(tmps) == 1: with casket(tmpnm, 'r') as z0: writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0])) else: with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, z) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr os.remove(tmpnm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = None out = opts['<output>'] px = list(pairs(opts['<input>'])) if len(px) == 1: with kmers(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] ix = px[0] if len(ix) == 1: with kmers(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) zs = hist(xs, h, acgt) writeKmersAndCounts(z, xs) else: with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1: K = z0.meta['K'] K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) ys = readKmersAndCounts(z1) zs = hist(merge(xs, ys), h, acgt) writeKmersAndCounts(z, zs) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt return tmps = [] tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: for ix in px: if len(ix) == 1: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with kmers(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) writeKmersAndCounts(z, xs, nm) else: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) ys = readKmersAndCounts(z1) writeKmersAndCounts(z, merge(xs, ys), nm) assert K is not None with kmers(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, acgt, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt os.remove(tmpnm)