Пример #1
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    with kmers(opts['<ref>'], 'r') as z:
        K = z.meta['K']
        xs = array.array('L', readKmers(z))
    Z = len(xs)

    with kmers(opts['<input>'], 'r') as z0:
        K0 = z0.meta['K']
        if K0 != K:
            print >> sys.stderr, "mismatched K (%d)" % (K0, )
            sys.exit(1)

        with kmers(opts['<output>'], 'w') as z:
            z.meta['K'] = K
            if 'counts' in z0.meta:
                ys = readKmersAndCounts(z0)
                writeKmersAndCounts(z, project2(xs, ys))
                z.meta['kmers'] = 'kmers'
                z.meta['counts'] = 'counts'
            else:
                ys = readKmers(z0)
                writeKmers(z, project1(xs, ys))
                z.meta['kmers'] = 'kmers'
            z.meta['hist'] = z0.meta['hist']
Пример #2
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    p = 0.01
    if opts['-P'] is not None:
        p = float(opts['-P'])
    inp = opts['<input>']
    out = opts['<output>']
    with kmers(out, 'w') as z:
        h = {}
        with kmers(inp, 'r') as z0:
            K = z0.meta['K']
            z.meta = z0.meta.copy()
            del z.meta['kmers']
            del z.meta['counts']
            xs = readKmersAndCounts(z0)
            S = 0
            if opts['-D'] is None:
                if opts['-S']:
                    S = long(opts['-S'])
                    random.seed(S)
                writeKmersAndCounts(z, sampleR(p, xs, h))
            else:
                if opts['-S']:
                    S = long(opts['-S'])
                writeKmersAndCounts(z, sampleD(p, S, xs, h))
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
Пример #3
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    out = opts['<output>']

    c = 0
    if opts['-c'] is not None:
        c = int(opts['-c'])

    C = None
    if opts['-C'] is not None:
        C0 = int(opts['-C'])
        if C0 > 0:
            C = C0

    with kmers(inp, 'r') as z:
        K = z.meta['K']
        h = z.meta['hist']
        if c == 0:
            c = infer(K, h)
            print >> sys.stderr, 'inferred cutoff:', c
        xs = readKmersAndCounts(z)
        with kmers(out, 'w') as w:
            w.meta = z.meta.copy()
            del w.meta['kmers']
            del w.meta['counts']
            writeKmersAndCounts(w, trim(xs, c, C))
            w.meta['K'] = K
            w.meta['kmers'] = 'kmers'
            w.meta['counts'] = 'counts'
            w.meta['hist'] = h
Пример #4
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        if opts['-K']:
            K = int(opts['-K'])

        buildIndex(K, opts['<input>'], opts['<alleles>'])

        return

    idx = index(opts['<alleles>'])

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            K0 = z.meta['K']
            if K0 != idx.K:
                print >> sys.stderr, 'Input "%d" has different k to index' % (
                    inp, )
                sys.exit(1)
            xs = readKmers(z)

            cs = [idx.lens[i] for i in xrange(len(idx.lens))]

            for x in xs:
                for j in idx[x]:
                    cs[j] -= 1

            for j in xrange(len(idx.lens)):
                assert cs[j] >= 0
                if cs[j] == 0:
                    print '%s\t%d\t%s' % (inp, j, idx.names[j])
Пример #5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            itms = z.meta.items()
            itms.sort()
            for (k, v) in itms:
                print k, v
Пример #6
0
def getK(ins):
    k = None
    for fn in ins:
        with kmers(fn, 'r') as z:
            k0 = z.meta['K']
            if k is None:
                k = k0
            elif k != k0:
                raise MismatchedK(k, k0)
    return k
Пример #7
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            if 'hist' in z.meta:
                h = z.meta['hist'].items()
                h = [(int(f), c) for (f, c) in h]
                h.sort()
                for (f,c) in h:
                    print '%s\t%d\t%d' % (inp, f, c)
Пример #8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    L0 = None
    if opts['-l']:
        L0 = int(opts['-l'])

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            K = z.meta['K']
            L = L0
            if L is None:
                L = 2*K

            xs = array.array('L', readKmers(z))
            S = sparse(2*K, xs)

            seen = bitvec(S.count())
            for i in xrange(S.count()):
                if seen[i]:
                    continue

                x = S.select(i)
                xb = rc(K, x)
                xp = succ(K, S, xb)
                if xp == 1:
                    # x isn't the start of a contig
                    continue

                pth = [x]
                seen[i] = 1
                xn = succ(K, S, x)
                while len(xn) == 1:
                    if seen[xn[0]] == 1:
                        break
                    x = S.select(xn[0])
                    pth.append(x)
                    seen[xn[0]] = 1
                    xb = rc(K, x)
                    j = S.rank(xb)
                    seen[j] = 1
                    xn = succ(K, S, x)

                if len(pth)+K-1 < L:
                    continue

                s = [render(K, pth[0])]
                for j in xrange(1, len(pth)):
                    s.append("ACGT"[pth[j]&3])

                print '>contig_%d\n%s' % (i, ''.join(s))
Пример #9
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    with kmers(opts['<ref>'], 'r') as z:
        K = z.meta['K']
        xs = readKmers(z)

        if opts['-H'] is not None:
            d = int(opts['-H'])
            ref = hamming(K, d, xs)
        elif opts['-L'] is not None:
            d = int(opts['-L'])
            ref = levenshtein(K, d, xs)
        else:
            ref = ksnp(K, xs)

        xs = []
        for ys in ref:
            xs += ys
    xs.sort()
    with kmers(opts['<output>'], 'w') as z:
        writeKmers(z, xs)
        z.meta['kmers'] = 'kmers'
        z.meta['K'] = K
Пример #10
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    nms = [str(i+1) for i in xrange(len(probesMTB))]
    probes = probesMTB

    if opts['-p'] is not None:
        nms = []
        probes = []
        bad = False
        with open(opts['-p']) as f:
            i = 0
            ln = 0
            for l in f:
                ln += 1
                if l[0] == '#':
                    continue
                i += 1
                t = l.split()
                if len(t) == 1:
                    nms.append(str(i))
                    probes.append(probe(t[0]))
                elif len(t) == 2:
                    nms.append(t[0])
                    probes.append(probe(t[1]))
                else:
                    bad = True
                    print >> sys.stderr, '%s line %d, badly formatted.' % (opts['-p'], i)
        if bad:
            sys.exit(1)

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            K = z.meta['K']
            xs = readKmers(z)
            xs = sparse(2*K, array.array('L', xs))

        res = []
        for i in xrange(len(probes)):
            if findProbe(probes[i], K, xs):
                res.append('1')
            else:
                res.append('0')
        if opts['-l']:
            for i in xrange(len(nms)):
                print '%s\t%s\t%s' % (inp, nms[i], res[i])
        else:
            print inp + '\t' + ''.join(res)
Пример #11
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    with kmers(inp, 'r') as z:
        K = z.meta['K']
        if 'kmers' not in z.meta:
            print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (inp,)
            return
        if 'counts' in z.meta:
            xs = readKmersAndCounts(z)
            for (x, c) in xs:
                print '%s\t%d' % (render(K, x), c)
        else:
            xs = readKmers(z)
            for x in xs:
                print render(K, x)
Пример #12
0
    def prep(self, K, fn):
        with kmers(fn, 'r') as z:
            fK = z.meta['K']
            if fK < K:
                raise MismatchedK(K, fK)
            xs = readKmers(z)
            if self.vec:
                S = 2 * (fK - K)
                v = array.array('I', [0 for i in xrange(1 << (2 * K))])
                for (x, c) in xs:
                    y = x >> S
                    v[y] += c
                return v

            S = 2 * (fK - K)
            v = array.array('L', [])
            for x in xs:
                y = x >> S
                if len(v) == 0 or v[-1] != y:
                    v.append(y)
            return v
Пример #13
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        S = []
        N = 0
        qacgt = [0, 0, 0, 0]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        continue
                    for (x, p) in kmersWithPos(K, seq, True):
                        S.append(x)
                        qacgt[x & 3] += 1
                        N += 1
        S.sort()
        qacgt = [float(c) / float(N) for c in qacgt]
        S = sparse(2 * K, array.array('L', uniq(S)))
        lens = []
        nms = []
        seqs = []
        n = 0
        tmp = [[] for i in xrange(S.count())]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        print >> sys.stderr, "warning: `%s' skipped" % (nm, )
                        continue
                    nms.append(nm)
                    seqs.append(seq)
                    lens.append(len(seq))
                    for (x, p) in kmersWithPos(K, seq, True):
                        r = S.rank(x)
                        tmp[r].append((n, p))
                    n += 1
        T = array.array('I', [])
        U = array.array('I', [])
        V = array.array('i', [])
        t = 0
        for nps in tmp:
            T.append(t)
            t += len(nps)
            for (n, p) in nps:
                U.append(n)
                V.append(p)
        T.append(t)
        del tmp

        gfn = opts['<genes>']
        with casket(gfn, 'w') as z:
            meta = {}
            meta['K'] = K
            meta['lens'] = lens
            meta['qacgt'] = qacgt
            meta['nms'] = nms
            meta['seqs'] = seqs

            z.add_content('__meta__', json.dumps(meta))
            write64(z, S.xs, 'S')
            write32(z, T, 'T')
            write32(z, U, 'U')
            write32s(z, V, 'V')

        return

    print >> sys.stderr, "loading..."

    gfn = opts['<genes>']
    with casket(gfn, 'r') as z:
        mf = z.open('__meta__')
        meta = json.load(mf)
        K = meta['K']
        lens = meta['lens']
        qacgt = meta['qacgt']
        nms = meta['nms']
        seqs = meta['seqs']

        S = read64(z, 'S')
        S = sparse(2 * K, S)
        T = read32(z, 'T')
        U = read32(z, 'U')
        V = read32s(z, 'V')

    print >> sys.stderr, "done."

    for fn in opts['<input>']:
        L = array.array('B', [0 for i in xrange(S.count())])
        Y = array.array('L', [0 for i in xrange(S.count())])
        with kmers(fn, 'r') as z:
            sacgt = z.meta['acgt']
            xs = readKmers(z)
            X = array.array('L', xs)
        M = len(X)
        resolveAll(K, S, L, Y, X)
        X = sparse(2 * K, X)

        g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)])
        print >> sys.stderr, "g =", g
        nm = [null(g, M, j) for j in range(0, K + 1)]

        # counts for computing distribution of prefix lengths
        cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))]

        # the k-mers that we pulled by lcp from the sample
        # for each position of each query.
        P = [
            array.array('L', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        # the length of the lcp for each position of each query.
        Q = [
            array.array('B', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        for i in xrange(S.count()):
            for j in xrange(T[i], T[i + 1]):
                n = U[j]
                p = V[j]
                y = Y[i]
                l = L[i]
                cnt[n][l] += 1
                if p > 0:
                    p -= 1
                else:
                    p = -(p + 1)
                    y = rc(K, y)
                if l > Q[n][p]:
                    Q[n][p] = l
                    P[n][p] = y

        for i in xrange(len(nms)):
            # iterate over the queries

            qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2)

            # Link up "de Bruijn" sequences
            m = (1 << (2 * K - 2)) - 1
            py = 0
            u = unionfind()
            for j in xrange(lens[i] - K + 1):
                x = P[i][j]
                y = x >> 2
                if j > 0:
                    d = ham(py, y)
                    if d == 0:
                        u.union(j - 1, j)
                py = x & m

            # Gather up the de Bruin fragments
            udx = {}
            for j in xrange(lens[i] - K + 1):
                v = u.find(j)
                if v not in udx:
                    udx[v] = []
                udx[v].append(j)

            # Index the left hand k-mers
            idxLhs = {}
            kx = []
            for (jx, js) in udx.iteritems():
                q = 0
                for j in js:
                    q += math.log1p(-nm[Q[i][j]])
                if q > math.log(0.05 / len(js)):
                    continue
                kx.append((-len(js), jx))
                idxLhs[P[i][js[0]]] = jx
            kx.sort()

            # Attempt to link up fragments
            links = {}
            for (_, jx) in kx:
                jR = udx[jx][-1]
                if jR == lens[i] - K + 1:
                    continue
                x = P[i][jR]
                xs = []
                lnk = None
                for k in xrange(100):
                    ys = succ(K, X, x)
                    if len(ys) != 1:
                        break
                    x = ys[0]
                    if x in idxLhs:
                        lnk = idxLhs[x]
                        break
                    xs.append(x)
                if lnk is not None:
                    links[jx] = xs
                    u.union(jx, lnk)

            # Gather up the linked fragments
            vdx = {}
            for j in [jx for (_, jx) in kx]:
                v = u.find(j)
                if v not in vdx:
                    vdx[v] = []
                vdx[v].append(j)

            res = []
            for (jxx, jxs) in vdx.iteritems():
                # Order the gragments by start position
                fs = [(udx[jx][0], jx) for jx in jxs]
                fs.sort()
                sxs = []
                for fj in xrange(len(fs)):
                    (_, jx) = fs[fj]
                    beg = udx[jx][0]
                    end = udx[jx][-1] + 1
                    if fj == 0:
                        for j in xrange(beg):
                            sxs.append((0, 0))
                    xs = links.get(jx, None)
                    for j in xrange(beg, end):
                        x = P[i][j]
                        l = Q[i][j]
                        sxs.append((x, l))
                    if xs:
                        for x in xs:
                            sxs.append((x, 27))
                    else:
                        if fj < len(fs) - 1:
                            nxt = fs[fj + 1][0]
                        else:
                            nxt = lens[i] - K + 1
                        for j in xrange(end, nxt):
                            sxs.append((0, 0))
                seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)]
                for j in xrange(len(sxs)):
                    (x, l) = sxs[j]
                    p = math.log1p(-nm[l])
                    for k in xrange(K):
                        seq[j + K - k - 1][x & 3] += p
                        x >>= 2
                ax = []
                p = None
                inf = False
                for j in xrange(len(seq)):
                    b = 0
                    for k in xrange(4):
                        if seq[j][k] < qc:
                            b |= 1 << k
                    ax.append(fasta(b))
                    ssj = sum(seq[j])
                    if p is None:
                        p = ssj
                    else:
                        p = logAdd(p, ssj)
                    if ssj > -1e-300:
                        inf = True
                dst = counts2cdf(cnt[i])
                (_, kd) = ksDistance2(dst, nm)
                df = math.ceil(len(seq) / float(K))
                if inf:
                    q = 1e300
                    pv = 0.0
                else:
                    q = 2 * math.exp(p)
                    pv = chi2(df, q)
                res.append((pv, q, kd, ''.join(ax)))

            if len(res) == 0:
                continue

            res.sort()
            if res[0][0] < -2:
                #ed = lev(seqs[i], res[0][2])
                ed = 0
                pv = res[0][0] / math.log(10)
                c2 = res[0][1]
                kd = res[0][2]
                a = res[0][3]
                print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (
                    i, lens[i], len(a), kd, c2, pv, nms[i], a)
            sys.stdout.flush()
Пример #14
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)
Пример #15
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    fns = opts['<input>']

    p = None
    if opts['-p'] is not None:
        p = float(opts['-p'])

    if len(fns) == 1 and isFasta(fns[0]):
        K = 25
        seqs = []
        with openFile(fns[0]) as f:
            for (nm, seq) in readFasta(f):
                xs = set(basics.kmers(K, seq, True))
                xs = list(xs)
                xs.sort()
                xs = array.array('L', xs)
                seqs.append((nm.split()[0], xs))
        Z = 1
        if opts['-a']:
            Z = len(seqs)

        print len(seqs)

        for i in xrange(Z):
            xnm = seqs[i][0]
            xs = seqs[i][1]
            for j in xrange(i + 1, len(seqs)):
                ynm = seqs[j][0]
                ys = seqs[j][1]
                (isec, union, d) = jaccard(xs, ys)
                if p is None:
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d)
                else:
                    pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10)
                    q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                    q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d, d - q05,
                        q95 - d, pv)
                sys.stdout.flush()

        return

    Z = 1
    if opts['-a']:
        Z = len(fns)

    for i in xrange(Z):
        with kmers(fns[i], 'r') as z0:
            xK = z0.meta['K']
            xs = array.array('L', readKmers(z0))
            for j in xrange(i + 1, len(fns)):
                with kmers(fns[j], 'r') as z1:
                    yK = z1.meta['K']
                    ys = array.array('L', readKmers(z1))
                    if xK != yK:
                        print >> sys.stderr, 'mismatched K:', fns[j]
                        sys.exit(1)
                    (isec, union, d) = jaccard(xs, ys)
                    if p is None:
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d)
                    else:
                        pv = logIx(p, isec + 1,
                                   (union - isec) + 1) / math.log(10)
                        q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                        q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d,
                            d - q05, q95 - d, pv)
                    sys.stdout.flush()
Пример #16
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = getK(opts['<input>'])
    J = K - 1
    M = (1 << (2 * (K - J))) - 1

    if opts['-r'] is not None:
        with kmers(opts['-r'], 'r') as z:
            xs = list(group(K, J, 0, readKmersAndCounts(z)))

        for fn in opts['<input>']:
            with kmers(fn, 'r') as z:
                samXs = readKmersAndCounts(z)
                i = 0
                for (yCtx, _, yGrp) in group(K, J, 0, samXs):
                    while i < len(xs) and xs[i][0] < yCtx:
                        i += 1
                    assert i < len(xs)
                    assert xs[i][0] == yCtx
                    gt = float(sum([c for (x,c) in xs[i][2]]))
                    gx = [0 for j in xrange(M+1)]
                    for (x,c) in xs[i][2]:
                        gx[x&M] = c
                    st = sum([c for (x,c) in yGrp])
                    sx = [0 for j in xrange(M+1)]
                    for (x,c) in yGrp:
                        sx[x&M] = c
                    ss = []
                    b = 0
                    for j in xrange(M+1):
                        p = float(gx[j])/gt
                        v = 0.0
                        if 0.0 < p and p < 1.0:
                            v = logBinGe(p, st, sx[j])
                            if v < -10:
                                b |= 1 << j
                        ss.append('%3.2g' % (v,))
                    if b > 0:
                        print '%s\t%s\t%s' % (render(J, yCtx), fasta(b), '\t'.join(ss))
                    i += 1
        return

    # Parse files in parallel to get global distribution

    N = len(opts['<input>'])
    h = heap.heap()
    i = 0
    for fn in opts['<input>']:
        (_, xs) = kfset.read(fn)
        i += 1
        h.push(Group(K, J, i, xs))

    while len(h) > 0:
        xfs = []
        g = h.pop()
        gy = g.this()[0]
        xfs.append(g.this())
        g.next()
        if g.valid():
            h.push(g)
        for x in h.xs:
            assert x.valid()
        while len(h) > 0 and h.front().this()[0] == gy:
            g = h.pop()
            xfs.append(g.this())
            g.next()
            if g.valid():
                h.push(g)
            for i in xrange(len(h.xs)):
                assert h.xs[i].valid()

        ds = []
        gc = [0 for i in xrange(M+1)]
        for (_, n, xc) in xfs:
            t = sum([c for (x,c) in xc])
            d = [0 for i in xrange(M+1)]
            for (x,c) in xc:
                j = x & M
                gc[j] += c
                d[j] = c
            ds.append((n, d))

        res = ['*' for i in xrange(N)]
        seen = set([])
        gt = float(sum(gc))
        for (n, d) in ds:
            t = sum(d)
            b = [0 for i in xrange((M+1)/4)]
            for i in xrange(M+1):
                p = float(gc[i])/gt
                if 0.0 < p and p < 1.0:
                    #vL = logBinLe(p, t, d[i])
                    #vG = logBinGe(p, t, d[i])
                    #v = min(vL, vG)
                    v = logBinGe(p, t, d[i])
                    if v > -10:
                        w = i >> 2
                        j = i & 3
                        b[w] |= 1 << j
            res[n-1] = ''.join([fasta(b0) for b0 in b])
            seen.add(res[n-1])
        if len(seen) > 1:
            print '%s\t%s' % (render(J, gy), '\t'.join(res))
Пример #17
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = None

    out = opts['<output>']

    px = list(pairs(opts['<input>']))
    if len(px) == 1:
        with kmers(out, 'w') as z:
            h = {}
            acgt = [0, 0, 0, 0]
            ix = px[0]
            if len(ix) == 1:
                with kmers(ix[0], 'r') as z0:
                    K = z0.meta['K']
                    xs = readKmersAndCounts(z0)
                    zs = hist(xs, h, acgt)
                    writeKmersAndCounts(z, xs)
            else:
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    K = z0.meta['K']
                    K1 = z1.meta['K']
                    if K1 != K:
                        print >> sys.stderr, "mismatched K"
                        sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    zs = hist(merge(xs, ys), h, acgt)
                    writeKmersAndCounts(z, zs)
            n = float(sum(acgt))
            acgt = [c/n for c in acgt]
            z.meta['hist'] = h
            z.meta['acgt'] = acgt
        return

    tmps = []
    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        for ix in px:
            if len(ix) == 1:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    writeKmersAndCounts(z, xs, nm)
            else:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                        K1 = z1.meta['K']
                        if K1 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    writeKmersAndCounts(z, merge(xs, ys), nm)

    assert K is not None

    with kmers(out, 'w') as z:
        h = {}
        acgt = [0, 0, 0, 0]
        with casket(tmpnm, 'r') as z0:
            xss = [readKmersAndCounts(z0, t) for t in tmps]
            mergeNinto(K, xss, h, acgt, z)
        n = float(sum(acgt))
        acgt = [c/n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt

    os.remove(tmpnm)