예제 #1
0
def writeKmersAndCounts(z, xs, nm=None):
    if nm is None:
        xNm = 'kmers'
        cNm = 'counts'
    else:
        xNm = nm + '-kmers'
        cNm = nm + '-counts'
    t = tmpfile()
    with open(t, 'w') as f:
        ys = demuxKmersAndCounts(xs, f)
        writeKmers(z, ys, xNm)
    z.add_file(cNm, t)
    os.remove(t)
예제 #2
0
def test_rwVector():
    N = 65536

    random.seed(17)
    xs = [int(0.5 + 10 * random.expovariate(0.5)) for i in xrange(N)]

    with autoremove():
        t = tmpfile()

        with casket(t, 'w') as z:
            writeVector(z, xs, 'quux')

        with casket(t, 'r') as z:
            ys = readVectorList(z, 'quux')

        assert len(xs) == len(ys)
        for i in xrange(len(xs)):
            assert xs[i] == ys[i]
예제 #3
0
def test_kmersList():
    K = 25
    M = (1 << (2 * K)) - 1
    N = 65536

    random.seed(17)
    xs = [random.randint(0, M) for i in xrange(N)]
    xs.sort()

    with autoremove():
        t = tmpfile()

        with casket(t, 'w') as z:
            zs = [x for x in xs]
            writeKmersList(z, zs)

        with casket(t, 'r') as z:
            ys = list(readKmers(z))

        assert len(xs) == len(ys)
        for i in xrange(len(xs)):
            assert xs[i] == ys[i], '%d\t%s\t%s' % (i, render(
                K, xs[i]), render(K, ys[i]))
예제 #4
0
def mergeNinto(K, xss, hist, acgt, z, nm = None):
    if nm is None:
        knm = 'kmers'
        cnm = 'counts'
    else:
        knm = nm + '-kmers'
        cnm = nm + '-counts'

    t = tmpfile()
    with z.add_stream(knm) as kf, open(t, 'w') as cf:
        with kmerWriter(kf) as kx, countsWriter(cf) as cx:
            ss = [_kmerRadixBlockStream(K, xs) for xs in xss]
            q = heap(ss)
            while len(q) > 0:
                v = q.front()
                r = v.radix
                ys = {}
                while len(q) > 0 and v.radix == r:
                    for (x, c) in v.xs:
                        ys[x] = c + ys.get(x, 0)
                    v.next()
                    if v.done():
                        q.pop()
                        if len(q) > 0:
                            v = q.front()
                    else:
                        q.modifyfront()
                        v = q.front()
                ys = ys.items()
                ys.sort()
                for (x, c) in ys:
                    hist[c] = 1 + hist.get(c, 0)
                    acgt[x&3] += c
                    kx.append(x)
                    cx.append(c)
    z.add_file(cnm, t)
    os.remove(t)
예제 #5
0
 def __init__(self, z, zfn, comp):
     self.z = z
     self.zfn = zfn
     self.comp = comp
     self.tfn = tmpfile('.szf')
     self.tf = open(self.tfn, 'w')
예제 #6
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = 25

    nms = []
    idx = {}
    for (nm, seq) in readFasta(openFile(opts['<baits>'])):
        n = len(nms)
        nms.append(nm)
        for x in kmersList(K, seq, True):
            if x not in idx:
                idx[x] = set([])
            idx[x].add(n)

    for x in idx.keys():
        idx[x] = list(idx[x])
        idx[x].sort()

    anti = set([])
    if opts['-U']:
        with openFile(opts['-U']) as f:
            for (nm, seq) in readFasta(f):
                for x in kmersList(K, seq, True):
                    anti.add(x)

    rn = 0
    if opts['-p']:

        hist = {}
        for (fn1, fn2) in pairs(opts['<input>']):
            tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq'))
                    for i in xrange(len(nms))]
            cache = [[[], []] for i in xrange(len(nms))]
            counts = [0 for i in xrange(len(nms))]
            with openFile(fn1) as f1, openFile(fn2) as f2:
                for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                    hits = set([])
                    pushup = False
                    for x in kmersList(K, fq1[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    for x in kmersList(K, fq2[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    if pushup:
                        continue
                    n = len(hits)
                    hist[n] = 1 + hist.get(n, 0)
                    for i in hits:
                        counts[i] += 1
                        cache[i][0].append(fq1)
                        cache[i][1].append(fq2)
                        if len(cache[i][0]) >= 1024:
                            with open(tmps[i][0], 'a') as f:
                                for rd in cache[i][0]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            with open(tmps[i][1], 'a') as f:
                                for rd in cache[i][1]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            cache[i][0] = []
                            cache[i][1] = []
            for i in xrange(len(cache)):
                if len(cache[i][0]) > 0:
                    with open(tmps[i][0], 'a') as f:
                        for rd in cache[i][0]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    with open(tmps[i][1], 'a') as f:
                        for rd in cache[i][1]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    cache[i][0] = []
                    cache[i][1] = []
            with zipfile.ZipFile(opts['<output>'], 'w',
                                 zipfile.ZIP_DEFLATED) as z:
                for i in xrange(len(nms)):
                    if counts[i] > 0:
                        pth = '/'.join(nms[i].split())
                        z.write(tmps[i][0], pth + '/' + fn1)
                        os.remove(tmps[i][0])
                        z.write(tmps[i][1], pth + '/' + fn2)
                        os.remove(tmps[i][1])
        hist = hist.items()
        hist.sort()
        for (n, f) in hist:
            print '%d\t%d' % (n, f)
    else:
        raise "not implemented"
예제 #7
0
파일: kmerize.py 프로젝트: drtconway/zotmer
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)
예제 #8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = None

    out = opts['<output>']

    px = list(pairs(opts['<input>']))
    if len(px) == 1:
        with kmers(out, 'w') as z:
            h = {}
            acgt = [0, 0, 0, 0]
            ix = px[0]
            if len(ix) == 1:
                with kmers(ix[0], 'r') as z0:
                    K = z0.meta['K']
                    xs = readKmersAndCounts(z0)
                    zs = hist(xs, h, acgt)
                    writeKmersAndCounts(z, xs)
            else:
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    K = z0.meta['K']
                    K1 = z1.meta['K']
                    if K1 != K:
                        print >> sys.stderr, "mismatched K"
                        sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    zs = hist(merge(xs, ys), h, acgt)
                    writeKmersAndCounts(z, zs)
            n = float(sum(acgt))
            acgt = [c/n for c in acgt]
            z.meta['hist'] = h
            z.meta['acgt'] = acgt
        return

    tmps = []
    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        for ix in px:
            if len(ix) == 1:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    writeKmersAndCounts(z, xs, nm)
            else:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                        K1 = z1.meta['K']
                        if K1 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    writeKmersAndCounts(z, merge(xs, ys), nm)

    assert K is not None

    with kmers(out, 'w') as z:
        h = {}
        acgt = [0, 0, 0, 0]
        with casket(tmpnm, 'r') as z0:
            xss = [readKmersAndCounts(z0, t) for t in tmps]
            mergeNinto(K, xss, h, acgt, z)
        n = float(sum(acgt))
        acgt = [c/n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt

    os.remove(tmpnm)