Пример #1
0
def prepareBedFileGeneTx(geneListFn, refGeneFn, outFn):
    genes = set([])
    transcripts = {}
    with openFile(geneListFn) as f:
        for l in f:
            t = l.split()
            genes.add(t[0])
            transcripts[t[1]] = t[0]

    found = {}
    with openFile(refGeneFn) as f, openFile(outFn, 'w') as out:
        for l in f:
            t = l.split()
            tx = t[1]
            ch = t[2]
            st = t[3]
            g = t[12]
            if tx not in transcripts:
                continue
            found[tx] = g
            ss = [int(s) for s in t[9].split(',') if len(s) > 0]
            ee = [int(e) for e in t[10].split(',') if len(e) > 0]
            exs = zip(ss, ee)
            if st == '-':
                exs = exs[::-1]
            for i in range(len(exs)):
                j = i + 1
                print >> out, '%s\t%d\t%d\t%s/%02d\t%s' % (ch, exs[i][0],
                                                           exs[i][1], g, j, st)

    for tx in sorted(set(transcripts.keys()) - set(found.keys())):
        print >> sys.stderr, 'transcipt not found: %s\t(%s)' % (
            tx, transcripts[tx])
Пример #2
0
    def download(self, acc, path):
        print >> sys.stderr, 'downloading %s -> %s' % (acc, path)

        qry = {}
        qry['db'] = 'nuccore'
        qry['rettype'] = self.rettype
        qry['retmode'] = 'text'
        qry['id'] = acc

        # Max size to slurp
        MB64 = 64 * 1024 * 0124

        with requests.get(self.base, params=qry, stream=True) as r:

            # deal with HTTP errors
            r.raise_for_status()

            # write "small" files in one go.
            if 'Content-Length' in r.headers and int(
                    r.headers['Content-Length']) <= MB64:
                with openFile(path, 'w') as f:
                    f.write(r.content)
                return

            # grab "big" files a piece at a time.
            with openFile(path, 'w') as f:
                for chk in r.iter_content(chunk_size=None):
                    f.write(chk)
Пример #3
0
def indexBedFiles(bedFn, sf):
    global verbose

    idx = {}
    with openFile(bedFn) as f:
        for l in f:
            t = l.split()
            ch = t[0]
            s = int(t[1])
            e = int(t[2])
            gex = t[3]
            st = t[4]
            if ch not in idx:
                idx[ch] = []
            g, ex = gex.split('/')
            idx[ch].append((s, e, st, g, ex))
    for ch in sorted(idx.keys()):
        if verbose:
            print >> sys.stderr, 'processing %s' % (ch, )

        seq = sf[ch]
        idx[ch].sort()
        for (s, e, st, g, ex) in idx[ch]:
            exSeq = seq[s:e].upper()
            if st == '-':
                exSeq = revComp(exSeq)
            itm = {}
            itm['gene'] = g
            itm['exon'] = ex
            itm['chr'] = ch
            itm['st'] = s
            itm['en'] = e
            itm['strand'] = st
            itm['seq'] = exSeq
            yield itm
Пример #4
0
def buildIndex(K, inputs, output):
    """
    Create a new k-mer index. The FASTA files named in the list
    `inputs` are read in and the `K` length k-mers and their reverse
    complements are extracted and collated to create an index that
    maps from k-mer to sequence number (numbering from 0). The
    `names` member of the KmerIndex object can be used to retrieve
    the name from the sequence number.
    """
    seqs = []
    for inp in inputs:
        with openFile(inp) as f:
            seqs += list(readFasta(f))

    S = []
    nms = []
    lens = array.array('I', [])
    for i in xrange(len(seqs)):
        (nm, seq) = seqs[i]
        nms.append(nm)
        xs = list(kmers(K, seq, True))
        xs.sort()
        uniq(xs)
        seqs[i] = [nm, xs]
        lens.append(len(xs))
        S += xs
    S.sort()
    uniq(S)
    S = sparse(2 * K, S)

    T = array.array('I', [0 for i in xrange(S.count() + 1)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            T[r] += 1

    t0 = 0
    for i in xrange(len(T)):
        t1 = t0 + T[i]
        T[i] = t0
        t0 = t1

    T0 = [c for c in T]
    U = array.array('H', [0 for i in xrange(t0)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            U[T0[r]] = i
            T0[r] += 1

    with container(output, 'w') as z:
        writeKmers(K, S.xs, z)
        n = write32(z, T, 'offsets')
        z.meta['T'] = n
        n = write16(z, U, 'postings')
        z.meta['U'] = n
        n = write32(z, lens, 'lens')
        z.meta['lens'] = n
        z.meta['names'] = nms
Пример #5
0
    def __getitem__(self, acc):
        pth = self.makePath(acc, self.compression)

        if not os.path.isfile(pth):
            (d, f) = self.makePathComponents(acc, self.compression)
            if not os.path.isdir(d):
                os.makedirs(d)
            self.download(acc, pth)

        return openFile(pth)
Пример #6
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])

    B = int(opts['-b'])

    paired = opts['-p']

    verbose = opts['-v']

    Z = opts['-z']

    names = []
    seqs = []
    baits = {}
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs.append(seq)
            for x in kmersList(K, seq, True):
                if x not in baits:
                    baits[x] = set([])
                baits[x].add(n)

    N = len(names)

    caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)]

    nr = 0
    nh = 0
    for itm in reads(opts['<input>'],
                     reads=True,
                     kmers=True,
                     fwdOnly=True,
                     paired=paired,
                     verbose=verbose):
        nr += 1
        E = len(itm.kmers)
        hits = set([])
        for i in xrange(E):
            fwd = itm.kmers[i]
            for x in fwd:
                if x in baits:
                    hits |= baits[x]
        for n in hits:
            caches[n].add(itm.reads)

        if len(hits) > 0:
            nh += 1

    for n in xrange(N):
        caches[n].end()
Пример #7
0
 def flush(self):
     for i in range(self.N):
         if len(self.buffers[i]) == 0:
             continue
         with openFile(self.names[i], 'a') as f:
             for rd in self.buffers[i]:
                 print >> f, rd[0]
                 print >> f, rd[1]
                 print >> f, rd[2]
                 print >> f, rd[3]
         self.buffers[i] = []
Пример #8
0
 def __getitem__(self, acc):
     if acc != self.prevAcc:
         acc = normalizeAccession(acc)
         pth = self.home + '/' + acc + '.fa'
         if not os.path.exists(pth):
             pth = pth + '.gz'
         with openFile(pth) as f:
             for (nm, seq) in readFasta(f):
                 self.prevAcc = acc
                 self.prevSeq = seq
                 break
     return self.prevSeq
Пример #9
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc not in hgvs.refSeq2Hg19:
                print >> sys.stderr, "accession %s not available." % (acc)
            assert acc in hgvs.refSeq2Hg19
            h = hgvs.refSeq2Hg19[acc]

            with openFile(self.home + "/" + h + ".fa.gz") as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Пример #10
0
def parseFiles(K, paired, fns, verbose):
    M = (1 << 18) - 1
    rn = 0

    if not paired:
        for fn in fns:
            with openFile(fn) as f:
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'reads processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False)
                yield xs
        return

    for (fn1, fn2) in pairs(fns):
        with openFile(fn1) as f1, openFile(fn2) as f2:
            for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'read pairs processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)]
                yield xs
Пример #11
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc in refSeq2Hg19:
                h = refSeq2Hg19[acc]
            else:
                h = acc

            with openFile(self.home + "/" + h + ".fa.gz") as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Пример #12
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc in refSeq2Hg19:
                h = refSeq2Hg19[acc]
            else:
                h = acc

            pth = self.home + '/' + h + '.fa'
            if not os.path.exists(pth):
                pth += '.gz'

            with openFile(pth) as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Пример #13
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    C = int(opts['-c'])
    Q = int(opts['-q'])
    S = int(opts['-S'])
    P = float(opts['-p'])

    verbose = opts['-v']

    both = True
    if opts['-s']:
        both = False

    res = []
    for fn in opts['<input>']:
        fres = {}
        fres['file'] = fn
        fres['contigs'] = []
        glob = {}
        ncontig = 0
        with openFile(fn) as f:
            for (nm, seq) in readFasta(f):
                ncontig += 1
                scaff = {}
                for x in kmersList(K, seq, both):
                    if sub(S, P, x):
                        scaff[x] = 1 + scaff.get(x, 0)
                summary = summarize(scaff, C, Q)
                summary['name'] = nm
                fres['contigs'].append(summary)
                for (x, c) in scaff.items():
                    glob[x] = c + glob.get(x, 0)
        fres['global'] = summarize(glob, C, Q)
        res.append(fres)

    yaml.safe_dump(res, sys.stdout)
Пример #14
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    T = []
    for t0 in opts['-t'].split(','):
        t1 = t0.split(':')
        if len(t1) == 1:
            T.append((1.0, t1[0]))
        elif len(t1) == 2:
            T.append((float(t1[0]), t1[1]))
        else:
            print >> sys.stderr, "unexpected variant category descriptor:", t0
            sys.exit(1)
    Tgen = MultiGen(T)

    N = int(opts['-N'])

    D = float(opts['-D'])
    I = float(opts['-I'])
    U = float(opts['-U'])

    Ds = {}
    Ds['del'] = 1.0 / D
    Ds['ins'] = 1.0 / I
    Ds['dup'] = 1.0 / U

    verbose = opts['-v']

    if opts['-S'] is not None:
        random.seed(int(opts['-S']))

    with openFile(opts['<regions>']) as f:
        zones = readBED(f)

    zps = []
    t = 0
    maxC = 0
    maxM = 0
    for c in zones.keys():
        maxC = max(maxC, len(c))
        for i in range(len(zones[c])):
            (s, e, m) = zones[c][i]
            maxM = max(maxM, len(m))
            l = e - s + 1
            zps.append((l, (c, i)))
            t += l
    zps = [(float(l) / float(t), c) for (l, c) in zps]
    zgen = MultiGen(zps)
    zcs = dict([(c, 0) for (p, c) in zps])
    for n in xrange(N):
        c = zgen.gen()
        zcs[c] += 1
    zcs = [(c, n) for (c, n) in zcs.items() if n > 0]
    zcs.sort()

    prog = None
    progFmt = None
    if verbose:
        prog = tqdm(total=N, unit='vars')
        progFmt = '%-' + str(maxC) + 's : %-' + str(maxM) + 's'
    curC = None
    curSeq = None
    prevM = None
    for ((c, i), n) in zcs:
        (s, e, m) = zones[c][i]
        if curC != c:
            curC = c
            if prog is not None:
                prog.set_description(c.ljust(maxC, ' '))
                prog.update(0)
            curSeq = sf[c]
        assert s < len(curSeq)
        assert e <= len(curSeq)
        if prog is not None:
            #prog.set_description(progFmt % (c, m))
            prog.update(n)
        if opts['-V'] and m != prevM:
            prevM = m
            print '# %s : %s' % (c, m)
        for j in xrange(n):
            v = genVar(c, curSeq, s, e, Tgen, Ds)
            fmts = []
            vals = []
            if opts['-T']:
                v.setSequenceFactory(sf)
                wt = curSeq[v.range()[0]:v.range()[1]].upper()
                mut = v.sequence()
                if mut is None:
                    mut = '*'
                fmts += ['%s', '%d', '%d', '%d', '%s', '%s']
                vals += [
                    v.accession(),
                    v.range()[0],
                    v.range()[1],
                    v.size(), wt, mut
                ]
            fmts += ['%s']
            vals += [str(v)]
            print '\t'.join(fmts) % tuple(vals)
Пример #15
0
def main(argv):
    global verbose

    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    genomeDir = '.'
    if opts['-g']:
        genomeDir = opts['-g']
    sf = SequenceFactory(genomeDir)

    if opts['-P']:
        if opts['-t']:
            prepareBedFileGeneTx(opts['<gene-list>'], opts['<refgene>'],
                                 opts['<bedfile>'])
        else:
            prepareBedFileGene(opts['<gene-list>'], opts['<refgene>'],
                               opts['<bedfile>'])
        return

    if opts['-X']:
        with openFile(opts['<index>'], 'w') as out:
            yaml.safe_dump_all(indexBedFiles(opts['<must-have>'], sf),
                               out,
                               default_flow_style=False)
        return

    K = int(opts['-k'])
    minGeneReads = int(opts['-M'])
    minExonReads = int(opts['-m'])
    minGeneRate = float(opts['-R'])
    minExonRate = float(opts['-r'])
    (minGeneCount, maxGeneCount) = map(int, opts['-Z'].split(':'))
    (minExonCount, maxExonCount) = map(int, opts['-z'].split(':'))

    with openFile(opts['<index>']) as f:
        ref = list(yaml.load_all(f, Loader=yaml.BaseLoader))

    if True:
        # Test the double-layer index
        idx = ExonIndex(K, ref)

        acc = {}
        toc = {}
        rn = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         paired=True,
                         reads=True,
                         kmers=False,
                         both=True,
                         verbose=verbose):
            rn += 1
            (lhsFwd, lhsRev) = kmersLists(K, itm.reads[0][1])
            (rhsFwd, rhsRev) = kmersLists(K, itm.reads[1][1])
            xs0 = lhsFwd + rhsRev
            rh0 = idx.readHash(xs0)
            if rh0 is not None:
                (h0, ys0) = rh0
                if h0 not in acc:
                    acc[h0] = []
                    toc[h0] = ys0
                acc[h0].append((compressRead(itm.reads[0][1]),
                                compressRead(itm.reads[1][1])))

            xs1 = lhsRev + rhsFwd
            rh1 = idx.readHash(xs1)
            if rh1 is not None:
                (h1, ys1) = rh1
                if h1 not in acc:
                    acc[h1] = []
                    toc[h1] = ys1
                acc[h1].append((compressRead(itm.reads[0][1]),
                                compressRead(itm.reads[1][1])))

        nx = 0
        for h in sorted(acc.keys()):
            for (x, c) in sorted(acc[h].items()):
                nx += 1
                if c <= 1:
                    continue
                print '%016x\t%s\t%d' % (h, render(K, x), c)

        print >> sys.stderr, 'nx =', nx
        return

    if False:
        # Position index
        idx = {}
        for i in range(len(ref)):
            itm = ref[i]
            for (x, p) in kmersWithPosList(K, itm['seq'], False):
                p -= 1
                if x not in idx:
                    idx[x] = []
                idx[x].append((i, p))

    if True:
        # Exon tuple index
        idx = {}
        lens = [0 for i in range(len(ref))]
        for i in range(len(ref)):
            itm = ref[i]
            for (x, p) in kmersWithPosList(K, itm['seq'], False):
                if x not in idx:
                    idx[x] = set([])
                idx[x].add(i)
                lens[i] += 1
        for x in idx.iterkeys():
            idx[x] = tuple(sorted(idx[x]))

    if opts['-T']:
        ak = {}
        for x in sorted(idx.iterkeys()):
            if len(idx[x]) == 1:
                continue
            xStr = render(K, x)
            ak[xStr] = []
            for i in idx[x]:
                itm = ref[i]
                k = '%s/%s' % (itm['gene'], itm['exon'])
                ak[xStr].append(k)
            ak[xStr].sort()
        rep = {}
        rep['aliasing-within'] = ak
        chrs = set([])
        for i in range(len(ref)):
            itm = ref[i]
            chrs.add(itm['chr'])
        counts = [0 for i in range(len(ref))]
        for ch in sorted(chrs):
            if verbose:
                print >> sys.stderr, 'processing %s' % (ch, )
            seq = sf[ch]
            for (x, p) in kmersWithPos(K, seq, True):
                if x not in idx:
                    continue
                for i in idx[x]:
                    counts[i] += 1
        gk = {}
        for i in range(len(ref)):
            if lens[i] == counts[i]:
                continue
            itm = ref[i]
            k = '%s/%s' % (itm['gene'], itm['exon'])
            gk[k] = {'indexed': lens[i], 'genomic': counts[i]}
        rep['aliasing-genomic'] = gk
        yaml.safe_dump(rep, sys.stdout, default_flow_style=False)
        return

    acc = {}
    rn = 0
    hitStats = Summary()
    hitHist = [0 for i in range(1000)]
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        (lhsFwd, lhsRev) = kmersWithPosLists(K, itm.reads[0][1])
        (rhsFwd, rhsRev) = kmersWithPosLists(K, itm.reads[1][1])
        (hits0, hitCount0) = recHits(idx, lhsFwd + rhsRev)
        (hits1, hitCount1) = recHits(idx, lhsRev + rhsFwd)
        if len(hits0) > 0:
            k = tuple(sorted(hits0.keys()))
            v = sum(hits0.values())
            if k not in acc:
                acc[k] = [0, 0]
            acc[k][0] += 1
            acc[k][1] += v
            hitStats.add(hitCount0)
            hitHist[hitCount0] += 1

        if len(hits1) > 0:
            k = tuple(sorted(hits1.keys()))
            v = sum(hits1.values())
            if k not in acc:
                acc[k] = [0, 0]
            acc[k][0] += 1
            acc[k][1] += v
            hitStats.add(hitCount1)
            hitHist[hitCount1] += 1

    if verbose:
        print >> sys.stderr, 'total read hits: %d' % (len(hitStats), )
        print >> sys.stderr, 'total hits per read: %g (%g)' % (hitStats.mean(),
                                                               hitStats.sd())
        print >> sys.stderr, 'total reads: %d' % (rn, )
        for i in range(len(hitHist)):
            if hitHist[i] > 0:
                print >> sys.stderr, '\t%d\t%d' % (i, hitHist[i])

    def gex(s):
        r = []
        for n in s:
            itm = ref[n]
            r.append('%s/%s' % (itm['gene'], itm['exon']))
        return '|'.join(r)

    def fmtKey(k):
        nex = len(k)
        gx = set([])
        kStrParts = []
        for s in k:
            kStrParts.append(gex(s))
            gx |= set([ref[i]['gene'] for i in s])
        kStr = '--'.join(sorted(kStrParts))
        return (nex, gx, kStr)

    gxCounts = {}
    for k in acc.keys():
        gx = set([])
        ex = set([])
        for s in k:
            gx |= set([ref[i]['gene'] for i in s])
            ex |= set(s)
        gx = tuple(sorted(gx))
        if gx not in gxCounts:
            gxCounts[gx] = [0, 0]
        gxCounts[gx][0] += acc[k][0]
        gxCounts[gx][1] += acc[k][1]

    hdr = ['numReads', 'numKmers', 'kmersPerRead']
    hdr += ['ggNumReads', 'ggNumKmers', 'ggKmersPerRead']
    hdr += ['numExons', 'numGenes', 'geneGroup', 'exonGroup']
    print '\t'.join(hdr)
    for k in acc.keys():
        (nex, gx, kStr) = fmtKey(k)
        gx = tuple(sorted(gx))
        if len(gx) < minGeneCount or len(gx) > maxGeneCount:
            continue
        if len(ex) < minExonCount or len(ex) > maxExonCount:
            continue
        if gxCounts[gx][0] < minGeneReads:
            continue
        if acc[k][0] < minExonReads:
            continue
        gxRate = float(gxCounts[gx][1]) / float(gxCounts[gx][0])
        if gxRate < minGeneRate:
            continue
        exRate = float(acc[k][1]) / float(acc[k][0])
        if exRate < minExonRate:
            continue
        gxStr = ':'.join(gx)

        print '%d\t%d\t%g\t%d\t%d\t%g\t%d\t%d\t%s\t%s' % (
            acc[k][0], acc[k][1], exRate, gxCounts[gx][0], gxCounts[gx][1],
            gxRate, nex, len(gx), gxStr, kStr)
Пример #16
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)
Пример #17
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    if (K & 1) != 0:
        print >> sys.stderr, "K must be even."
        return

    minCov = int(opts['-m'])

    verbose = opts['-v']

    J = K // 2
    S = 2*(K - J)
    Mj = (1 << (2*J)) - 1

    names = []
    seqs = {}
    bait = {}
    wtFst = []
    wtLst = []
    posIdx = []
    rds = []
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs[nm] = seq
            wf = {}
            wl = {}
            for x in kmersList(K, seq, False):
                if x not in bait:
                    bait[x] = set([])
                bait[x].add(n)

                y0 = x >> S
                y1 = x & Mj
                #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1))

                if y0 not in wf:
                    wf[y0] = set([])
                wf[y0].add(y1)

                if y1 not in wl:
                    wl[y1] = set([])
                wl[y1].add(y0)

            wtFst.append(wf)
            wtLst.append(wl)
            
            px = {}
            for (x,p) in kmersWithPosList(J, seq, False):
                if x not in px:
                    px[x] = []
                px[x].append(p)
            posIdx.append(px)

            for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]):
                pps = positions(posIdx[n], J, a, b, c, d)
                if pps is None:
                    continue
                for pp in pps:
                    ab = a << S | b
                    cb = c << S | b
                    cd = c << S | d
                    dd = pp[2] - pp[0]
                    print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd)

            rds.append([])

    N = len(names)

    L = None
    X = [{} for n in range(N)]
    for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose):
        rd = itm.reads[0]
        L = len(rd)

        xs = itm.kmers[0]
        hits = set([])
        for x in xs:
            if x in bait:
                hits |= bait[x]
        for n in hits:
            for x in xs:
                if x not in X[n]:
                    X[n][x] = 0
                X[n][x] += 1
            rds[n].append(rd)

    hdrShown = False
    vn = 0
    for n in range(N):
        xs = {}
        for (x,c) in X[n].iteritems():
            if c >= 10:
                xs[x] = c

        seq = seqs[names[n]]

        rngs = []
        st = None
        en = None
        inside = False
        xx = []
        for x in kmersList(K, seq, False):
            if x in xs:
                xx.append('.')
            else:
                xx.append('X')
        print ''.join(xx)
        for x in kmersList(K, seq, False):
            if not inside:
                if x in xs:
                    st = x
                else:
                    inside = True
            else:
                if x in xs:
                    en = x
                    rngs.append((st, en))
                    st = x
                    en = None
                    inside = False
        if inside:
            rngs.append((st, en))

        pthr = Pather(K, xs)

        for (x,y) in rngs:
            if x is None or y is None:
                continue
            print render(K, x), render(K, y)
            for p in pthr.trace(x, y, 100):
                print renderPath(K, p)
        continue

        fst = {}
        lst = {}
        for (x,c) in xs.iteritems():
            #if c < 5:
            #    continue
            y0 = x >> S
            y1 = x & Mj

            if y0 not in fst:
                fst[y0] = []
            fst[y0].append(y1)

            if y1 not in lst:
                lst[y1] = []
            lst[y1].append(y0)

        #for (a, b, c, d) in findDupDeNovo(fst, lst):
        for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst):
            #continue
            pps = positions(posIdx[n], J, a, b, c, d)
            if pps is None:
                continue
            for pp in pps:
                ab = a << S | b
                cb = c << S | b
                cd = c << S | d
                #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)]

                dd = pp[2] - pp[0]

                if not opts['-a'] and dd % 3 != 0:
                    continue

                if opts['-s']:
                    fstPath = interpolate(K, xs, ab, cb, dd+1)
                    sndPath = interpolate(K, xs, cb, cd, dd+1)

                    if fstPath is None:
                        continue
                    if sndPath is None:
                        continue

                    if fstPath[J:-J] != sndPath[J:-J]:
                        continue

                pa = pp[0]
                pb = pp[1]
                pc = pp[2]
                pd = pp[3]

                cab = xs.get(ab, 0)
                ccb = xs.get(cb, 0)
                ccd = xs.get(cd, 0)

                if cab < minCov:
                    continue
                if ccb < minCov:
                    continue
                if ccd < minCov:
                    continue

                m = (cab + ccd) / 2.0
                # Assume the true std dev is 10% of the mean
                w = ccb / m

                hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1)
                v = Duplication(names[n], pb, pd-1, seqs)
                if opts['-A']:
                    showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n])

                vn += 1

                hdrs = ['n']
                fmts = ['%d']
                outs = [vn]

                hdrs += ['left', 'leftCov']
                fmts += ['%s','%d']
                outs += [render(K, ab), cab]

                hdrs += ['mid', 'midCov']
                fmts += ['%s','%d']
                outs += [render(K, cb), ccb]

                hdrs += ['right', 'rightCov']
                fmts += ['%s','%d']
                outs += [render(K, cd), ccd]

                hdrs += ['len']
                fmts += ['%d']
                outs += [dd]

                hdrs += ['vaf']
                fmts += ['%g']
                outs += [w]

                hdrs += ['hgvs']
                fmts += ['%s']
                outs += [hgvs]

                if not hdrShown:
                    hdrShown = True
                    print '\t'.join(hdrs)
                print '\t'.join(fmts) % tuple(outs)
Пример #18
0
    def next(self):
        self.readNum += 1
        if (self.readNum & self.M) == 0 and self.progress is not None:
            self.progress.update(self.M)

        while True:
            if self.currParsers is None:

                if self.currFilesInd is None:
                    self.currFilesInd = 0
                else:
                    self.currFilesInd += self.N

                if self.progress is not None:
                    self.progress.update(self.readNum & self.M)

                if self.currFilesInd + (self.N - 1) >= len(self.files):
                    raise StopIteration

                if self.verbose:
                    pfx = ' & '.join([
                        basename(self.files[i])
                        for i in range(self.currFilesInd, self.currFilesInd +
                                       self.N)
                    ])
                    self.progress = tqdm(unit=' reads', unit_scale=True)
                    self.progress.set_postfix(reading=pfx, refresh=True)

                self.currParsers = []
                for i in range(self.currFilesInd, self.currFilesInd + self.N):
                    fn = self.files[i]
                    f = openFile(fn)
                    if isFasta(fn):
                        self.currParsers.append(readFasta(f))
                    else:
                        self.currParsers.append(readFastq(f))

            self.currReads = []
            try:
                for p in self.currParsers:
                    self.currReads.append(p.next())
            except StopIteration:
                if len(self.currReads) != 0:
                    print >> sys.stderr, 'warning: files had unequal length'
                self.currParsers = None
                if self.progress is not None:
                    self.progress.close()
                    self.progress = None
                continue

            if self.kmers:
                self.currKmers = []
                for rd in self.currReads:
                    if self.fwdOnly:
                        self.currKmers.append(kmersList(self.K, rd[1], False))
                    elif self.both:
                        self.currKmers.append(kmersList(self.K, rd[1], True))
                    else:
                        assert self.separate
                        self.currKmers.append(kmersLists(self.K, rd[1]))

            res = Reads()

            if self.reads:
                res.reads = self.currReads
            if self.kmers:
                res.kmers = self.currKmers
            return res
Пример #19
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['-k'])

    C = int(opts['-C'])

    L = int(opts['-L'])

    raw = opts['-r']

    S = int(opts['-S'])

    V = float(opts['-V'])

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    with openFile(opts['<regions>']) as f:
        R = readBED(f)

    refTbl = {}
    refIdx = {}
    zoneIdx = {}
    for (acc, zones) in R.items():
        accSeq = sf[acc]
        for (s, e, nm) in zones:
            zoneIdx[nm] = (acc, s, e)
            seq = accSeq[s - 1:e]
            if nm not in refTbl:
                refTbl[nm] = {}
            for (x, p) in kmersWithPosList(K, seq, False):
                p -= 1
                p += s
                refTbl[nm][p] = x
                if x not in refIdx:
                    refIdx[x] = []
                refIdx[x].append((nm, p))

    acc = {}
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     verbose=verbose):
        rdL = itm.reads[0]
        zL = len(rdL)
        (fwdL, revL) = kmersWithPosLists(K, rdL[1])
        fwdLHits = hits(refIdx, K, fwdL, acc)
        revLHits = hits(refIdx, K, revL, acc)

        rdR = itm.reads[1]
        zR = len(rdR)
        (fwdR, revR) = kmersWithPosLists(K, rdR[1])
        fwdRHits = hits(refIdx, K, fwdR, acc)
        revRHits = hits(refIdx, K, revR, acc)

    killZ = set([])
    for z in acc.keys():
        killP = set([])
        for p in acc[z].keys():
            killX = set([])
            vv = {}
            for x in acc[z][p].keys():
                y = x >> 2
                if y not in vv:
                    vv[y] = []
                vv[y].append((x, acc[z][p][x]))
            for vs in vv.values():
                vt = V * sum([c for (x, c) in vs])
                for (x, c) in vs:
                    if c < vt or c < C:
                        killX.add(x)
            for x in killX:
                del acc[z][p][x]
            if len(acc[z][p]) == 0:
                killP.add(p)
        for p in killP:
            del acc[z][p]
        if len(acc[z]) == 0:
            killZ.add(z)
    for z in killZ:
        del acc[z]

    if raw:
        print '\t'.join(['chrom', 'pos', 'side', 'label', 'anchor', 'insSeq'])
    else:
        print '\t'.join([
            'chrom', 'after', 'before', 'label', 'rhsShift', 'lhsShift',
            'lhsAnc', 'rhsAnc', 'lhsSeq', 'rhsSeq'
        ])

    for z in sorted(acc.keys()):
        (ch, st, en) = zoneIdx[z]

        Z = acc[z]
        ref = refTbl[z]
        aft = dict(forwardSpurs(K, ref, Z))
        bef = dict(reverseSpurs(K, ref, Z))

        scoredAft = {}
        for p in sorted(aft.keys()):
            if p + K - 1 == en:
                continue

            for spur in aft[p]:

                if len(spur) < L:
                    continue

                if raw:
                    (xs, cs) = zip(*spur)
                    seq = renderPath(K, xs)
                    anc = seq[:K]
                    ins = seq[K:]
                    print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % (
                        ch, p + K - 1, 'after', z, anc, ins, ','.join(
                            map(str, cs)))
                    continue

                for (q, xcs, v) in shiftForwardSpur(ref, Z, S, p, spur):
                    q += K - 1
                    if q not in scoredAft:
                        scoredAft[q] = []
                    (xs, cs) = zip(*xcs)
                    seq = renderPath(K, xs)
                    anc = seq[:K]
                    ins = seq[K:]
                    scoredAft[q].append((v, anc, ins, cs))

        scoredBef = {}
        for p in sorted(bef.keys()):
            if p == st:
                continue

            for spur in bef[p]:
                if len(spur) < L:
                    continue

                if raw:
                    (xs, cs) = zip(*spur)
                    seq = renderPath(K, xs)
                    anc = seq[-K:]
                    ins = seq[:-K]
                    print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % (
                        ch, p, 'before', z, anc, ins, ','.join(map(str, cs)))
                    continue

                for (q, xcs, v) in shiftReverseSpur(ref, Z, S, p, spur):
                    if q not in scoredBef:
                        scoredBef[q] = []
                    (xs, cs) = zip(*xcs)
                    seq = renderPath(K, xs)
                    anc = seq[-K:]
                    ins = seq[:-K]
                    scoredBef[q].append((v, anc, ins, cs))

        for p0 in sorted(scoredAft.keys()):
            p1 = p0 + 1
            if p1 not in scoredBef:
                continue
            for (aftV, aftAnc, aftIns, aftCov) in scoredAft[p0]:
                for (befV, befAnc, befIns, befCov) in scoredBef[p1]:
                    if befAnc in aftIns or aftAnc in befIns:
                        continue
                    v = aftV + befV
                    print '%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                        ch, p0, p1, z, aftV, befV, aftAnc, befAnc, aftIns,
                        befIns)
Пример #20
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    random.seed(17)

    K = int(opts['-k'])
    S = 2*(K-3)

    frameAnchors = {}
    knownStops = {}
    sequences = {}
    seqKmers = {}

    if opts['-r']:
        with openFile(opts['-r']) as f:
            for (nm,seq) in readFasta(f):
                sequences[nm] = seq
                # trim polyA tails
                seq = re.sub('AAAAAA*$', '', seq)
                seqKmers[nm] = set([])
                for (x,p1) in kmersWithPosList(K, seq, False):
                    seqKmers[nm].add(x)
                    p = p1 - 1
                    w = p % 3
                    if x not in frameAnchors:
                        frameAnchors[x] = set([])
                    frameAnchors[x].add((nm,p))
                    y = x & 63
                    if w == 0 and y in stops:
                        if x not in knownStops:
                            knownStops[x] = set([])
                        knownStops[x].add(nm)

    rn = 0
    res = {}
    for fn in opts['<input>']:
        with openFile(fn) as f:
            for rd in readFastq(f):
                L = len(rd[1])
                rn += 1
                fwdAndRev = kmersWithPosLists(K, rd[1])
                frames = {}
                possibleStops = {}
                for i in range(2):
                    #print i, sorted([p for (x,p) in fwdAndRev[i]])
                    for (x,p) in fwdAndRev[i]:
                        if x in frameAnchors:
                            for (nm,q) in frameAnchors[x]:
                                o = (q - p)
                                k = (nm, o, i)
                                frames[k] = 1 + frames.get(k, 0)
                if len(frames) == 0:
                    continue
                n = sum(frames.values())
                probs = []
                for ((nm, off, strnd), cnt) in sorted(frames.items()):
                    probs.append((float(cnt)/float(n), cnt, off, strnd, nm))
                v = random.random()
                for (pv, cnt, off, strnd, nm) in probs:
                    if v < pv:
                        #print rd[1]
                        #print proj(strnd, sequences[nm][off:off+len(rd[1])])
                        #print codons(off % 3, rd[1]), off
                        for (x,p) in fwdAndRev[strnd]:
                            if (p + off + K - 3) % 3 == 0 and (x & 63) in stops:
                                if nm not in res:
                                    res[nm] = {}
                                if x not in res[nm]:
                                    res[nm][x] = 0
                                res[nm][x] += 1
                        break
                    v -= pv
    for (nm,stps) in res.iteritems():
        for (x,c) in stps.iteritems():
            (d,y) = nearest3(K, seqKmers[nm], x)
            if x in knownStops:
                k = 'known'
            else:
                k = 'novel'
            print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
Пример #21
0
def outputFile(nm):
    if nm is None:
        return sys.stdout
    return openFile(nm, 'w')
Пример #22
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    sf = SequenceFactory(opts['-g'])

    I = int(opts['-I'])
    D = float(opts['-d'])
    E = float(opts['-e'])
    L = int(opts['-L'])
    N = int(opts['-N'])
    V = float(opts['-V'])

    M = None
    if opts['-M'] is not None:
        M = float(opts['-M'])
        # compute the 99% quantile
        W = int(math.log1p(-0.99) / math.log1p(-M))

    S = None
    if opts['-S'] is not None:
        S = int(opts['-S'])
        random.seed(S)

    if opts['-b']:
        zones = readBED(openFile(opts['-b']))
    else:
        zones = {}
        for ch in refSeq2Hg19.values():
            ch = normalizeAccession(ch)
            s = sf[ch]
            v = (1, len(s), ch)
            if ch not in zones:
                zones[ch] = []
            zones[ch].append(v)

    popVars = {}
    if opts['-m'] is not None:
        with openFile(opts['-m']) as f:
            for l in f:
                t = l.split()
                p = float(t[0])
                v = makeHGVS(t[1], sf)
                a = normalizeAccession(v.accession())
                if a not in popVars:
                    popVars[a] = []
                popVars[a].append((v, p))
    for ch in popVars.keys():
        popVars[ch].sort()

    t = 0
    chMax = 0
    for ch in zones.keys():
        chMax = max(chMax, len(ch))
        for zone in zones[ch]:
            (s, e, _n) = zone
            l = e - s + 1
            t += l

    if opts['-v']:
        print >> sys.stderr, 'mean coverage = %g' % (float(N * L) / float(t), )

    zoneCounts = {}
    zoneProbs = []
    for ch in zones.keys():
        zoneCounts[ch] = {}
        for zone in zones[ch]:
            zoneCounts[ch][zone] = 0
            (s, e, _n) = zone
            l = e - s + 1
            zoneProbs.append((float(l) / float(t), (ch, zone)))
    zgen = MultiGen(zoneProbs)
    for n in xrange(N):
        (ch, z) = zgen.gen()
        if z not in zoneCounts[ch]:
            zoneCounts[ch][z] = 0
        zoneCounts[ch][z] += 1

    vStrs = opts['<variant>']
    if opts['-f'] is not None:
        with openFile(opts['-f']) as f:
            for l in f:
                s = l.strip()
                vStrs.append(s)

    allVars = {}
    for s in vStrs:
        v = makeHGVS(s, sf)
        if v is None:
            print >> sys.stderr, 'unable to parse variant: %s', (s, )
            continue
        if v.anonymous():
            n = v.size()
            seq = ''.join(
                [random.choice(['A', 'C', 'G', 'T']) for i in range(n)])
            v.setSequence(seq)
        a = normalizeAccession(v.accession())
        if a not in allVars:
            allVars[a] = []
        allVars[a].append(v)

    numOverlaps = 0
    for xs in allVars.values():
        xs.sort()
        for i in range(len(xs)):
            for j in range(i + 1, len(xs)):
                if xs[i].overlaps(xs[j]):
                    print >> sys.stderr, "variants overlap: %s <> %s" % (str(
                        xs[i]), str(xs[j]))
                    numOverlaps += 1
    if numOverlaps > 0:
        sys.exit(1)

    prog = None
    if opts['-v']:
        prog = tqdm(total=N, unit='pairs')

    egen = GeomVarSource(E)

    fasta = False

    logfile = None
    if opts['-l']:
        logfile = open(opts['-l'], 'w')

    pfx = opts['<output-prefix>']
    sfx = ''
    if opts['-z']:
        sfx = '.gz'
    with openFile(pfx + '_1.fastq' + sfx,
                  'w') as out1, openFile(pfx + '_2.fastq' + sfx, 'w') as out2:
        for ch in zones.keys():
            if prog is not None:
                prog.set_description(ch.ljust(chMax, ' '))
                prog.update(0)
            chVars = []
            if ch in allVars:
                chVars = allVars[ch]

            wtVars = applyBackgroundVariants(ch, [], popVars)
            mutVars = applyBackgroundVariants(ch, chVars, popVars)

            for zone in zones[ch]:
                wtMaker = ReadMaker(chrom=ch,
                                    zone=zone,
                                    L=L,
                                    I=I,
                                    D=D,
                                    variants=wtVars,
                                    fasta=fasta,
                                    egen=egen,
                                    sf=sf)
                wtMaker.prepareAllele()

                mutMaker = ReadMaker(chrom=ch,
                                     zone=zone,
                                     L=L,
                                     I=I,
                                     D=D,
                                     variants=mutVars,
                                     fasta=fasta,
                                     egen=egen,
                                     sf=sf)
                mutMaker.prepareAllele()

                for i in xrange(zoneCounts[ch][zone]):
                    if prog is not None:
                        prog.update(1)
                    u = random.random()
                    if u > V:
                        (rd1, rd2) = wtMaker.makeReadFromZone()
                    else:
                        (rd1, rd2) = mutMaker.makeReadFromZone()
                    print >> out1, rd1
                    print >> out2, rd2

    if prog is not None:
        prog.__exit__(None, None, None)
Пример #23
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['-k'])

    D = int(opts['-D'])

    Q = int(opts['-C'])

    V = float(opts['-V'])

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    if opts['-X']:
        Wcap = int(opts['-w'])
        Wval = int(opts['-W'])

        variants = opts['<variant>']
        if opts['-f']:
            with openFile(opts['-f']) as f:
                variants += f.read().split()

        vx = {}
        for v in variants:
            x = makeHGVS(v)
            if x is None:
                print >> sys.stderr, "unable to parse %s" % (v, )
                continue
            x.setSequenceFactory(sf)
            acc = x.accession()
            if acc not in vx:
                vx[acc] = []
            vx[acc].append(x)

        chk = None
        if opts['-T']:
            chk = {}

        rs = []
        for (acc, vs) in vx.iteritems():
            for v in vs:
                r = makeIndexedVariant(v, K, Wcap, Wval)
                if r is not None:
                    rs.append(r)
                if chk is not None:
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['wtSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('wt', str(v)))
                    if r['mutSeq'] is None:
                        continue
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['mutSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('mut', str(v)))

        if chk is not None:
            counts = dict([(x, 0) for x in chk.keys()])
            for acc in refSeq2Hg19.keys():
                if verbose:
                    print >> sys.stderr, 'scanning', acc
                seq = sf[acc]
                for x in kmers(K, seq):
                    if x in counts:
                        counts[x] += 1
            res = {}
            seen = set([])
            for x in counts.keys():
                y = rc(K, x)
                z = min(x, y)
                if z in seen:
                    continue
                seen.add(z)
                c = counts[x] + counts[y]
                for (a, v) in chk[x]:
                    if v not in res:
                        res[v] = {}
                    if a not in res[v]:
                        res[v][a] = {}
                    if c not in res[v][a]:
                        res[v][a][c] = 0
                    res[v][a][c] += 1
            yaml.safe_dump(res, sys.stdout, default_flow_style=False)
            return

        with open(opts['<index>'], 'w') as f:
            yaml.safe_dump(rs, f, default_flow_style=False)

        return

    capt = False
    zipname = None
    if opts['-c']:
        capt = True
        zipname = opts['-c']

    fmt = set([])
    if opts['-F']:
        fmt = set(opts['-F'].split(','))

    if verbose:
        print >> sys.stderr, "loading index."

    with open(opts['<index>']) as f:
        hgvsVars = yaml.load(f, Loader=yaml.FullLoader)

    NV = len(hgvsVars)

    combineStrands = True
    if opts['-s']:
        combineStrands = False

    cap = capture(K, reads=capt, kmers=True, verbose=verbose)

    for n in range(NV):
        itm = hgvsVars[n]
        h = itm['hgvs']
        v = makeHGVS(h)
        itm['var'] = v
        lhs = itm['lhsFlank']
        rhs = itm['rhsFlank']
        wt = itm['wtSeq']
        mut = itm['mutSeq']
        bait = [lhs, wt, rhs]
        if mut is not None:
            bait += ['N']
            bait += [lhs, mut, rhs]
        bait = ''.join(bait)
        n0 = cap.addBait(h, bait)
        assert n0 == n

    if verbose:
        print >> sys.stderr, "done."

    rn = 0
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        cap.addReadPairAndKmers(itm.reads[0], itm.reads[1])

    if capt:
        cap.saveReads(zipname)

    scorer = Scorer(K)

    globHist = {}

    for n in range(NV):
        mx = cap.capKmers[n]
        for c in mx.itervalues():
            if c < Q:
                continue
            if c not in globHist:
                globHist[c] = 0
            globHist[c] += 1

    with outputFile(opts['-o']) as out:
        hdrShown = False
        for n in range(NV):
            itm = hgvsVars[n]
            v = itm['var']
            h = itm['hgvs']

            mx = cap.capKmers[n]

            nr = cap.capReadCounts[n]

            if 'kmers' in fmt:
                for (x, c) in mx.iteritems():
                    print '%d\t%s\t%d' % (n, render(K, x), c)

            lhsFlank = itm['lhsFlank']
            rhsFlank = itm['rhsFlank']

            alleles = {}
            alleles['wt'] = []
            alleles['mut'] = []

            wtSeq = itm['wtSeq']
            wtZ = len(wtSeq)

            mutSeq = itm['mutSeq']
            mutZ = v.size()

            cs = [c for (x, c) in mx.iteritems() if c >= Q]
            cs.sort()
            nk = len(cs)
            if nk == 0:
                cs = [0]

            q10 = cs[1 * len(cs) // 10]
            q50 = cs[5 * len(cs) // 10]
            q90 = cs[9 * len(cs) // 10]

            af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq,
                              wtZ, mutZ)
            finders = []
            if not v.anonymous():
                finders.append(af.definiteAlleles())
            else:
                finders.append(af.bridgingAlleles())

            j = 0
            for (t, a) in cat(finders):
                assert t == 'wt' or t == 'mut'
                alleles[t].append(a)
                j += 1

            wtRes = {}
            wtRes['covMin'] = 0
            wtRes['binom'] = 1.0
            wtRes['ksDist'] = 0.0
            wtRes['hamming'] = 0
            wtRes['path'] = []
            for pthRes in alleles['wt']:
                scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank)
                if isBetter(pthRes, wtRes):
                    wtRes = pthRes

            mutRes = {}
            mutRes['covMin'] = 0
            mutRes['binom'] = 1.0
            mutRes['ksDist'] = 0.0
            mutRes['hamming'] = 0
            mutRes['path'] = []
            for pthRes in alleles['mut']:
                scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank)
                if isBetter(pthRes, mutRes):
                    mutRes = pthRes

            if True:
                wtXs = [mx.get(x, 0) for x in wtRes['path']]
                if len(wtXs) == 0:
                    wtXs = [0]
                wtXs.sort()
                wtCount = sum(wtXs)
                wtLen = len(wtXs)
                wtMean = float(wtCount) / float(wtLen)
                wtMedian = wtXs[wtLen // 2]

                mutXs = [mx.get(x, 0) for x in mutRes['path']]
                if len(mutXs) == 0:
                    mutXs = [0]
                mutXs.sort()
                mutCount = sum(mutXs)
                mutLen = len(mutXs)
                mutMean = float(mutCount) / float(mutLen)
                mutMedian = mutXs[mutLen // 2]

                totX = max([1.0, float(wtMedian + mutMedian), float(q90)])
                wtVaf = wtMedian / totX
                mutVaf = mutMedian / totX

            hdrs = ['n']
            fmts = ['%d']
            outs = [n]

            wtAllele = ((wtRes['covMin'] > Q) and
                        (wtRes['hamming'] < 4)) and (wtVaf > V)
            mutAllele = ((mutRes['covMin'] > Q) and
                         (mutRes['hamming'] < 4)) and (mutVaf > V)
            resV = 1 * wtAllele + 2 * mutAllele
            res = ['null', 'wt', 'mut', 'wt/mut'][resV]

            hdrs += ['res']
            fmts += ['%s']
            outs += [res]

            if 'rds' in fmt:
                hdrs += ['numReads']
                fmts += ['%d']
                outs += [nr]

            hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90']
            fmts += ['%d', '%d', '%d', '%d']
            outs += [nk, q10, q50, q90]

            hdrs += ['wtMin', 'mutMin']
            fmts += ['%d', '%d']
            outs += [wtRes['covMin'], mutRes['covMin']]

            hdrs += ['wtHam', 'mutHam']
            fmts += ['%d', '%d']
            outs += [wtRes['hamming'], mutRes['hamming']]

            if 'ks' in fmt:
                hdrs += ['wtD', 'mutD']
                fmts += ['%g', '%g']
                outs += [wtRes['ksDist'], mutRes['ksDist']]

            if 'binom' in fmt:
                hdrs += ['wtQ', 'mutQ']
                fmts += ['%g', '%g']
                outs += [wtRes['binom'], mutRes['binom']]

            if 'vaf' in fmt:
                hdrs += ['wtVaf', 'mutVaf']
                fmts += ['%g', '%g']
                outs += [wtVaf, mutVaf]

            hdrs += ['hgvs']
            fmts += ['%s']
            outs += [h]

            if not hdrShown:
                hdrShown = True
                print >> out, '\t'.join(hdrs)
            print >> out, '\t'.join(fmts) % tuple(outs)
            out.flush()
Пример #24
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    fns = opts['<input>']

    p = None
    if opts['-p'] is not None:
        p = float(opts['-p'])

    if len(fns) == 1 and isFasta(fns[0]):
        K = 25
        seqs = []
        with openFile(fns[0]) as f:
            for (nm, seq) in readFasta(f):
                xs = set(basics.kmers(K, seq, True))
                xs = list(xs)
                xs.sort()
                xs = array.array('L', xs)
                seqs.append((nm.split()[0], xs))
        Z = 1
        if opts['-a']:
            Z = len(seqs)

        print len(seqs)

        for i in xrange(Z):
            xnm = seqs[i][0]
            xs = seqs[i][1]
            for j in xrange(i + 1, len(seqs)):
                ynm = seqs[j][0]
                ys = seqs[j][1]
                (isec, union, d) = jaccard(xs, ys)
                if p is None:
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d)
                else:
                    pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10)
                    q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                    q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d, d - q05,
                        q95 - d, pv)
                sys.stdout.flush()

        return

    Z = 1
    if opts['-a']:
        Z = len(fns)

    for i in xrange(Z):
        with kmers(fns[i], 'r') as z0:
            xK = z0.meta['K']
            xs = array.array('L', readKmers(z0))
            for j in xrange(i + 1, len(fns)):
                with kmers(fns[j], 'r') as z1:
                    yK = z1.meta['K']
                    ys = array.array('L', readKmers(z1))
                    if xK != yK:
                        print >> sys.stderr, 'mismatched K:', fns[j]
                        sys.exit(1)
                    (isec, union, d) = jaccard(xs, ys)
                    if p is None:
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d)
                    else:
                        pv = logIx(p, isec + 1,
                                   (union - isec) + 1) / math.log(10)
                        q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                        q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d,
                            d - q05, q95 - d, pv)
                    sys.stdout.flush()
Пример #25
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = 25

    nms = []
    idx = {}
    for (nm, seq) in readFasta(openFile(opts['<baits>'])):
        n = len(nms)
        nms.append(nm)
        for x in kmersList(K, seq, True):
            if x not in idx:
                idx[x] = set([])
            idx[x].add(n)

    for x in idx.keys():
        idx[x] = list(idx[x])
        idx[x].sort()

    anti = set([])
    if opts['-U']:
        with openFile(opts['-U']) as f:
            for (nm, seq) in readFasta(f):
                for x in kmersList(K, seq, True):
                    anti.add(x)

    rn = 0
    if opts['-p']:

        hist = {}
        for (fn1, fn2) in pairs(opts['<input>']):
            tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq'))
                    for i in xrange(len(nms))]
            cache = [[[], []] for i in xrange(len(nms))]
            counts = [0 for i in xrange(len(nms))]
            with openFile(fn1) as f1, openFile(fn2) as f2:
                for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                    hits = set([])
                    pushup = False
                    for x in kmersList(K, fq1[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    for x in kmersList(K, fq2[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    if pushup:
                        continue
                    n = len(hits)
                    hist[n] = 1 + hist.get(n, 0)
                    for i in hits:
                        counts[i] += 1
                        cache[i][0].append(fq1)
                        cache[i][1].append(fq2)
                        if len(cache[i][0]) >= 1024:
                            with open(tmps[i][0], 'a') as f:
                                for rd in cache[i][0]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            with open(tmps[i][1], 'a') as f:
                                for rd in cache[i][1]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            cache[i][0] = []
                            cache[i][1] = []
            for i in xrange(len(cache)):
                if len(cache[i][0]) > 0:
                    with open(tmps[i][0], 'a') as f:
                        for rd in cache[i][0]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    with open(tmps[i][1], 'a') as f:
                        for rd in cache[i][1]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    cache[i][0] = []
                    cache[i][1] = []
            with zipfile.ZipFile(opts['<output>'], 'w',
                                 zipfile.ZIP_DEFLATED) as z:
                for i in xrange(len(nms)):
                    if counts[i] > 0:
                        pth = '/'.join(nms[i].split())
                        z.write(tmps[i][0], pth + '/' + fn1)
                        os.remove(tmps[i][0])
                        z.write(tmps[i][1], pth + '/' + fn2)
                        os.remove(tmps[i][1])
        hist = hist.items()
        hist.sort()
        for (n, f) in hist:
            print '%d\t%d' % (n, f)
    else:
        raise "not implemented"
Пример #26
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    M = (1 << (2*K)) - 1

    paired = True
    if opts['-s']:
        paired = False

    p = float(opts['-p'])
    T = int(M * p)

    if opts['-r']:
        refs = []
        with openFile(opts['-r']) as f:
            for (nm, seq) in readFasta(f):
                refs += kmersList(K, seq, False)
        refs = set(refs)

        kill = set([])
        for x in refs:
            y = rc(K, x)
            if y in refs:
                kill.add(x)
                kill.add(y)
        print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs))

        refs -= set(kill)

        fwd = {}
        rev = {}
        for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
            fn = 0
            for x in xs:
                if x in refs:
                    fn += 1

            ys = [rc(K, x) for x in xs]
            rn = 0
            for y in ys:
                if y in refs:
                    rn += 1
            
            if fn + rn == 0:
                continue

            q = float(fn) / float(fn + rn)
            if random.random() < q:
                for x in xs:
                    fwd[x] = 1 + fwd.get(x, 0)
            else:
                for y in ys:
                    rev[y] = 1 + rev.get(y, 0)

        for (x,xc) in fwd.iteritems():
            y = rc(K, x)
            yc = 0
            if y in rev:
                yc = rev[y]
                del rev[y]
            print '%d\t%d' % (xc, yc)

        for (y,yc) in rev.iteritems():
            print '%d\t%d' % (0, yc)

        return

    kx = {}
    for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
        for x in xs:
            if x in kx:
                kx[x] += 1
                continue
            y = rc(K, x)
            z = murmer(min(x, y), 17)
            if (z & M) > T:
                continue
            kx[x] = 1

    for x in kx.keys():
        y = rc(K, x)
        if x > y:
            continue
        xc = kx[x]
        yc = kx.get(y, 0)
        if murmer(x, 17) >= murmer(y, 17):
            (a, b) = (x, y)
            (ac, bc) = (xc, yc)
        else:
            (a, b) = (y, x)
            (ac, bc) = (yc, xc)
        #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc)
        print '%d\t%d' % (ac, bc)
Пример #27
0
def prepareBedFileGene(geneListFn, refGeneFn, outFn):
    genes = {}
    with openFile(geneListFn) as f:
        for l in f:
            t = l.split()
            genes[t[0]] = set([])

    loci = {}
    strands = {}
    with openFile(refGeneFn) as f:
        for l in f:
            t = l.split()
            tx = t[1]
            ch = t[2]
            st = t[3]
            g = t[12]

            # Ignore chr6_cox_hap2 and friends.
            if '_' in ch:
                continue

            if g not in genes:
                continue

            if g in strands:
                if (ch, st) != strands[g]:
                    print >> sys.stderr, 'gene tx seen in multiple chromosomes: %s [%s] (%s%s, %s%s)' % (
                        g, tx, strands[g][0], strands[g][1], ch, st)
                    continue
            else:
                strands[g] = (ch, st)

            ss = [int(s) for s in t[9].split(',') if len(s) > 0]
            ee = [int(e) for e in t[10].split(',') if len(e) > 0]
            exs = zip(ss, ee)
            for i in range(len(exs)):
                genes[g].add((exs[i][0], exs[i][1]))

    for g in sorted(genes.keys()):
        if len(genes[g]) == 0:
            print >> sys.stderr, 'no annotated transcripts for gene: %s' % (
                g, )
            continue

        (ch, st) = strands[g]
        exs0 = sorted(genes[g])

        exs1 = [exs0[0]]
        for i in range(1, len(exs0)):
            if exs1[-1][1] >= exs0[i][0]:
                exs1[-1] = (min(exs1[-1][0],
                                exs0[i][0]), max(exs1[-1][1], exs0[i][1]))
            else:
                exs1.append(exs0[i])

        if len(exs0) != len(exs1):
            genes[g] = exs1
            #print >> sys.stderr, 'merged exons for gene %s (%d -> %d)' % (g, len(exs0), len(exs1))

    with openFile(outFn, 'w') as out:
        for g in sorted(genes.keys()):
            if len(genes[g]) == 0:
                continue

            (ch, st) = strands[g]
            exs = sorted(genes[g])
            if st == '-':
                exs = exs[::-1]

            for i in range(len(exs)):
                j = i + 1
                print >> out, '%s\t%d\t%d\t%s/%02d\t%s' % (ch, exs[i][0],
                                                           exs[i][1], g, j, st)