def kmersWithPosLists(K, seq): fwd = [] rev = [] L = len(seq) - K + 1 for (x,p) in kmersWithPosList(K, seq, True): if p > 0: fwd.append((x, p - 1)) else: rev.append((x, L + p)) return (fwd, rev)
def remapReads(K, L, rds, v): ctx = v.context(2*L) idx = {} for (x,p) in kmersWithPosList(K, ctx[1], False): if x not in idx: idx[x] = [] idx[x].append(p) res = {} for fq in rds: for (x,p) in locate(K, idx, fq[1]): if p not in res: res[p] = {} if x not in res[p]: res[p][x] = 0 res[p][x] += 1 for (p,ys) in sorted(res.items()): for (y,c) in sorted(ys.items()): print '%d\t%s\t%d' % (p, render(K, y), c)
def main(argv): opts = docopt.docopt(__doc__, argv) random.seed(17) K = int(opts['-k']) S = 2*(K-3) frameAnchors = {} knownStops = {} sequences = {} seqKmers = {} if opts['-r']: with openFile(opts['-r']) as f: for (nm,seq) in readFasta(f): sequences[nm] = seq # trim polyA tails seq = re.sub('AAAAAA*$', '', seq) seqKmers[nm] = set([]) for (x,p1) in kmersWithPosList(K, seq, False): seqKmers[nm].add(x) p = p1 - 1 w = p % 3 if x not in frameAnchors: frameAnchors[x] = set([]) frameAnchors[x].add((nm,p)) y = x & 63 if w == 0 and y in stops: if x not in knownStops: knownStops[x] = set([]) knownStops[x].add(nm) rn = 0 res = {} for fn in opts['<input>']: with openFile(fn) as f: for rd in readFastq(f): L = len(rd[1]) rn += 1 fwdAndRev = kmersWithPosLists(K, rd[1]) frames = {} possibleStops = {} for i in range(2): #print i, sorted([p for (x,p) in fwdAndRev[i]]) for (x,p) in fwdAndRev[i]: if x in frameAnchors: for (nm,q) in frameAnchors[x]: o = (q - p) k = (nm, o, i) frames[k] = 1 + frames.get(k, 0) if len(frames) == 0: continue n = sum(frames.values()) probs = [] for ((nm, off, strnd), cnt) in sorted(frames.items()): probs.append((float(cnt)/float(n), cnt, off, strnd, nm)) v = random.random() for (pv, cnt, off, strnd, nm) in probs: if v < pv: #print rd[1] #print proj(strnd, sequences[nm][off:off+len(rd[1])]) #print codons(off % 3, rd[1]), off for (x,p) in fwdAndRev[strnd]: if (p + off + K - 3) % 3 == 0 and (x & 63) in stops: if nm not in res: res[nm] = {} if x not in res[nm]: res[nm][x] = 0 res[nm][x] += 1 break v -= pv for (nm,stps) in res.iteritems(): for (x,c) in stps.iteritems(): (d,y) = nearest3(K, seqKmers[nm], x) if x in knownStops: k = 'known' else: k = 'novel' print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) if (K & 1) != 0: print >> sys.stderr, "K must be even." return minCov = int(opts['-m']) verbose = opts['-v'] J = K // 2 S = 2*(K - J) Mj = (1 << (2*J)) - 1 names = [] seqs = {} bait = {} wtFst = [] wtLst = [] posIdx = [] rds = [] with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs[nm] = seq wf = {} wl = {} for x in kmersList(K, seq, False): if x not in bait: bait[x] = set([]) bait[x].add(n) y0 = x >> S y1 = x & Mj #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1)) if y0 not in wf: wf[y0] = set([]) wf[y0].add(y1) if y1 not in wl: wl[y1] = set([]) wl[y1].add(y0) wtFst.append(wf) wtLst.append(wl) px = {} for (x,p) in kmersWithPosList(J, seq, False): if x not in px: px[x] = [] px[x].append(p) posIdx.append(px) for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]): pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d dd = pp[2] - pp[0] print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd) rds.append([]) N = len(names) L = None X = [{} for n in range(N)] for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose): rd = itm.reads[0] L = len(rd) xs = itm.kmers[0] hits = set([]) for x in xs: if x in bait: hits |= bait[x] for n in hits: for x in xs: if x not in X[n]: X[n][x] = 0 X[n][x] += 1 rds[n].append(rd) hdrShown = False vn = 0 for n in range(N): xs = {} for (x,c) in X[n].iteritems(): if c >= 10: xs[x] = c seq = seqs[names[n]] rngs = [] st = None en = None inside = False xx = [] for x in kmersList(K, seq, False): if x in xs: xx.append('.') else: xx.append('X') print ''.join(xx) for x in kmersList(K, seq, False): if not inside: if x in xs: st = x else: inside = True else: if x in xs: en = x rngs.append((st, en)) st = x en = None inside = False if inside: rngs.append((st, en)) pthr = Pather(K, xs) for (x,y) in rngs: if x is None or y is None: continue print render(K, x), render(K, y) for p in pthr.trace(x, y, 100): print renderPath(K, p) continue fst = {} lst = {} for (x,c) in xs.iteritems(): #if c < 5: # continue y0 = x >> S y1 = x & Mj if y0 not in fst: fst[y0] = [] fst[y0].append(y1) if y1 not in lst: lst[y1] = [] lst[y1].append(y0) #for (a, b, c, d) in findDupDeNovo(fst, lst): for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst): #continue pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)] dd = pp[2] - pp[0] if not opts['-a'] and dd % 3 != 0: continue if opts['-s']: fstPath = interpolate(K, xs, ab, cb, dd+1) sndPath = interpolate(K, xs, cb, cd, dd+1) if fstPath is None: continue if sndPath is None: continue if fstPath[J:-J] != sndPath[J:-J]: continue pa = pp[0] pb = pp[1] pc = pp[2] pd = pp[3] cab = xs.get(ab, 0) ccb = xs.get(cb, 0) ccd = xs.get(cd, 0) if cab < minCov: continue if ccb < minCov: continue if ccd < minCov: continue m = (cab + ccd) / 2.0 # Assume the true std dev is 10% of the mean w = ccb / m hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1) v = Duplication(names[n], pb, pd-1, seqs) if opts['-A']: showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n]) vn += 1 hdrs = ['n'] fmts = ['%d'] outs = [vn] hdrs += ['left', 'leftCov'] fmts += ['%s','%d'] outs += [render(K, ab), cab] hdrs += ['mid', 'midCov'] fmts += ['%s','%d'] outs += [render(K, cb), ccb] hdrs += ['right', 'rightCov'] fmts += ['%s','%d'] outs += [render(K, cd), ccd] hdrs += ['len'] fmts += ['%d'] outs += [dd] hdrs += ['vaf'] fmts += ['%g'] outs += [w] hdrs += ['hgvs'] fmts += ['%s'] outs += [hgvs] if not hdrShown: hdrShown = True print '\t'.join(hdrs) print '\t'.join(fmts) % tuple(outs)
def findAnchors(K, seq, mx, isLhs, D): xps = kmersWithPosList(K, seq, False) xps = [(x, p - 1) for (x, p) in xps] # Find the highest coverage k-mer that intersects. xc = 0 for (x, p) in xps: if x in mx and mx[x] >= xc: xc = mx[x] if xc == 0: return set([]) # Seeds should be in the same order of magnitude # as the highest-coverage seed. t = int(math.exp(math.log(xc) - 1.5)) xs = {} for (x, c) in mx.iteritems(): if c < t: continue xs[x] = c ys = xs.keys() zs = {} for (x, p) in xps: zs[p] = set([]) if x not in xs: continue for y in ys: d = ham(x, y) if d > D: continue zs[p].add(y) e = set([]) res = set([]) if isLhs: for (x, p) in xps: ss = zs.get(p, e) tt = zs.get(p - 1, e) for s in ss: res.add((s, p)) for t in tt: if debruijn(K, t, s): res.discard((t, p - 1)) else: for (x, p) in xps[::-1]: ss = zs.get(p, e) tt = zs.get(p + 1, e) for s in ss: res.add((s, p)) for t in tt: if debruijn(K, s, t): res.discard((t, p + 1)) if isLhs: l = len(seq) - K res = [(x, l - p) for (x, p) in res] else: res = list(res) res.sort() return res
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) C = int(opts['-C']) L = int(opts['-L']) raw = opts['-r'] S = int(opts['-S']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) with openFile(opts['<regions>']) as f: R = readBED(f) refTbl = {} refIdx = {} zoneIdx = {} for (acc, zones) in R.items(): accSeq = sf[acc] for (s, e, nm) in zones: zoneIdx[nm] = (acc, s, e) seq = accSeq[s - 1:e] if nm not in refTbl: refTbl[nm] = {} for (x, p) in kmersWithPosList(K, seq, False): p -= 1 p += s refTbl[nm][p] = x if x not in refIdx: refIdx[x] = [] refIdx[x].append((nm, p)) acc = {} for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, verbose=verbose): rdL = itm.reads[0] zL = len(rdL) (fwdL, revL) = kmersWithPosLists(K, rdL[1]) fwdLHits = hits(refIdx, K, fwdL, acc) revLHits = hits(refIdx, K, revL, acc) rdR = itm.reads[1] zR = len(rdR) (fwdR, revR) = kmersWithPosLists(K, rdR[1]) fwdRHits = hits(refIdx, K, fwdR, acc) revRHits = hits(refIdx, K, revR, acc) killZ = set([]) for z in acc.keys(): killP = set([]) for p in acc[z].keys(): killX = set([]) vv = {} for x in acc[z][p].keys(): y = x >> 2 if y not in vv: vv[y] = [] vv[y].append((x, acc[z][p][x])) for vs in vv.values(): vt = V * sum([c for (x, c) in vs]) for (x, c) in vs: if c < vt or c < C: killX.add(x) for x in killX: del acc[z][p][x] if len(acc[z][p]) == 0: killP.add(p) for p in killP: del acc[z][p] if len(acc[z]) == 0: killZ.add(z) for z in killZ: del acc[z] if raw: print '\t'.join(['chrom', 'pos', 'side', 'label', 'anchor', 'insSeq']) else: print '\t'.join([ 'chrom', 'after', 'before', 'label', 'rhsShift', 'lhsShift', 'lhsAnc', 'rhsAnc', 'lhsSeq', 'rhsSeq' ]) for z in sorted(acc.keys()): (ch, st, en) = zoneIdx[z] Z = acc[z] ref = refTbl[z] aft = dict(forwardSpurs(K, ref, Z)) bef = dict(reverseSpurs(K, ref, Z)) scoredAft = {} for p in sorted(aft.keys()): if p + K - 1 == en: continue for spur in aft[p]: if len(spur) < L: continue if raw: (xs, cs) = zip(*spur) seq = renderPath(K, xs) anc = seq[:K] ins = seq[K:] print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % ( ch, p + K - 1, 'after', z, anc, ins, ','.join( map(str, cs))) continue for (q, xcs, v) in shiftForwardSpur(ref, Z, S, p, spur): q += K - 1 if q not in scoredAft: scoredAft[q] = [] (xs, cs) = zip(*xcs) seq = renderPath(K, xs) anc = seq[:K] ins = seq[K:] scoredAft[q].append((v, anc, ins, cs)) scoredBef = {} for p in sorted(bef.keys()): if p == st: continue for spur in bef[p]: if len(spur) < L: continue if raw: (xs, cs) = zip(*spur) seq = renderPath(K, xs) anc = seq[-K:] ins = seq[:-K] print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % ( ch, p, 'before', z, anc, ins, ','.join(map(str, cs))) continue for (q, xcs, v) in shiftReverseSpur(ref, Z, S, p, spur): if q not in scoredBef: scoredBef[q] = [] (xs, cs) = zip(*xcs) seq = renderPath(K, xs) anc = seq[-K:] ins = seq[:-K] scoredBef[q].append((v, anc, ins, cs)) for p0 in sorted(scoredAft.keys()): p1 = p0 + 1 if p1 not in scoredBef: continue for (aftV, aftAnc, aftIns, aftCov) in scoredAft[p0]: for (befV, befAnc, befIns, befCov) in scoredBef[p1]: if befAnc in aftIns or aftAnc in befIns: continue v = aftV + befV print '%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( ch, p0, p1, z, aftV, befV, aftAnc, befAnc, aftIns, befIns)
def main(argv): global verbose opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] genomeDir = '.' if opts['-g']: genomeDir = opts['-g'] sf = SequenceFactory(genomeDir) if opts['-P']: if opts['-t']: prepareBedFileGeneTx(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) else: prepareBedFileGene(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) return if opts['-X']: with openFile(opts['<index>'], 'w') as out: yaml.safe_dump_all(indexBedFiles(opts['<must-have>'], sf), out, default_flow_style=False) return K = int(opts['-k']) minGeneReads = int(opts['-M']) minExonReads = int(opts['-m']) minGeneRate = float(opts['-R']) minExonRate = float(opts['-r']) (minGeneCount, maxGeneCount) = map(int, opts['-Z'].split(':')) (minExonCount, maxExonCount) = map(int, opts['-z'].split(':')) with openFile(opts['<index>']) as f: ref = list(yaml.load_all(f, Loader=yaml.BaseLoader)) if True: # Test the double-layer index idx = ExonIndex(K, ref) acc = {} toc = {} rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersLists(K, itm.reads[1][1]) xs0 = lhsFwd + rhsRev rh0 = idx.readHash(xs0) if rh0 is not None: (h0, ys0) = rh0 if h0 not in acc: acc[h0] = [] toc[h0] = ys0 acc[h0].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) xs1 = lhsRev + rhsFwd rh1 = idx.readHash(xs1) if rh1 is not None: (h1, ys1) = rh1 if h1 not in acc: acc[h1] = [] toc[h1] = ys1 acc[h1].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) nx = 0 for h in sorted(acc.keys()): for (x, c) in sorted(acc[h].items()): nx += 1 if c <= 1: continue print '%016x\t%s\t%d' % (h, render(K, x), c) print >> sys.stderr, 'nx =', nx return if False: # Position index idx = {} for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): p -= 1 if x not in idx: idx[x] = [] idx[x].append((i, p)) if True: # Exon tuple index idx = {} lens = [0 for i in range(len(ref))] for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): if x not in idx: idx[x] = set([]) idx[x].add(i) lens[i] += 1 for x in idx.iterkeys(): idx[x] = tuple(sorted(idx[x])) if opts['-T']: ak = {} for x in sorted(idx.iterkeys()): if len(idx[x]) == 1: continue xStr = render(K, x) ak[xStr] = [] for i in idx[x]: itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) ak[xStr].append(k) ak[xStr].sort() rep = {} rep['aliasing-within'] = ak chrs = set([]) for i in range(len(ref)): itm = ref[i] chrs.add(itm['chr']) counts = [0 for i in range(len(ref))] for ch in sorted(chrs): if verbose: print >> sys.stderr, 'processing %s' % (ch, ) seq = sf[ch] for (x, p) in kmersWithPos(K, seq, True): if x not in idx: continue for i in idx[x]: counts[i] += 1 gk = {} for i in range(len(ref)): if lens[i] == counts[i]: continue itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) gk[k] = {'indexed': lens[i], 'genomic': counts[i]} rep['aliasing-genomic'] = gk yaml.safe_dump(rep, sys.stdout, default_flow_style=False) return acc = {} rn = 0 hitStats = Summary() hitHist = [0 for i in range(1000)] for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersWithPosLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersWithPosLists(K, itm.reads[1][1]) (hits0, hitCount0) = recHits(idx, lhsFwd + rhsRev) (hits1, hitCount1) = recHits(idx, lhsRev + rhsFwd) if len(hits0) > 0: k = tuple(sorted(hits0.keys())) v = sum(hits0.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount0) hitHist[hitCount0] += 1 if len(hits1) > 0: k = tuple(sorted(hits1.keys())) v = sum(hits1.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount1) hitHist[hitCount1] += 1 if verbose: print >> sys.stderr, 'total read hits: %d' % (len(hitStats), ) print >> sys.stderr, 'total hits per read: %g (%g)' % (hitStats.mean(), hitStats.sd()) print >> sys.stderr, 'total reads: %d' % (rn, ) for i in range(len(hitHist)): if hitHist[i] > 0: print >> sys.stderr, '\t%d\t%d' % (i, hitHist[i]) def gex(s): r = [] for n in s: itm = ref[n] r.append('%s/%s' % (itm['gene'], itm['exon'])) return '|'.join(r) def fmtKey(k): nex = len(k) gx = set([]) kStrParts = [] for s in k: kStrParts.append(gex(s)) gx |= set([ref[i]['gene'] for i in s]) kStr = '--'.join(sorted(kStrParts)) return (nex, gx, kStr) gxCounts = {} for k in acc.keys(): gx = set([]) ex = set([]) for s in k: gx |= set([ref[i]['gene'] for i in s]) ex |= set(s) gx = tuple(sorted(gx)) if gx not in gxCounts: gxCounts[gx] = [0, 0] gxCounts[gx][0] += acc[k][0] gxCounts[gx][1] += acc[k][1] hdr = ['numReads', 'numKmers', 'kmersPerRead'] hdr += ['ggNumReads', 'ggNumKmers', 'ggKmersPerRead'] hdr += ['numExons', 'numGenes', 'geneGroup', 'exonGroup'] print '\t'.join(hdr) for k in acc.keys(): (nex, gx, kStr) = fmtKey(k) gx = tuple(sorted(gx)) if len(gx) < minGeneCount or len(gx) > maxGeneCount: continue if len(ex) < minExonCount or len(ex) > maxExonCount: continue if gxCounts[gx][0] < minGeneReads: continue if acc[k][0] < minExonReads: continue gxRate = float(gxCounts[gx][1]) / float(gxCounts[gx][0]) if gxRate < minGeneRate: continue exRate = float(acc[k][1]) / float(acc[k][0]) if exRate < minExonRate: continue gxStr = ':'.join(gx) print '%d\t%d\t%g\t%d\t%d\t%g\t%d\t%d\t%s\t%s' % ( acc[k][0], acc[k][1], exRate, gxCounts[gx][0], gxCounts[gx][1], gxRate, nex, len(gx), gxStr, kStr)