def showAnchoredReads(K, anks, rds): A = set(anks.keys()) res = {} for rd in rds: (xs, ys) = kmersWithPosLists(K, rd[1]) fwd = set([]) for (x,p) in xs: if x in A: fwd.add((x,p)) rev = set([]) for (y,p) in ys: if y in A: rev.add((y,p)) if len(fwd) > 0: p0 = min([p for (x,p) in fwd]) fwd = frozenset([(x,p-p0) for (x,p) in fwd]) seq = rd[1] if fwd not in res: res[fwd] = {} k = (p0,seq) if k not in res[fwd]: res[fwd][k] = 0 res[fwd][k] += 1 if len(rev) > 0: p0 = min([p for (y,p) in rev]) rev = frozenset([(y,p-p0) for (y,p) in rev]) seq = revComp(rd[1]) if rev not in res: res[rev] = {} k = (p0,seq) if k not in res[rev]: res[rev][k] = 0 res[rev][k] += 1 for s in sorted(res.keys()): q = max([p for (p,seq) in res[s].keys()]) lab = ','.join(sorted([anks[x] for (x,p) in s])) hdr = [lab,'\t\t', (q-1)*' '] pp = None for (p,x) in sorted([(p1,x1) for (x1,p1) in s]): if pp is None: hdr += [render(K, x)] else: d = p - pp - K hdr += [d*' ', render(K, x)] pp = p print ''.join(hdr) vv = [] for (k,c) in sorted(res[s].items()): (p0, seq) = k vv.append((p0, c, seq)) for (p0, c, seq) in sorted(vv): print '%d\t%d\t%s%s' % (p0, c, (q-p0)*' ', seq) print
def renderPath(K, xs): if len(xs) == 0: return '' res = [render(K, xs[0])] for x in xs[1:]: res.append('ACGT'[x&3]) return ''.join(res)
def test_fixed_path_kmers_1() : random.seed(17) K = 25 D = 3 N = 100 e = 0.01 alts = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']} seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG' L = len(seq) xs = kmersList(K, seq) X = {} for i in range(N): r = [] for j in range(L): b = seq[j] if random.random() < e: b = random.choice(alts[b]) r.append(b) s = ''.join(r) ys = kmersList(K, s) for y in ys: if y not in X: X[y] = 0 X[y] += 1 Y = fixed_path_kmers(K, D, X, xs) assert Y is not None assert len(Y) == len(xs) for i in range(len(Y)): assert xs[i] in Y[i] V = [(render(K, y), X[y]) for y in Y[i] if y != xs[i] and X[y] > X[xs[i]]] assert len(V) == 0
def renderPath(K, xs): if len(xs) == 0: return '' r = [render(K, xs[0])] for x in xs[1:]: r.append("ACGT"[x & 3]) return ''.join(r)
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] with kmers(inp, 'r') as z: K = z.meta['K'] if 'kmers' not in z.meta: print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (inp,) return if 'counts' in z.meta: xs = readKmersAndCounts(z) for (x, c) in xs: print '%s\t%d' % (render(K, x), c) else: xs = readKmers(z) for x in xs: print render(K, x)
def trace1(self, pth, xe, n): if len(pth) > n: return if pth[-1] == xe: yield pth print len(pth), render(self.K, pth[-1]) for y in self.succ(pth[-1]): for p in self.trace1(pth + [y], xe, n): yield p
def main(argv): opts = docopt.docopt(__doc__, argv) L0 = None if opts['-l']: L0 = int(opts['-l']) for inp in opts['<input>']: with kmers(inp, 'r') as z: K = z.meta['K'] L = L0 if L is None: L = 2*K xs = array.array('L', readKmers(z)) S = sparse(2*K, xs) seen = bitvec(S.count()) for i in xrange(S.count()): if seen[i]: continue x = S.select(i) xb = rc(K, x) xp = succ(K, S, xb) if xp == 1: # x isn't the start of a contig continue pth = [x] seen[i] = 1 xn = succ(K, S, x) while len(xn) == 1: if seen[xn[0]] == 1: break x = S.select(xn[0]) pth.append(x) seen[xn[0]] = 1 xb = rc(K, x) j = S.rank(xb) seen[j] = 1 xn = succ(K, S, x) if len(pth)+K-1 < L: continue s = [render(K, pth[0])] for j in xrange(1, len(pth)): s.append("ACGT"[pth[j]&3]) print '>contig_%d\n%s' % (i, ''.join(s))
def test_kmersList(): K = 25 M = (1 << (2 * K)) - 1 N = 65536 random.seed(17) xs = [random.randint(0, M) for i in xrange(N)] xs.sort() with autoremove(): t = tmpfile() with casket(t, 'w') as z: zs = [x for x in xs] writeKmersList(z, zs) with casket(t, 'r') as z: ys = list(readKmers(z)) assert len(xs) == len(ys) for i in xrange(len(xs)): assert xs[i] == ys[i], '%d\t%s\t%s' % (i, render( K, xs[i]), render(K, ys[i]))
def computeBias(K, zs, verbose=False): S = summarizer() for (x, xc) in zs.iteritems(): y = rc(K, x) if y < x: continue yc = zs.get(y, 0) if xc > yc: a = xc b = yc else: a = yc b = xc apb = a + b if apb > 0: v = float(a) / float(apb) else: v = 0.5 if verbose: print '%s\t%s\t%d\t%d\t%g' % (render(K, x), render(K, y), xc, yc, v) S.add(v) return (S.mean(), S.var())
def remapReads(K, L, rds, v): ctx = v.context(2*L) idx = {} for (x,p) in kmersWithPosList(K, ctx[1], False): if x not in idx: idx[x] = [] idx[x].append(p) res = {} for fq in rds: for (x,p) in locate(K, idx, fq[1]): if p not in res: res[p] = {} if x not in res[p]: res[p][x] = 0 res[p][x] += 1 for (p,ys) in sorted(res.items()): for (y,c) in sorted(ys.items()): print '%d\t%s\t%d' % (p, render(K, y), c)
def main(argv): opts = docopt.docopt(__doc__, argv) random.seed(17) K = int(opts['-k']) S = 2*(K-3) frameAnchors = {} knownStops = {} sequences = {} seqKmers = {} if opts['-r']: with openFile(opts['-r']) as f: for (nm,seq) in readFasta(f): sequences[nm] = seq # trim polyA tails seq = re.sub('AAAAAA*$', '', seq) seqKmers[nm] = set([]) for (x,p1) in kmersWithPosList(K, seq, False): seqKmers[nm].add(x) p = p1 - 1 w = p % 3 if x not in frameAnchors: frameAnchors[x] = set([]) frameAnchors[x].add((nm,p)) y = x & 63 if w == 0 and y in stops: if x not in knownStops: knownStops[x] = set([]) knownStops[x].add(nm) rn = 0 res = {} for fn in opts['<input>']: with openFile(fn) as f: for rd in readFastq(f): L = len(rd[1]) rn += 1 fwdAndRev = kmersWithPosLists(K, rd[1]) frames = {} possibleStops = {} for i in range(2): #print i, sorted([p for (x,p) in fwdAndRev[i]]) for (x,p) in fwdAndRev[i]: if x in frameAnchors: for (nm,q) in frameAnchors[x]: o = (q - p) k = (nm, o, i) frames[k] = 1 + frames.get(k, 0) if len(frames) == 0: continue n = sum(frames.values()) probs = [] for ((nm, off, strnd), cnt) in sorted(frames.items()): probs.append((float(cnt)/float(n), cnt, off, strnd, nm)) v = random.random() for (pv, cnt, off, strnd, nm) in probs: if v < pv: #print rd[1] #print proj(strnd, sequences[nm][off:off+len(rd[1])]) #print codons(off % 3, rd[1]), off for (x,p) in fwdAndRev[strnd]: if (p + off + K - 3) % 3 == 0 and (x & 63) in stops: if nm not in res: res[nm] = {} if x not in res[nm]: res[nm][x] = 0 res[nm][x] += 1 break v -= pv for (nm,stps) in res.iteritems(): for (x,c) in stps.iteritems(): (d,y) = nearest3(K, seqKmers[nm], x) if x in knownStops: k = 'known' else: k = 'novel' print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = getK(opts['<input>']) J = K - 1 M = (1 << (2 * (K - J))) - 1 if opts['-r'] is not None: with kmers(opts['-r'], 'r') as z: xs = list(group(K, J, 0, readKmersAndCounts(z))) for fn in opts['<input>']: with kmers(fn, 'r') as z: samXs = readKmersAndCounts(z) i = 0 for (yCtx, _, yGrp) in group(K, J, 0, samXs): while i < len(xs) and xs[i][0] < yCtx: i += 1 assert i < len(xs) assert xs[i][0] == yCtx gt = float(sum([c for (x,c) in xs[i][2]])) gx = [0 for j in xrange(M+1)] for (x,c) in xs[i][2]: gx[x&M] = c st = sum([c for (x,c) in yGrp]) sx = [0 for j in xrange(M+1)] for (x,c) in yGrp: sx[x&M] = c ss = [] b = 0 for j in xrange(M+1): p = float(gx[j])/gt v = 0.0 if 0.0 < p and p < 1.0: v = logBinGe(p, st, sx[j]) if v < -10: b |= 1 << j ss.append('%3.2g' % (v,)) if b > 0: print '%s\t%s\t%s' % (render(J, yCtx), fasta(b), '\t'.join(ss)) i += 1 return # Parse files in parallel to get global distribution N = len(opts['<input>']) h = heap.heap() i = 0 for fn in opts['<input>']: (_, xs) = kfset.read(fn) i += 1 h.push(Group(K, J, i, xs)) while len(h) > 0: xfs = [] g = h.pop() gy = g.this()[0] xfs.append(g.this()) g.next() if g.valid(): h.push(g) for x in h.xs: assert x.valid() while len(h) > 0 and h.front().this()[0] == gy: g = h.pop() xfs.append(g.this()) g.next() if g.valid(): h.push(g) for i in xrange(len(h.xs)): assert h.xs[i].valid() ds = [] gc = [0 for i in xrange(M+1)] for (_, n, xc) in xfs: t = sum([c for (x,c) in xc]) d = [0 for i in xrange(M+1)] for (x,c) in xc: j = x & M gc[j] += c d[j] = c ds.append((n, d)) res = ['*' for i in xrange(N)] seen = set([]) gt = float(sum(gc)) for (n, d) in ds: t = sum(d) b = [0 for i in xrange((M+1)/4)] for i in xrange(M+1): p = float(gc[i])/gt if 0.0 < p and p < 1.0: #vL = logBinLe(p, t, d[i]) #vG = logBinGe(p, t, d[i]) #v = min(vL, vG) v = logBinGe(p, t, d[i]) if v > -10: w = i >> 2 j = i & 3 b[w] |= 1 << j res[n-1] = ''.join([fasta(b0) for b0 in b]) seen.add(res[n-1]) if len(seen) > 1: print '%s\t%s' % (render(J, gy), '\t'.join(res))
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) if (K & 1) != 0: print >> sys.stderr, "K must be even." return minCov = int(opts['-m']) verbose = opts['-v'] J = K // 2 S = 2*(K - J) Mj = (1 << (2*J)) - 1 names = [] seqs = {} bait = {} wtFst = [] wtLst = [] posIdx = [] rds = [] with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs[nm] = seq wf = {} wl = {} for x in kmersList(K, seq, False): if x not in bait: bait[x] = set([]) bait[x].add(n) y0 = x >> S y1 = x & Mj #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1)) if y0 not in wf: wf[y0] = set([]) wf[y0].add(y1) if y1 not in wl: wl[y1] = set([]) wl[y1].add(y0) wtFst.append(wf) wtLst.append(wl) px = {} for (x,p) in kmersWithPosList(J, seq, False): if x not in px: px[x] = [] px[x].append(p) posIdx.append(px) for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]): pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d dd = pp[2] - pp[0] print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd) rds.append([]) N = len(names) L = None X = [{} for n in range(N)] for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose): rd = itm.reads[0] L = len(rd) xs = itm.kmers[0] hits = set([]) for x in xs: if x in bait: hits |= bait[x] for n in hits: for x in xs: if x not in X[n]: X[n][x] = 0 X[n][x] += 1 rds[n].append(rd) hdrShown = False vn = 0 for n in range(N): xs = {} for (x,c) in X[n].iteritems(): if c >= 10: xs[x] = c seq = seqs[names[n]] rngs = [] st = None en = None inside = False xx = [] for x in kmersList(K, seq, False): if x in xs: xx.append('.') else: xx.append('X') print ''.join(xx) for x in kmersList(K, seq, False): if not inside: if x in xs: st = x else: inside = True else: if x in xs: en = x rngs.append((st, en)) st = x en = None inside = False if inside: rngs.append((st, en)) pthr = Pather(K, xs) for (x,y) in rngs: if x is None or y is None: continue print render(K, x), render(K, y) for p in pthr.trace(x, y, 100): print renderPath(K, p) continue fst = {} lst = {} for (x,c) in xs.iteritems(): #if c < 5: # continue y0 = x >> S y1 = x & Mj if y0 not in fst: fst[y0] = [] fst[y0].append(y1) if y1 not in lst: lst[y1] = [] lst[y1].append(y0) #for (a, b, c, d) in findDupDeNovo(fst, lst): for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst): #continue pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)] dd = pp[2] - pp[0] if not opts['-a'] and dd % 3 != 0: continue if opts['-s']: fstPath = interpolate(K, xs, ab, cb, dd+1) sndPath = interpolate(K, xs, cb, cd, dd+1) if fstPath is None: continue if sndPath is None: continue if fstPath[J:-J] != sndPath[J:-J]: continue pa = pp[0] pb = pp[1] pc = pp[2] pd = pp[3] cab = xs.get(ab, 0) ccb = xs.get(cb, 0) ccd = xs.get(cd, 0) if cab < minCov: continue if ccb < minCov: continue if ccd < minCov: continue m = (cab + ccd) / 2.0 # Assume the true std dev is 10% of the mean w = ccb / m hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1) v = Duplication(names[n], pb, pd-1, seqs) if opts['-A']: showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n]) vn += 1 hdrs = ['n'] fmts = ['%d'] outs = [vn] hdrs += ['left', 'leftCov'] fmts += ['%s','%d'] outs += [render(K, ab), cab] hdrs += ['mid', 'midCov'] fmts += ['%s','%d'] outs += [render(K, cb), ccb] hdrs += ['right', 'rightCov'] fmts += ['%s','%d'] outs += [render(K, cd), ccd] hdrs += ['len'] fmts += ['%d'] outs += [dd] hdrs += ['vaf'] fmts += ['%g'] outs += [w] hdrs += ['hgvs'] fmts += ['%s'] outs += [hgvs] if not hdrShown: hdrShown = True print '\t'.join(hdrs) print '\t'.join(fmts) % tuple(outs)
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) D = int(opts['-D']) Q = int(opts['-C']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) if opts['-X']: Wcap = int(opts['-w']) Wval = int(opts['-W']) variants = opts['<variant>'] if opts['-f']: with openFile(opts['-f']) as f: variants += f.read().split() vx = {} for v in variants: x = makeHGVS(v) if x is None: print >> sys.stderr, "unable to parse %s" % (v, ) continue x.setSequenceFactory(sf) acc = x.accession() if acc not in vx: vx[acc] = [] vx[acc].append(x) chk = None if opts['-T']: chk = {} rs = [] for (acc, vs) in vx.iteritems(): for v in vs: r = makeIndexedVariant(v, K, Wcap, Wval) if r is not None: rs.append(r) if chk is not None: xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['wtSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('wt', str(v))) if r['mutSeq'] is None: continue xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['mutSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('mut', str(v))) if chk is not None: counts = dict([(x, 0) for x in chk.keys()]) for acc in refSeq2Hg19.keys(): if verbose: print >> sys.stderr, 'scanning', acc seq = sf[acc] for x in kmers(K, seq): if x in counts: counts[x] += 1 res = {} seen = set([]) for x in counts.keys(): y = rc(K, x) z = min(x, y) if z in seen: continue seen.add(z) c = counts[x] + counts[y] for (a, v) in chk[x]: if v not in res: res[v] = {} if a not in res[v]: res[v][a] = {} if c not in res[v][a]: res[v][a][c] = 0 res[v][a][c] += 1 yaml.safe_dump(res, sys.stdout, default_flow_style=False) return with open(opts['<index>'], 'w') as f: yaml.safe_dump(rs, f, default_flow_style=False) return capt = False zipname = None if opts['-c']: capt = True zipname = opts['-c'] fmt = set([]) if opts['-F']: fmt = set(opts['-F'].split(',')) if verbose: print >> sys.stderr, "loading index." with open(opts['<index>']) as f: hgvsVars = yaml.load(f, Loader=yaml.FullLoader) NV = len(hgvsVars) combineStrands = True if opts['-s']: combineStrands = False cap = capture(K, reads=capt, kmers=True, verbose=verbose) for n in range(NV): itm = hgvsVars[n] h = itm['hgvs'] v = makeHGVS(h) itm['var'] = v lhs = itm['lhsFlank'] rhs = itm['rhsFlank'] wt = itm['wtSeq'] mut = itm['mutSeq'] bait = [lhs, wt, rhs] if mut is not None: bait += ['N'] bait += [lhs, mut, rhs] bait = ''.join(bait) n0 = cap.addBait(h, bait) assert n0 == n if verbose: print >> sys.stderr, "done." rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 cap.addReadPairAndKmers(itm.reads[0], itm.reads[1]) if capt: cap.saveReads(zipname) scorer = Scorer(K) globHist = {} for n in range(NV): mx = cap.capKmers[n] for c in mx.itervalues(): if c < Q: continue if c not in globHist: globHist[c] = 0 globHist[c] += 1 with outputFile(opts['-o']) as out: hdrShown = False for n in range(NV): itm = hgvsVars[n] v = itm['var'] h = itm['hgvs'] mx = cap.capKmers[n] nr = cap.capReadCounts[n] if 'kmers' in fmt: for (x, c) in mx.iteritems(): print '%d\t%s\t%d' % (n, render(K, x), c) lhsFlank = itm['lhsFlank'] rhsFlank = itm['rhsFlank'] alleles = {} alleles['wt'] = [] alleles['mut'] = [] wtSeq = itm['wtSeq'] wtZ = len(wtSeq) mutSeq = itm['mutSeq'] mutZ = v.size() cs = [c for (x, c) in mx.iteritems() if c >= Q] cs.sort() nk = len(cs) if nk == 0: cs = [0] q10 = cs[1 * len(cs) // 10] q50 = cs[5 * len(cs) // 10] q90 = cs[9 * len(cs) // 10] af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq, wtZ, mutZ) finders = [] if not v.anonymous(): finders.append(af.definiteAlleles()) else: finders.append(af.bridgingAlleles()) j = 0 for (t, a) in cat(finders): assert t == 'wt' or t == 'mut' alleles[t].append(a) j += 1 wtRes = {} wtRes['covMin'] = 0 wtRes['binom'] = 1.0 wtRes['ksDist'] = 0.0 wtRes['hamming'] = 0 wtRes['path'] = [] for pthRes in alleles['wt']: scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank) if isBetter(pthRes, wtRes): wtRes = pthRes mutRes = {} mutRes['covMin'] = 0 mutRes['binom'] = 1.0 mutRes['ksDist'] = 0.0 mutRes['hamming'] = 0 mutRes['path'] = [] for pthRes in alleles['mut']: scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank) if isBetter(pthRes, mutRes): mutRes = pthRes if True: wtXs = [mx.get(x, 0) for x in wtRes['path']] if len(wtXs) == 0: wtXs = [0] wtXs.sort() wtCount = sum(wtXs) wtLen = len(wtXs) wtMean = float(wtCount) / float(wtLen) wtMedian = wtXs[wtLen // 2] mutXs = [mx.get(x, 0) for x in mutRes['path']] if len(mutXs) == 0: mutXs = [0] mutXs.sort() mutCount = sum(mutXs) mutLen = len(mutXs) mutMean = float(mutCount) / float(mutLen) mutMedian = mutXs[mutLen // 2] totX = max([1.0, float(wtMedian + mutMedian), float(q90)]) wtVaf = wtMedian / totX mutVaf = mutMedian / totX hdrs = ['n'] fmts = ['%d'] outs = [n] wtAllele = ((wtRes['covMin'] > Q) and (wtRes['hamming'] < 4)) and (wtVaf > V) mutAllele = ((mutRes['covMin'] > Q) and (mutRes['hamming'] < 4)) and (mutVaf > V) resV = 1 * wtAllele + 2 * mutAllele res = ['null', 'wt', 'mut', 'wt/mut'][resV] hdrs += ['res'] fmts += ['%s'] outs += [res] if 'rds' in fmt: hdrs += ['numReads'] fmts += ['%d'] outs += [nr] hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90'] fmts += ['%d', '%d', '%d', '%d'] outs += [nk, q10, q50, q90] hdrs += ['wtMin', 'mutMin'] fmts += ['%d', '%d'] outs += [wtRes['covMin'], mutRes['covMin']] hdrs += ['wtHam', 'mutHam'] fmts += ['%d', '%d'] outs += [wtRes['hamming'], mutRes['hamming']] if 'ks' in fmt: hdrs += ['wtD', 'mutD'] fmts += ['%g', '%g'] outs += [wtRes['ksDist'], mutRes['ksDist']] if 'binom' in fmt: hdrs += ['wtQ', 'mutQ'] fmts += ['%g', '%g'] outs += [wtRes['binom'], mutRes['binom']] if 'vaf' in fmt: hdrs += ['wtVaf', 'mutVaf'] fmts += ['%g', '%g'] outs += [wtVaf, mutVaf] hdrs += ['hgvs'] fmts += ['%s'] outs += [h] if not hdrShown: hdrShown = True print >> out, '\t'.join(hdrs) print >> out, '\t'.join(fmts) % tuple(outs) out.flush()
def main(argv): global verbose opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] genomeDir = '.' if opts['-g']: genomeDir = opts['-g'] sf = SequenceFactory(genomeDir) if opts['-P']: if opts['-t']: prepareBedFileGeneTx(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) else: prepareBedFileGene(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) return if opts['-X']: with openFile(opts['<index>'], 'w') as out: yaml.safe_dump_all(indexBedFiles(opts['<must-have>'], sf), out, default_flow_style=False) return K = int(opts['-k']) minGeneReads = int(opts['-M']) minExonReads = int(opts['-m']) minGeneRate = float(opts['-R']) minExonRate = float(opts['-r']) (minGeneCount, maxGeneCount) = map(int, opts['-Z'].split(':')) (minExonCount, maxExonCount) = map(int, opts['-z'].split(':')) with openFile(opts['<index>']) as f: ref = list(yaml.load_all(f, Loader=yaml.BaseLoader)) if True: # Test the double-layer index idx = ExonIndex(K, ref) acc = {} toc = {} rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersLists(K, itm.reads[1][1]) xs0 = lhsFwd + rhsRev rh0 = idx.readHash(xs0) if rh0 is not None: (h0, ys0) = rh0 if h0 not in acc: acc[h0] = [] toc[h0] = ys0 acc[h0].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) xs1 = lhsRev + rhsFwd rh1 = idx.readHash(xs1) if rh1 is not None: (h1, ys1) = rh1 if h1 not in acc: acc[h1] = [] toc[h1] = ys1 acc[h1].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) nx = 0 for h in sorted(acc.keys()): for (x, c) in sorted(acc[h].items()): nx += 1 if c <= 1: continue print '%016x\t%s\t%d' % (h, render(K, x), c) print >> sys.stderr, 'nx =', nx return if False: # Position index idx = {} for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): p -= 1 if x not in idx: idx[x] = [] idx[x].append((i, p)) if True: # Exon tuple index idx = {} lens = [0 for i in range(len(ref))] for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): if x not in idx: idx[x] = set([]) idx[x].add(i) lens[i] += 1 for x in idx.iterkeys(): idx[x] = tuple(sorted(idx[x])) if opts['-T']: ak = {} for x in sorted(idx.iterkeys()): if len(idx[x]) == 1: continue xStr = render(K, x) ak[xStr] = [] for i in idx[x]: itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) ak[xStr].append(k) ak[xStr].sort() rep = {} rep['aliasing-within'] = ak chrs = set([]) for i in range(len(ref)): itm = ref[i] chrs.add(itm['chr']) counts = [0 for i in range(len(ref))] for ch in sorted(chrs): if verbose: print >> sys.stderr, 'processing %s' % (ch, ) seq = sf[ch] for (x, p) in kmersWithPos(K, seq, True): if x not in idx: continue for i in idx[x]: counts[i] += 1 gk = {} for i in range(len(ref)): if lens[i] == counts[i]: continue itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) gk[k] = {'indexed': lens[i], 'genomic': counts[i]} rep['aliasing-genomic'] = gk yaml.safe_dump(rep, sys.stdout, default_flow_style=False) return acc = {} rn = 0 hitStats = Summary() hitHist = [0 for i in range(1000)] for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersWithPosLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersWithPosLists(K, itm.reads[1][1]) (hits0, hitCount0) = recHits(idx, lhsFwd + rhsRev) (hits1, hitCount1) = recHits(idx, lhsRev + rhsFwd) if len(hits0) > 0: k = tuple(sorted(hits0.keys())) v = sum(hits0.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount0) hitHist[hitCount0] += 1 if len(hits1) > 0: k = tuple(sorted(hits1.keys())) v = sum(hits1.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount1) hitHist[hitCount1] += 1 if verbose: print >> sys.stderr, 'total read hits: %d' % (len(hitStats), ) print >> sys.stderr, 'total hits per read: %g (%g)' % (hitStats.mean(), hitStats.sd()) print >> sys.stderr, 'total reads: %d' % (rn, ) for i in range(len(hitHist)): if hitHist[i] > 0: print >> sys.stderr, '\t%d\t%d' % (i, hitHist[i]) def gex(s): r = [] for n in s: itm = ref[n] r.append('%s/%s' % (itm['gene'], itm['exon'])) return '|'.join(r) def fmtKey(k): nex = len(k) gx = set([]) kStrParts = [] for s in k: kStrParts.append(gex(s)) gx |= set([ref[i]['gene'] for i in s]) kStr = '--'.join(sorted(kStrParts)) return (nex, gx, kStr) gxCounts = {} for k in acc.keys(): gx = set([]) ex = set([]) for s in k: gx |= set([ref[i]['gene'] for i in s]) ex |= set(s) gx = tuple(sorted(gx)) if gx not in gxCounts: gxCounts[gx] = [0, 0] gxCounts[gx][0] += acc[k][0] gxCounts[gx][1] += acc[k][1] hdr = ['numReads', 'numKmers', 'kmersPerRead'] hdr += ['ggNumReads', 'ggNumKmers', 'ggKmersPerRead'] hdr += ['numExons', 'numGenes', 'geneGroup', 'exonGroup'] print '\t'.join(hdr) for k in acc.keys(): (nex, gx, kStr) = fmtKey(k) gx = tuple(sorted(gx)) if len(gx) < minGeneCount or len(gx) > maxGeneCount: continue if len(ex) < minExonCount or len(ex) > maxExonCount: continue if gxCounts[gx][0] < minGeneReads: continue if acc[k][0] < minExonReads: continue gxRate = float(gxCounts[gx][1]) / float(gxCounts[gx][0]) if gxRate < minGeneRate: continue exRate = float(acc[k][1]) / float(acc[k][0]) if exRate < minExonRate: continue gxStr = ':'.join(gx) print '%d\t%d\t%g\t%d\t%d\t%g\t%d\t%d\t%s\t%s' % ( acc[k][0], acc[k][1], exRate, gxCounts[gx][0], gxCounts[gx][1], gxRate, nex, len(gx), gxStr, kStr)