def test_fixed_path_kmers_1() : random.seed(17) K = 25 D = 3 N = 100 e = 0.01 alts = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']} seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG' L = len(seq) xs = kmersList(K, seq) X = {} for i in range(N): r = [] for j in range(L): b = seq[j] if random.random() < e: b = random.choice(alts[b]) r.append(b) s = ''.join(r) ys = kmersList(K, s) for y in ys: if y not in X: X[y] = 0 X[y] += 1 Y = fixed_path_kmers(K, D, X, xs) assert Y is not None assert len(Y) == len(xs) for i in range(len(Y)): assert xs[i] in Y[i] V = [(render(K, y), X[y]) for y in Y[i] if y != xs[i] and X[y] > X[xs[i]]] assert len(V) == 0
def test_junction_kmers() : K = 25 seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG' xs = kmersList(K, seq) assert len(xs) == K+1 ys = junction_kmers(K, xs[0], xs[-1]) assert xs == ys
def addReadAndKmers(self, rd): xs = kmersList(self.K, rd[1], True) ns = [] for x in xs: if x not in self.baits: continue ns += self.baits[x] if len(ns) == 0: return ns = set(ns) for n in ns: self.capReadCounts[n] += 1 if self.reads: for n in ns: self.capReads[n][0] += rd if self.kmers: for n in ns: ys = self.capKmers[n] for x in xs: if x not in ys: ys[x] = 0 ys[x] += 1
def findFairlySimpleAllele(K, D, lhs, mid, rhs, mx): allele = consAllele(lhs, mid, rhs, K - 1, K - 1) xs = kmersList(K, allele) Y = fixed_path_kmers(K, D, mx, xs) if Y is None: return ancPath = [] cov = [] for i in range(len(xs)): ys = Y[i] (y, c) = consensus_kmer(K, ys, mx) ancPath.append(y) cov.append(c) seq = renderPath(K, ancPath) res = {} res['ancPath'] = ancPath res['allele'] = seq res['covMin'] = min(cov) res['path'] = ancPath res['lhsPos'] = 0 res['rhsPos'] = 0 yield res
def score(self, res, lhs, seq, rhs): Km1 = self.K - 1 Kp1 = self.K + 1 xs = res['path'] r = [0 for i in range(Kp1)] if seq is None: n = len(res['allele']) allele = consAllele(lhs, n * 'N', rhs, Km1 + res['lhsPos'], Km1 + res['rhsPos']) r[0] += len(xs) else: allele = consAllele(lhs, seq, rhs, Km1 + res['lhsPos'], Km1 + res['rhsPos']) ys = kmersList(self.K, allele) for d in [ham(x, y) for (x, y) in zip(xs, ys)]: r[d] += 1 res['hammingProfile'] = r res['hammingCdf'] = counts2cdf(r) res['ksDist'] = ksDistance2(res['hammingCdf'], self.nullCdf)[0] res['hamming'] = hamming(allele, renderPath(self.K, xs)) p = 1.0 for d in xrange(Kp1): p *= math.pow(self.binModel[d], r[d]) res['binom'] = 1.0 - p
def test_fixed_path_kmers_2() : random.seed(17) K = 25 D = 3 N = 100 e = 0.01 seq0 = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG' seq1 = 'TACTTGCACTGGGAGGCCCAGCGGCTTTTCAGTGTCACAGGTATTAGGAG' xs = kmersList(K, seq0) ys = kmersList(K, seq1) X = dict([(y, 1) for y in ys]) Y = fixed_path_kmers(K, D, X, xs) assert Y is not None assert len(Y) == len(xs) for i in range(len(Y)): assert len(Y[i]) == 1 assert ys[i] in Y[i]
def test_interpolate0Exacxt() : K = 25 seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC" ks = kmersList(K, seq, False) x = ks[0] xs = dict([(x, 1)]) p = interpolate(K, xs, x, x, 1) assert p is not None assert len(p) == 1 assert p[0] == x
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) B = int(opts['-b']) paired = opts['-p'] verbose = opts['-v'] Z = opts['-z'] names = [] seqs = [] baits = {} with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs.append(seq) for x in kmersList(K, seq, True): if x not in baits: baits[x] = set([]) baits[x].add(n) N = len(names) caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)] nr = 0 nh = 0 for itm in reads(opts['<input>'], reads=True, kmers=True, fwdOnly=True, paired=paired, verbose=verbose): nr += 1 E = len(itm.kmers) hits = set([]) for i in xrange(E): fwd = itm.kmers[i] for x in fwd: if x in baits: hits |= baits[x] for n in hits: caches[n].add(itm.reads) if len(hits) > 0: nh += 1 for n in xrange(N): caches[n].end()
def findSimpleAllele(K, lhs, mid, rhs, mx): allele = consAllele(lhs, mid, rhs, K - 1, K - 1) xs = kmersList(K, allele) m = min([mx.get(x, 0) for x in xs]) res = {} res['allele'] = mid res['covMin'] = m res['path'] = xs res['lhsPos'] = 0 res['rhsPos'] = 0 return res
def test_interpolate3Exacxt() : K = 25 seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC" ks = kmersList(K, seq, False) x = ks[0] y = ks[-1] xs = dict([(z, 1) for z in ks]) p = interpolate(K, xs, x, y, slice(len(ks)+1)) assert p is not None assert len(p) == len(ks) for i in range(len(ks)): assert ks[i] == p[i]
def test_interpolate1Exacxt() : K = 25 seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC" ks = kmersList(K, seq, False)[:2] x = ks[0] y = ks[1] xs = dict([(z, 1) for z in ks]) p = interpolate(K, xs, x, y, 2) assert p is not None assert len(p) == 2 assert p[0] == x assert p[1] == y
def test_fixed_path_kmers_0() : K = 25 D = 3 seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG' xs = kmersList(K, seq) X = dict([(x,1) for x in xs]) Y = fixed_path_kmers(K, D, X, xs) assert Y is not None assert len(Y) == len(xs) for i in range(len(Y)): assert len(Y[i]) == 1 assert Y[i][0] == xs[i]
def parseFiles(K, paired, fns, verbose): M = (1 << 18) - 1 rn = 0 if not paired: for fn in fns: with openFile(fn) as f: rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'reads processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) yield xs return for (fn1, fn2) in pairs(fns): with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'read pairs processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)] yield xs
def addBait(self, nm, seq, bothStrands=True): n = len(self.names) self.names.append(nm) self.nameIdx[nm] = n for x in kmersList(self.K, seq, bothStrands): if x not in self.baits: self.baits[x] = [n] elif self.baits[x][-1] != n: self.baits[x].append(n) self.capReadCounts.append(0) self.capReads.append([[], []]) self.capKmers.append({}) return n
def addReadPairAndKmers(self, lhs, rhs): xs = kmersList(self.K, lhs[1], True) ys = kmersList(self.K, rhs[1], True) ns = [] for x in xs: if x not in self.baits: continue ns += self.baits[x] for y in ys: if y not in self.baits: continue ns += self.baits[y] if len(ns) == 0: return ns = set(ns) for n in ns: self.capReadCounts[n] += 1 if self.reads: for n in ns: self.capReads[n][0] += lhs self.capReads[n][1] += rhs if self.kmers: for n in ns: zs = self.capKmers[n] for x in xs: if x not in zs: zs[x] = 0 zs[x] += 1 for y in ys: if y not in zs: zs[y] = 0 zs[y] += 1
def addRead(self, rd): assert self.reads xs = kmersList(self.K, rd[1], True) ns = [] for x in xs: if x not in self.baits: continue ns += self.baits[x] if len(ns) == 0: return ns = set(ns) for n in ns: self.capReadCounts[n] += 1 self.capReads[n][0] += rd
def __init__(self, K, refList): self.K = K self.exonLengths = [0 for i in range(len(refList))] idx = {} for i in range(len(refList)): itm = refList[i] for x in kmersList(self.K, itm['seq'], False): if x not in idx: idx[x] = set([]) idx[x].add(i) self.exonLengths[i] += 1 self.idxUpper = {} self.idxLower = {} for x in idx.iterkeys(): k = tuple(sorted(idx[x])) h = sig(k) if h not in self.idxLower: self.idxLower[h] = k self.idxUpper[x] = h
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) C = int(opts['-c']) Q = int(opts['-q']) S = int(opts['-S']) P = float(opts['-p']) verbose = opts['-v'] both = True if opts['-s']: both = False res = [] for fn in opts['<input>']: fres = {} fres['file'] = fn fres['contigs'] = [] glob = {} ncontig = 0 with openFile(fn) as f: for (nm, seq) in readFasta(f): ncontig += 1 scaff = {} for x in kmersList(K, seq, both): if sub(S, P, x): scaff[x] = 1 + scaff.get(x, 0) summary = summarize(scaff, C, Q) summary['name'] = nm fres['contigs'].append(summary) for (x, c) in scaff.items(): glob[x] = c + glob.get(x, 0) fres['global'] = summarize(glob, C, Q) res.append(fres) yaml.safe_dump(res, sys.stdout)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) M = (1 << (2*K)) - 1 paired = True if opts['-s']: paired = False p = float(opts['-p']) T = int(M * p) if opts['-r']: refs = [] with openFile(opts['-r']) as f: for (nm, seq) in readFasta(f): refs += kmersList(K, seq, False) refs = set(refs) kill = set([]) for x in refs: y = rc(K, x) if y in refs: kill.add(x) kill.add(y) print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs)) refs -= set(kill) fwd = {} rev = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): fn = 0 for x in xs: if x in refs: fn += 1 ys = [rc(K, x) for x in xs] rn = 0 for y in ys: if y in refs: rn += 1 if fn + rn == 0: continue q = float(fn) / float(fn + rn) if random.random() < q: for x in xs: fwd[x] = 1 + fwd.get(x, 0) else: for y in ys: rev[y] = 1 + rev.get(y, 0) for (x,xc) in fwd.iteritems(): y = rc(K, x) yc = 0 if y in rev: yc = rev[y] del rev[y] print '%d\t%d' % (xc, yc) for (y,yc) in rev.iteritems(): print '%d\t%d' % (0, yc) return kx = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): for x in xs: if x in kx: kx[x] += 1 continue y = rc(K, x) z = murmer(min(x, y), 17) if (z & M) > T: continue kx[x] = 1 for x in kx.keys(): y = rc(K, x) if x > y: continue xc = kx[x] yc = kx.get(y, 0) if murmer(x, 17) >= murmer(y, 17): (a, b) = (x, y) (ac, bc) = (xc, yc) else: (a, b) = (y, x) (ac, bc) = (yc, xc) #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc) print '%d\t%d' % (ac, bc)
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator2(K) n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) B = opts['-C'] if B is not None: xs = set([]) for (nm, seq) in readFasta(openFile(B)): xs |= set(kmersList(K, seq, True)) B = xs tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: nr = 0 for itm in reads(opts['<input>'], K=K, pairs=False, reads=False, kmers=True, both=True, verbose=verbose): xs = itm.kmers[0] for x in xs: acgt[x & 3] += 1 if d is not None: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) elif B is not None: found = False for x in xs: if x in B: found = True break if found: buf.addList(xs) for x in xs: m += 1 n += 1 else: buf.addList(xs) for x in xs: m += 1 n += 1 nr += 1 if (nr & 1023) == 0 and buf.mem() >= Z // 2: fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf.clear() n = 0 if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf = [] with zotk.kmers(out, 'w') as z: h = {} if len(tmps) == 0: for c in buf.countsOnly(): h[c] = 1 + h.get(c, 0) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly()) elif len(tmps) == 1: with casket(tmpnm, 'r') as z0: writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0])) else: with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, z) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr os.remove(tmpnm)
def next(self): self.readNum += 1 if (self.readNum & self.M) == 0 and self.progress is not None: self.progress.update(self.M) while True: if self.currParsers is None: if self.currFilesInd is None: self.currFilesInd = 0 else: self.currFilesInd += self.N if self.progress is not None: self.progress.update(self.readNum & self.M) if self.currFilesInd + (self.N - 1) >= len(self.files): raise StopIteration if self.verbose: pfx = ' & '.join([ basename(self.files[i]) for i in range(self.currFilesInd, self.currFilesInd + self.N) ]) self.progress = tqdm(unit=' reads', unit_scale=True) self.progress.set_postfix(reading=pfx, refresh=True) self.currParsers = [] for i in range(self.currFilesInd, self.currFilesInd + self.N): fn = self.files[i] f = openFile(fn) if isFasta(fn): self.currParsers.append(readFasta(f)) else: self.currParsers.append(readFastq(f)) self.currReads = [] try: for p in self.currParsers: self.currReads.append(p.next()) except StopIteration: if len(self.currReads) != 0: print >> sys.stderr, 'warning: files had unequal length' self.currParsers = None if self.progress is not None: self.progress.close() self.progress = None continue if self.kmers: self.currKmers = [] for rd in self.currReads: if self.fwdOnly: self.currKmers.append(kmersList(self.K, rd[1], False)) elif self.both: self.currKmers.append(kmersList(self.K, rd[1], True)) else: assert self.separate self.currKmers.append(kmersLists(self.K, rd[1])) res = Reads() if self.reads: res.reads = self.currReads if self.kmers: res.kmers = self.currKmers return res
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) D = int(opts['-D']) Q = int(opts['-C']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) if opts['-X']: Wcap = int(opts['-w']) Wval = int(opts['-W']) variants = opts['<variant>'] if opts['-f']: with openFile(opts['-f']) as f: variants += f.read().split() vx = {} for v in variants: x = makeHGVS(v) if x is None: print >> sys.stderr, "unable to parse %s" % (v, ) continue x.setSequenceFactory(sf) acc = x.accession() if acc not in vx: vx[acc] = [] vx[acc].append(x) chk = None if opts['-T']: chk = {} rs = [] for (acc, vs) in vx.iteritems(): for v in vs: r = makeIndexedVariant(v, K, Wcap, Wval) if r is not None: rs.append(r) if chk is not None: xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['wtSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('wt', str(v))) if r['mutSeq'] is None: continue xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['mutSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('mut', str(v))) if chk is not None: counts = dict([(x, 0) for x in chk.keys()]) for acc in refSeq2Hg19.keys(): if verbose: print >> sys.stderr, 'scanning', acc seq = sf[acc] for x in kmers(K, seq): if x in counts: counts[x] += 1 res = {} seen = set([]) for x in counts.keys(): y = rc(K, x) z = min(x, y) if z in seen: continue seen.add(z) c = counts[x] + counts[y] for (a, v) in chk[x]: if v not in res: res[v] = {} if a not in res[v]: res[v][a] = {} if c not in res[v][a]: res[v][a][c] = 0 res[v][a][c] += 1 yaml.safe_dump(res, sys.stdout, default_flow_style=False) return with open(opts['<index>'], 'w') as f: yaml.safe_dump(rs, f, default_flow_style=False) return capt = False zipname = None if opts['-c']: capt = True zipname = opts['-c'] fmt = set([]) if opts['-F']: fmt = set(opts['-F'].split(',')) if verbose: print >> sys.stderr, "loading index." with open(opts['<index>']) as f: hgvsVars = yaml.load(f, Loader=yaml.FullLoader) NV = len(hgvsVars) combineStrands = True if opts['-s']: combineStrands = False cap = capture(K, reads=capt, kmers=True, verbose=verbose) for n in range(NV): itm = hgvsVars[n] h = itm['hgvs'] v = makeHGVS(h) itm['var'] = v lhs = itm['lhsFlank'] rhs = itm['rhsFlank'] wt = itm['wtSeq'] mut = itm['mutSeq'] bait = [lhs, wt, rhs] if mut is not None: bait += ['N'] bait += [lhs, mut, rhs] bait = ''.join(bait) n0 = cap.addBait(h, bait) assert n0 == n if verbose: print >> sys.stderr, "done." rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 cap.addReadPairAndKmers(itm.reads[0], itm.reads[1]) if capt: cap.saveReads(zipname) scorer = Scorer(K) globHist = {} for n in range(NV): mx = cap.capKmers[n] for c in mx.itervalues(): if c < Q: continue if c not in globHist: globHist[c] = 0 globHist[c] += 1 with outputFile(opts['-o']) as out: hdrShown = False for n in range(NV): itm = hgvsVars[n] v = itm['var'] h = itm['hgvs'] mx = cap.capKmers[n] nr = cap.capReadCounts[n] if 'kmers' in fmt: for (x, c) in mx.iteritems(): print '%d\t%s\t%d' % (n, render(K, x), c) lhsFlank = itm['lhsFlank'] rhsFlank = itm['rhsFlank'] alleles = {} alleles['wt'] = [] alleles['mut'] = [] wtSeq = itm['wtSeq'] wtZ = len(wtSeq) mutSeq = itm['mutSeq'] mutZ = v.size() cs = [c for (x, c) in mx.iteritems() if c >= Q] cs.sort() nk = len(cs) if nk == 0: cs = [0] q10 = cs[1 * len(cs) // 10] q50 = cs[5 * len(cs) // 10] q90 = cs[9 * len(cs) // 10] af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq, wtZ, mutZ) finders = [] if not v.anonymous(): finders.append(af.definiteAlleles()) else: finders.append(af.bridgingAlleles()) j = 0 for (t, a) in cat(finders): assert t == 'wt' or t == 'mut' alleles[t].append(a) j += 1 wtRes = {} wtRes['covMin'] = 0 wtRes['binom'] = 1.0 wtRes['ksDist'] = 0.0 wtRes['hamming'] = 0 wtRes['path'] = [] for pthRes in alleles['wt']: scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank) if isBetter(pthRes, wtRes): wtRes = pthRes mutRes = {} mutRes['covMin'] = 0 mutRes['binom'] = 1.0 mutRes['ksDist'] = 0.0 mutRes['hamming'] = 0 mutRes['path'] = [] for pthRes in alleles['mut']: scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank) if isBetter(pthRes, mutRes): mutRes = pthRes if True: wtXs = [mx.get(x, 0) for x in wtRes['path']] if len(wtXs) == 0: wtXs = [0] wtXs.sort() wtCount = sum(wtXs) wtLen = len(wtXs) wtMean = float(wtCount) / float(wtLen) wtMedian = wtXs[wtLen // 2] mutXs = [mx.get(x, 0) for x in mutRes['path']] if len(mutXs) == 0: mutXs = [0] mutXs.sort() mutCount = sum(mutXs) mutLen = len(mutXs) mutMean = float(mutCount) / float(mutLen) mutMedian = mutXs[mutLen // 2] totX = max([1.0, float(wtMedian + mutMedian), float(q90)]) wtVaf = wtMedian / totX mutVaf = mutMedian / totX hdrs = ['n'] fmts = ['%d'] outs = [n] wtAllele = ((wtRes['covMin'] > Q) and (wtRes['hamming'] < 4)) and (wtVaf > V) mutAllele = ((mutRes['covMin'] > Q) and (mutRes['hamming'] < 4)) and (mutVaf > V) resV = 1 * wtAllele + 2 * mutAllele res = ['null', 'wt', 'mut', 'wt/mut'][resV] hdrs += ['res'] fmts += ['%s'] outs += [res] if 'rds' in fmt: hdrs += ['numReads'] fmts += ['%d'] outs += [nr] hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90'] fmts += ['%d', '%d', '%d', '%d'] outs += [nk, q10, q50, q90] hdrs += ['wtMin', 'mutMin'] fmts += ['%d', '%d'] outs += [wtRes['covMin'], mutRes['covMin']] hdrs += ['wtHam', 'mutHam'] fmts += ['%d', '%d'] outs += [wtRes['hamming'], mutRes['hamming']] if 'ks' in fmt: hdrs += ['wtD', 'mutD'] fmts += ['%g', '%g'] outs += [wtRes['ksDist'], mutRes['ksDist']] if 'binom' in fmt: hdrs += ['wtQ', 'mutQ'] fmts += ['%g', '%g'] outs += [wtRes['binom'], mutRes['binom']] if 'vaf' in fmt: hdrs += ['wtVaf', 'mutVaf'] fmts += ['%g', '%g'] outs += [wtVaf, mutVaf] hdrs += ['hgvs'] fmts += ['%s'] outs += [h] if not hdrShown: hdrShown = True print >> out, '\t'.join(hdrs) print >> out, '\t'.join(fmts) % tuple(outs) out.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) K = 25 nms = [] idx = {} for (nm, seq) in readFasta(openFile(opts['<baits>'])): n = len(nms) nms.append(nm) for x in kmersList(K, seq, True): if x not in idx: idx[x] = set([]) idx[x].add(n) for x in idx.keys(): idx[x] = list(idx[x]) idx[x].sort() anti = set([]) if opts['-U']: with openFile(opts['-U']) as f: for (nm, seq) in readFasta(f): for x in kmersList(K, seq, True): anti.add(x) rn = 0 if opts['-p']: hist = {} for (fn1, fn2) in pairs(opts['<input>']): tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq')) for i in xrange(len(nms))] cache = [[[], []] for i in xrange(len(nms))] counts = [0 for i in xrange(len(nms))] with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): hits = set([]) pushup = False for x in kmersList(K, fq1[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) for x in kmersList(K, fq2[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) if pushup: continue n = len(hits) hist[n] = 1 + hist.get(n, 0) for i in hits: counts[i] += 1 cache[i][0].append(fq1) cache[i][1].append(fq2) if len(cache[i][0]) >= 1024: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] for i in xrange(len(cache)): if len(cache[i][0]) > 0: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] with zipfile.ZipFile(opts['<output>'], 'w', zipfile.ZIP_DEFLATED) as z: for i in xrange(len(nms)): if counts[i] > 0: pth = '/'.join(nms[i].split()) z.write(tmps[i][0], pth + '/' + fn1) os.remove(tmps[i][0]) z.write(tmps[i][1], pth + '/' + fn2) os.remove(tmps[i][1]) hist = hist.items() hist.sort() for (n, f) in hist: print '%d\t%d' % (n, f) else: raise "not implemented"
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) if (K & 1) != 0: print >> sys.stderr, "K must be even." return minCov = int(opts['-m']) verbose = opts['-v'] J = K // 2 S = 2*(K - J) Mj = (1 << (2*J)) - 1 names = [] seqs = {} bait = {} wtFst = [] wtLst = [] posIdx = [] rds = [] with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs[nm] = seq wf = {} wl = {} for x in kmersList(K, seq, False): if x not in bait: bait[x] = set([]) bait[x].add(n) y0 = x >> S y1 = x & Mj #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1)) if y0 not in wf: wf[y0] = set([]) wf[y0].add(y1) if y1 not in wl: wl[y1] = set([]) wl[y1].add(y0) wtFst.append(wf) wtLst.append(wl) px = {} for (x,p) in kmersWithPosList(J, seq, False): if x not in px: px[x] = [] px[x].append(p) posIdx.append(px) for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]): pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d dd = pp[2] - pp[0] print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd) rds.append([]) N = len(names) L = None X = [{} for n in range(N)] for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose): rd = itm.reads[0] L = len(rd) xs = itm.kmers[0] hits = set([]) for x in xs: if x in bait: hits |= bait[x] for n in hits: for x in xs: if x not in X[n]: X[n][x] = 0 X[n][x] += 1 rds[n].append(rd) hdrShown = False vn = 0 for n in range(N): xs = {} for (x,c) in X[n].iteritems(): if c >= 10: xs[x] = c seq = seqs[names[n]] rngs = [] st = None en = None inside = False xx = [] for x in kmersList(K, seq, False): if x in xs: xx.append('.') else: xx.append('X') print ''.join(xx) for x in kmersList(K, seq, False): if not inside: if x in xs: st = x else: inside = True else: if x in xs: en = x rngs.append((st, en)) st = x en = None inside = False if inside: rngs.append((st, en)) pthr = Pather(K, xs) for (x,y) in rngs: if x is None or y is None: continue print render(K, x), render(K, y) for p in pthr.trace(x, y, 100): print renderPath(K, p) continue fst = {} lst = {} for (x,c) in xs.iteritems(): #if c < 5: # continue y0 = x >> S y1 = x & Mj if y0 not in fst: fst[y0] = [] fst[y0].append(y1) if y1 not in lst: lst[y1] = [] lst[y1].append(y0) #for (a, b, c, d) in findDupDeNovo(fst, lst): for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst): #continue pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)] dd = pp[2] - pp[0] if not opts['-a'] and dd % 3 != 0: continue if opts['-s']: fstPath = interpolate(K, xs, ab, cb, dd+1) sndPath = interpolate(K, xs, cb, cd, dd+1) if fstPath is None: continue if sndPath is None: continue if fstPath[J:-J] != sndPath[J:-J]: continue pa = pp[0] pb = pp[1] pc = pp[2] pd = pp[3] cab = xs.get(ab, 0) ccb = xs.get(cb, 0) ccd = xs.get(cd, 0) if cab < minCov: continue if ccb < minCov: continue if ccd < minCov: continue m = (cab + ccd) / 2.0 # Assume the true std dev is 10% of the mean w = ccb / m hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1) v = Duplication(names[n], pb, pd-1, seqs) if opts['-A']: showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n]) vn += 1 hdrs = ['n'] fmts = ['%d'] outs = [vn] hdrs += ['left', 'leftCov'] fmts += ['%s','%d'] outs += [render(K, ab), cab] hdrs += ['mid', 'midCov'] fmts += ['%s','%d'] outs += [render(K, cb), ccb] hdrs += ['right', 'rightCov'] fmts += ['%s','%d'] outs += [render(K, cd), ccd] hdrs += ['len'] fmts += ['%d'] outs += [dd] hdrs += ['vaf'] fmts += ['%g'] outs += [w] hdrs += ['hgvs'] fmts += ['%s'] outs += [hgvs] if not hdrShown: hdrShown = True print '\t'.join(hdrs) print '\t'.join(fmts) % tuple(outs)