def prepareBedFileGeneTx(geneListFn, refGeneFn, outFn): genes = set([]) transcripts = {} with openFile(geneListFn) as f: for l in f: t = l.split() genes.add(t[0]) transcripts[t[1]] = t[0] found = {} with openFile(refGeneFn) as f, openFile(outFn, 'w') as out: for l in f: t = l.split() tx = t[1] ch = t[2] st = t[3] g = t[12] if tx not in transcripts: continue found[tx] = g ss = [int(s) for s in t[9].split(',') if len(s) > 0] ee = [int(e) for e in t[10].split(',') if len(e) > 0] exs = zip(ss, ee) if st == '-': exs = exs[::-1] for i in range(len(exs)): j = i + 1 print >> out, '%s\t%d\t%d\t%s/%02d\t%s' % (ch, exs[i][0], exs[i][1], g, j, st) for tx in sorted(set(transcripts.keys()) - set(found.keys())): print >> sys.stderr, 'transcipt not found: %s\t(%s)' % ( tx, transcripts[tx])
def download(self, acc, path): print >> sys.stderr, 'downloading %s -> %s' % (acc, path) qry = {} qry['db'] = 'nuccore' qry['rettype'] = self.rettype qry['retmode'] = 'text' qry['id'] = acc # Max size to slurp MB64 = 64 * 1024 * 0124 with requests.get(self.base, params=qry, stream=True) as r: # deal with HTTP errors r.raise_for_status() # write "small" files in one go. if 'Content-Length' in r.headers and int( r.headers['Content-Length']) <= MB64: with openFile(path, 'w') as f: f.write(r.content) return # grab "big" files a piece at a time. with openFile(path, 'w') as f: for chk in r.iter_content(chunk_size=None): f.write(chk)
def indexBedFiles(bedFn, sf): global verbose idx = {} with openFile(bedFn) as f: for l in f: t = l.split() ch = t[0] s = int(t[1]) e = int(t[2]) gex = t[3] st = t[4] if ch not in idx: idx[ch] = [] g, ex = gex.split('/') idx[ch].append((s, e, st, g, ex)) for ch in sorted(idx.keys()): if verbose: print >> sys.stderr, 'processing %s' % (ch, ) seq = sf[ch] idx[ch].sort() for (s, e, st, g, ex) in idx[ch]: exSeq = seq[s:e].upper() if st == '-': exSeq = revComp(exSeq) itm = {} itm['gene'] = g itm['exon'] = ex itm['chr'] = ch itm['st'] = s itm['en'] = e itm['strand'] = st itm['seq'] = exSeq yield itm
def buildIndex(K, inputs, output): """ Create a new k-mer index. The FASTA files named in the list `inputs` are read in and the `K` length k-mers and their reverse complements are extracted and collated to create an index that maps from k-mer to sequence number (numbering from 0). The `names` member of the KmerIndex object can be used to retrieve the name from the sequence number. """ seqs = [] for inp in inputs: with openFile(inp) as f: seqs += list(readFasta(f)) S = [] nms = [] lens = array.array('I', []) for i in xrange(len(seqs)): (nm, seq) = seqs[i] nms.append(nm) xs = list(kmers(K, seq, True)) xs.sort() uniq(xs) seqs[i] = [nm, xs] lens.append(len(xs)) S += xs S.sort() uniq(S) S = sparse(2 * K, S) T = array.array('I', [0 for i in xrange(S.count() + 1)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) T[r] += 1 t0 = 0 for i in xrange(len(T)): t1 = t0 + T[i] T[i] = t0 t0 = t1 T0 = [c for c in T] U = array.array('H', [0 for i in xrange(t0)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) U[T0[r]] = i T0[r] += 1 with container(output, 'w') as z: writeKmers(K, S.xs, z) n = write32(z, T, 'offsets') z.meta['T'] = n n = write16(z, U, 'postings') z.meta['U'] = n n = write32(z, lens, 'lens') z.meta['lens'] = n z.meta['names'] = nms
def __getitem__(self, acc): pth = self.makePath(acc, self.compression) if not os.path.isfile(pth): (d, f) = self.makePathComponents(acc, self.compression) if not os.path.isdir(d): os.makedirs(d) self.download(acc, pth) return openFile(pth)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) B = int(opts['-b']) paired = opts['-p'] verbose = opts['-v'] Z = opts['-z'] names = [] seqs = [] baits = {} with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs.append(seq) for x in kmersList(K, seq, True): if x not in baits: baits[x] = set([]) baits[x].add(n) N = len(names) caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)] nr = 0 nh = 0 for itm in reads(opts['<input>'], reads=True, kmers=True, fwdOnly=True, paired=paired, verbose=verbose): nr += 1 E = len(itm.kmers) hits = set([]) for i in xrange(E): fwd = itm.kmers[i] for x in fwd: if x in baits: hits |= baits[x] for n in hits: caches[n].add(itm.reads) if len(hits) > 0: nh += 1 for n in xrange(N): caches[n].end()
def flush(self): for i in range(self.N): if len(self.buffers[i]) == 0: continue with openFile(self.names[i], 'a') as f: for rd in self.buffers[i]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] self.buffers[i] = []
def __getitem__(self, acc): if acc != self.prevAcc: acc = normalizeAccession(acc) pth = self.home + '/' + acc + '.fa' if not os.path.exists(pth): pth = pth + '.gz' with openFile(pth) as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def __getitem__(self, acc): if acc != self.prevAcc: if acc not in hgvs.refSeq2Hg19: print >> sys.stderr, "accession %s not available." % (acc) assert acc in hgvs.refSeq2Hg19 h = hgvs.refSeq2Hg19[acc] with openFile(self.home + "/" + h + ".fa.gz") as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def parseFiles(K, paired, fns, verbose): M = (1 << 18) - 1 rn = 0 if not paired: for fn in fns: with openFile(fn) as f: rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'reads processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) yield xs return for (fn1, fn2) in pairs(fns): with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'read pairs processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)] yield xs
def __getitem__(self, acc): if acc != self.prevAcc: if acc in refSeq2Hg19: h = refSeq2Hg19[acc] else: h = acc with openFile(self.home + "/" + h + ".fa.gz") as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def __getitem__(self, acc): if acc != self.prevAcc: if acc in refSeq2Hg19: h = refSeq2Hg19[acc] else: h = acc pth = self.home + '/' + h + '.fa' if not os.path.exists(pth): pth += '.gz' with openFile(pth) as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) C = int(opts['-c']) Q = int(opts['-q']) S = int(opts['-S']) P = float(opts['-p']) verbose = opts['-v'] both = True if opts['-s']: both = False res = [] for fn in opts['<input>']: fres = {} fres['file'] = fn fres['contigs'] = [] glob = {} ncontig = 0 with openFile(fn) as f: for (nm, seq) in readFasta(f): ncontig += 1 scaff = {} for x in kmersList(K, seq, both): if sub(S, P, x): scaff[x] = 1 + scaff.get(x, 0) summary = summarize(scaff, C, Q) summary['name'] = nm fres['contigs'].append(summary) for (x, c) in scaff.items(): glob[x] = c + glob.get(x, 0) fres['global'] = summarize(glob, C, Q) res.append(fres) yaml.safe_dump(res, sys.stdout)
def main(argv): opts = docopt.docopt(__doc__, argv) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) T = [] for t0 in opts['-t'].split(','): t1 = t0.split(':') if len(t1) == 1: T.append((1.0, t1[0])) elif len(t1) == 2: T.append((float(t1[0]), t1[1])) else: print >> sys.stderr, "unexpected variant category descriptor:", t0 sys.exit(1) Tgen = MultiGen(T) N = int(opts['-N']) D = float(opts['-D']) I = float(opts['-I']) U = float(opts['-U']) Ds = {} Ds['del'] = 1.0 / D Ds['ins'] = 1.0 / I Ds['dup'] = 1.0 / U verbose = opts['-v'] if opts['-S'] is not None: random.seed(int(opts['-S'])) with openFile(opts['<regions>']) as f: zones = readBED(f) zps = [] t = 0 maxC = 0 maxM = 0 for c in zones.keys(): maxC = max(maxC, len(c)) for i in range(len(zones[c])): (s, e, m) = zones[c][i] maxM = max(maxM, len(m)) l = e - s + 1 zps.append((l, (c, i))) t += l zps = [(float(l) / float(t), c) for (l, c) in zps] zgen = MultiGen(zps) zcs = dict([(c, 0) for (p, c) in zps]) for n in xrange(N): c = zgen.gen() zcs[c] += 1 zcs = [(c, n) for (c, n) in zcs.items() if n > 0] zcs.sort() prog = None progFmt = None if verbose: prog = tqdm(total=N, unit='vars') progFmt = '%-' + str(maxC) + 's : %-' + str(maxM) + 's' curC = None curSeq = None prevM = None for ((c, i), n) in zcs: (s, e, m) = zones[c][i] if curC != c: curC = c if prog is not None: prog.set_description(c.ljust(maxC, ' ')) prog.update(0) curSeq = sf[c] assert s < len(curSeq) assert e <= len(curSeq) if prog is not None: #prog.set_description(progFmt % (c, m)) prog.update(n) if opts['-V'] and m != prevM: prevM = m print '# %s : %s' % (c, m) for j in xrange(n): v = genVar(c, curSeq, s, e, Tgen, Ds) fmts = [] vals = [] if opts['-T']: v.setSequenceFactory(sf) wt = curSeq[v.range()[0]:v.range()[1]].upper() mut = v.sequence() if mut is None: mut = '*' fmts += ['%s', '%d', '%d', '%d', '%s', '%s'] vals += [ v.accession(), v.range()[0], v.range()[1], v.size(), wt, mut ] fmts += ['%s'] vals += [str(v)] print '\t'.join(fmts) % tuple(vals)
def main(argv): global verbose opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] genomeDir = '.' if opts['-g']: genomeDir = opts['-g'] sf = SequenceFactory(genomeDir) if opts['-P']: if opts['-t']: prepareBedFileGeneTx(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) else: prepareBedFileGene(opts['<gene-list>'], opts['<refgene>'], opts['<bedfile>']) return if opts['-X']: with openFile(opts['<index>'], 'w') as out: yaml.safe_dump_all(indexBedFiles(opts['<must-have>'], sf), out, default_flow_style=False) return K = int(opts['-k']) minGeneReads = int(opts['-M']) minExonReads = int(opts['-m']) minGeneRate = float(opts['-R']) minExonRate = float(opts['-r']) (minGeneCount, maxGeneCount) = map(int, opts['-Z'].split(':')) (minExonCount, maxExonCount) = map(int, opts['-z'].split(':')) with openFile(opts['<index>']) as f: ref = list(yaml.load_all(f, Loader=yaml.BaseLoader)) if True: # Test the double-layer index idx = ExonIndex(K, ref) acc = {} toc = {} rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersLists(K, itm.reads[1][1]) xs0 = lhsFwd + rhsRev rh0 = idx.readHash(xs0) if rh0 is not None: (h0, ys0) = rh0 if h0 not in acc: acc[h0] = [] toc[h0] = ys0 acc[h0].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) xs1 = lhsRev + rhsFwd rh1 = idx.readHash(xs1) if rh1 is not None: (h1, ys1) = rh1 if h1 not in acc: acc[h1] = [] toc[h1] = ys1 acc[h1].append((compressRead(itm.reads[0][1]), compressRead(itm.reads[1][1]))) nx = 0 for h in sorted(acc.keys()): for (x, c) in sorted(acc[h].items()): nx += 1 if c <= 1: continue print '%016x\t%s\t%d' % (h, render(K, x), c) print >> sys.stderr, 'nx =', nx return if False: # Position index idx = {} for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): p -= 1 if x not in idx: idx[x] = [] idx[x].append((i, p)) if True: # Exon tuple index idx = {} lens = [0 for i in range(len(ref))] for i in range(len(ref)): itm = ref[i] for (x, p) in kmersWithPosList(K, itm['seq'], False): if x not in idx: idx[x] = set([]) idx[x].add(i) lens[i] += 1 for x in idx.iterkeys(): idx[x] = tuple(sorted(idx[x])) if opts['-T']: ak = {} for x in sorted(idx.iterkeys()): if len(idx[x]) == 1: continue xStr = render(K, x) ak[xStr] = [] for i in idx[x]: itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) ak[xStr].append(k) ak[xStr].sort() rep = {} rep['aliasing-within'] = ak chrs = set([]) for i in range(len(ref)): itm = ref[i] chrs.add(itm['chr']) counts = [0 for i in range(len(ref))] for ch in sorted(chrs): if verbose: print >> sys.stderr, 'processing %s' % (ch, ) seq = sf[ch] for (x, p) in kmersWithPos(K, seq, True): if x not in idx: continue for i in idx[x]: counts[i] += 1 gk = {} for i in range(len(ref)): if lens[i] == counts[i]: continue itm = ref[i] k = '%s/%s' % (itm['gene'], itm['exon']) gk[k] = {'indexed': lens[i], 'genomic': counts[i]} rep['aliasing-genomic'] = gk yaml.safe_dump(rep, sys.stdout, default_flow_style=False) return acc = {} rn = 0 hitStats = Summary() hitHist = [0 for i in range(1000)] for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 (lhsFwd, lhsRev) = kmersWithPosLists(K, itm.reads[0][1]) (rhsFwd, rhsRev) = kmersWithPosLists(K, itm.reads[1][1]) (hits0, hitCount0) = recHits(idx, lhsFwd + rhsRev) (hits1, hitCount1) = recHits(idx, lhsRev + rhsFwd) if len(hits0) > 0: k = tuple(sorted(hits0.keys())) v = sum(hits0.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount0) hitHist[hitCount0] += 1 if len(hits1) > 0: k = tuple(sorted(hits1.keys())) v = sum(hits1.values()) if k not in acc: acc[k] = [0, 0] acc[k][0] += 1 acc[k][1] += v hitStats.add(hitCount1) hitHist[hitCount1] += 1 if verbose: print >> sys.stderr, 'total read hits: %d' % (len(hitStats), ) print >> sys.stderr, 'total hits per read: %g (%g)' % (hitStats.mean(), hitStats.sd()) print >> sys.stderr, 'total reads: %d' % (rn, ) for i in range(len(hitHist)): if hitHist[i] > 0: print >> sys.stderr, '\t%d\t%d' % (i, hitHist[i]) def gex(s): r = [] for n in s: itm = ref[n] r.append('%s/%s' % (itm['gene'], itm['exon'])) return '|'.join(r) def fmtKey(k): nex = len(k) gx = set([]) kStrParts = [] for s in k: kStrParts.append(gex(s)) gx |= set([ref[i]['gene'] for i in s]) kStr = '--'.join(sorted(kStrParts)) return (nex, gx, kStr) gxCounts = {} for k in acc.keys(): gx = set([]) ex = set([]) for s in k: gx |= set([ref[i]['gene'] for i in s]) ex |= set(s) gx = tuple(sorted(gx)) if gx not in gxCounts: gxCounts[gx] = [0, 0] gxCounts[gx][0] += acc[k][0] gxCounts[gx][1] += acc[k][1] hdr = ['numReads', 'numKmers', 'kmersPerRead'] hdr += ['ggNumReads', 'ggNumKmers', 'ggKmersPerRead'] hdr += ['numExons', 'numGenes', 'geneGroup', 'exonGroup'] print '\t'.join(hdr) for k in acc.keys(): (nex, gx, kStr) = fmtKey(k) gx = tuple(sorted(gx)) if len(gx) < minGeneCount or len(gx) > maxGeneCount: continue if len(ex) < minExonCount or len(ex) > maxExonCount: continue if gxCounts[gx][0] < minGeneReads: continue if acc[k][0] < minExonReads: continue gxRate = float(gxCounts[gx][1]) / float(gxCounts[gx][0]) if gxRate < minGeneRate: continue exRate = float(acc[k][1]) / float(acc[k][0]) if exRate < minExonRate: continue gxStr = ':'.join(gx) print '%d\t%d\t%g\t%d\t%d\t%g\t%d\t%d\t%s\t%s' % ( acc[k][0], acc[k][1], exRate, gxCounts[gx][0], gxCounts[gx][1], gxRate, nex, len(gx), gxStr, kStr)
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator2(K) n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) B = opts['-C'] if B is not None: xs = set([]) for (nm, seq) in readFasta(openFile(B)): xs |= set(kmersList(K, seq, True)) B = xs tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: nr = 0 for itm in reads(opts['<input>'], K=K, pairs=False, reads=False, kmers=True, both=True, verbose=verbose): xs = itm.kmers[0] for x in xs: acgt[x & 3] += 1 if d is not None: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) elif B is not None: found = False for x in xs: if x in B: found = True break if found: buf.addList(xs) for x in xs: m += 1 n += 1 else: buf.addList(xs) for x in xs: m += 1 n += 1 nr += 1 if (nr & 1023) == 0 and buf.mem() >= Z // 2: fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf.clear() n = 0 if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf = [] with zotk.kmers(out, 'w') as z: h = {} if len(tmps) == 0: for c in buf.countsOnly(): h[c] = 1 + h.get(c, 0) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly()) elif len(tmps) == 1: with casket(tmpnm, 'r') as z0: writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0])) else: with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, z) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr os.remove(tmpnm)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) if (K & 1) != 0: print >> sys.stderr, "K must be even." return minCov = int(opts['-m']) verbose = opts['-v'] J = K // 2 S = 2*(K - J) Mj = (1 << (2*J)) - 1 names = [] seqs = {} bait = {} wtFst = [] wtLst = [] posIdx = [] rds = [] with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs[nm] = seq wf = {} wl = {} for x in kmersList(K, seq, False): if x not in bait: bait[x] = set([]) bait[x].add(n) y0 = x >> S y1 = x & Mj #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1)) if y0 not in wf: wf[y0] = set([]) wf[y0].add(y1) if y1 not in wl: wl[y1] = set([]) wl[y1].add(y0) wtFst.append(wf) wtLst.append(wl) px = {} for (x,p) in kmersWithPosList(J, seq, False): if x not in px: px[x] = [] px[x].append(p) posIdx.append(px) for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]): pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d dd = pp[2] - pp[0] print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd) rds.append([]) N = len(names) L = None X = [{} for n in range(N)] for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose): rd = itm.reads[0] L = len(rd) xs = itm.kmers[0] hits = set([]) for x in xs: if x in bait: hits |= bait[x] for n in hits: for x in xs: if x not in X[n]: X[n][x] = 0 X[n][x] += 1 rds[n].append(rd) hdrShown = False vn = 0 for n in range(N): xs = {} for (x,c) in X[n].iteritems(): if c >= 10: xs[x] = c seq = seqs[names[n]] rngs = [] st = None en = None inside = False xx = [] for x in kmersList(K, seq, False): if x in xs: xx.append('.') else: xx.append('X') print ''.join(xx) for x in kmersList(K, seq, False): if not inside: if x in xs: st = x else: inside = True else: if x in xs: en = x rngs.append((st, en)) st = x en = None inside = False if inside: rngs.append((st, en)) pthr = Pather(K, xs) for (x,y) in rngs: if x is None or y is None: continue print render(K, x), render(K, y) for p in pthr.trace(x, y, 100): print renderPath(K, p) continue fst = {} lst = {} for (x,c) in xs.iteritems(): #if c < 5: # continue y0 = x >> S y1 = x & Mj if y0 not in fst: fst[y0] = [] fst[y0].append(y1) if y1 not in lst: lst[y1] = [] lst[y1].append(y0) #for (a, b, c, d) in findDupDeNovo(fst, lst): for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst): #continue pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)] dd = pp[2] - pp[0] if not opts['-a'] and dd % 3 != 0: continue if opts['-s']: fstPath = interpolate(K, xs, ab, cb, dd+1) sndPath = interpolate(K, xs, cb, cd, dd+1) if fstPath is None: continue if sndPath is None: continue if fstPath[J:-J] != sndPath[J:-J]: continue pa = pp[0] pb = pp[1] pc = pp[2] pd = pp[3] cab = xs.get(ab, 0) ccb = xs.get(cb, 0) ccd = xs.get(cd, 0) if cab < minCov: continue if ccb < minCov: continue if ccd < minCov: continue m = (cab + ccd) / 2.0 # Assume the true std dev is 10% of the mean w = ccb / m hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1) v = Duplication(names[n], pb, pd-1, seqs) if opts['-A']: showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n]) vn += 1 hdrs = ['n'] fmts = ['%d'] outs = [vn] hdrs += ['left', 'leftCov'] fmts += ['%s','%d'] outs += [render(K, ab), cab] hdrs += ['mid', 'midCov'] fmts += ['%s','%d'] outs += [render(K, cb), ccb] hdrs += ['right', 'rightCov'] fmts += ['%s','%d'] outs += [render(K, cd), ccd] hdrs += ['len'] fmts += ['%d'] outs += [dd] hdrs += ['vaf'] fmts += ['%g'] outs += [w] hdrs += ['hgvs'] fmts += ['%s'] outs += [hgvs] if not hdrShown: hdrShown = True print '\t'.join(hdrs) print '\t'.join(fmts) % tuple(outs)
def next(self): self.readNum += 1 if (self.readNum & self.M) == 0 and self.progress is not None: self.progress.update(self.M) while True: if self.currParsers is None: if self.currFilesInd is None: self.currFilesInd = 0 else: self.currFilesInd += self.N if self.progress is not None: self.progress.update(self.readNum & self.M) if self.currFilesInd + (self.N - 1) >= len(self.files): raise StopIteration if self.verbose: pfx = ' & '.join([ basename(self.files[i]) for i in range(self.currFilesInd, self.currFilesInd + self.N) ]) self.progress = tqdm(unit=' reads', unit_scale=True) self.progress.set_postfix(reading=pfx, refresh=True) self.currParsers = [] for i in range(self.currFilesInd, self.currFilesInd + self.N): fn = self.files[i] f = openFile(fn) if isFasta(fn): self.currParsers.append(readFasta(f)) else: self.currParsers.append(readFastq(f)) self.currReads = [] try: for p in self.currParsers: self.currReads.append(p.next()) except StopIteration: if len(self.currReads) != 0: print >> sys.stderr, 'warning: files had unequal length' self.currParsers = None if self.progress is not None: self.progress.close() self.progress = None continue if self.kmers: self.currKmers = [] for rd in self.currReads: if self.fwdOnly: self.currKmers.append(kmersList(self.K, rd[1], False)) elif self.both: self.currKmers.append(kmersList(self.K, rd[1], True)) else: assert self.separate self.currKmers.append(kmersLists(self.K, rd[1])) res = Reads() if self.reads: res.reads = self.currReads if self.kmers: res.kmers = self.currKmers return res
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) C = int(opts['-C']) L = int(opts['-L']) raw = opts['-r'] S = int(opts['-S']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) with openFile(opts['<regions>']) as f: R = readBED(f) refTbl = {} refIdx = {} zoneIdx = {} for (acc, zones) in R.items(): accSeq = sf[acc] for (s, e, nm) in zones: zoneIdx[nm] = (acc, s, e) seq = accSeq[s - 1:e] if nm not in refTbl: refTbl[nm] = {} for (x, p) in kmersWithPosList(K, seq, False): p -= 1 p += s refTbl[nm][p] = x if x not in refIdx: refIdx[x] = [] refIdx[x].append((nm, p)) acc = {} for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, verbose=verbose): rdL = itm.reads[0] zL = len(rdL) (fwdL, revL) = kmersWithPosLists(K, rdL[1]) fwdLHits = hits(refIdx, K, fwdL, acc) revLHits = hits(refIdx, K, revL, acc) rdR = itm.reads[1] zR = len(rdR) (fwdR, revR) = kmersWithPosLists(K, rdR[1]) fwdRHits = hits(refIdx, K, fwdR, acc) revRHits = hits(refIdx, K, revR, acc) killZ = set([]) for z in acc.keys(): killP = set([]) for p in acc[z].keys(): killX = set([]) vv = {} for x in acc[z][p].keys(): y = x >> 2 if y not in vv: vv[y] = [] vv[y].append((x, acc[z][p][x])) for vs in vv.values(): vt = V * sum([c for (x, c) in vs]) for (x, c) in vs: if c < vt or c < C: killX.add(x) for x in killX: del acc[z][p][x] if len(acc[z][p]) == 0: killP.add(p) for p in killP: del acc[z][p] if len(acc[z]) == 0: killZ.add(z) for z in killZ: del acc[z] if raw: print '\t'.join(['chrom', 'pos', 'side', 'label', 'anchor', 'insSeq']) else: print '\t'.join([ 'chrom', 'after', 'before', 'label', 'rhsShift', 'lhsShift', 'lhsAnc', 'rhsAnc', 'lhsSeq', 'rhsSeq' ]) for z in sorted(acc.keys()): (ch, st, en) = zoneIdx[z] Z = acc[z] ref = refTbl[z] aft = dict(forwardSpurs(K, ref, Z)) bef = dict(reverseSpurs(K, ref, Z)) scoredAft = {} for p in sorted(aft.keys()): if p + K - 1 == en: continue for spur in aft[p]: if len(spur) < L: continue if raw: (xs, cs) = zip(*spur) seq = renderPath(K, xs) anc = seq[:K] ins = seq[K:] print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % ( ch, p + K - 1, 'after', z, anc, ins, ','.join( map(str, cs))) continue for (q, xcs, v) in shiftForwardSpur(ref, Z, S, p, spur): q += K - 1 if q not in scoredAft: scoredAft[q] = [] (xs, cs) = zip(*xcs) seq = renderPath(K, xs) anc = seq[:K] ins = seq[K:] scoredAft[q].append((v, anc, ins, cs)) scoredBef = {} for p in sorted(bef.keys()): if p == st: continue for spur in bef[p]: if len(spur) < L: continue if raw: (xs, cs) = zip(*spur) seq = renderPath(K, xs) anc = seq[-K:] ins = seq[:-K] print '%s\t%d\t%s\t%s\t%s\t%s\t%s' % ( ch, p, 'before', z, anc, ins, ','.join(map(str, cs))) continue for (q, xcs, v) in shiftReverseSpur(ref, Z, S, p, spur): if q not in scoredBef: scoredBef[q] = [] (xs, cs) = zip(*xcs) seq = renderPath(K, xs) anc = seq[-K:] ins = seq[:-K] scoredBef[q].append((v, anc, ins, cs)) for p0 in sorted(scoredAft.keys()): p1 = p0 + 1 if p1 not in scoredBef: continue for (aftV, aftAnc, aftIns, aftCov) in scoredAft[p0]: for (befV, befAnc, befIns, befCov) in scoredBef[p1]: if befAnc in aftIns or aftAnc in befIns: continue v = aftV + befV print '%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( ch, p0, p1, z, aftV, befV, aftAnc, befAnc, aftIns, befIns)
def main(argv): opts = docopt.docopt(__doc__, argv) random.seed(17) K = int(opts['-k']) S = 2*(K-3) frameAnchors = {} knownStops = {} sequences = {} seqKmers = {} if opts['-r']: with openFile(opts['-r']) as f: for (nm,seq) in readFasta(f): sequences[nm] = seq # trim polyA tails seq = re.sub('AAAAAA*$', '', seq) seqKmers[nm] = set([]) for (x,p1) in kmersWithPosList(K, seq, False): seqKmers[nm].add(x) p = p1 - 1 w = p % 3 if x not in frameAnchors: frameAnchors[x] = set([]) frameAnchors[x].add((nm,p)) y = x & 63 if w == 0 and y in stops: if x not in knownStops: knownStops[x] = set([]) knownStops[x].add(nm) rn = 0 res = {} for fn in opts['<input>']: with openFile(fn) as f: for rd in readFastq(f): L = len(rd[1]) rn += 1 fwdAndRev = kmersWithPosLists(K, rd[1]) frames = {} possibleStops = {} for i in range(2): #print i, sorted([p for (x,p) in fwdAndRev[i]]) for (x,p) in fwdAndRev[i]: if x in frameAnchors: for (nm,q) in frameAnchors[x]: o = (q - p) k = (nm, o, i) frames[k] = 1 + frames.get(k, 0) if len(frames) == 0: continue n = sum(frames.values()) probs = [] for ((nm, off, strnd), cnt) in sorted(frames.items()): probs.append((float(cnt)/float(n), cnt, off, strnd, nm)) v = random.random() for (pv, cnt, off, strnd, nm) in probs: if v < pv: #print rd[1] #print proj(strnd, sequences[nm][off:off+len(rd[1])]) #print codons(off % 3, rd[1]), off for (x,p) in fwdAndRev[strnd]: if (p + off + K - 3) % 3 == 0 and (x & 63) in stops: if nm not in res: res[nm] = {} if x not in res[nm]: res[nm][x] = 0 res[nm][x] += 1 break v -= pv for (nm,stps) in res.iteritems(): for (x,c) in stps.iteritems(): (d,y) = nearest3(K, seqKmers[nm], x) if x in knownStops: k = 'known' else: k = 'novel' print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
def outputFile(nm): if nm is None: return sys.stdout return openFile(nm, 'w')
def main(argv): opts = docopt.docopt(__doc__, argv) sf = SequenceFactory(opts['-g']) I = int(opts['-I']) D = float(opts['-d']) E = float(opts['-e']) L = int(opts['-L']) N = int(opts['-N']) V = float(opts['-V']) M = None if opts['-M'] is not None: M = float(opts['-M']) # compute the 99% quantile W = int(math.log1p(-0.99) / math.log1p(-M)) S = None if opts['-S'] is not None: S = int(opts['-S']) random.seed(S) if opts['-b']: zones = readBED(openFile(opts['-b'])) else: zones = {} for ch in refSeq2Hg19.values(): ch = normalizeAccession(ch) s = sf[ch] v = (1, len(s), ch) if ch not in zones: zones[ch] = [] zones[ch].append(v) popVars = {} if opts['-m'] is not None: with openFile(opts['-m']) as f: for l in f: t = l.split() p = float(t[0]) v = makeHGVS(t[1], sf) a = normalizeAccession(v.accession()) if a not in popVars: popVars[a] = [] popVars[a].append((v, p)) for ch in popVars.keys(): popVars[ch].sort() t = 0 chMax = 0 for ch in zones.keys(): chMax = max(chMax, len(ch)) for zone in zones[ch]: (s, e, _n) = zone l = e - s + 1 t += l if opts['-v']: print >> sys.stderr, 'mean coverage = %g' % (float(N * L) / float(t), ) zoneCounts = {} zoneProbs = [] for ch in zones.keys(): zoneCounts[ch] = {} for zone in zones[ch]: zoneCounts[ch][zone] = 0 (s, e, _n) = zone l = e - s + 1 zoneProbs.append((float(l) / float(t), (ch, zone))) zgen = MultiGen(zoneProbs) for n in xrange(N): (ch, z) = zgen.gen() if z not in zoneCounts[ch]: zoneCounts[ch][z] = 0 zoneCounts[ch][z] += 1 vStrs = opts['<variant>'] if opts['-f'] is not None: with openFile(opts['-f']) as f: for l in f: s = l.strip() vStrs.append(s) allVars = {} for s in vStrs: v = makeHGVS(s, sf) if v is None: print >> sys.stderr, 'unable to parse variant: %s', (s, ) continue if v.anonymous(): n = v.size() seq = ''.join( [random.choice(['A', 'C', 'G', 'T']) for i in range(n)]) v.setSequence(seq) a = normalizeAccession(v.accession()) if a not in allVars: allVars[a] = [] allVars[a].append(v) numOverlaps = 0 for xs in allVars.values(): xs.sort() for i in range(len(xs)): for j in range(i + 1, len(xs)): if xs[i].overlaps(xs[j]): print >> sys.stderr, "variants overlap: %s <> %s" % (str( xs[i]), str(xs[j])) numOverlaps += 1 if numOverlaps > 0: sys.exit(1) prog = None if opts['-v']: prog = tqdm(total=N, unit='pairs') egen = GeomVarSource(E) fasta = False logfile = None if opts['-l']: logfile = open(opts['-l'], 'w') pfx = opts['<output-prefix>'] sfx = '' if opts['-z']: sfx = '.gz' with openFile(pfx + '_1.fastq' + sfx, 'w') as out1, openFile(pfx + '_2.fastq' + sfx, 'w') as out2: for ch in zones.keys(): if prog is not None: prog.set_description(ch.ljust(chMax, ' ')) prog.update(0) chVars = [] if ch in allVars: chVars = allVars[ch] wtVars = applyBackgroundVariants(ch, [], popVars) mutVars = applyBackgroundVariants(ch, chVars, popVars) for zone in zones[ch]: wtMaker = ReadMaker(chrom=ch, zone=zone, L=L, I=I, D=D, variants=wtVars, fasta=fasta, egen=egen, sf=sf) wtMaker.prepareAllele() mutMaker = ReadMaker(chrom=ch, zone=zone, L=L, I=I, D=D, variants=mutVars, fasta=fasta, egen=egen, sf=sf) mutMaker.prepareAllele() for i in xrange(zoneCounts[ch][zone]): if prog is not None: prog.update(1) u = random.random() if u > V: (rd1, rd2) = wtMaker.makeReadFromZone() else: (rd1, rd2) = mutMaker.makeReadFromZone() print >> out1, rd1 print >> out2, rd2 if prog is not None: prog.__exit__(None, None, None)
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) D = int(opts['-D']) Q = int(opts['-C']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) if opts['-X']: Wcap = int(opts['-w']) Wval = int(opts['-W']) variants = opts['<variant>'] if opts['-f']: with openFile(opts['-f']) as f: variants += f.read().split() vx = {} for v in variants: x = makeHGVS(v) if x is None: print >> sys.stderr, "unable to parse %s" % (v, ) continue x.setSequenceFactory(sf) acc = x.accession() if acc not in vx: vx[acc] = [] vx[acc].append(x) chk = None if opts['-T']: chk = {} rs = [] for (acc, vs) in vx.iteritems(): for v in vs: r = makeIndexedVariant(v, K, Wcap, Wval) if r is not None: rs.append(r) if chk is not None: xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['wtSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('wt', str(v))) if r['mutSeq'] is None: continue xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['mutSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('mut', str(v))) if chk is not None: counts = dict([(x, 0) for x in chk.keys()]) for acc in refSeq2Hg19.keys(): if verbose: print >> sys.stderr, 'scanning', acc seq = sf[acc] for x in kmers(K, seq): if x in counts: counts[x] += 1 res = {} seen = set([]) for x in counts.keys(): y = rc(K, x) z = min(x, y) if z in seen: continue seen.add(z) c = counts[x] + counts[y] for (a, v) in chk[x]: if v not in res: res[v] = {} if a not in res[v]: res[v][a] = {} if c not in res[v][a]: res[v][a][c] = 0 res[v][a][c] += 1 yaml.safe_dump(res, sys.stdout, default_flow_style=False) return with open(opts['<index>'], 'w') as f: yaml.safe_dump(rs, f, default_flow_style=False) return capt = False zipname = None if opts['-c']: capt = True zipname = opts['-c'] fmt = set([]) if opts['-F']: fmt = set(opts['-F'].split(',')) if verbose: print >> sys.stderr, "loading index." with open(opts['<index>']) as f: hgvsVars = yaml.load(f, Loader=yaml.FullLoader) NV = len(hgvsVars) combineStrands = True if opts['-s']: combineStrands = False cap = capture(K, reads=capt, kmers=True, verbose=verbose) for n in range(NV): itm = hgvsVars[n] h = itm['hgvs'] v = makeHGVS(h) itm['var'] = v lhs = itm['lhsFlank'] rhs = itm['rhsFlank'] wt = itm['wtSeq'] mut = itm['mutSeq'] bait = [lhs, wt, rhs] if mut is not None: bait += ['N'] bait += [lhs, mut, rhs] bait = ''.join(bait) n0 = cap.addBait(h, bait) assert n0 == n if verbose: print >> sys.stderr, "done." rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 cap.addReadPairAndKmers(itm.reads[0], itm.reads[1]) if capt: cap.saveReads(zipname) scorer = Scorer(K) globHist = {} for n in range(NV): mx = cap.capKmers[n] for c in mx.itervalues(): if c < Q: continue if c not in globHist: globHist[c] = 0 globHist[c] += 1 with outputFile(opts['-o']) as out: hdrShown = False for n in range(NV): itm = hgvsVars[n] v = itm['var'] h = itm['hgvs'] mx = cap.capKmers[n] nr = cap.capReadCounts[n] if 'kmers' in fmt: for (x, c) in mx.iteritems(): print '%d\t%s\t%d' % (n, render(K, x), c) lhsFlank = itm['lhsFlank'] rhsFlank = itm['rhsFlank'] alleles = {} alleles['wt'] = [] alleles['mut'] = [] wtSeq = itm['wtSeq'] wtZ = len(wtSeq) mutSeq = itm['mutSeq'] mutZ = v.size() cs = [c for (x, c) in mx.iteritems() if c >= Q] cs.sort() nk = len(cs) if nk == 0: cs = [0] q10 = cs[1 * len(cs) // 10] q50 = cs[5 * len(cs) // 10] q90 = cs[9 * len(cs) // 10] af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq, wtZ, mutZ) finders = [] if not v.anonymous(): finders.append(af.definiteAlleles()) else: finders.append(af.bridgingAlleles()) j = 0 for (t, a) in cat(finders): assert t == 'wt' or t == 'mut' alleles[t].append(a) j += 1 wtRes = {} wtRes['covMin'] = 0 wtRes['binom'] = 1.0 wtRes['ksDist'] = 0.0 wtRes['hamming'] = 0 wtRes['path'] = [] for pthRes in alleles['wt']: scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank) if isBetter(pthRes, wtRes): wtRes = pthRes mutRes = {} mutRes['covMin'] = 0 mutRes['binom'] = 1.0 mutRes['ksDist'] = 0.0 mutRes['hamming'] = 0 mutRes['path'] = [] for pthRes in alleles['mut']: scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank) if isBetter(pthRes, mutRes): mutRes = pthRes if True: wtXs = [mx.get(x, 0) for x in wtRes['path']] if len(wtXs) == 0: wtXs = [0] wtXs.sort() wtCount = sum(wtXs) wtLen = len(wtXs) wtMean = float(wtCount) / float(wtLen) wtMedian = wtXs[wtLen // 2] mutXs = [mx.get(x, 0) for x in mutRes['path']] if len(mutXs) == 0: mutXs = [0] mutXs.sort() mutCount = sum(mutXs) mutLen = len(mutXs) mutMean = float(mutCount) / float(mutLen) mutMedian = mutXs[mutLen // 2] totX = max([1.0, float(wtMedian + mutMedian), float(q90)]) wtVaf = wtMedian / totX mutVaf = mutMedian / totX hdrs = ['n'] fmts = ['%d'] outs = [n] wtAllele = ((wtRes['covMin'] > Q) and (wtRes['hamming'] < 4)) and (wtVaf > V) mutAllele = ((mutRes['covMin'] > Q) and (mutRes['hamming'] < 4)) and (mutVaf > V) resV = 1 * wtAllele + 2 * mutAllele res = ['null', 'wt', 'mut', 'wt/mut'][resV] hdrs += ['res'] fmts += ['%s'] outs += [res] if 'rds' in fmt: hdrs += ['numReads'] fmts += ['%d'] outs += [nr] hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90'] fmts += ['%d', '%d', '%d', '%d'] outs += [nk, q10, q50, q90] hdrs += ['wtMin', 'mutMin'] fmts += ['%d', '%d'] outs += [wtRes['covMin'], mutRes['covMin']] hdrs += ['wtHam', 'mutHam'] fmts += ['%d', '%d'] outs += [wtRes['hamming'], mutRes['hamming']] if 'ks' in fmt: hdrs += ['wtD', 'mutD'] fmts += ['%g', '%g'] outs += [wtRes['ksDist'], mutRes['ksDist']] if 'binom' in fmt: hdrs += ['wtQ', 'mutQ'] fmts += ['%g', '%g'] outs += [wtRes['binom'], mutRes['binom']] if 'vaf' in fmt: hdrs += ['wtVaf', 'mutVaf'] fmts += ['%g', '%g'] outs += [wtVaf, mutVaf] hdrs += ['hgvs'] fmts += ['%s'] outs += [h] if not hdrShown: hdrShown = True print >> out, '\t'.join(hdrs) print >> out, '\t'.join(fmts) % tuple(outs) out.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) fns = opts['<input>'] p = None if opts['-p'] is not None: p = float(opts['-p']) if len(fns) == 1 and isFasta(fns[0]): K = 25 seqs = [] with openFile(fns[0]) as f: for (nm, seq) in readFasta(f): xs = set(basics.kmers(K, seq, True)) xs = list(xs) xs.sort() xs = array.array('L', xs) seqs.append((nm.split()[0], xs)) Z = 1 if opts['-a']: Z = len(seqs) print len(seqs) for i in xrange(Z): xnm = seqs[i][0] xs = seqs[i][1] for j in xrange(i + 1, len(seqs)): ynm = seqs[j][0] ys = seqs[j][1] (isec, union, d) = jaccard(xs, ys) if p is None: print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % ( xnm, ynm, len(xs), len(ys), isec, union, d) else: pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10) q05 = quantBeta(0.05, isec + 1, (union - isec) + 1) q95 = quantBeta(0.95, isec + 1, (union - isec) + 1) print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % ( xnm, ynm, len(xs), len(ys), isec, union, d, d - q05, q95 - d, pv) sys.stdout.flush() return Z = 1 if opts['-a']: Z = len(fns) for i in xrange(Z): with kmers(fns[i], 'r') as z0: xK = z0.meta['K'] xs = array.array('L', readKmers(z0)) for j in xrange(i + 1, len(fns)): with kmers(fns[j], 'r') as z1: yK = z1.meta['K'] ys = array.array('L', readKmers(z1)) if xK != yK: print >> sys.stderr, 'mismatched K:', fns[j] sys.exit(1) (isec, union, d) = jaccard(xs, ys) if p is None: print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d) else: pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10) q05 = quantBeta(0.05, isec + 1, (union - isec) + 1) q95 = quantBeta(0.95, isec + 1, (union - isec) + 1) print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d, d - q05, q95 - d, pv) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) K = 25 nms = [] idx = {} for (nm, seq) in readFasta(openFile(opts['<baits>'])): n = len(nms) nms.append(nm) for x in kmersList(K, seq, True): if x not in idx: idx[x] = set([]) idx[x].add(n) for x in idx.keys(): idx[x] = list(idx[x]) idx[x].sort() anti = set([]) if opts['-U']: with openFile(opts['-U']) as f: for (nm, seq) in readFasta(f): for x in kmersList(K, seq, True): anti.add(x) rn = 0 if opts['-p']: hist = {} for (fn1, fn2) in pairs(opts['<input>']): tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq')) for i in xrange(len(nms))] cache = [[[], []] for i in xrange(len(nms))] counts = [0 for i in xrange(len(nms))] with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): hits = set([]) pushup = False for x in kmersList(K, fq1[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) for x in kmersList(K, fq2[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) if pushup: continue n = len(hits) hist[n] = 1 + hist.get(n, 0) for i in hits: counts[i] += 1 cache[i][0].append(fq1) cache[i][1].append(fq2) if len(cache[i][0]) >= 1024: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] for i in xrange(len(cache)): if len(cache[i][0]) > 0: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] with zipfile.ZipFile(opts['<output>'], 'w', zipfile.ZIP_DEFLATED) as z: for i in xrange(len(nms)): if counts[i] > 0: pth = '/'.join(nms[i].split()) z.write(tmps[i][0], pth + '/' + fn1) os.remove(tmps[i][0]) z.write(tmps[i][1], pth + '/' + fn2) os.remove(tmps[i][1]) hist = hist.items() hist.sort() for (n, f) in hist: print '%d\t%d' % (n, f) else: raise "not implemented"
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) M = (1 << (2*K)) - 1 paired = True if opts['-s']: paired = False p = float(opts['-p']) T = int(M * p) if opts['-r']: refs = [] with openFile(opts['-r']) as f: for (nm, seq) in readFasta(f): refs += kmersList(K, seq, False) refs = set(refs) kill = set([]) for x in refs: y = rc(K, x) if y in refs: kill.add(x) kill.add(y) print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs)) refs -= set(kill) fwd = {} rev = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): fn = 0 for x in xs: if x in refs: fn += 1 ys = [rc(K, x) for x in xs] rn = 0 for y in ys: if y in refs: rn += 1 if fn + rn == 0: continue q = float(fn) / float(fn + rn) if random.random() < q: for x in xs: fwd[x] = 1 + fwd.get(x, 0) else: for y in ys: rev[y] = 1 + rev.get(y, 0) for (x,xc) in fwd.iteritems(): y = rc(K, x) yc = 0 if y in rev: yc = rev[y] del rev[y] print '%d\t%d' % (xc, yc) for (y,yc) in rev.iteritems(): print '%d\t%d' % (0, yc) return kx = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): for x in xs: if x in kx: kx[x] += 1 continue y = rc(K, x) z = murmer(min(x, y), 17) if (z & M) > T: continue kx[x] = 1 for x in kx.keys(): y = rc(K, x) if x > y: continue xc = kx[x] yc = kx.get(y, 0) if murmer(x, 17) >= murmer(y, 17): (a, b) = (x, y) (ac, bc) = (xc, yc) else: (a, b) = (y, x) (ac, bc) = (yc, xc) #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc) print '%d\t%d' % (ac, bc)
def prepareBedFileGene(geneListFn, refGeneFn, outFn): genes = {} with openFile(geneListFn) as f: for l in f: t = l.split() genes[t[0]] = set([]) loci = {} strands = {} with openFile(refGeneFn) as f: for l in f: t = l.split() tx = t[1] ch = t[2] st = t[3] g = t[12] # Ignore chr6_cox_hap2 and friends. if '_' in ch: continue if g not in genes: continue if g in strands: if (ch, st) != strands[g]: print >> sys.stderr, 'gene tx seen in multiple chromosomes: %s [%s] (%s%s, %s%s)' % ( g, tx, strands[g][0], strands[g][1], ch, st) continue else: strands[g] = (ch, st) ss = [int(s) for s in t[9].split(',') if len(s) > 0] ee = [int(e) for e in t[10].split(',') if len(e) > 0] exs = zip(ss, ee) for i in range(len(exs)): genes[g].add((exs[i][0], exs[i][1])) for g in sorted(genes.keys()): if len(genes[g]) == 0: print >> sys.stderr, 'no annotated transcripts for gene: %s' % ( g, ) continue (ch, st) = strands[g] exs0 = sorted(genes[g]) exs1 = [exs0[0]] for i in range(1, len(exs0)): if exs1[-1][1] >= exs0[i][0]: exs1[-1] = (min(exs1[-1][0], exs0[i][0]), max(exs1[-1][1], exs0[i][1])) else: exs1.append(exs0[i]) if len(exs0) != len(exs1): genes[g] = exs1 #print >> sys.stderr, 'merged exons for gene %s (%d -> %d)' % (g, len(exs0), len(exs1)) with openFile(outFn, 'w') as out: for g in sorted(genes.keys()): if len(genes[g]) == 0: continue (ch, st) = strands[g] exs = sorted(genes[g]) if st == '-': exs = exs[::-1] for i in range(len(exs)): j = i + 1 print >> out, '%s\t%d\t%d\t%s/%02d\t%s' % (ch, exs[i][0], exs[i][1], g, j, st)