def get_regions(self): total = 0 for chrom, chrom_len in self.chrom_lens: total += (chrom_len / self.binsize) if chrom_len % self.binsize != 0: total += 1 eta = ETA(total) pos_acc = 0 for chrom, chrom_len in self.chrom_lens: pos = -1 for bin in xrange(0, chrom_len, self.binsize): if pos > -1: eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin)) yield (chrom, [pos], [bin], '+', [chrom, pos, bin, '+'], None) if self.stranded: eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin)) yield (chrom, [pos], [bin], '-', [chrom, pos, bin, '-'], None) pos = bin pos_acc += 1 eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin)) yield (chrom, [pos], [chrom_len], '+', [chrom, pos, chrom_len, '+'], None) if self.stranded: eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin)) yield (chrom, [pos], [chrom_len], '- ', [chrom, pos, chrom_len, '-'], None) eta.done()
def Loop(TotalPosts): cnt = 0 Bal = True PageNum = 0 MakeURL() eta = ETA(TotalPosts / 320) file = open(str(total[0]) + str(total[1] + '.txt'), 'w+') while Bal == True: Bal = APIConnection(PageNum + 1) time.sleep(RateLimit / 1000) if not cnt <= TotalPosts: Bal = False cnt += 320 eta.print_status() PageNum += 1 eta.done() print('Parsing XML') for each in XMList: root = ET.fromstring(each.content) for e in root.findall('post'): file.write('https://e621.net/post/show/' + e.find('id').text + '\n') print(' ' + str(cnt) + ' ' + str(TotalPosts)) file.close() eta.done() cntttl = cnt / TotalPosts cntttl = cntttl * 100 print('DL ' + str(cnt) + ' Posts or ' + str(cntttl) + '%')
def gzip_reader(fname, quiet=False, callback=None, done_callback=None): if fname == '-': f = sys.stdin elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': f = gzip.open(os.path.expanduser(fname)) else: f = open(os.path.expanduser(fname)) if quiet or fname == '-': eta = None else: eta = ETA(os.stat(fname).st_size, fileobj=f) for line in f: if eta: if callback: extra = callback() else: extra = '' eta.print_status(extra=extra) yield line if done_callback and done_callback(): break if f != sys.stdout: f.close() if eta: eta.done()
def _gen1(): if not self.quiet: eta = ETA(self.regions.total) else: eta = None count = 0 for region in self.regions: working_chrom = None if region.chrom in self.bam.references: working_chrom = region.chrom elif chrom[0:3] == 'chr': if region.chrom[3:] in self.bam.references: working_chrom = region.chrom[3:] if not working_chrom: continue # for troubleshooting self.cur_chrom = region.chrom self.cur_start = region.start self.cur_end = region.end laststart = 0 for read in self.bam.fetch(working_chrom, region.start, region.end): if read.pos != laststart: count += 1 laststart = read.pos if eta: eta.print_status(count, extra='%s/%s %s:%s' % (count, self.regions.total, self.bam.references[read.tid], read.pos)) yield read if eta: eta.done()
def bam_extract(inbam, outbam, bedfile, nostrand=False, quiet=False): bed = BedFile(bedfile) if not quiet: eta = ETA(os.stat(bedfile).st_size, fileobj=bed) else: eta = None passed = 0 for region in bed: if eta: eta.print_status(extra="extracted:%s" % (passed)) if not region.chrom in inbam.references: continue if not nostrand: strand = region.strand else: strand = None for read in bam_extract_reads(inbam, region.chrom, region.start, region.end, strand): outbam.write(read) passed += 1 if not quiet: eta.done() sys.stderr.write("%s extracted\n" % (passed, ))
def bam_iter(bam, quiet=False, show_ref_pos=False, callback=None): ''' >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] ''' if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) else: eta = None if os.path.exists('%s.bai' % bam.filename): # This is an indexed file, so it is ref sorted... # Meaning that we should show chrom:pos, instead of read names show_ref_pos = True for read in bam: pos = bam.tell() bgz_offset = pos >> 16 if not quiet and eta: if callback: eta.print_status(bgz_offset, extra=callback(read)) elif (show_ref_pos): if read.tid > -1: eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) else: eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) else: eta.print_status(bgz_offset, extra='%s' % read.qname) yield read if eta: eta.done()
def bam_extract(inbam, outbam, bedfile, nostrand=False, quiet=False): bed = BedFile(bedfile) if not quiet: eta = ETA(os.stat(bedfile).st_size, fileobj=bed) else: eta = None passed = 0 for region in bed: if eta: eta.print_status(extra="extracted:%s" % (passed)) if not region.chrom in inbam.references: continue if not nostrand: strand = region.strand else: strand = None for read in bam_extract_reads(inbam, region.chrom, region.start, region.end, strand): outbam.write(read) passed += 1 if not quiet: eta.done() sys.stderr.write("%s extracted\n" % (passed,))
def _schedualJobs(self,jobs): ''' strategy for jobs ''' time_interval = self.time_interval with open(self.log_file,'a') as logFile: n_err = 0 eta = ETA(len(jobs)) for i in xrange(len(jobs)): eta.print_status(i) time1 = time.time() result =self._doJob(jobs[i],logFile) if(result != None ): self._saveLog(logFile,jobs[i],success=True) self._collect(result) if self.result_file!=None : self._saveResult() time_interval = time_interval/4 if time_interval < self.time_interval: time_interval = self.time_interval else: self._saveLog(logFile,jobs[i],success=False) n_err += 1 time_interval = time_interval*2 if n_err == self.error_times: break; time_span = time.time() - time1 if(time_span<self.time_interval): time.sleep(self.time_interval - time_span) eta.done()
def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] # just include all regions - don't worry about transcripts and exons # the regions encompass all exons anyway... for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) out = [ gene.gene_id, gene.gene_name, ] if self.has_isoform: out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') out.extend([gene.chrom, gene.strand, gene.start, gene.end]) yield (gene.chrom, starts, ends, gene.strand, out, None) eta.done()
def fastq_sort(fastq, byname=True, bysequence=False, tmpdir=None, chunksize=100000, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if byname: chunk.append((read.name, read)) if bysequence: chunk.append((read.seq, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk)) sys.stderr.write('Merging chunks...\n') buf = [None, ] * len(tmpfiles) skip = [False, ] * len(tmpfiles) eta = ETA(count) j=0 writing = True while writing: j+=1 eta.print_status(j) for i, fobj in enumerate(tmpfiles): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if byname: buf[i] = (read.name, i, read) if bysequence: buf[i] = (read.seq, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done()
def get_regions(self): eta = ETA(self.bed.length, fileobj=self.bed) for region in self.bed: eta.print_status( extra='%s:%s-%s[%s]' % (region.chrom, region.start, region.end, region.strand)) yield (region.chrom, [region.start], [region.end], region.strand, [ region.chrom, region.start, region.end, region.name, region.score, region.strand ], None) eta.done()
class Callback(object): def __init__(self, total): self.i = 0 self.eta = ETA(total) def __call__(self, result=None): self.i += 1 self.eta.print_status(self.i, extra=result if result else '') def done(self): self.eta.done()
def gtf_junctions(gtf, refname, fragment_size, min_size, max_exons=5, known=False, out=sys.stdout, quiet=False, scramble=False, retain_introns=False): ref = pysam.Fastafile(refname) references = [] with open('%s.fai' % refname) as f: for line in f: cols = line.split('\t') references.append(cols[0]) if not quiet: eta = ETA(gtf.fsize(), fileobj=gtf) else: eta = None exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out, scramble) for gene in gtf.genes: if not gene.chrom in references: continue if eta: eta.print_status(extra='%s:%s %s' % (gene.chrom, gene.start, gene.gene_name)) if known: for txpt in gene.transcripts: last = None for exon in txpt.exons: if last: exporter.export(gene.chrom, [last, exon]) last = exon else: exons = set() for txpt in gene.transcripts: for exon in txpt.exons: exons.add(exon) exons = list(exons) exons.sort() if retain_introns: exporter.export_retained_introns(gene.chrom, exons, gene.strand) if scramble: # We can just pretend the transcript is repeated # and then let the set take care of removing the duplicates exons = exons * 2 exporter.export(gene.chrom, exons) if eta: eta.done() ref.close()
def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int): """Computes optimal sequence of player opcodes to reproduce audio data.""" dlen = len(data) # TODO: avoid temporarily doubling memory footprint to concatenate data = numpy.concatenate( [data, numpy.zeros(lookahead_steps, dtype=numpy.float32)]) voltage = -1.0 position = -1.0 # Pre-warm cache so we don't skew ETA during encoding for i in range(2048): _, _ = opcodes.candidate_opcodes(frame_horizon(i, lookahead_steps), lookahead_steps) total_err = 0.0 frame_offset = 0 eta = ETA(total=1000) i = 0 last_updated = 0 opcode_counts = collections.defaultdict(int) while i < dlen: if (i - last_updated) > int((dlen / 1000)): eta.print_status() last_updated = i candidate_opcodes, voltages = opcodes.candidate_opcodes( frame_horizon(frame_offset, lookahead_steps), lookahead_steps) opcode_idx = lookahead(step, position, data, i, voltage * voltages) opcode = candidate_opcodes[opcode_idx][0] opcode_counts[opcode] += 1 yield opcode position, voltage, new_error, i = evolve(opcode, position, voltage, step, data, i) total_err += new_error frame_offset = (frame_offset + 1) % 2048 for _ in range(frame_offset % 2048, 2047): yield opcodes.Opcode.NOTICK_6 yield opcodes.Opcode.EXIT eta.done() print("Total error %f" % total_err) print("Opcodes used:") for v, k in sorted(list(opcode_counts.items()), key=lambda kv: kv[1], reverse=True): print("%s: %d" % (v, k))
def output(self, partitions): for pn, part in enumerate(partitions): d = os.path.abspath(os.path.join(self.dst, self.name % pn)) if os.path.isfile(d): logging.warning('Archive already exists, overwriting: ' + d) logging.info('Creating archive %s...' % (self.name % pn)) eta = ETA(part.size, min_ms_between_updates=500) with tarfile.open(d, self.mode) as tar: for fn, size, estsize in part.filelist: try: tar.add(os.path.join(self.srcbase, fn), fn) except Exception as ex: logging.error(ex) eta.print_status(estsize) eta.done()
def output(self, partitions): for pn, part in enumerate(partitions): d = os.path.abspath(os.path.join(self.dst, self.name % pn)) if os.path.isfile(d): logging.warning('Archive already exists, overwriting: ' + d) logging.info('Creating archive %s...' % (self.name % pn)) eta = ETA(part.size, min_ms_between_updates=500) with zipfile.ZipFile(d, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: for fn, size, estsize in part.filelist: try: zipf.write(os.path.join(self.srcbase, fn), fn) except Exception as ex: logging.error(ex) eta.print_status(estsize) eta.done()
def gzip_aware_reader(fname, callback=None): if fname[-3:] == '.gz': f = gzip.open(fname) else: f = open(fname) eta = ETA(os.stat(fname).st_size, fileobj=f) for line in f: extra = None if callback: extra = callback() eta.print_status(extra=extra) yield line f.close() eta.done()
def fetch(self, quiet=False): if self.fname and not quiet: eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) else: eta = None while True: try: read = fastq_read_file(self.fileobj) if eta: eta.print_status(extra=read.name) yield read except: break if eta: eta.done()
def fetch(self, quiet=False): name = '' comment = '' seq = '' if not quiet and self.fname and self.fname != '-': eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) else: eta = None for line in self.fileobj: line = line.strip() if not line: continue if line[0] == '#': continue if line[0] == '>': if name and seq: if eta: eta.print_status(extra=name) yield FASTARead(name, comment, seq) spl = re.split(r'[ \t]', line[1:], maxsplit=1) name = spl[0] if len(spl) > 1: comment = spl[1] else: comment = '' seq = '' else: if self.qual: seq = seq + ' ' + line else: seq += line if name and seq: if eta: eta.print_status(extra=name) yield FASTARead(name, comment, seq) if eta: eta.done()
def _repeatreader(fname): with ngsutils.support.ngs_utils.gzip_opener(fname) as repeat_f: eta = ETA(os.stat(fname).st_size, fileobj=repeat_f) repeat_f.next() repeat_f.next() repeat_f.next() for line in repeat_f: cols = line.strip().split() chrom = cols[4] start = int(cols[5]) - 1 end = int(cols[6]) strand = '+' if cols[8] == '+' else '-' family = cols[10] member = cols[9] eta.print_status(extra='%s|%s %s:%s-%s[%s]' % (family, member, chrom, start, end, strand)) yield (family, member, chrom, start, end, strand) eta.done()
def output(self, partitions): for pn, part in enumerate(partitions): d = os.path.abspath(os.path.join(self.dst, self.name % pn)) logging.info('Copying to %s' % d) eta = ETA(part.size, min_ms_between_updates=500) for fn, size, estsize in part.filelist: src = os.path.join(self.srcbase, fn) dst = os.path.join(d, fn) try: if os.path.isdir(src): os.makedirs(dst, exist_ok=True) else: os.makedirs(os.path.dirname(dst), exist_ok=True) shutil.copy2(src, dst) except Exception as ex: logging.error(ex) continue eta.print_status(estsize) eta.done()
def gtf_junctions(gtf, refname, fragment_size, min_size, max_exons=5, known=False, out=sys.stdout, quiet=False): ref = pysam.Fastafile(refname) references = [] with open("%s.fai" % refname) as f: for line in f: cols = line.split("\t") references.append(cols[0]) if not quiet: eta = ETA(gtf.fsize(), fileobj=gtf) else: eta = None exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out) for gene in gtf.genes: if not gene.chrom in references: continue if eta: eta.print_status(extra="%s:%s %s" % (gene.chrom, gene.start, gene.gene_name)) if known: for txpt in gene.transcripts: last = None for exon in txpt.exons: if last: exporter.export(gene.chrom, [last, exon]) last = exon else: exons = set() for txpt in gene.transcripts: for exon in txpt.exons: exons.add(exon) exons = list(exons) exons.sort() exporter.export(gene.chrom, exons) if eta: eta.done() ref.close()
def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None): if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) else: eta = None for pileup in bam.pileup(mask=mask): pos = bam.tell() bgz_offset = pos >> 16 if not quiet: if callback: eta.print_status(bgz_offset, extra=callback(pileup)) else: eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos)) yield pileup if eta: eta.done()
def fetch(self, quiet=False, callback=None): if self.fname and not quiet: eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) else: eta = None while True: try: read = fastq_read_file(self.fileobj) if eta: if callback: eta.print_status(extra=callback()) else: eta.print_status(extra=read.name) yield read except StopIteration: break if eta: eta.done()
def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] # just include all regions - don't worry about transcripts and exons # the regions encompass all exons anyway... for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) out = [gene.gene_id, gene.gene_name, ] if self.has_isoform: out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') out.extend([gene.chrom, gene.strand, gene.start, gene.end]) yield (gene.chrom, starts, ends, gene.strand, out, None) eta.done()
def run(predict, test_size, n_iter=100, n_burnin=10, resample=None): p_tot = LogR(0) eta = ETA(n_iter); eta.print_status(0, extra='starting...') for i in range(n_iter): print polya.timestamp(), "iteration %u/%u" % (i+1, n_iter) polya.resample() if resample: resample() p = predict() pplx = float(p ** (-1./test_size)) print polya.timestamp(), 'perplexity =', pplx if i < n_burnin: eta.print_status(i+1, extra="burning in (%.1f)..." % pplx) else: p_tot += p pplx = float((p_tot / (i+1 - n_burnin)) ** (-1./test_size)) eta.print_status(i+1, extra="perplexity %.1f" % pplx) eta.done() p_avg = p_tot / (n_iter - n_burnin) pplx = float(p_avg ** (-1./test_size)) print '---\nfinal perplexity =', pplx print>>sys.stderr, 'Perplexity:', pplx return p_avg
def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): if fileobj: f = fileobj elif fname == '-': f = sys.stdin elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': f = gzip.open(os.path.expanduser(fname)) else: f = open(os.path.expanduser(fname)) if quiet or fname == '-': eta = None else: eta = ETA(os.stat(fname).st_size, fileobj=f) for line in f: if eta: if callback: extra = callback() else: extra = '' eta.print_status(extra=extra) yield line if done_callback and done_callback(): break if f != sys.stdin: f.close() if eta: eta.done()
for i in range(20): eta.print_status() with open(list_dir + 'list_' + str(i)) as list_file: list_reader = csv.reader(list_file, delimiter='\t') for row in list_reader: article_id = str(row[0]) article_dir = sub_dir + row[0] + '.article' with open(article_dir, 'r') as article_file: article = json.load(article_file) article_id = article['id'] if article["social_counts"]["nujij_comments"] > 0: comment_dir = sub_dir + row[0] + '.comment' with open(comment_dir, 'r') as comment_file: comment_dict = json.load(comment_file) comment_count = len(comment_dict['comments']) else: comment_count = 0 flag = 0 for i, section in enumerate(article['sections']): if 'canonical' in section: if section['style'] != None: article_date_df = article_date_df.append(pd.DataFrame({'article' : [str(article['id'])], 'date' : [article['published_at']], 'section' : [section['style']], 'comment_count' : [comment_count]}, index = [article_id])) else: article_date_df = article_date_df.append(pd.DataFrame({'article' : [str(article['id'])], 'date' : [article['published_at']], 'section' : [section['name']], 'comment_count' : [comment_count]}, index = [article_id])) #article_date_df['section_counts'] = article_date_df.groupby(['section']).transform('count') eta.done() print article_date_df['section'].value_counts() print article_date_df.shape with open('section-count.csv', 'w') as csvfile: article_date_df.to_csv(csvfile, encoding='utf-8')
def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] const_spans = [] geneout = [ gene.gene_id, gene.gene_name, ] if self.has_isoform: geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') geneout.extend([gene.chrom, gene.strand, gene.start, gene.end]) was_last_const = False for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) # assemble a list of lists with contiguous spans of constant regions # this will let us count junction-spanning reads that are cover two # constant exons/regions if const: if not was_last_const: const_spans.append([]) const_spans[-1].append((start, end)) was_last_const = True else: was_last_const = False def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding( bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols yield (gene.chrom, starts, ends, gene.strand, geneout, callback) eta.done()
def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): ''' >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] ''' if os.path.exists('%s.bai' % bam.filename): # This is an indexed file, so it is ref sorted... # Meaning that we should show chrom:pos, instead of read names show_ref_pos = True eta = None if not ref: if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) for read in bam: pos = bam.tell() bgz_offset = pos >> 16 if not quiet and eta: if callback: eta.print_status(bgz_offset, extra=callback(read)) elif (show_ref_pos): if read.tid > -1: eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) else: eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) else: eta.print_status(bgz_offset, extra='%s' % read.qname) yield read else: working_chrom = None if ref in bam.references: working_chrom = ref elif ref[0:3] == 'chr': # compensate for Ensembl vs UCSC ref naming if ref[3:] in bam.references: working_chrom = ref[3:] if not working_chrom: raise ValueError('Missing reference: %s' % ref) tid = bam.gettid(working_chrom) if not start: start = 0 if not end: end = bam.lengths[tid] if not quiet and bam.filename: eta = ETA(end - start) for read in bam.fetch(working_chrom, start, end): if not quiet and eta: if callback: eta.print_status(read.pos - start, extra=callback(read)) else: eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) yield read if eta: eta.done()
def get_regions(self): eta = ETA(self.bed.length, fileobj=self.bed) for region in self.bed: eta.print_status(extra='%s:%s-%s[%s]' % (region.chrom, region.start, region.end, region.strand)) yield (region.chrom, [region.start], [region.end], region.strand, [region.chrom, region.start, region.end, region.name, region.score, region.strand], None) eta.done()
def scanpaths(self, paths, prefix=None): prefix = prefix or os.path.join(*os.path.commonprefix(tuple(map(splitpath, map(os.path.abspath, paths))))) fl = [] estsize = 0 ignored = [] logging.info("Scanning files...") for path in paths: if os.path.isfile(path): try: relfn = os.path.relpath(path, prefix) if self.ffilter(relfn, prefix): filesize = os.path.getsize(path) fl.append((os.path.relpath(path, prefix), filesize, filesize)) estsize += min(self.samplesize, filesize) else: ignored.append(path) except Exception as ex: logging.error(ex) else: for root, dirs, files in os.walk(path): for name in files: fn = os.path.join(root, name) relfn = os.path.relpath(fn, prefix) try: if self.ffilter(relfn, prefix): filesize = os.path.getsize(fn) fl.append((relfn, filesize, filesize)) estsize += min(self.samplesize, filesize) else: ignored.append((relfn, os.path.getsize(fn))) except Exception as ex: logging.error(ex) # file access error -> ignore ignored.append((relfn, 0)) for name in dirs: fn = os.path.join(root, name) # not ignoring empty dirs if not os.listdir(fn): fl.append((os.path.relpath(fn + '/', prefix), 0, 0)) # estimate compressd size if callable(self.compressfunc): logging.info("Calculating estimated compressed size...") eta = ETA(estsize, min_ms_between_updates=500) estcurrent = 0 for k, v in enumerate(fl): filename, size, size2 = v fn = os.path.join(prefix, filename) try: fl[k] = (filename, size, self.estcompresssize(fn, size)) except Exception as ex: logging.exception("Can't access " + path) estcurrent += min(self.samplesize, size) eta.print_status(estcurrent) eta.done() if self.totalsizelim: filtered = [] sizesum = 0 maxfilesize = 0 for k, v in sorted(enumerate(fl), key=lambda x: x[1][2]): filename, origsize, size = v if sizesum + size > self.totalsizelim: ignored.append(fl[k][:2]) if not maxfilesize: maxfilesize = origsize else: filtered.append(fl[k]) sizesum += size fl = filtered if maxfilesize: logging.info("Max file size is " + sizeof_fmt(maxfilesize)) return fl, ignored
def __init__(self, filename=None, cache_enabled=True, quiet=False, fileobj=None): if not filename and not fileobj: raise ValueError('Must pass either a filename or a fileobj') if fileobj: fobj = fileobj cache_enabled = False eta = None else: fobj = gzip_aware_open(filename) eta = ETA(os.stat(filename).st_size, fileobj=fobj) cachefile = os.path.join(os.path.dirname(filename), '.%s.cache' % os.path.basename(filename)) self._genes = {} self._pos = 0 self._gene_bins = {} self._gene_names = {} self._gene_ids = {} warned = False if cache_enabled and os.path.exists(cachefile): self._load_cache(cachefile) if not self._genes: if not quiet: sys.stderr.write('Reading GTF file... (%s) \n' % filename) for linenum, line in enumerate(fobj): try: idx = line.find('#') if idx > -1: if idx == 0: continue line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t') source = symbols[source] start = int(start) - 1 # Note: 1-based end = int(end) attributes = {} for key, val in [x.split(' ', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and ' ' in x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] attributes[key] = val gid = None if 'isoform_id' in attributes: gid = attributes['isoform_id'] elif 'gene_name' in attributes: # use gene_name if we have it. gid = attributes['gene_name'] # elif 'tss_id' in attributes: # iGenomes GTF files... are strange. use gene_name first. # gid = attributes['tss_id'] elif 'gene_id' in attributes: gid = attributes['gene_id'] if not warned and not quiet: sys.stderr.write('\nGTF file potentially missing isoform annotation! Each transcript may be treated separately. (%s)\n' % gid) sys.stderr.write('%s\n\n' % (str(attributes))) warned = True else: if not warned and not quiet: sys.stderr.write('\nNot a valid GTF file! Maybe GFF?\n') sys.stderr.write('%s\n\n' % (str(attributes))) warned = True first_key = None attributes = {} for key, val in [x.split('=', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and '=' in x]: if not first_key: first_key = key if val[0] == '"' and val[-1] == '"': val = val[1:-1] attributes[key] = val if not attributes: gid = 'id_%s' % linenum if not warned and not quiet: sys.stderr.write('\nGTF file missing annotations! Using line numbers as IDs\n') warned = True else: gid = attributes[first_key] if not warned and not quiet: sys.stderr.write('\nGTF file missing annotations (gene_id, transcript_id)! Assuming GFF? Taking first attribute as ID (%s=%s)\n' % (first_key, gid)) sys.stderr.write('%s\n\n' % (str(attributes))) warned = True if eta: eta.print_status(extra=gid) except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1) if not gid in self._genes or chrom != self._genes[gid].chrom: self._genes[gid] = _GTFGene(gid, chrom, source, **attributes) if 'gene_name' in attributes: gene_name = attributes['gene_name'] if not gene_name in self._gene_names: self._gene_names[gene_name] = [gid] else: self._gene_names[gene_name].append(gid) if gid != attributes['gene_id']: self._gene_ids[attributes['gene_id']] = gid self._genes[gid].add_feature(attributes['transcript_id'] if 'transcript_id' in attributes else gid, feature, start, end, strand) if eta: eta.done() if filename and fobj != sys.stdin: fobj.close() for gid in self._genes: gene = self._genes[gid] start_bin = gene.start / GTF.__binsize end_bin = gene.end / GTF.__binsize for bin in xrange(start_bin, end_bin+1): if not (gene.chrom, bin) in self._gene_bins: self._gene_bins[(gene.chrom, bin)] = [gid] else: self._gene_bins[(gene.chrom, bin)].append(gid) if cache_enabled: try: self._write_cache(cachefile) except Exception, e: sys.stderr.write("Error saving cache: %s!\n" % str(e)) pass # do nothing if we can't write the cache.
def __init__(self, filename=None, cache_enabled=True, quiet=False, fileobj=None): if not filename and not fileobj: raise ValueError('Must pass either a filename or a fileobj') if fileobj: fobj = fileobj cache_enabled = False eta = None else: fobj = gzip_aware_open(filename) eta = ETA(os.stat(filename).st_size, fileobj=fobj) cachefile = os.path.join(os.path.dirname(filename), '.%s.cache' % os.path.basename(filename)) self._genes = {} self._pos = 0 self._gene_bins = {} self._gene_names = {} self._gene_ids = {} warned = False if cache_enabled and os.path.exists(cachefile): self._load_cache(cachefile) if not self._genes: if not quiet: sys.stderr.write('Reading GTF file... (%s) \n' % filename) for line in fobj: try: idx = line.find('#') if idx > -1: if idx == 0: continue line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t') source = symbols[source] start = int(start) - 1 # Note: 1-based end = int(end) attributes = {} for key, val in [x.split(' ', 1) for x in [x.strip() for x in attrs.split(';')] if x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] attributes[key] = val gid = None if 'isoform_id' in attributes: gid = attributes['isoform_id'] elif 'gene_name' in attributes: # use gene_name if we have it. gid = attributes['gene_name'] # elif 'tss_id' in attributes: # iGenomes GTF files... are strange. use gene_name first. # gid = attributes['tss_id'] else: gid = attributes['gene_id'] if not warned and not quiet: sys.stderr.write('\nGTF file potentially missing isoform annotation! Each transcript may be treated separately. (%s)\n' % gid) sys.stderr.write('%s\n\n' % (str(attributes))) warned = True if eta: eta.print_status(extra=gid) except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1) if not gid in self._genes or chrom != self._genes[gid].chrom: self._genes[gid] = _GTFGene(gid, chrom, source, **attributes) if 'gene_name' in attributes: gene_name = attributes['gene_name'] if not gene_name in self._gene_names: self._gene_names[gene_name] = [gid] else: self._gene_names[gene_name].append(gid) if gid != attributes['gene_id']: self._gene_ids[attributes['gene_id']] = gid self._genes[gid].add_feature(attributes['transcript_id'], feature, start, end, strand) if eta: eta.done() if filename and fobj != sys.stdin: fobj.close() for gid in self._genes: gene = self._genes[gid] start_bin = gene.start / GTF.__binsize end_bin = gene.end / GTF.__binsize for bin in xrange(start_bin, end_bin+1): if not (gene.chrom, bin) in self._gene_bins: self._gene_bins[(gene.chrom, bin)] = [gid] else: self._gene_bins[(gene.chrom, bin)].append(gid) if cache_enabled: try: self._write_cache(cachefile) except Exception, e: sys.stderr.write("Error saving cache: %s!\n" % str(e)) pass # do nothing if we can't write the cache.
def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): """ >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] """ if os.path.exists("%s.bai" % bam.filename): # This is an indexed file, so it is ref sorted... # Meaning that we should show chrom:pos, instead of read names show_ref_pos = True eta = None if not ref: if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) for read in bam: pos = bam.tell() bgz_offset = pos >> 16 if not quiet and eta: if callback: eta.print_status(bgz_offset, extra=callback(read)) elif show_ref_pos: if read.tid > -1: eta.print_status(bgz_offset, extra="%s:%s %s" % (bam.getrname(read.tid), read.pos, read.qname)) else: eta.print_status(bgz_offset, extra="unmapped %s" % (read.qname)) else: eta.print_status(bgz_offset, extra="%s" % read.qname) yield read else: working_chrom = None if ref in bam.references: working_chrom = ref elif ref[0:3] == "chr": # compensate for Ensembl vs UCSC ref naming if ref[3:] in bam.references: working_chrom = ref[3:] if not working_chrom: raise ValueError("Missing reference: %s" % ref) tid = bam.gettid(working_chrom) if not start: start = 0 if not end: end = bam.lengths[tid] if not quiet and bam.filename: eta = ETA(end - start) for read in bam.fetch(working_chrom, start, end): if not quiet and eta: if callback: eta.print_status(read.pos - start, extra=callback(read)) else: eta.print_status( read.pos - start, extra="%s:%s %s" % (bam.getrname(read.tid), read.pos, read.qname) ) yield read if eta: eta.done()
def refiso_junctions(fname,refname,fragment_size=46,min_size=50,out=sys.stdout,max_exons=3): ''' Given a refiso file and a reference genome, it will produce a fasta file representing all possible unique splice junctions within an isoform. fragement_size - the maximum amount from each side of a splice to include min_size - the minimum length of a junction max_exons - the maximum number of exons to include in a junction (for small IG exons) ''' refiso = RefIso(fname) ref = pysam.Fastafile(refname) references = [] with open('%s.fai' % refname) as f: for line in f: cols = line.split('\t') references.append(cols[0]) def _extend_junction(seq,name,chrom,exons,counter=1): if counter >= max_exons: return start,end = exons[0] frag_end = end if end-start > fragment_size: frag_end = start+fragment_size seq5 = ref.fetch(chrom,start,frag_end) newname = '%s,%s-%s' % (name,start,frag_end) newseq = seq + seq5 if len(newseq) >= min_size: yield newname,newseq return elif len(exons) > 1 and counter+1 < max_exons: for i in xrange(1,len(exons)): for nn_name,nn_seq in _extend_junction(newseq,newname,chrom,exons[i:],counter+1): yield nn_name,nn_seq eta=ETA(refiso.fsize(),fileobj=refiso) junctions = set() for gene in refiso.genes: if not gene.chrom in references: continue for txpt in gene.transcripts: exons = zip(txpt.exon_starts,txpt.exon_ends) # print exons if len(exons) > 1000 or gene.name == 'abParts': # skip IG hyper / Ab regions continue for i,(start,end) in enumerate(exons): eta.print_status(extra='%s:%s %s #%s' % (gene.chrom,gene.tx_start,gene.name,i)) if i == len(exons)-1: # con't splice the last exon continue frag_start = start if end-start > fragment_size: frag_start = end-fragment_size # print '[%s] %s:%s-%s' % (i,gene.chrom,frag_start,end) seq3 = ref.fetch(gene.chrom,frag_start,end) for j in xrange(len(exons)-i-1): # print ' [%s]' % (j+i+1), # print '%s-%s' % exons[j+i+1] for name,seq in _extend_junction(seq3,'%s:%s-%s' % (gene.chrom,frag_start,end),gene.chrom,exons[j+i+1:]): if not name in junctions: junctions.add(name) out.write('>%s\n%s\n' % (name,seq)) eta.done()
def contexts(text, M, verbose=None): if verbose: eta = ETA(len(text)) for i in range(len(text)): yield tuple(text[max(0,i-M):i]), text[i] if verbose: eta.print_status(i, extra=verbose) if verbose: eta.done()
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if bysequence: chunk.append((read.seq, read)) else: chunk.append((read.name, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) sys.stderr.write('\nMerging chunks...\n') sys.stderr.flush() buf = [ None, ] * len(tmpfiles) skip = [ False, ] * len(tmpfiles) eta = ETA(count) j = 0 writing = True if nogz: tmpfobjs = [open(x) for x in tmpfiles] else: tmpfobjs = [gzip.open(x) for x in tmpfiles] while writing: j += 1 eta.print_status(j) for i, fobj in enumerate(tmpfobjs): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if bysequence: buf[i] = (read.seq, i, read) else: buf[i] = (read.name, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done() for fobj in tmpfobjs: fobj.close() for tmpfile in tmpfiles: os.unlink(tmpfile)
def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] const_spans = [] geneout = [gene.gene_id, gene.gene_name, ] if self.has_isoform: geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') geneout.extend([gene.chrom, gene.strand, gene.start, gene.end]) was_last_const = False for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) # assemble a list of lists with contiguous spans of constant regions # this will let us count junction-spanning reads that are cover two # constant exons/regions if const: if not was_last_const: const_spans.append([]) const_spans[-1].append((start, end)) was_last_const = True else: was_last_const = False def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols yield (gene.chrom, starts, ends, gene.strand, geneout, callback) eta.done()
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False): tmpfiles = [] chunk = [] sys.stderr.write('Sorting FASTQ file into chunks...\n') count = 0 for read in fastq.fetch(quiet): count += 1 if bysequence: chunk.append((read.seq, read)) else: chunk.append((read.name, read)) if len(chunk) >= chunksize: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) chunk = [] if chunk: tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz)) sys.stderr.write('\nMerging chunks...\n') sys.stderr.flush() buf = [None, ] * len(tmpfiles) skip = [False, ] * len(tmpfiles) eta = ETA(count) j=0 writing = True if nogz: tmpfobjs = [open(x) for x in tmpfiles] else: tmpfobjs = [gzip.open(x) for x in tmpfiles] while writing: j+=1 eta.print_status(j) for i, fobj in enumerate(tmpfobjs): if not buf[i] and not skip[i]: try: read = fastq_read_file(fobj) if bysequence: buf[i] = (read.seq, i, read) else: buf[i] = (read.name, i, read) except: buf[i] = None skip[i] = True sorted_list = buf[:] sorted_list.sort() writing = False for tup in sorted_list: if not tup: continue sorter, i, read = tup read.write(out) buf[i] = None writing = True break eta.done() for fobj in tmpfobjs: fobj.close() for tmpfile in tmpfiles: os.unlink(tmpfile)