def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols
def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False): # This is a separate count implementation because for repeat families, # we need to combine the counts from multiple regions in the genome, # so the usual chrom, starts, ends loop breaks down. stranded = library_type in ['FR', 'RF'] if coverage: sys.stderr.write( 'Coverage calculations not supported with repeatmasker family models\n' ) sys.exit(1) if norm and norm not in ['all', 'mapped']: sys.stderr.write( 'Normalization "%s" not supported with repeatmasker family models\n' % norm) sys.exit(1) # multireads = set() # single_count = 0 repeats = {} total_count = 0.0 for family, member, chrom, start, end, strand in _repeatreader( self.fname): if not (family, member) in repeats: repeats[(family, member)] = {'count': 0, 'size': 0} if not (family, '*') in repeats: repeats[(family, '*')] = {'count': 0, 'size': 0} if not chrom in bam.references: continue size = end - start repeats[(family, '*')]['size'] += size repeats[(family, member)]['size'] += size count, reads = _fetch_reads(bam, chrom, strand if stranded else None, [start], [end], multiple, False, whitelist, blacklist, library_type=library_type) repeats[(family, '*')]['count'] += count repeats[(family, member)]['count'] += count total_count += count # for read in reads: # if read.tags and 'IH' in read.tags: # ih = int(read.opt('IH')) # else: # ih = 1 # if ih == 1: # single_count += 1 # else: # multireads.add(read.qname) sys.stderr.write('Calculating normalization...') norm_val = None norm_val_orig = None if norm == 'all': norm_val_orig = _find_mapped_count(bam, whitelist, blacklist) elif norm == 'mapped': # norm_val_orig = single_count + len(multireads) norm_val_orig = total_count if norm_val_orig: norm_val = float(norm_val_orig) / 1000000 sys.stderr.write('\n') out.write('## input%s%s\n' % (' ' if bam.filename else '', bam.filename)) out.write('## model %s %s\n' % (self.get_name(), self.get_source())) out.write('## library %s\n' % library_type) out.write('## multiple %s\n' % multiple) if norm_val: out.write('## norm %s %s\n' % (norm, float(norm_val_orig))) out.write('## CPM-factor %s\n' % norm_val) out.write('#') out.write('\t'.join(self.get_headers())) out.write('\tlength\tcount') if norm_val: out.write('\tcount (CPM)') if fpkm: out.write('\tRPKM') out.write('\n') sortedkeys = [] for k in repeats: sortedkeys.append(k) sortedkeys.sort() for class_level in [True, False]: for k in sortedkeys: if class_level and k[1] != '*': # Do class level counts first continue elif not class_level and k[1] == '*': continue cols = [k[0], k[1], repeats[k]['size'], repeats[k]['count']] if norm_val: cols.append(repeats[k]['count'] / norm_val) if fpkm: cols.append(repeats[k]['count'] / (repeats[k]['size'] / 1000.0) / norm_val) out.write('%s\n' % '\t'.join([str(x) for x in cols]))
def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding( bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols
def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False): # This is a separate count implementation because for repeat families, # we need to combine the counts from multiple regions in the genome, # so the usual chrom, starts, ends loop breaks down. stranded = library_type in ['FR', 'RF'] if coverage: sys.stderr.write('Coverage calculations not supported with repeatmasker family models\n') sys.exit(1) if norm and norm not in ['all', 'mapped']: sys.stderr.write('Normalization "%s" not supported with repeatmasker family models\n' % norm) sys.exit(1) # multireads = set() # single_count = 0 repeats = {} total_count = 0.0 for family, member, chrom, start, end, strand in _repeatreader(self.fname): if not (family, member) in repeats: repeats[(family, member)] = {'count': 0, 'size': 0} if not (family, '*') in repeats: repeats[(family, '*')] = {'count': 0, 'size': 0} if not chrom in bam.references: continue size = end - start repeats[(family, '*')]['size'] += size repeats[(family, member)]['size'] += size count, reads = _fetch_reads(bam, chrom, strand if stranded else None, [start], [end], multiple, False, whitelist, blacklist, library_type=library_type) repeats[(family, '*')]['count'] += count repeats[(family, member)]['count'] += count total_count += count # for read in reads: # if read.tags and 'IH' in read.tags: # ih = int(read.opt('IH')) # else: # ih = 1 # if ih == 1: # single_count += 1 # else: # multireads.add(read.qname) sys.stderr.write('Calculating normalization...') norm_val = None norm_val_orig = None if norm == 'all': norm_val_orig = _find_mapped_count(bam, whitelist, blacklist) elif norm == 'mapped': # norm_val_orig = single_count + len(multireads) norm_val_orig = total_count if norm_val_orig: norm_val = float(norm_val_orig) / 1000000 sys.stderr.write('\n') out.write('## input%s%s\n' % (' ' if bam.filename else '', bam.filename)) out.write('## model %s %s\n' % (self.get_name(), self.get_source())) out.write('## library %s\n' % library_type) out.write('## multiple %s\n' % multiple) if norm_val: out.write('## norm %s %s\n' % (norm, float(norm_val_orig))) out.write('## CPM-factor %s\n' % norm_val) out.write('#') out.write('\t'.join(self.get_headers())) out.write('\tlength\tcount') if norm_val: out.write('\tcount (CPM)') if fpkm: out.write('\tRPKM') out.write('\n') sortedkeys = [] for k in repeats: sortedkeys.append(k) sortedkeys.sort() for class_level in [True, False]: for k in sortedkeys: if class_level and k[1] != '*': # Do class level counts first continue elif not class_level and k[1] == '*': continue cols = [k[0], k[1], repeats[k]['size'], repeats[k]['count']] if norm_val: cols.append(repeats[k]['count'] / norm_val) if fpkm: cols.append(repeats[k]['count'] / (repeats[k]['size'] / 1000.0) / norm_val) out.write('%s\n' % '\t'.join([str(x) for x in cols]))