예제 #1
0
            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols
예제 #2
0
파일: models.py 프로젝트: xuwei684/ngsutils
    def count(self,
              bam,
              library_type,
              coverage=False,
              uniq_only=False,
              fpkm=False,
              norm='',
              multiple='complete',
              whitelist=None,
              blacklist=None,
              out=sys.stdout,
              quiet=False,
              start_only=False):
        # This is a separate count implementation because for repeat families,
        # we need to combine the counts from multiple regions in the genome,
        # so the usual chrom, starts, ends loop breaks down.

        stranded = library_type in ['FR', 'RF']

        if coverage:
            sys.stderr.write(
                'Coverage calculations not supported with repeatmasker family models\n'
            )
            sys.exit(1)

        if norm and norm not in ['all', 'mapped']:
            sys.stderr.write(
                'Normalization "%s" not supported with repeatmasker family models\n'
                % norm)
            sys.exit(1)

        # multireads = set()
        # single_count = 0
        repeats = {}
        total_count = 0.0
        for family, member, chrom, start, end, strand in _repeatreader(
                self.fname):
            if not (family, member) in repeats:
                repeats[(family, member)] = {'count': 0, 'size': 0}

            if not (family, '*') in repeats:
                repeats[(family, '*')] = {'count': 0, 'size': 0}

            if not chrom in bam.references:
                continue

            size = end - start
            repeats[(family, '*')]['size'] += size
            repeats[(family, member)]['size'] += size

            count, reads = _fetch_reads(bam,
                                        chrom,
                                        strand if stranded else None, [start],
                                        [end],
                                        multiple,
                                        False,
                                        whitelist,
                                        blacklist,
                                        library_type=library_type)
            repeats[(family, '*')]['count'] += count
            repeats[(family, member)]['count'] += count

            total_count += count

            # for read in reads:
            #     if read.tags and 'IH' in read.tags:
            #         ih = int(read.opt('IH'))
            #     else:
            #         ih = 1

            #     if ih == 1:
            #         single_count += 1
            #     else:
            #         multireads.add(read.qname)

        sys.stderr.write('Calculating normalization...')

        norm_val = None
        norm_val_orig = None

        if norm == 'all':
            norm_val_orig = _find_mapped_count(bam, whitelist, blacklist)
        elif norm == 'mapped':
            # norm_val_orig = single_count + len(multireads)
            norm_val_orig = total_count

        if norm_val_orig:
            norm_val = float(norm_val_orig) / 1000000

        sys.stderr.write('\n')

        out.write('## input%s%s\n' %
                  (' ' if bam.filename else '', bam.filename))
        out.write('## model %s %s\n' % (self.get_name(), self.get_source()))
        out.write('## library %s\n' % library_type)
        out.write('## multiple %s\n' % multiple)
        if norm_val:
            out.write('## norm %s %s\n' % (norm, float(norm_val_orig)))
            out.write('## CPM-factor %s\n' % norm_val)

        out.write('#')
        out.write('\t'.join(self.get_headers()))
        out.write('\tlength\tcount')
        if norm_val:
            out.write('\tcount (CPM)')
            if fpkm:
                out.write('\tRPKM')

        out.write('\n')

        sortedkeys = []
        for k in repeats:
            sortedkeys.append(k)
        sortedkeys.sort()

        for class_level in [True, False]:
            for k in sortedkeys:
                if class_level and k[1] != '*':  # Do class level counts first
                    continue
                elif not class_level and k[1] == '*':
                    continue

                cols = [k[0], k[1], repeats[k]['size'], repeats[k]['count']]

                if norm_val:
                    cols.append(repeats[k]['count'] / norm_val)
                    if fpkm:
                        cols.append(repeats[k]['count'] /
                                    (repeats[k]['size'] / 1000.0) / norm_val)

                out.write('%s\n' % '\t'.join([str(x) for x in cols]))
예제 #3
0
파일: models.py 프로젝트: xuwei684/ngsutils
            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, starts, ends,
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, [start], [end],
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, start, end,
                        self.multiple, self.whitelist, self.blacklist,
                        self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols
예제 #4
0
    def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False):
        # This is a separate count implementation because for repeat families,
        # we need to combine the counts from multiple regions in the genome,
        # so the usual chrom, starts, ends loop breaks down.

        stranded = library_type in ['FR', 'RF']

        if coverage:
            sys.stderr.write('Coverage calculations not supported with repeatmasker family models\n')
            sys.exit(1)

        if norm and norm not in ['all', 'mapped']:
            sys.stderr.write('Normalization "%s" not supported with repeatmasker family models\n' % norm)
            sys.exit(1)

        # multireads = set()
        # single_count = 0
        repeats = {}
        total_count = 0.0
        for family, member, chrom, start, end, strand in _repeatreader(self.fname):
            if not (family, member) in repeats:
                repeats[(family, member)] = {'count': 0, 'size': 0}

            if not (family, '*') in repeats:
                repeats[(family, '*')] = {'count': 0, 'size': 0}

            if not chrom in bam.references:
                continue

            size = end - start
            repeats[(family, '*')]['size'] += size
            repeats[(family, member)]['size'] += size

            count, reads = _fetch_reads(bam, chrom, strand if stranded else None, [start], [end], multiple, False, whitelist, blacklist, library_type=library_type)
            repeats[(family, '*')]['count'] += count
            repeats[(family, member)]['count'] += count

            total_count += count

            # for read in reads:
            #     if read.tags and 'IH' in read.tags:
            #         ih = int(read.opt('IH'))
            #     else:
            #         ih = 1

            #     if ih == 1:
            #         single_count += 1
            #     else:
            #         multireads.add(read.qname)

        sys.stderr.write('Calculating normalization...')

        norm_val = None
        norm_val_orig = None

        if norm == 'all':
            norm_val_orig = _find_mapped_count(bam, whitelist, blacklist)
        elif norm == 'mapped':
            # norm_val_orig = single_count + len(multireads)
            norm_val_orig = total_count

        if norm_val_orig:
            norm_val = float(norm_val_orig) / 1000000

        sys.stderr.write('\n')

        out.write('## input%s%s\n' % (' ' if bam.filename else '', bam.filename))
        out.write('## model %s %s\n' % (self.get_name(), self.get_source()))
        out.write('## library %s\n' % library_type)
        out.write('## multiple %s\n' % multiple)
        if norm_val:
            out.write('## norm %s %s\n' % (norm, float(norm_val_orig)))
            out.write('## CPM-factor %s\n' % norm_val)

        out.write('#')
        out.write('\t'.join(self.get_headers()))
        out.write('\tlength\tcount')
        if norm_val:
            out.write('\tcount (CPM)')
            if fpkm:
                out.write('\tRPKM')

        out.write('\n')

        sortedkeys = []
        for k in repeats:
            sortedkeys.append(k)
        sortedkeys.sort()

        for class_level in [True, False]:
            for k in sortedkeys:
                if class_level and k[1] != '*':  # Do class level counts first
                    continue
                elif not class_level and k[1] == '*':
                    continue

                cols = [k[0], k[1], repeats[k]['size'], repeats[k]['count']]

                if norm_val:
                    cols.append(repeats[k]['count'] / norm_val)
                    if fpkm:
                        cols.append(repeats[k]['count'] / (repeats[k]['size'] / 1000.0) / norm_val)

                out.write('%s\n' % '\t'.join([str(x) for x in cols]))