예제 #1
0
파일: models.py 프로젝트: xuwei684/ngsutils
class GTFModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        # test for what attributes we need to return
        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes

    def get_source(self):
        return self.fname

    def get_name(self):
        return 'gtf'

    def get_headers(self):
        out = [
            'gene_id',
            'gene_name',
        ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())
        return out

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in
                           gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in
                           gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
예제 #2
0
class GTFModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        # test for what attributes we need to return
        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes


    def get_source(self):
        return self.fname

    def get_name(self):
        return 'gtf'

    def get_headers(self):
        out = ['gene_id', 'gene_name', ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())
        return out

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
예제 #3
0
파일: models.py 프로젝트: xuwei684/ngsutils
class ExonModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes

    def get_source(self):
        return self.fname

    def get_name(self):
        return 'exon'

    def get_headers(self):
        out = [
            'gene_id',
            'gene_name',
        ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())

        return out

    def get_postheaders(self):
        return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split(
        )

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in
                               gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype']
                               if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, starts, ends,
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, [start], [end],
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, start, end,
                        self.multiple, self.whitelist, self.blacklist,
                        self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()

    def count(self,
              bam,
              library_type,
              coverage=False,
              uniq_only=False,
              fpkm=False,
              norm='',
              multiple='complete',
              whitelist=None,
              blacklist=None,
              out=sys.stdout,
              quiet=False,
              start_only=False):
        self.uniq_only = uniq_only
        self.multiple = multiple
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.library_type = library_type

        self.stranded = library_type in ['FR', 'RF']

        Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm,
                    multiple, whitelist, blacklist, out, quiet, start_only)
예제 #4
0
class ExonModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes


    def get_source(self):
        return self.fname

    def get_name(self):
        return 'exon'

    def get_headers(self):
        out = ['gene_id', 'gene_name', ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())

        return out

    def get_postheaders(self):
        return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split()

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])


            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()

    def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False):
        self.uniq_only = uniq_only
        self.multiple = multiple
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.library_type = library_type

        self.stranded = library_type in ['FR', 'RF']

        Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, out, quiet, start_only)
예제 #5
0
파일: models.py 프로젝트: erlevy/ngsutils
    def get_regions(self):
        gtf = GTF(self.fname)
        eta = ETA(gtf.fsize(), fileobj=gtf)

        for gene in gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, [gene.gene_name, gene.gene_id, gene.isoform_id, gene.chrom, gene.strand, gene.start, gene.end], callback)
        eta.done()