예제 #1
0
    def testFind(self):
        src = StringIO.StringIO('''\
chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1";
chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2";
chr1|test|gene|3001|3100|0|+|.|gene_id "foo3"; transcript_id "bar3";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(['foo1', 'foo2', 'foo3'], [x.gene_id for x in genes])

        found = False
        for gene in gtf.find('chr1', 1900, 2200):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)

        found = False
        for gene in gtf.find('chr1', 1900, 2000):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)

        found = False
        for gene in gtf.find('chr1', 2050, 2200):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)
예제 #2
0
    def testGTFIso(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(len(genes), 1)
        self.assertEqual(['foo1'], [g.gene_id for g in genes])
        self.assertEqual(['iso1'], [
            g.gid for g in genes
        ])  # gid is the actual unique id (should be constant for isoforms)
        transcripts = list(genes[0].transcripts)
        self.assertEqual(['bar1', 'bar2'],
                         [t.transcript_id for t in transcripts])
        self.assertEqual(len(transcripts), 2)
        self.assertEqual(list(genes[0].regions),
                         [(1, 1000, 1100, True, 'bar1,bar2'),
                          (2, 1200, 1300, False, 'bar1'),
                          (3, 1400, 1500, True, 'bar1,bar2')])
예제 #3
0
    def testJunctionsIsoforms(self):
        gtf = GTF(fileobj=StringIO.StringIO('''\
test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|30|40|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
test1|test|exon|50|70|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
'''.replace('|', '\t')),
                  quiet=True)

        valid = '''\
>test1:16-20,29-33
ATGCGCGC
>test1:16-20,49-53
ATGCCTGA
>test1:16-20,89-93
ATGCTCGA
>test1:36-40,49-53
GATCCTGA
>test1:36-40,89-93
GATCTCGA
>test1:66-70,89-93
ATCGTCGA
'''
        out = StringIO.StringIO('')
        ngsutils.gtf.junctions.gtf_junctions(gtf,
                                             fa,
                                             fragment_size=4,
                                             min_size=8,
                                             out=out,
                                             quiet=True)
        self.assertEqual(out.getvalue(), valid)
예제 #4
0
파일: models.py 프로젝트: xuwei684/ngsutils
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes
예제 #5
0
    def testGTF(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|CDS|1051|1247|0|+|1|gene_id "foo"; transcript_id "bar";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|stop_codon|1248|1250|0|+|.|gene_id "foo"; transcript_id "bar";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        for gene in gtf.genes:
            self.assertEqual(gene.gene_id, 'foo')
            for transcript in gene.transcripts:
                self.assertEqual(transcript.transcript_id, 'bar')
                self.assertEqual(list(transcript.exons), [(1000, 1100),
                                                          (1200, 1300)])
예제 #6
0
    def testGeneOnly(self):
        src = StringIO.StringIO('''\
chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1";
chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(['foo1', 'foo2'], [x.gene_id for x in genes])
        # gene start / end
        self.assertEqual((1000, 1100), (genes[0].start, genes[0].end))
        t = list(genes[0].transcripts)[0]
        # apply start / end to transcript and as an exon
        self.assertEqual((1000, 1100), (t.start, t.end))
        self.assertEqual((1000, 1100), (t.exons[0]))

        t = list(genes[1].transcripts)[0]
        # apply start / end to transcript and as an exon
        self.assertEqual((2000, 2100), (t.start, t.end))
        self.assertEqual((2000, 2100), (t.exons[0]))
예제 #7
0
    def testGTFNoIso(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar1";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar2";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar2";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual('foo', genes[0].gene_id)
        transcripts = list(genes[0].transcripts)
        self.assertEqual(len(transcripts), 2)
        self.assertEqual(list(genes[0].regions),
                         [(1, 1000, 1100, True, 'bar1,bar2'),
                          (2, 1200, 1300, False, 'bar1'),
                          (3, 1400, 1500, True, 'bar1,bar2')])
예제 #8
0
        elif arg in ['-frag', '-min']:
            last = arg
        elif arg == '-scramble':
            scramble = True
        elif arg == '-retain-introns':
            retain_introns = True
        elif arg == '-known':
            known = True
        elif arg == '-h':
            usage()
        elif gtf is None and os.path.exists(arg):
            gtf = arg
        elif fasta is None and os.path.exists(arg):
            if not os.path.exists('%s.fai' % arg):
                usage('Missing .fai FASTA index for file: %s' % arg)
            fasta = arg

    if not gtf or not fasta:
        usage()

    if known and scramble:
        usage("You can not use both -known and -scramble at the same time!")

    gtf_junctions(GTF(gtf),
                  fasta,
                  frag_size,
                  min_size,
                  known=known,
                  scramble=scramble,
                  retain_introns=retain_introns)
예제 #9
0
            last = None
        elif arg == '-noheader':
            header = False
        elif arg == '-gene_id':
            gene_id = True
        elif arg == '-transcript_id':
            transcript_id = True
        elif arg == '-gene_name':
            gene_name = True
        elif arg == '-gene_location':
            gene_location = True
        elif arg in ['-ref', '-pos']:
            last = arg
        elif not gtffile and os.path.exists(arg):
            gtffile = arg
        elif not infile and os.path.exists(arg):
            infile = arg
        else:
            print 'Unknown argument: %s' % arg

    if not gtffile:
        usage('Missing GTF file')
    if not infile:
        usage('Missing input file')
    if not (gene_name or gene_location or gene_id or transcript_id):
        usage('Missing outputs - nothing to annotate')

    gtf = GTF(gtffile)
    gtf_annotate(gtf, infile, ref_col, pos_col, gene_name, gene_location,
                 gene_id, transcript_id, header)
예제 #10
0
            maxtx = max(size, maxtx)
            maxintron = max(intron_size, maxintron)
            maxcoding = max(cds, maxcoding)
            maxutr5 = max(utr5, maxutr5)
            maxutr3 = max(utr3, maxutr3)

        cols.append(maxtx)
        cols.append(maxintron)
        cols.append(maxcoding)
        cols.append(maxutr5)
        cols.append(maxutr3)

        out.write('%s\n' % '\t'.join([str(x) for x in cols]))


if __name__ == '__main__':
    fname = None
    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if not fname and os.path.exists(arg):
            fname = arg
        else:
            usage()

    if not fname:
        usage()

    gtf = GTF(fname)
    gtf_genesize(gtf)
예제 #11
0
파일: tobed.py 프로젝트: xuwei684/ngsutils
            genes, exons, introns, regions, tss, tlss, txs, tlxs, utr_5, utr_3,
            junc_5, junc_3, promoter
    ]:
        if arg:
            i += 1

    if i == 0:
        usage('You must select one [type] to export.')
    elif i > 1:
        usage('You must select *only one* [type] to export.')
    elif not filename:
        usage('Missing input file')
    elif promoter and not (promoter_down or promoter_up):
        usage('You must specify a valid promoter length!')

    gtf = GTF(filename)

    if genes:
        gtf_genes_tobed(gtf)
    elif exons:
        gtf_exons_tobed(gtf)
    elif introns:
        gtf_introns_tobed(gtf)
    elif regions:
        gtf_regions_tobed(gtf)
    elif tss:
        gtf_tss_tobed(gtf)
    elif tlss:
        gtf_tlss_tobed(gtf)
    elif txs:
        gtf_txs_tobed(gtf)
예제 #12
0
 def testQuery(self):
     genes = list(ngsutils.gtf.query.gtf_query(GTF(fname, cache_enabled=False), 'chr1', 1000, 2000))
     self.assertEquals(str(genes[0]), 'foo1(iso1) chr1:1000-2500[+]')
예제 #13
0
import unittest
import StringIO

import ngsutils.gtf.genesize
from ngsutils.gtf import GTF

gtf = GTF(fileobj=StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|2001|2100|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|exon|2201|2300|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|exon|2401|2500|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|CDS|2051|2447|0|+|1|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|start_codon|2051|2053|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|stop_codon|2448|2450|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
'''.replace('|', '\t')),
          quiet=True)


class GTFGeneSizeTest(unittest.TestCase):
    def testGeneSize(self):
        out = StringIO.StringIO('')
        ngsutils.gtf.genesize.gtf_genesize(gtf, out=out)
예제 #14
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')