예제 #1
0
    def test_merging_stop_codons_4(self):
        """
        Situation:
            * 1 stop codon split in two exons
        """
        intervals = list_to_intervals([
            # for this test no more than one interval is needed...
            ['1', '.', 'transcript', '20', '70', '.', '+', '.', '.'],
            ['1', '.', 'stop_codon', '40', '40', '.', '+', '.', '.'],
            ['1', '.', 'stop_codon', '60', '61', '.', '+', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '20', '40', '.', '+', '.', '.'],
            ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '30', '39', '.', '+', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '30', '40', '.', '+', '.', '.'],
            ['1', '.', 'CDS', '60', '61', '.', '+', '.', '.'],
        ]
        expeted_utrs = [
            ['1', '.', 'UTR5', '20', '29', '.', '+', '.', '.'],
            ['1', '.', 'UTR3', '62', '70', '.', '+', '.', '.'],
        ]
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)

        # Negative strand:
        intervals = list_to_intervals([
            # for this test no more than one interval is needed...
            ['1', '.', 'transcript', '20', '80', '.', '-', '.', '.'],
            ['1', '.', 'stop_codon', '39', '40', '.', '-', '.', '.'],
            ['1', '.', 'stop_codon', '60', '60', '.', '-', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '20', '40', '.', '-', '.', '.'],
            ['1', '.', 'exon', '60', '80', '.', '-', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '61', '65', '.', '-', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '60', '65', '.', '-', '.', '.'],
            ['1', '.', 'CDS', '39', '40', '.', '-', '.', '.'],
        ]
        expeted_utrs = [
            ['1', '.', 'UTR3', '20', '38', '.', '-', '.', '.'],
            ['1', '.', 'UTR5', '66', '80', '.', '-', '.', '.'],
        ]
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)
예제 #2
0
    def test_1(self):
        """
        Situation:
            * no stop codons
            * 1 "empty" exon before first cds
            * 1 "empty" exon after last cds
            * 1 exons shared by UTR5 and CDS
            * 1 exons shared by UTR3 and CDS
        """
        intervals = list_to_intervals([
            # for this test no more than one interval is needed...
            ['1', '.', 'transcript', '20', '90', '.', '+', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '20', '30', '.', '+', '.', '.'],
            ['1', '.', 'exon', '40', '50', '.', '+', '.', '.'],
            ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'],
            ['1', '.', 'exon', '80', '90', '.', '+', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '45', '50', '.', '+', '.', '.'],
            ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '45', '50', '.', '+', '.', '.'],
            ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'],
        ]
        expeted_utrs = [
            ['1', '.', 'UTR5', '20', '30', '.', '+', '.', '.'],
            ['1', '.', 'UTR5', '40', '44', '.', '+', '.', '.'],
            ['1', '.', 'UTR3', '66', '70', '.', '+', '.', '.'],
            ['1', '.', 'UTR3', '80', '90', '.', '+', '.', '.'],
        ]
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)

        # Also test for negative strand:
        intervals, exons, cdses = map(reverse_strand,
                                      [intervals, exons, cdses])

        expeted_new_cdses = reverse_strand(expeted_new_cdses)
        expeted_utrs = [
            ['1', '.', 'UTR3', '20', '30', '.', '-', '.', '.'],
            ['1', '.', 'UTR3', '40', '44', '.', '-', '.', '.'],
            ['1', '.', 'UTR5', '66', '70', '.', '-', '.', '.'],
            ['1', '.', 'UTR5', '80', '90', '.', '-', '.', '.'],
        ]

        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)
예제 #3
0
    def test_merging_stop_codons_1(self):
        """
        Situation:
            * stop codon and CDS completely overlap
        """
        intervals = list_to_intervals([
            # for this test no more than is needed...
            ['1', '.', 'transcript', '20', '62', '.', '+', '.', '.'],
            ['1', '.', 'stop_codon', '60', '62', '.', '+', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '20', '40', '.', '+', '.', '.'],
            ['1', '.', 'exon', '60', '62', '.', '+', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '20', '40', '.', '+', '.', '.'],
            ['1', '.', 'CDS', '60', '62', '.', '+', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '20', '40', '.', '+', '.', '.'],
            ['1', '.', 'CDS', '60', '62', '.', '+', '.', '.'],
        ]
        expeted_utrs = []
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)

        # Negative strand:
        intervals = list_to_intervals([
            # for this test no more than is needed...
            ['1', '.', 'transcript', '20', '80', '.', '-', '.', '.'],
            ['1', '.', 'stop_codon', '20', '22', '.', '-', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '20', '22', '.', '+', '.', '.'],
            ['1', '.', 'exon', '60', '80', '.', '+', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '20', '22', '.', '-', '.', '.'],
            ['1', '.', 'CDS', '60', '80', '.', '-', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '20', '22', '.', '-', '.', '.'],
            ['1', '.', 'CDS', '60', '80', '.', '-', '.', '.'],
        ]
        expeted_utrs = []
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)
예제 #4
0
    def test_merging_stop_codons_3(self):
        """
        Situation:
            * 1 stop codon given on same exon as CDS, bit inside CDS!
        """
        intervals = list_to_intervals([
            # for this test no more than one interval is needed...
            ['1', '.', 'transcript', '60', '70', '.', '+', '.', '.'],
            ['1', '.', 'stop_codon', '63', '65', '.', '+', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'],
        ]
        expeted_utrs = [
            ['1', '.', 'UTR3', '66', '70', '.', '+', '.', '.'],
        ]
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)

        # Negative strand:
        intervals = list_to_intervals([
            # for this test no more than one interval is needed...
            ['1', '.', 'transcript', '60', '70', '.', '-', '.', '.'],
            ['1', '.', 'stop_codon', '65', '67', '.', '-', '.', '.'],
        ])
        exons = list_to_intervals([
            ['1', '.', 'exon', '60', '70', '.', '-', '.', '.'],
        ])
        cdses = list_to_intervals([
            ['1', '.', 'CDS', '65', '70', '.', '-', '.', '.'],
        ])

        expeted_new_cdses = [
            ['1', '.', 'CDS', '65', '70', '.', '-', '.', '.'],
        ]
        expeted_utrs = [
            ['1', '.', 'UTR3', '60', '64', '.', '-', '.', '.'],
        ]
        new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals)
        new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs)
        self.assertEqual(expeted_new_cdses, new_cdses)
        self.assertEqual(expeted_utrs, utrs)
예제 #5
0
    def test_negative_strand(self):
        """
        Whole read is in single transcript, single segment. But the segment
        borders on intergenic (downstream).
        """
        gtf_neg_data = [
            i[:6] + ['-'] + i[7:] for i in intervals_to_list(self.gtf_data)
        ]
        gtf_neg = make_file_from_list(gtf_neg_data)
        bam = make_bam_file({
            'chromosomes': [('1', 1000)],
            'segments': [
                # (qname, flag, refname, pos, mapq, cigar, tags)
                ('name2:rbc:CCCC', 16, 0, 549, 255, [(0, 30)], {
                    'NH': 1
                }),
            ]
        })

        expected = [
            ['RNAmap', 'type', 'position', 'all', 'explicit'],
            ['CDS-intergenic', '20', '0.5', '0'],
            ['intergenic-CDS', '-80', '0.5', '0'],
        ]

        rnamaps.run(bam,
                    gtf_neg,
                    self.out,
                    self.strange,
                    self.cross_tr,
                    mismatches=1,
                    implicit_handling='split')
        self.assertEqual(expected, make_list_from_file(self.out))
예제 #6
0
    def test_all_good(self):
        gtf_in_data = list_to_intervals([
            ['1', '.', 'gene', '400', '500', '.', '+', '.',
             'gene_id "G2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
        ])
        gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data))

        gtf_out = tempfile.NamedTemporaryFile(mode='w+', delete=False)
        gtf_out.close()

        genome_file = make_file_from_list(
            [
                ['1', '2000'],
                ['MT', '500'],
            ], bedtool=False)

        gtf_out_data = list_to_intervals(make_list_from_file(segment.get_regions(
            gtf_in_file, gtf_out.name, genome_file), fields_separator='\t'))

        expected = list_to_intervals([
            ['1', '.', 'intergenic', '1', '399', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['1', '.', 'intergenic', '1', '2000', '.', '-', '.',
             'gene_id "."; transcript_id ".";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR5', '400', '409', '.', '+', '.',
             'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'gene', '400', '500', '.', '+', '.',
             'gene_id "G2"; biotype "[.]";'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intron', '431', '469', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; biotype ".";'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR3', '491', '500', '.', '+', '.',
             'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intergenic', '501', '2000', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '-', '.',
             'gene_id "."; transcript_id ".";'],
        ])

        self.assertEqual(expected, gtf_out_data)
예제 #7
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))
        self.gtf_data = list_to_intervals([
            ['1', '.', 'intergenic', '1', '99', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],
            # Gene #1:
            ['1', '.', 'gene', '100', '499', '.', '+', '.',
             'gene_id "G1";'],
            # Transcript #1
            ['1', '.', 'transcript', '100', '249', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'UTR5', '100', '149', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "1";'],
            ['1', '.', 'intron', '150', '199', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'CDS', '200', '229', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "2";'],
            ['1', '.', 'intron', '230', '239', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'UTR3', '240', '249', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "3";'],

            # Transcript #2
            ['1', '.', 'transcript', '240', '499', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'CDS', '240', '299', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "1";'],
            ['1', '.', 'intron', '300', '399', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'CDS', '400', '499', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "2";'],

            # intergenic
            ['1', '.', 'intergenic', '500', '599', '.', '+', '.',
             'gene_id "."; transcript_id ".";'],

            # Gene #1:
            ['1', '.', 'gene', '600', '999', '.', '+', '.',
             'gene_id "G2";'],

            # Transcript #3
            ['1', '.', 'transcript', '600', '799', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'CDS', '600', '649', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1";'],
            ['1', '.', 'intron', '650', '749', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'CDS', '750', '799', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2";'],

        ])
        self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
        self.strange = get_temp_file_name()
        self.cross_tr = get_temp_file_name()
        self.out = get_temp_file_name()
예제 #8
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))
        self.gtf_data = list_to_intervals([
            [
                '1', '.', 'intergenic', '1', '2', '.', '+', '.',
                'gene_id "."; transcript_id ".";'
            ],
            # Gene #1:
            ['1', '.', 'gene', '3', '7', '.', '+', '.', 'gene_id "G1";'],
            # Transcript #1
            [
                '1', '.', 'transcript', '3', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1";'
            ],
            [
                '1', '.', 'CDS', '3', '3', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1"; exon_number "2";'
            ],
            [
                '1', '.', 'intron', '4', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1";'
            ],
            [
                '1', '.', 'UTR3', '5', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T1"; exon_number "3";'
            ],

            # Transcript #2
            [
                '1', '.', 'transcript', '4', '7', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2";'
            ],
            [
                '1', '.', 'ncRNA', '4', '5', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2"; exon_number "1";'
            ],
            [
                '1', '.', 'intron', '6', '6', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2";'
            ],
            [
                '1', '.', 'ncRNA', '7', '7', '.', '+', '.',
                'gene_id "G1"; transcript_id "T2"; exon_number "2";'
            ],

            # intergenic
            [
                '1', '.', 'intergenic', '8', '9', '.', '+', '.',
                'gene_id "."; transcript_id ".";'
            ],
        ])
        self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
예제 #9
0
    def test_all_good(self):
        """
        * second gene has no 'gene' interval - but it is present in output as it should
        * last interval is on chromosome 2, but it is not in the output
        """
        gtf_data = list_to_intervals([
            ['1', '.', 'gene', '100', '300', '.', '+', '.',
             'gene_id "G1";'],
            ['1', '.', 'transcript', '100', '250', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1";'],
            ['1', '.', 'exon', '100', '150', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "1";'],
            ['1', '.', 'exon', '200', '250', '.', '+', '.',
             'gene_id "G1"; transcript_id "T1"; exon_number "2";'],
            ['1', '.', 'transcript', '150', '300', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2";'],
            ['1', '.', 'exon', '150', '200', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "1";'],
            ['1', '.', 'exon', '250', '300', '.', '+', '.',
             'gene_id "G1"; transcript_id "T2"; exon_number "2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G2"; transcript_id "T3";'],
            ['2', '.', 'CDS', '470', '490', '.', '+', '.',
             'gene_id "G3"; transcript_id "T4";'],
        ])
        gtf = make_file_from_list(intervals_to_list(gtf_data))

        gene1, gene2 = list(segment._get_gene_content(gtf, ['1', 'MT'], report_progress=True))

        expected1 = {
            'gene': gtf_data[0],
            'T1': gtf_data[1:4],
            'T2': gtf_data[4:7],
        }

        extra_gene = create_interval_from_list(
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'])
        expected2 = {
            'gene': extra_gene,
            'T3': gtf_data[7:-1],
        }

        self.assertEqual(gene1, expected1)
        self.assertEqual(gene2, expected2)
예제 #10
0
    def test_all_good(self):
        gtf_in_data = list_to_intervals([
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
            ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'],
        ])
        gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data))

        gtf_out = get_temp_file_name()

        genome_file = make_file_from_list([
            ['1', '2000'],
            ['MT', '500'],
        ], bedtool=False)

        segment.get_segments(gtf_in_file, gtf_out, genome_file)
        gtf_out_data = list_to_intervals(make_list_from_file(gtf_out, fields_separator='\t'))

        expected = list_to_intervals([
            ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'],
            ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR5', '400', '409', '.', '+', '.',
             'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'],
            ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'],
            ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'UTR3', '491', '500', '.', '+', '.',
             'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'],
            ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'],
            ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'],
        ])

        self.assertEqual(expected, gtf_out_data)

        out_dir = os.path.dirname(os.path.abspath(gtf_out))
        self.assertTrue(os.path.isfile(os.path.join(out_dir, region.REGIONS_FILE)))
        self.assertTrue(os.path.isfile(os.path.join(out_dir, 'landmarks.bed.gz')))
예제 #11
0
    def test_no_transcript_interval(self):
        """
        If not transcript interval is given, it is determined by function
        Also this is the case if no CDS are given - all exons turn to ncRNA.
        """
        intervals = list_to_intervals([
            ['1', '.', 'exon', '1', '30', '.', '+', '.', 'exon_number "1";'],
            ['1', '.', 'exon', '60', '100', '.', '+', '.', 'exon_number "2";'],
        ])

        expected = [
            ['1', '.', 'transcript', '1', '100', '.', '+', '.', ''],
            ['1', '.', 'intron', '31', '59', '.', '+', '.', ''],
            ['1', '.', 'ncRNA', '1', '30', '.', '+', '.', 'exon_number "1";'],
            ['1', '.', 'ncRNA', '60', '100', '.', '+', '.', 'exon_number "2";'],
        ]

        output = intervals_to_list(segment._process_transcript_group(intervals))
        self.assertEqual(output, expected)
예제 #12
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))

        self.gtf_data = list_to_intervals([
            [
                '1', '.', 'intergenic', '1', '799', '.', '-', '.',
                attrs(tid='.', iid='interN00000')
            ],
            [
                '1', '.', 'intergenic', '1', '99', '.', '+', '.',
                attrs(tid='.', iid='interP00000')
            ],
            # Gene #1:
            [
                '1', '.', 'gene', '100', '499', '.', '+', '.',
                attrs('G1', bio='.')
            ],
            # Transcript #1
            [
                '1', '.', 'transcript', '100', '249', '.', '+', '.',
                attrs('G1', 'T1', bio='.')
            ],
            [
                '1', '.', 'UTR5', '100', '149', '.', '+', '.',
                attrs('G1', 'T1', 1, bio='.')
            ],
            [
                '1', '.', 'intron', '150', '199', '.', '+', '.',
                attrs('G1', 'T1', bio='.')
            ],
            [
                '1', '.', 'CDS', '200', '229', '.', '+', '.',
                attrs('G1', 'T1', 2, bio='.')
            ],
            [
                '1', '.', 'intron', '230', '239', '.', '+', '.',
                attrs('G1', 'T1', bio='.')
            ],
            [
                '1', '.', 'UTR3', '240', '249', '.', '+', '.',
                attrs('G1', 'T1', 3, bio='.')
            ],

            # Transcript #2
            [
                '1', '.', 'transcript', '240', '499', '.', '+', '.',
                attrs('G1', 'T2', bio='.')
            ],
            [
                '1', '.', 'CDS', '240', '299', '.', '+', '.',
                attrs('G1', 'T2', 1, bio='.')
            ],
            [
                '1', '.', 'intron', '300', '399', '.', '+', '.',
                attrs('G1', 'T1', bio='.')
            ],
            [
                '1', '.', 'CDS', '400', '499', '.', '+', '.',
                attrs('G1', 'T2', 2, bio='.')
            ],

            # intergenic
            [
                '1', '.', 'intergenic', '500', '599', '.', '+', '.',
                attrs(tid='.', iid='interP00001')
            ],

            # Gene #1:
            [
                '1', '.', 'gene', '600', '799', '.', '+', '.',
                attrs('G2', bio='.')
            ],

            # Transcript #3
            [
                '1', '.', 'transcript', '600', '799', '.', '+', '.',
                attrs('G2', 'T3', bio='.')
            ],
            [
                '1', '.', 'CDS', '600', '649', '.', '+', '.',
                attrs('G2', 'T3', 1, bio='.')
            ],
            [
                '1', '.', 'intron', '650', '749', '.', '+', '.',
                attrs('G2', 'T3', bio='.')
            ],
            [
                '1', '.', 'CDS', '750', '799', '.', '+', '.',
                attrs('G2', 'T3', 2, bio='.')
            ],
            [
                '1', '.', 'intergenic', '800', '999', '.', '+', '.',
                attrs(tid='.', iid='interP00002')
            ],

            # Gene #3:
            [
                '1', '.', 'gene', '800', '899', '.', '-', '.',
                attrs('G3', bio='.')
            ],

            # Transcript #3
            [
                '1', '.', 'transcript', '800', '899', '.', '-', '.',
                attrs('G3', 'T4', bio='.')
            ],
            [
                '1', '.', 'CDS', '800', '899', '.', '-', '.',
                attrs('G3', 'T4', 1, bio='.')
            ],
            [
                '1', '.', 'intergenic', '900', '999', '.', '-', '.',
                attrs(tid='.', iid='interN00001')
            ],
        ])

        self.gtf = make_file_from_list(intervals_to_list(self.gtf_data),
                                       extension='gtf')
        self.strange = get_temp_file_name(extension='bam')
        self.cross_tr = get_temp_file_name(extension='tsv')
        self.out = get_temp_file_name(extension='tsv')