def test_add_biotype_attribute1(self): gene_content = { 'gene': create_interval_from_list( ['1', '.', 'gene', '1', '200', '.', '+', '.', 'gene_biotype "G";'] ), 'transcript1': list_to_intervals([ ['1', '.', 'CDS', '1', '5', '.', '+', '.', 'gene_biotype "G"; transcript_biotype "A";'], ['1', '.', 'ncRNA', '1', '5', '.', '+', '.', 'gene_biotype "G"; transcript_biotype "A";'], ['1', '.', 'intron', '1', '5', '.', '+', '.', '.'], ]), 'transcript2': list_to_intervals([ ['1', '.', 'ncRNA', '1', '5', '.', '+', '.', 'gene_biotype "G"; transcript_biotype "B";'], ['1', '.', 'intron', '1', '5', '.', '+', '.', '.'], ]), } out = segment._add_biotype_attribute(gene_content) for transcript_id, tr_intervals in sorted(out.items()): if transcript_id == 'gene': # I this case tjhi is single interval not a list of intervals: self.assertEqual(tr_intervals.attrs['biotype'], 'A, B, G') elif transcript_id == 'transcript1': for interval in tr_intervals: self.assertEqual(interval.attrs['biotype'], 'A') elif transcript_id == 'transcript2': for interval in tr_intervals: self.assertEqual(interval.attrs['biotype'], 'B')
def test_all_good(self): gtf_in_data = list_to_intervals([ ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ]) gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data)) gtf_out = tempfile.NamedTemporaryFile(mode='w+', delete=False) gtf_out.close() genome_file = make_file_from_list( [ ['1', '2000'], ['MT', '500'], ], bedtool=False) gtf_out_data = list_to_intervals(make_list_from_file(segment.get_regions( gtf_in_file, gtf_out.name, genome_file), fields_separator='\t')) expected = list_to_intervals([ ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR5', '400', '409', '.', '+', '.', 'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'], ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR3', '491', '500', '.', '+', '.', 'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ]) self.assertEqual(expected, gtf_out_data)
def test_1(self): """ Situation: * no stop codons * 1 "empty" exon before first cds * 1 "empty" exon after last cds * 1 exons shared by UTR5 and CDS * 1 exons shared by UTR3 and CDS """ intervals = list_to_intervals([ # for this test no more than one interval is needed... ['1', '.', 'transcript', '20', '90', '.', '+', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '20', '30', '.', '+', '.', '.'], ['1', '.', 'exon', '40', '50', '.', '+', '.', '.'], ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'], ['1', '.', 'exon', '80', '90', '.', '+', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '45', '50', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '45', '50', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'], ] expeted_utrs = [ ['1', '.', 'UTR5', '20', '30', '.', '+', '.', '.'], ['1', '.', 'UTR5', '40', '44', '.', '+', '.', '.'], ['1', '.', 'UTR3', '66', '70', '.', '+', '.', '.'], ['1', '.', 'UTR3', '80', '90', '.', '+', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs) # Also test for negative strand: intervals, exons, cdses = map(reverse_strand, [intervals, exons, cdses]) expeted_new_cdses = reverse_strand(expeted_new_cdses) expeted_utrs = [ ['1', '.', 'UTR3', '20', '30', '.', '-', '.', '.'], ['1', '.', 'UTR3', '40', '44', '.', '-', '.', '.'], ['1', '.', 'UTR5', '66', '70', '.', '-', '.', '.'], ['1', '.', 'UTR5', '80', '90', '.', '-', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs)
def test_get_introns(self): exons = list_to_intervals([ ['1', '.', 'exon', '1', '10', '.', '+', '.', 'transcript_id "42"; exon_number "1"'], ['1', '.', 'exon', '20', '30', '.', '+', '.', 'gene_name "42"; '], ['1', '.', 'exon', '40', '50', '.', '+', '.', 'gene_id "FHIT"; useless_data "3"'], ]) expected = list_to_intervals([ ['1', '.', 'exon', '11', '19', '.', '+', '.', 'transcript_id "42";'], ['1', '.', 'exon', '31', '39', '.', '+', '.', 'gene_id "FHIT";'], ]) self.assertEqual(segment._get_introns(exons), expected)
def test_complement(self): genome_file = make_file_from_list( [ ['1', '2000'], ['2', '1000'], ['MT', '500'], ], bedtool=False) genes = list_to_intervals([ ['1', '.', 'gene1', '200', '400', '.', '+', '.', '.'], ['1', '.', 'gene2', '300', '600', '.', '+', '.', '.'], ['1', '.', 'gene3', '200', '500', '.', '+', '.', '.'], ['2', '.', 'gene4', '100', '200', '.', '+', '.', '.'], ['2', '.', 'gene5', '100', '300', '.', '-', '.', '.'], ]) complement = make_list_from_file(segment._complement(genes, genome_file, '+'), fields_separator='\t') empty_col8 = 'ID "inter%s"; gene_id "."; transcript_id ".";' expected = [ ['1', '.', 'intergenic', '1', '199', '.', '+', '.', empty_col8 % "P00000"], ['1', '.', 'intergenic', '601', '2000', '.', '+', '.', empty_col8 % "P00001"], ['2', '.', 'intergenic', '1', '99', '.', '+', '.', empty_col8 % "P00002"], ['2', '.', 'intergenic', '201', '1000', '.', '+', '.', empty_col8 % "P00003"], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', empty_col8 % "P00004"], ] self.assertEqual(complement, expected)
def test_all_good(self): gtf_in_data = list_to_intervals([ ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ]) gtf_in_file = make_file_from_list(intervals_to_list(gtf_in_data)) gtf_out = get_temp_file_name() genome_file = make_file_from_list([ ['1', '2000'], ['MT', '500'], ], bedtool=False) segment.get_segments(gtf_in_file, gtf_out, genome_file) gtf_out_data = list_to_intervals(make_list_from_file(gtf_out, fields_separator='\t')) expected = list_to_intervals([ ['1', '.', 'intergenic', '1', '399', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'intergenic', '1', '2000', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR5', '400', '409', '.', '+', '.', 'gene_id "G2";exon_number "1";transcript_id "T3"; biotype ".";'], ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2"; biotype "[.]";'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intron', '431', '469', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; biotype ".";'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2";transcript_id "T3"; biotype ".";'], ['1', '.', 'UTR3', '491', '500', '.', '+', '.', 'gene_id "G2";exon_number "2";transcript_id "T3"; biotype ".";'], ['1', '.', 'intergenic', '501', '2000', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', 'gene_id "."; transcript_id ".";'], ['MT', '.', 'intergenic', '1', '500', '.', '-', '.', 'gene_id "."; transcript_id ".";'], ]) self.assertEqual(expected, gtf_out_data) out_dir = os.path.dirname(os.path.abspath(gtf_out)) self.assertTrue(os.path.isfile(os.path.join(out_dir, region.REGIONS_FILE))) self.assertTrue(os.path.isfile(os.path.join(out_dir, 'landmarks.bed.gz')))
def test_no_exons(self): """ Fail if no exons are given. """ intervals = list_to_intervals([ ['1', '.', 'transcript', '1', '100', '.', '+', '.', '.'], ]) with self.assertRaises(AssertionError): segment._process_transcript_group(intervals)
def test_filter_col8(self): interval = list_to_intervals([ ['1', '.', 'CDS', '1', '2', '.', '+', '.', 'gene_name "B"; transcript_id "A"; key42 "A"; key43: "?";'], ])[0] expected = 'gene_name "B"; transcript_id "A";' self.assertEqual(segment._filter_col8(interval), expected) expected = 'gene_name "B"; key42 "A";' self.assertEqual(segment._filter_col8(interval, keys=['gene_name', 'key42']), expected)
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) self.gtf_data = list_to_intervals([ ['1', '.', 'intergenic', '1', '99', '.', '+', '.', 'gene_id "."; transcript_id ".";'], # Gene #1: ['1', '.', 'gene', '100', '499', '.', '+', '.', 'gene_id "G1";'], # Transcript #1 ['1', '.', 'transcript', '100', '249', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'UTR5', '100', '149', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "1";'], ['1', '.', 'intron', '150', '199', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'CDS', '200', '229', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";'], ['1', '.', 'intron', '230', '239', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'UTR3', '240', '249', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "3";'], # Transcript #2 ['1', '.', 'transcript', '240', '499', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'CDS', '240', '299', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";'], ['1', '.', 'intron', '300', '399', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'CDS', '400', '499', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";'], # intergenic ['1', '.', 'intergenic', '500', '599', '.', '+', '.', 'gene_id "."; transcript_id ".";'], # Gene #1: ['1', '.', 'gene', '600', '999', '.', '+', '.', 'gene_id "G2";'], # Transcript #3 ['1', '.', 'transcript', '600', '799', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'CDS', '600', '649', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1";'], ['1', '.', 'intron', '650', '749', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'CDS', '750', '799', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2";'], ]) self.gtf = make_file_from_list(intervals_to_list(self.gtf_data)) self.strange = get_temp_file_name() self.cross_tr = get_temp_file_name() self.out = get_temp_file_name()
def test_check_consistency_fail3(self): """ Unallowed order of types. """ intervals = list_to_intervals([ ['1', '.', 'transcript', '1', '100', '.', '+', '.', '.'], ['1', '.', 'UTR3', '1', '49', '.', '+', '.', '.'], ['1', '.', 'CDS', '50', '100', '.', '+', '.', '.'], ]) with self.assertRaises(AssertionError): segment._check_consistency(intervals)
def test_check_consistency_fail2(self): """ Overlaping intervals. """ intervals = list_to_intervals([ ['1', '.', 'transcript', '1', '100', '.', '+', '.', '.'], ['1', '.', 'UTR5', '1', '50', '.', '+', '.', '.'], ['1', '.', 'CDS', '50', '100', '.', '+', '.', '.'], ]) with self.assertRaises(AssertionError): segment._check_consistency(intervals)
def test_check_consistency_pass(self): # pylint: disable=no-self-use intervals = list_to_intervals([ ['1', '.', 'transcript', '1', '100', '.', '+', '.', '.'], ['1', '.', 'UTR5', '1', '9', '.', '+', '.', '.'], ['1', '.', 'CDS', '10', '49', '.', '+', '.', '.'], ['1', '.', 'intron', '50', '59', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '89', '.', '+', '.', '.'], ['1', '.', 'UTR3', '90', '100', '.', '+', '.', '.'], ]) # If no AssertionError is raised, this is succes: segment._check_consistency(intervals)
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) self.gtf_data = list_to_intervals([ [ '1', '.', 'intergenic', '1', '2', '.', '+', '.', 'gene_id "."; transcript_id ".";' ], # Gene #1: ['1', '.', 'gene', '3', '7', '.', '+', '.', 'gene_id "G1";'], # Transcript #1 [ '1', '.', 'transcript', '3', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";' ], [ '1', '.', 'CDS', '3', '3', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";' ], [ '1', '.', 'intron', '4', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";' ], [ '1', '.', 'UTR3', '5', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "3";' ], # Transcript #2 [ '1', '.', 'transcript', '4', '7', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";' ], [ '1', '.', 'ncRNA', '4', '5', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";' ], [ '1', '.', 'intron', '6', '6', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";' ], [ '1', '.', 'ncRNA', '7', '7', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";' ], # intergenic [ '1', '.', 'intergenic', '8', '9', '.', '+', '.', 'gene_id "."; transcript_id ".";' ], ]) self.gtf = make_file_from_list(intervals_to_list(self.gtf_data))
def test_all_good(self): """ * second gene has no 'gene' interval - but it is present in output as it should * last interval is on chromosome 2, but it is not in the output """ gtf_data = list_to_intervals([ ['1', '.', 'gene', '100', '300', '.', '+', '.', 'gene_id "G1";'], ['1', '.', 'transcript', '100', '250', '.', '+', '.', 'gene_id "G1"; transcript_id "T1";'], ['1', '.', 'exon', '100', '150', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "1";'], ['1', '.', 'exon', '200', '250', '.', '+', '.', 'gene_id "G1"; transcript_id "T1"; exon_number "2";'], ['1', '.', 'transcript', '150', '300', '.', '+', '.', 'gene_id "G1"; transcript_id "T2";'], ['1', '.', 'exon', '150', '200', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "1";'], ['1', '.', 'exon', '250', '300', '.', '+', '.', 'gene_id "G1"; transcript_id "T2"; exon_number "2";'], ['1', '.', 'transcript', '400', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '400', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "1"'], ['1', '.', 'CDS', '410', '430', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['1', '.', 'exon', '470', '500', '.', '+', '.', 'gene_id "G2"; transcript_id "T3"; exon_number "2"'], ['1', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G2"; transcript_id "T3";'], ['2', '.', 'CDS', '470', '490', '.', '+', '.', 'gene_id "G3"; transcript_id "T4";'], ]) gtf = make_file_from_list(intervals_to_list(gtf_data)) gene1, gene2 = list(segment._get_gene_content(gtf, ['1', 'MT'], report_progress=True)) expected1 = { 'gene': gtf_data[0], 'T1': gtf_data[1:4], 'T2': gtf_data[4:7], } extra_gene = create_interval_from_list( ['1', '.', 'gene', '400', '500', '.', '+', '.', 'gene_id "G2";']) expected2 = { 'gene': extra_gene, 'T3': gtf_data[7:-1], } self.assertEqual(gene1, expected1) self.assertEqual(gene2, expected2)
def test_merging_stop_codons_4(self): """ Situation: * 1 stop codon split in two exons """ intervals = list_to_intervals([ # for this test no more than one interval is needed... ['1', '.', 'transcript', '20', '70', '.', '+', '.', '.'], ['1', '.', 'stop_codon', '40', '40', '.', '+', '.', '.'], ['1', '.', 'stop_codon', '60', '61', '.', '+', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '20', '40', '.', '+', '.', '.'], ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '30', '39', '.', '+', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '30', '40', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '61', '.', '+', '.', '.'], ] expeted_utrs = [ ['1', '.', 'UTR5', '20', '29', '.', '+', '.', '.'], ['1', '.', 'UTR3', '62', '70', '.', '+', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs) # Negative strand: intervals = list_to_intervals([ # for this test no more than one interval is needed... ['1', '.', 'transcript', '20', '80', '.', '-', '.', '.'], ['1', '.', 'stop_codon', '39', '40', '.', '-', '.', '.'], ['1', '.', 'stop_codon', '60', '60', '.', '-', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '20', '40', '.', '-', '.', '.'], ['1', '.', 'exon', '60', '80', '.', '-', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '61', '65', '.', '-', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '60', '65', '.', '-', '.', '.'], ['1', '.', 'CDS', '39', '40', '.', '-', '.', '.'], ] expeted_utrs = [ ['1', '.', 'UTR3', '20', '38', '.', '-', '.', '.'], ['1', '.', 'UTR5', '66', '80', '.', '-', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs)
def test_fail_validating(self, print_mock): """ Fail on validation. Mock the print function to suppress the actual printing during test. """ intervals = list_to_intervals([ ['1', '.', 'transcript', '1', '200', '.', '+', '.', 'transcript_id "42";'], ['1', '.', 'exon', '1', '30', '.', '+', '.', 'exon_number "1";'], ['1', '.', 'exon', '60', '100', '.', '+', '.', 'exon_number "2";'], ]) with self.assertRaises(AssertionError): segment._process_transcript_group(intervals) self.assertEqual(print_mock.call_count, 8)
def test_merging_stop_codons_1(self): """ Situation: * stop codon and CDS completely overlap """ intervals = list_to_intervals([ # for this test no more than is needed... ['1', '.', 'transcript', '20', '62', '.', '+', '.', '.'], ['1', '.', 'stop_codon', '60', '62', '.', '+', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '20', '40', '.', '+', '.', '.'], ['1', '.', 'exon', '60', '62', '.', '+', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '20', '40', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '62', '.', '+', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '20', '40', '.', '+', '.', '.'], ['1', '.', 'CDS', '60', '62', '.', '+', '.', '.'], ] expeted_utrs = [] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs) # Negative strand: intervals = list_to_intervals([ # for this test no more than is needed... ['1', '.', 'transcript', '20', '80', '.', '-', '.', '.'], ['1', '.', 'stop_codon', '20', '22', '.', '-', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '20', '22', '.', '+', '.', '.'], ['1', '.', 'exon', '60', '80', '.', '+', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '20', '22', '.', '-', '.', '.'], ['1', '.', 'CDS', '60', '80', '.', '-', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '20', '22', '.', '-', '.', '.'], ['1', '.', 'CDS', '60', '80', '.', '-', '.', '.'], ] expeted_utrs = [] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs)
def test_merging_stop_codons_3(self): """ Situation: * 1 stop codon given on same exon as CDS, bit inside CDS! """ intervals = list_to_intervals([ # for this test no more than one interval is needed... ['1', '.', 'transcript', '60', '70', '.', '+', '.', '.'], ['1', '.', 'stop_codon', '63', '65', '.', '+', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '60', '70', '.', '+', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '60', '65', '.', '+', '.', '.'], ] expeted_utrs = [ ['1', '.', 'UTR3', '66', '70', '.', '+', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs) # Negative strand: intervals = list_to_intervals([ # for this test no more than one interval is needed... ['1', '.', 'transcript', '60', '70', '.', '-', '.', '.'], ['1', '.', 'stop_codon', '65', '67', '.', '-', '.', '.'], ]) exons = list_to_intervals([ ['1', '.', 'exon', '60', '70', '.', '-', '.', '.'], ]) cdses = list_to_intervals([ ['1', '.', 'CDS', '65', '70', '.', '-', '.', '.'], ]) expeted_new_cdses = [ ['1', '.', 'CDS', '65', '70', '.', '-', '.', '.'], ] expeted_utrs = [ ['1', '.', 'UTR3', '60', '64', '.', '-', '.', '.'], ] new_cdses, utrs = segment._get_non_cds_exons(cdses, exons, intervals) new_cdses, utrs = intervals_to_list(new_cdses), intervals_to_list(utrs) self.assertEqual(expeted_new_cdses, new_cdses) self.assertEqual(expeted_utrs, utrs)
def test_no_transcript_interval(self): """ If not transcript interval is given, it is determined by function Also this is the case if no CDS are given - all exons turn to ncRNA. """ intervals = list_to_intervals([ ['1', '.', 'exon', '1', '30', '.', '+', '.', 'exon_number "1";'], ['1', '.', 'exon', '60', '100', '.', '+', '.', 'exon_number "2";'], ]) expected = [ ['1', '.', 'transcript', '1', '100', '.', '+', '.', ''], ['1', '.', 'intron', '31', '59', '.', '+', '.', ''], ['1', '.', 'ncRNA', '1', '30', '.', '+', '.', 'exon_number "1";'], ['1', '.', 'ncRNA', '60', '100', '.', '+', '.', 'exon_number "2";'], ] output = intervals_to_list(segment._process_transcript_group(intervals)) self.assertEqual(output, expected)
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) self.gtf_data = list_to_intervals([ [ '1', '.', 'intergenic', '1', '799', '.', '-', '.', attrs(tid='.', iid='interN00000') ], [ '1', '.', 'intergenic', '1', '99', '.', '+', '.', attrs(tid='.', iid='interP00000') ], # Gene #1: [ '1', '.', 'gene', '100', '499', '.', '+', '.', attrs('G1', bio='.') ], # Transcript #1 [ '1', '.', 'transcript', '100', '249', '.', '+', '.', attrs('G1', 'T1', bio='.') ], [ '1', '.', 'UTR5', '100', '149', '.', '+', '.', attrs('G1', 'T1', 1, bio='.') ], [ '1', '.', 'intron', '150', '199', '.', '+', '.', attrs('G1', 'T1', bio='.') ], [ '1', '.', 'CDS', '200', '229', '.', '+', '.', attrs('G1', 'T1', 2, bio='.') ], [ '1', '.', 'intron', '230', '239', '.', '+', '.', attrs('G1', 'T1', bio='.') ], [ '1', '.', 'UTR3', '240', '249', '.', '+', '.', attrs('G1', 'T1', 3, bio='.') ], # Transcript #2 [ '1', '.', 'transcript', '240', '499', '.', '+', '.', attrs('G1', 'T2', bio='.') ], [ '1', '.', 'CDS', '240', '299', '.', '+', '.', attrs('G1', 'T2', 1, bio='.') ], [ '1', '.', 'intron', '300', '399', '.', '+', '.', attrs('G1', 'T1', bio='.') ], [ '1', '.', 'CDS', '400', '499', '.', '+', '.', attrs('G1', 'T2', 2, bio='.') ], # intergenic [ '1', '.', 'intergenic', '500', '599', '.', '+', '.', attrs(tid='.', iid='interP00001') ], # Gene #1: [ '1', '.', 'gene', '600', '799', '.', '+', '.', attrs('G2', bio='.') ], # Transcript #3 [ '1', '.', 'transcript', '600', '799', '.', '+', '.', attrs('G2', 'T3', bio='.') ], [ '1', '.', 'CDS', '600', '649', '.', '+', '.', attrs('G2', 'T3', 1, bio='.') ], [ '1', '.', 'intron', '650', '749', '.', '+', '.', attrs('G2', 'T3', bio='.') ], [ '1', '.', 'CDS', '750', '799', '.', '+', '.', attrs('G2', 'T3', 2, bio='.') ], [ '1', '.', 'intergenic', '800', '999', '.', '+', '.', attrs(tid='.', iid='interP00002') ], # Gene #3: [ '1', '.', 'gene', '800', '899', '.', '-', '.', attrs('G3', bio='.') ], # Transcript #3 [ '1', '.', 'transcript', '800', '899', '.', '-', '.', attrs('G3', 'T4', bio='.') ], [ '1', '.', 'CDS', '800', '899', '.', '-', '.', attrs('G3', 'T4', 1, bio='.') ], [ '1', '.', 'intergenic', '900', '999', '.', '-', '.', attrs(tid='.', iid='interN00001') ], ]) self.gtf = make_file_from_list(intervals_to_list(self.gtf_data), extension='gtf') self.strange = get_temp_file_name(extension='bam') self.cross_tr = get_temp_file_name(extension='tsv') self.out = get_temp_file_name(extension='tsv')