def test_wrong_cds(self): transcript = Transcript() transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" cds_line = "\t".join([ "15", "protein_coding", "CDS", "48051996", "48051996", ".", "+", "0", "ID=ENST00000560636.cds1;Parent=ENST00000560636" ]) cds_line = GffLine(cds_line) transcript.add_exon(cds_line) logger = Mikado.utilities.log_utils.create_null_logger() transcript.logger = logger with self.assertLogs("null", level="WARNING"): transcript.finalize() trimmed = trim_coding(transcript, logger, max_length=50) self.assertEqual(trimmed.start, 47631366) self.assertEqual(trimmed.end, 48051992)
def test_coding_positive(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "+" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" # Phase 0, 0 because the first CDS exon is 300bp gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual( gff, res, "++++\n\n" + "\n+++\n".join( [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_intronMatch(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1510)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger))
def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n"+"\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_non_redundant_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts) t3 = Transcript() t3.chrom = "Chr1" t3.strand = "+" t3.score = 20 t3.id = "G3.1" t3.parent = "G3" t3.start = 201 t3.end = 1630 t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460), (1501, 1630)], "exon") t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)], "CDS") t3.logger = self.logger t3.finalize() self.assertEqual( self.locus.is_alternative_splicing(t3)[:2], (True, "j")) self.locus.add_transcript_to_locus(t3) self.assertEqual(len(self.locus.transcripts), 3, self.locus.transcripts)
def test_noIntronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1800, 2000)]) t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_no_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1600 t2.end = 2000 t2.add_exons([(1600, 1700), (1801, 2000)]) t2.add_exons([(1661, 1700), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_same_id(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G1.1" t2.parent = "G1" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() # This fails because they have the same ID self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_intronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)]) t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_creation_from_transcript(self): t = Transcript() t.chrom = "Chr1" t.start = 100 t.end = 200 t.strand = "+" t.id = "test" t.parent = "parent" gene = Gene(t) self.assertIn(t.id, gene, gene.keys()) self.assertIn(t, gene, gene.keys()) self.assertEqual(t.chrom, gene.chrom) self.assertEqual(t.start, gene.start) self.assertEqual(t.end, gene.end) self.assertEqual(t.strand, gene.strand) self.assertIs(t, gene[t.id])
def test_not_intersecting(self): # This one is contained and should be rejected t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G1.1" t2.parent = "G1" t2.start = 601 t2.end = 1420 t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon") t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (False, "c"))
def testCdsOverlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_lowscore(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts)
def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger))
def test_valid_as(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 20 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertEqual( self.locus.is_alternative_splicing(t2)[:2], (True, "J")) self.locus.add_transcript_to_locus(t2) self.assertEqual(len(self.locus.transcripts), 2, self.locus.transcripts)