Пример #1
0
    def test_wrong_cds(self):

        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        cds_line = "\t".join([
            "15", "protein_coding", "CDS", "48051996", "48051996", ".", "+",
            "0", "ID=ENST00000560636.cds1;Parent=ENST00000560636"
        ])
        cds_line = GffLine(cds_line)
        transcript.add_exon(cds_line)
        logger = Mikado.utilities.log_utils.create_null_logger()
        transcript.logger = logger
        with self.assertLogs("null", level="WARNING"):
            transcript.finalize()

        trimmed = trim_coding(transcript, logger, max_length=50)
        self.assertEqual(trimmed.start, 47631366)
        self.assertEqual(trimmed.end, 48051992)
Пример #2
0
    def test_intronMatch(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1510)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))
Пример #3
0
    def test_coding_positive(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "+"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.add_exons([(101, 300),
                      (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #4
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #5
0
    def test_noncoding(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        transcript.add_exons(exons)
        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds2")

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=50)
        self.assertEqual(trimmed.start, 11450)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)
Пример #6
0
    def test_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.add_exons([(101, 300), (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        # Phase 0, 0 because the first CDS exon is 300bp
        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(
            gff, res, "++++\n\n" + "\n+++\n".join(
                [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #7
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n"+"\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #8
0
    def test_non_redundant_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

        t3 = Transcript()
        t3.chrom = "Chr1"
        t3.strand = "+"
        t3.score = 20
        t3.id = "G3.1"
        t3.parent = "G3"
        t3.start = 201
        t3.end = 1630

        t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460),
                      (1501, 1630)], "exon")
        t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)],
                     "CDS")
        t3.logger = self.logger
        t3.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t3)[:2], (True, "j"))
        self.locus.add_transcript_to_locus(t3)
        self.assertEqual(len(self.locus.transcripts), 3,
                         self.locus.transcripts)
Пример #9
0
    def test_no_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1600
        t2.end = 2000
        t2.add_exons([(1600, 1700), (1801, 2000)])
        t2.add_exons([(1661, 1700), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #10
0
    def test_noIntronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1800, 2000)])
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #11
0
    def test_intronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)])

        t2.finalize()
        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #12
0
    def test_same_id(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        # This fails because they have the same ID
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #13
0
    def test_not_intersecting(self):

        # This one is contained and should be rejected
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 601
        t2.end = 1420
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon")
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (False, "c"))
Пример #14
0
    def test_creation_from_transcript(self):

        t = Transcript()
        t.chrom = "Chr1"
        t.start = 100
        t.end = 200
        t.strand = "+"
        t.id = "test"
        t.parent = "parent"
        gene = Gene(t)
        self.assertIn(t.id, gene, gene.keys())
        self.assertIn(t, gene, gene.keys())
        self.assertEqual(t.chrom, gene.chrom)
        self.assertEqual(t.start, gene.start)
        self.assertEqual(t.end, gene.end)
        self.assertEqual(t.strand, gene.strand)
        self.assertIs(t, gene[t.id])
Пример #15
0
    def test_creation_from_transcript(self):

        t = Transcript()
        t.chrom = "Chr1"
        t.start = 100
        t.end = 200
        t.strand = "+"
        t.id = "test"
        t.parent = "parent"
        gene = Gene(t)
        self.assertIn(t.id, gene, gene.keys())
        self.assertIn(t, gene, gene.keys())
        self.assertEqual(t.chrom, gene.chrom)
        self.assertEqual(t.start, gene.start)
        self.assertEqual(t.end, gene.end)
        self.assertEqual(t.strand, gene.strand)
        self.assertIs(t, gene[t.id])
Пример #16
0
    def testCdsOverlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #17
0
    def test_lowscore(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)
Пример #18
0
    def test_only_CDS_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

        t2.strip_cds()
        t2.finalized = False
        t2.add_exons([(1461, 1560), (1801, 1850)], "CDS")
        # No CDS overlap this time
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #19
0
    def test_noCDSOverlap(self):

        self.t1.strip_cds()
        self.assertEqual(self.t1.combined_cds_introns, set())
        self.t1.finalized = False
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS")
        self.t1.finalize()

        t2 = Transcript()
        t2.logger = self.logger
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)])
        t2.add_exons([(1201, 1350), (1421, 1450)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertGreaterEqual(
            0,
            overlap((self.t1.combined_cds_start, self.t1.combined_cds_end),
                    (t2.combined_cds_start, t2.combined_cds_end)),
            [(self.t1.combined_cds_start, self.t1.combined_cds_end),
             (t2.combined_cds_start, t2.combined_cds_end)])

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertFalse(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))
Пример #20
0
    def test_valid_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (True, "J"))

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)
Пример #21
0
    def test_correct_cds(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        cds = [
            (11400, 11500),  # 101
            (12000, 13000),  # 1001 ==> 1102
            (15000, 17998)
        ]  # 2998 == > 3090 (y)

        transcript.add_exons(exons)
        transcript.add_exons(cds, features="CDS")

        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds")

        copied = transcript.deepcopy()

        trimmed = trim_coding(copied, logger, max_length=50)
        self.assertEqual(trimmed.start, 11400)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()
        self.assertEqual(copied.start, 10000)
        trimmed = trim_coding(copied, logger, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)