Пример #1
0
class TestRetrieval(unittest.TestCase):
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(
            os.path.join(os.path.dirname(__file__), "configuration.yaml"))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(
            conf["pick"]["chimera_split"]["blast_params"]["leniency"],
            "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf

    def test_load_pos_and_neg(self):

        b1 = BED12(transcriptomic=True)
        b1.chrom = self.tr.id
        b1.start = 0
        b1.end = self.tr.cdna_length - 1
        b1.strand = "+"
        b1.name = "first"
        b1.thick_start = 101
        b1.thick_end = 190
        self.assertFalse(b1.invalid)

        b2 = b1.copy()
        b2.strand = "-"
        b2.thick_start = 1
        b2.thick_end = 87
        b2.name = "second"
        self.assertFalse(b2.invalid)
        with self.assertLogs("null", "DEBUG") as _:
            after_overlap_check = retrieval.find_overlapping_cds(
                self.tr, [b1, b2])
        # print(*_.output, sep="\n")

        self.assertEqual(len(after_overlap_check), 2,
                         self.tr.json_conf["pick"]["orf_loading"])
        self.assertEqual(after_overlap_check, [b1, b2],
                         [_.name for _ in after_overlap_check])
        retrieval.load_orfs(self.tr, [b1, b2])
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.combined_cds_start, 201,
                         self.tr.combined_cds_start)
        self.assertEqual(self.tr.combined_cds_length, 90)

    def test_connect(self):

        retrieval._connect_to_db(self.tr)
        reflector = reflection.Inspector.from_engine(self.tr.engine)
Пример #2
0
class ExternalTester(unittest.TestCase):
    
    def setUp(self):
        
        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416),
                 (47704590, 47704669),
                 (47762671, 47762742),
                 (47893062, 47893093),
                 (47895572, 47895655),
                 (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"
        
    def test_copying(self):
        
        self.transcript.external_scores.update({"test": 0, "test1": 1})
        self.assertEqual(self.transcript.external_scores.test, 0)
        self.assertEqual(self.transcript.external_scores.test1, 1)
        transcript = self.transcript.deepcopy()
        self.assertEqual(transcript.external_scores.test, 0)
        self.assertEqual(transcript.external_scores.test1, 1)
Пример #3
0
    def test_intronMatch(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1510)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))
Пример #4
0
    def test_regression(self):

        sequence = """TC
CTCACAGTTACTATAAGCTCGTCT
ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA
GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG
ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT
TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT
TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA
TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG
AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA
GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG
CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA
AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC
CGTTGACTATCTCGCCTGA"""

        record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)), id="class_Chr1.1006.0")
        index = {record.id: record}

        line = "\t".join(
            ['class_Chr1.1006.0',
             '0',
             '619',
             'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)',
             '0',
             '+',
             '2',
             '617',
             '0',
             '1',
             '619',
             '0'])

        # Now we are going back to find the start codon
        bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index, max_regression=0.2)
        self.assertFalse(bed_line.invalid, bed_line.invalid_reason)
        self.assertEqual(bed_line.phase, 0)
        # Start codon in frame found at location 27
        self.assertEqual(bed_line.thick_start, 27)
        self.assertTrue(bed_line.has_start_codon)
        self.assertFalse(bed_line.has_stop_codon)

        lines = """Chr1	CLASS	transcript	3442811	3443785	1000	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0";
Chr1	CLASS	exon	3442811	3442999	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443099	3443169	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443252	3443329	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443417	3443493	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443582	3443785	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";"""

        lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _]

        transcript = Transcript(lines[0])
        transcript.add_exons(lines[1:])
        transcript.finalize()
        transcript.load_orfs([bed_line])
        self.assertTrue(transcript.is_coding)
        self.assertTrue(transcript.has_start_codon)
        self.assertFalse(transcript.has_stop_codon)
        self.assertEqual(transcript.selected_cds_end, transcript.start)
        self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
Пример #5
0
    def test_wrong_cds(self):

        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        cds_line = "\t".join([
            "15", "protein_coding", "CDS", "48051996", "48051996", ".", "+",
            "0", "ID=ENST00000560636.cds1;Parent=ENST00000560636"
        ])
        cds_line = GffLine(cds_line)
        transcript.add_exon(cds_line)
        logger = Mikado.utilities.log_utils.create_null_logger()
        transcript.logger = logger
        with self.assertLogs("null", level="WARNING"):
            transcript.finalize()

        trimmed = trim_coding(transcript, logger, max_length=50)
        self.assertEqual(trimmed.start, 47631366)
        self.assertEqual(trimmed.end, 48051992)
Пример #6
0
class ExternalTester(unittest.TestCase):
    def setUp(self):

        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"

    def test_copying(self):

        self.transcript.external_scores.update({"test": 0, "test1": 1})
        self.assertEqual(self.transcript.external_scores.test, 0)
        self.assertEqual(self.transcript.external_scores.test1, 1)
        transcript = self.transcript.deepcopy()
        self.assertEqual(transcript.external_scores.test, 0)
        self.assertEqual(transcript.external_scores.test1, 1)
Пример #7
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n"+"\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #8
0
    def test_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.add_exons([(101, 300), (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        # Phase 0, 0 because the first CDS exon is 300bp
        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(
            gff, res, "++++\n\n" + "\n+++\n".join(
                [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #9
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #10
0
class TestRetrieval(unittest.TestCase):
    
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(os.path.join(
            os.path.dirname(__file__),
            "configuration.yaml"
        ))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(conf["pick"]["chimera_split"]["blast_params"]["leniency"],
                         "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf

    def test_load_pos_and_neg(self):
        
        b1 = BED12(transcriptomic=True)
        b1.chrom = self.tr.id
        b1.start = 0
        b1.end = self.tr.cdna_length - 1
        b1.strand = "+"
        b1.name = "first"
        b1.thick_start = 101
        b1.thick_end = 190
        self.assertFalse(b1.invalid)

        b2 = b1.copy()
        b2.strand = "-"
        b2.thick_start = 1
        b2.thick_end = 87
        b2.name = "second"
        self.assertFalse(b2.invalid)
        with self.assertLogs("null", "DEBUG") as _:
            after_overlap_check = retrieval.find_overlapping_cds(self.tr, [b1, b2])
        # print(*_.output, sep="\n")

        self.assertEqual(len(after_overlap_check), 2, self.tr.json_conf["pick"]["orf_loading"])
        self.assertEqual(after_overlap_check,
                         [b1, b2],
                         [_.name for _ in after_overlap_check])
        retrieval.load_orfs(self.tr, [b1, b2])
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.combined_cds_start, 201, self.tr.combined_cds_start)
        self.assertEqual(self.tr.combined_cds_length, 90)

    def test_connect(self):

        retrieval._connect_to_db(self.tr)
        reflector = reflection.Inspector.from_engine(self.tr.engine)
Пример #11
0
    def test_coding_positive(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "+"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.add_exons([(101, 300),
                      (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #12
0
    def test_noncoding(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        transcript.add_exons(exons)
        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds2")

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=50)
        self.assertEqual(trimmed.start, 11450)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)
Пример #13
0
    def test_regression(self):

        sequence = """TC
CTCACAGTTACTATAAGCTCGTCT
ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA
GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG
ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT
TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT
TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA
TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG
AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA
GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG
CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA
AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC
CGTTGACTATCTCGCCTGA"""

        record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)),
                                     id="class_Chr1.1006.0")
        index = {record.id: record}

        line = "\t".join([
            'class_Chr1.1006.0', '0', '619',
            'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)',
            '0', '+', '2', '617', '0', '1', '619', '0'
        ])

        # Now we are going back to find the start codon
        bed_line = bed12.BED12(line,
                               transcriptomic=True,
                               fasta_index=index,
                               max_regression=0.2)
        self.assertFalse(bed_line.invalid, bed_line.invalid_reason)
        self.assertEqual(bed_line.phase, 0)
        # Start codon in frame found at location 27
        self.assertEqual(bed_line.thick_start, 27)
        self.assertTrue(bed_line.has_start_codon)
        self.assertFalse(bed_line.has_stop_codon)

        lines = """Chr1	CLASS	transcript	3442811	3443785	1000	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0";
Chr1	CLASS	exon	3442811	3442999	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443099	3443169	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443252	3443329	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443417	3443493	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443582	3443785	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";"""

        lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _]

        transcript = Transcript(lines[0])
        transcript.add_exons(lines[1:])
        transcript.finalize()
        transcript.load_orfs([bed_line])
        self.assertTrue(transcript.is_coding)
        self.assertTrue(transcript.has_start_codon)
        self.assertFalse(transcript.has_stop_codon)
        self.assertEqual(transcript.selected_cds_end, transcript.start)
        self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
Пример #14
0
    def test_mixed_strands(self):
        """Verify that no retained intron is called if the strands are mixed."""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "-", "t2"
        t2.add_exons([(601, 1000), (1201, 1300), (1501, 1800)])
        t2.add_exons(
            [
                (1501, 1530),  # 30
                (1201, 1300),  # 100
                (771, 1000)  # 230
            ],
            features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json, stranded=False)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
Пример #15
0
    def test_noIntronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1800, 2000)])
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #16
0
    def test_not_retained_neg(self):
        """Here we verify that a false retained intron is not called as such"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "-", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "-", "t2"
        t2.add_exons([(301, 1000), (1201, 1300), (1501, 1800)])
        t2.add_exons(
            [
                (1501, 1530),  # 30
                (1201, 1300),  # 100
                (471, 1000)  # 230
            ],
            features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
Пример #17
0
    def test_exon_switching_pos(self):
        """Checking that an exon switching is treated correctly as a NON-retained intron. Positive strand case"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (2501, 2800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (2501, 2530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "+", "t2"
        t2.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t2.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
Пример #18
0
    def test_no_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1600
        t2.end = 2000
        t2.add_exons([(1600, 1700), (1801, 2000)])
        t2.add_exons([(1661, 1700), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #19
0
    def test_intronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)])

        t2.finalize()
        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #20
0
    def test_same_id(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        # This fails because they have the same ID
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #21
0
    def test_not_intersecting(self):

        # This one is contained and should be rejected
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 601
        t2.end = 1420
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon")
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (False, "c"))
Пример #22
0
    def testCdsOverlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #23
0
    def test_lowscore(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)
Пример #24
0
    def test_real_retained_pos_noCDS(self):
        """Here we verify that a real retained intron is called as such, even when the transcript lacks a CDS"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "+", "t2"
        t2.add_exons([(101, 500), (801, 1000), (1201, 1600)])
        # t2.add_exons([(201, 500),  # 300
        #               (801, 1000),  # 200
        #               (1201, 1420),  # 220
        #               ], features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_introns,
                         ((1201, 1600), ))
Пример #25
0
    def test_caseNegative(self):
        tr = Transcript()
        tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "-"
        tr.id = "test1"
        tr.add_exons([(101, 300),
                      (401, 600),
                      (801, 1200),
                      (2501, 3000)
                      ])

        tr.add_exons([(421, 600),  # 180
                      (801, 1200),  # 400
                      (2501, 2700)  # 200  = 780 % 3 == 0
                      ], features="CDS")
        with self.assertLogs("null", "DEBUG") as _:
            tr.finalize()
        self.assertTrue(tr.is_coding)

        b12 = tr.as_bed12()
        self.assertEqual(b12.thick_start, tr.combined_cds_end)
        self.assertEqual(b12.thick_end, tr.combined_cds_start)
        self.assertEqual(len(b12.block_sizes), tr.exon_num)
        self.assertEqual(b12.block_sizes,
                         [200, 200, 400, 500],
                         b12.block_sizes)
        self.assertEqual(b12.strand, "-")
        self.assertEqual(b12.block_starts,
                         [0, 300, 700, 2400],
                         b12.block_starts)

        self.assertEqual(tr.format("bed12"), str(b12))
        self.assertEqual(str(b12),
                         "\t".join([str(_) for _ in
                                    ["Chr1", 100, 3000, tr.id, 0, tr.strand,
                                     b12.thick_start - 1, b12.thick_end,
                                     0, 4,
                                     ",".join([str(__) for __ in [200, 200, 400, 500]]),
                                     ",".join([str(___) for ___ in [0, 300, 700, 2400]])]]
                                   ))
Пример #26
0
    def test_noCDSOverlap(self):

        self.t1.strip_cds()
        self.assertEqual(self.t1.combined_cds_introns, set())
        self.t1.finalized = False
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS")
        self.t1.finalize()

        t2 = Transcript()
        t2.logger = self.logger
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)])
        t2.add_exons([(1201, 1350), (1421, 1450)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertGreaterEqual(
            0,
            overlap((self.t1.combined_cds_start, self.t1.combined_cds_end),
                    (t2.combined_cds_start, t2.combined_cds_end)),
            [(self.t1.combined_cds_start, self.t1.combined_cds_end),
             (t2.combined_cds_start, t2.combined_cds_end)])

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertFalse(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))
Пример #27
0
    def test_valid_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (True, "J"))

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)
Пример #28
0
    def test_correct_cds(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        cds = [
            (11400, 11500),  # 101
            (12000, 13000),  # 1001 ==> 1102
            (15000, 17998)
        ]  # 2998 == > 3090 (y)

        transcript.add_exons(exons)
        transcript.add_exons(cds, features="CDS")

        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds")

        copied = transcript.deepcopy()

        trimmed = trim_coding(copied, logger, max_length=50)
        self.assertEqual(trimmed.start, 11400)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()
        self.assertEqual(copied.start, 10000)
        trimmed = trim_coding(copied, logger, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)
Пример #29
0
    def test_non_redundant_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

        t3 = Transcript()
        t3.chrom = "Chr1"
        t3.strand = "+"
        t3.score = 20
        t3.id = "G3.1"
        t3.parent = "G3"
        t3.start = 201
        t3.end = 1630

        t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460),
                      (1501, 1630)], "exon")
        t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)],
                     "CDS")
        t3.logger = self.logger
        t3.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t3)[:2], (True, "j"))
        self.locus.add_transcript_to_locus(t3)
        self.assertEqual(len(self.locus.transcripts), 3,
                         self.locus.transcripts)
Пример #30
0
    def test_only_CDS_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

        t2.strip_cds()
        t2.finalized = False
        t2.add_exons([(1461, 1560), (1801, 1850)], "CDS")
        # No CDS overlap this time
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #31
0
class WrongLoadedOrf(unittest.TestCase):
    def setUp(self):

        self.tr = Transcript()
        self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000,
                                                                     "Chr1",
                                                                     "+")
        self.tr.id = "test1"
        self.tr.add_exons([(101, 400), (701, 1000)])
        self.tr.finalize()

    def test_load_invalid_length(self):

        b_invalid = BED12(transcriptomic=True)
        b_invalid.chrom = self.tr.id
        self.assertTrue(b_invalid.transcriptomic)
        # b_invalid.name = self.tr.id
        b_invalid.start = 0
        b_invalid.strand = "+"
        b_invalid.end = self.tr.cdna_length + 10
        b_invalid.thick_start = 101
        b_invalid.thick_end = 190
        self.assertEqual(b_invalid.chrom, b_invalid.id, b_invalid.id)

        with self.assertLogs("null", "WARNING") as cm:
            retrieval.load_orfs(self.tr, [b_invalid])

        found_message = False
        for _ in cm.output:
            if "Wrong ORF for {}:".format(self.tr.id) in _:
                found_message = True
                break

        self.assertTrue(found_message, cm.output)

    def test_load_invalid_multiple(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "invalid"
        b_invalid.thick_start = 1
        b_invalid.thick_end = 89
        b_invalid.phase = 0

        self.assertTrue(b_invalid.invalid)
        self.assertFalse(b_valid.invalid, b_valid.invalid_reason)

        with self.assertLogs("null", "DEBUG") as _:
            retrieval.load_orfs(self.tr, [b_valid, b_invalid])

        # print(*cm.output, sep="\n")

        self.assertEqual(self.tr.number_internal_orfs, 1)

    def test_filter_non_transcriptomic(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "non-transcriptomic"
        b_invalid.transcriptomic = False

        retained = retrieval.find_overlapping_cds(self.tr,
                                                  [b_invalid, b_valid])
        self.assertEqual(retained, [b_valid])
Пример #32
0
class PhaseChecker(unittest.TestCase):

    logger = create_default_logger("pcheck")
    logger.setLevel("DEBUG")

    def setUp(self):

        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	mRNA	40282	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	40282	40933	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	three_prime_UTR	40282	40720	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	40721	40933	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41018	41111	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41018	41111	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41227	41468	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41227	41468	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41673	41831	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41673	41831	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41946	42820	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41946	42820	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	42905	42913	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	42905	42913	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45373	45496	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45373	45496	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45600	45651	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45600	45651	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45726	45726	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45726	45726	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45875	45893	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45875	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	five_prime_UTR	45894	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2"""

        lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _]
        self.transcript = Transcript(lines[0], logger=self.logger)
        self.transcript.add_exons(lines[1:])
        self.correct_phases = {(40721, 40933): 2,
                               (41018, 41111): 0,
                               (41227, 41468): 2,
                               (41673, 41831): 2,
                               (41946, 42820): 1,
                               (42905, 42913): 1,
                               (45373, 45496): 2,
                               (45600, 45651): 0,
                               (45726, 45726): 2,
                               (45875, 45893): 0}

    @unittest.skip
    def test_check_phases(self):
        self.transcript.finalize()
        phases = dict((_[1], _[2]) for _ in self.transcript.internal_orfs[0]
                      if _[0] == "CDS")
        self.assertEqual(self.transcript.combined_cds_start, 45893)

        self.assertEqual(phases.keys(),
                         self.correct_phases.keys(),
                         list(zip(sorted(phases.keys()),
                                  sorted(self.correct_phases.keys()))))

        if self.correct_phases != phases:
            for key in sorted(phases.keys(), reverse=True):
                self.assertEqual(phases[key], self.correct_phases[key],
                                 (key, phases[key], self.correct_phases[key]))

        self.assertEqual(self.correct_phases,
                         phases,
                         (self.correct_phases, phases))
Пример #33
0
class MonoHolderTester(unittest.TestCase):

    logger = create_default_logger("MonoHolderTester")

    def setUp(self):

        self.conf = dict()

        self.t1 = Transcript()
        self.t1.chrom = "Chr1"
        self.t1.strand = "+"
        self.t1.score = 20
        self.t1.id = "G1.1"
        self.t1.parent = "G1"
        self.t1.start = 101
        self.t1.end = 1500

        self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)],
                          "exon")
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                          "CDS")
        self.t1.finalize()

    def testCdsOverlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_intronMatch(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1510)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))

    def test_intronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)])

        t2.finalize()
        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_noIntronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1800, 2000)])
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_noCDSOverlap(self):

        self.t1.strip_cds()
        self.assertEqual(self.t1.combined_cds_introns, set())
        self.t1.finalized = False
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS")
        self.t1.finalize()

        t2 = Transcript()
        t2.logger = self.logger
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)])
        t2.add_exons([(1201, 1350), (1421, 1450)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertGreaterEqual(
            0,
            overlap((self.t1.combined_cds_start, self.t1.combined_cds_end),
                    (t2.combined_cds_start, t2.combined_cds_end)),
            [(self.t1.combined_cds_start, self.t1.combined_cds_end),
             (t2.combined_cds_start, t2.combined_cds_end)])

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertFalse(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))

    def test_only_CDS_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

        t2.strip_cds()
        t2.finalized = False
        t2.add_exons([(1461, 1560), (1801, 1850)], "CDS")
        # No CDS overlap this time
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_no_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1600
        t2.end = 2000
        t2.add_exons([(1600, 1700), (1801, 2000)])
        t2.add_exons([(1661, 1700), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_same_id(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        # This fails because they have the same ID
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Пример #34
0
    def test_add_two_partials(self):

        logger = create_null_logger("test_add_two_partials")
        logger.setLevel("INFO")
        json_conf = load_and_validate_config(None)
        json_conf.reference.genome = self.fai
        json_conf.pick.alternative_splicing.only_confirmed_introns = False
        json_conf.pick.run_options.only_reference_update = True

        ref = Transcript(is_reference=True)
        ref.chrom, ref.strand, ref.id = "Chr5", "-", "AT5G66670.2"
        ref.add_exons([(26611258, 26612889)])
        ref.add_exons([(26611474, 26612700)], features=["CDS"])
        ref.finalize()
        self.assertTrue(ref.is_coding)

        # Chr5	TAIR10	mRNA	26611258	26612889	.	-	.	ID=AT5G66670.2;Parent=AT5G66670;Name=AT5G66670.2;index=1
        # Chr5	TAIR10	protein	26611474	26612700	.	-	.	ID=AT5G66670.2-Protein;Parent=AT5G66670.2;Name=AT5G66670.2;derives_from=AT5G66670.2
        # Chr5	TAIR10	three_prime_UTR	26611258	26611473	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	CDS	26611474	26612700	.	-	0	Parent=AT5G66670.2
        # Chr5	TAIR10	five_prime_UTR	26612701	26612889	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	exon	26611258	26612889	.	-	.	Parent=AT5G66670.2

        template1 = Transcript(is_reference=False)
        template1.chrom, template1.strand, template1.id = ref.chrom, ref.strand, ref.id + "_frag1"
        template1.add_exons(((26611116, 26611157), (26611258, 26612670)))
        template1.add_exons(((26611474, 26612670), ), features=["CDS"])
        template1.finalize()
        self.assertTrue(template1.is_coding)

        template2 = Transcript(is_reference=False)
        template2.chrom, template2.strand, template2.id = ref.chrom, ref.strand, ref.id + "_frag2"
        template2.add_exons(((26611574, 26612889), (26613007, 26613403)))
        template2.add_exons(((26611574, 26612700), ), features=["CDS"])
        template2.finalize()
        self.assertTrue(template2.is_coding)

        logger.setLevel("INFO")
        json_conf.pick.alternative_splicing.pad = True
        locus = Locus(ref, configuration=json_conf, logger=logger)
        locus.add_transcript_to_locus(template1)
        locus.add_transcript_to_locus(template2)
        self.assertIn(template2.id, locus)
        # self.assertIn(template1.id, locus)
        # locus.logger.setLevel("DEBUG")
        # for tid in locus:
        #     locus[tid].logger.setLevel("DEBUG")
        locus.finalize_alternative_splicing(check_requirements=False)
        self.assertTrue(locus._finalized)
        self.assertNotIn(template1.id, locus, "\n" + str(locus))
        self.assertNotIn(template2.id, locus, "\n" + str(locus))
        self.assertEqual(
            locus[ref.id].end, template2.end,
            ((locus[ref.id].end, ref.end, template2.end, template1.end),
             (locus[ref.id].start, ref.start, template2.start,
              template1.start)))
Пример #35
0
class TestMetricsEndDistances(unittest.TestCase):

    logger = create_default_logger("End")
    logger.setLevel("ERROR")

    def setUp(self):

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.start = 101
        self.tr.end = 10000
        self.tr.add_exons([(101, 300),
                           (501, 800),
                           (1001, 1200),
                           (1301, 2000),
                           (3501, 5000),
                           (5501, 6000),
                           (6201, 7000),
                           (7301, 7700),
                           (8201, 9000),
                           (9101, 9300),
                           (9501, 9700),
                           (9801, 10000)])
        self.tr.id = "test1"
        self.tr.parent = "test1.gene"

    def test_end_positive(self):

        self.tr.strand = "+"

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         9130)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9690)  # 190 % 3 == 1
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9690)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (9700 - 9691 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9700 - 9691 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9700),  # 200 % 3 == 2
               (9801, 9820),  # 20 % 2 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9820)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         180)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)

    def test_end_negative(self):

        self.tr.strand = "-"

        # self.tr.add_exons([(101, 300),
        #                    (501, 800),
        #                    (1001, 1200),
        #                    (1301, 2000),
        #                    (3501, 5000),
        #                    (5501, 6000),
        #                    (6201, 7000),
        #                    (7301, 7700),
        #                    (8201, 9000),
        #                    (9101, 9300),
        #                    (9501, 9700),
        #                    (9801, 10000)])

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0)

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertTrue(self.tr.is_coding)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         1161)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (1161-1001) + (800-501+1),
                         (self.tr.end_distance_from_junction,
                          (1161-1001) + (800-501+1))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                          self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(721, 800),
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 200 % 3 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         721)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (721-501),
                         (self.tr.end_distance_from_junction, (721-501))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(161, 300),    # 140 % 3 == 2
               (501, 800),    # 300 % 3 == 0
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 30 % 3 == 0
               ]

        self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0)
        self.tr.logger = self.logger
        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         161)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         60)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)
Пример #36
0
class WrongLoadedOrf(unittest.TestCase):

    def setUp(self):

        self.tr = Transcript()
        self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000, "Chr1", "+")
        self.tr.id = "test1"
        self.tr.add_exons([(101, 400), (701, 1000)])
        self.tr.finalize()

    def test_load_invalid_length(self):

        b_invalid = BED12(transcriptomic=True)
        b_invalid.chrom = self.tr.id
        self.assertTrue(b_invalid.transcriptomic)
        # b_invalid.name = self.tr.id
        b_invalid.start = 0
        b_invalid.strand = "+"
        b_invalid.end = self.tr.cdna_length + 10
        b_invalid.thick_start = 101
        b_invalid.thick_end = 190
        self.assertEqual(b_invalid.chrom,
                         b_invalid.id,
                         b_invalid.id)

        with self.assertLogs("null", "WARNING") as cm:
            retrieval.load_orfs(self.tr, [b_invalid])

        found_message = False
        for _ in cm.output:
            if "Wrong ORF for {}:".format(self.tr.id) in _:
                found_message = True
                break

        self.assertTrue(found_message, cm.output)

    def test_load_invalid_multiple(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "invalid"
        b_invalid.thick_start = 1
        b_invalid.thick_end = 89
        b_invalid.phase = 0

        self.assertTrue(b_invalid.invalid)
        self.assertFalse(b_valid.invalid, b_valid.invalid_reason)

        with self.assertLogs("null", "DEBUG") as _:
            retrieval.load_orfs(self.tr, [b_valid, b_invalid])

        # print(*cm.output, sep="\n")

        self.assertEqual(self.tr.number_internal_orfs, 1)

    def test_filter_non_transcriptomic(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "non-transcriptomic"
        b_invalid.transcriptomic = False

        retained = retrieval.find_overlapping_cds(self.tr, [b_invalid, b_valid])
        self.assertEqual(retained, [b_valid])