Python TranscriptChecker示例，Mikado.transcripts.transcriptchecker.TranscriptChecker Python示例

示例#1

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

 def test_check_strand_not_reversed(self):
     self.model.strand = "-"
     tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=True)
     tcheck.check_strand()
     self.assertEqual(tcheck.strand, "-")
     self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"])
     self.assertTrue(tcheck.suspicious_splicing)

示例#2

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

    def test_check_reverse_strand(self):

        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model,
                                   self.model_fasta,
                                   strand_specific=False)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

示例#3

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

 def test_check_strand_not_reversed(self):
     self.model.strand = "-"
     tcheck = TranscriptChecker(self.model,
                                self.model_fasta,
                                strand_specific=True)
     tcheck.check_strand()
     self.assertEqual(tcheck.strand, "-")
     self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"])
     self.assertTrue(tcheck.suspicious_splicing)

示例#4

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_rev_complement(self):

        string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC"

        self.assertEqual(TranscriptChecker.rev_complement(string),
                         "GCACTGTTGCACTTCGACNCTGCACGACT")

        string = "agtcGTGCAGNGTCGAAGTGCAACAgtgc"

        self.assertEqual(TranscriptChecker.rev_complement(string),
                         "gcacTGTTGCACTTCGACNCTGCACgact")

示例#5

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_reference_not_flipped(self):

        model = Transcript()
        model.chrom, model.start, model.end = "Chr5", 9930, 13235
        model.id, model.parent, model.strand = "AT5G01030.1", "AT5G01030", "-"
        model.add_exons([(9930, 10172), (10620, 12665), (12797, 13235)])
        model.add_exons([(10638, 12665), (12797, 13003)], features="CDS")
        model.finalize()
        model_fasta = self.fasta["Chr5"][model.start - 1:model.end]
        check_model = TranscriptChecker(model, model_fasta, is_reference=True)
        check_model.check_strand()
        self.assertEqual(check_model.strand, "-")
        self.assertGreater(check_model.combined_cds_length, 0)

示例#6

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_check_reverse_strand(self):

        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model,
                                   self.model_fasta,
                                   strand_specific=False)
        logger = create_default_logger("test_check_reverse_strand", "DEBUG")
        tcheck.logger = logger
        self.assertFalse(tcheck.strand_specific)
        self.assertFalse(tcheck.is_reference)
        self.assertFalse(tcheck.lenient)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

示例#7

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_sequence_reversed(self):

        model = Transcript()
        model.chrom, model.start, model.end, model.strand = "Chr5", 1001, 1500, "+"
        model.add_exon((1001, 1500))
        model.id, model.parent = "foo.1", "foo"
        model.finalize()
        seq = self.fasta.fetch("Chr5", 1001 - 1, 1500)
        self.assertEqual(len(seq), len(model))
        model = TranscriptChecker(model, seq, strand_specific=True)
        model.reverse_strand()
        fasta = "".join(model.fasta.split("\n")[1:])
        self.assertEqual(model.strand, "-")
        self.assertEqual(fasta, TranscriptChecker.rev_complement(seq))

示例#8

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_negative(self):

        transcript = Transcript()
        transcript.chrom, transcript.start, transcript.end, transcript.strand = "Chr5", 26575364, 26578163, "-"
        transcript.id, transcript.parent = "cufflinks_star_at.23553.1", "cufflinks_star_at.23553"
        transcript.add_exons([(26575364, 26575410), (26575495, 26575620),
                              (26575711, 26575797), (26575885, 26575944),
                              (26576035, 26576134), (26576261, 26577069),
                              (26577163, 26577288), (26577378, 26577449),
                              (26577856, 26578163)])

        transcript.finalize()
        fasta_seq = self.fasta.fetch(transcript.chrom, transcript.start - 1,
                                     transcript.end)
        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(str(tchecker.fasta_seq.seq), fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(),
                                             fasta_seq,
                                             strand_specific=ss)
                tchecker.check_strand()
                self.assertEqual(tchecker.strand, "-")

示例#9

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

    def test_translation_table(self):

        self.assertEqual(TranscriptChecker.get_translation_table(), {
            65: 84,
            67: 71,
            71: 67,
            84: 65
        })

示例#10

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

    def test_init(self):

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(
            sorted(tcheck.exons),
            sorted([(exon.start, exon.end) for exon in self.exons]))
        self.assertEqual(tcheck.fasta_seq, self.model_fasta)

示例#11

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_monoexonic_cds(self):

        # Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
        for strand in ("+", "-"):
            with self.subTest(strand=strand):
                model = self.model.copy()
                model.unfinalize()
                [model.remove_exon(exon) for exon in sorted(model.exons[1:])]
                exon = model.exons[0]
                model.add_exon((exon[0] + 2, exon[1]), feature="CDS")
                model.strand = strand
                model.finalize()
                self.assertTrue(model.is_coding)
                fasta = self.fasta[model.chrom][model.start - 1:model.end]
                tcheck = TranscriptChecker(model.copy(),
                                           fasta,
                                           strip_faulty_cds=False,
                                           strand_specific=False)
                tcheck.check_strand()
                self.assertTrue(tcheck.is_coding)
                self.assertEqual(tcheck.strand, strand)

示例#12

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_codon_finder_negative_2(self):
        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5766	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5338	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = self.genome[t.chrom][t.start - 1:t.end]
        logger = create_default_logger("test_codon_finder_negative_2",
                                       level="WARNING")
        self.assertTrue(t.has_start_codon)
        self.assertTrue(t.has_stop_codon)
        tc = TranscriptChecker(t, seq, logger=logger)
        tc.finalize()
        tc.check_orf()
        self.assertTrue(tc.is_coding)
        self.assertIn("has_stop_codon", tc.attributes)
        self.assertIn("has_start_codon", tc.attributes)
        self.assertFalse(tc.has_stop_codon)
        self.assertFalse(tc.has_start_codon)

示例#13

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_codon_finder_negative_3(self):

        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5769	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5335	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = self.genome[t.chrom][t.start - 1:t.end]
        correct_seq = "".join(
            """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG
ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG
GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG
TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC
CAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT
TCTTGCTCCATTTGA""".split("\n"))
        logger = create_default_logger("test_codon_finder_negative_3",
                                       level="WARNING")
        tc = TranscriptChecker(t, seq, logger=logger)
        tc.finalize()
        correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1)
        self.assertEqual(correct_length, len(correct_seq),
                         (correct_length, len(correct_seq)))
        self.assertEqual(tc.cdna_length, correct_length,
                         (correct_length, tc.cdna_length))
        self.assertEqual(len(tc.cdna), tc.cdna_length)
        self.assertEqual(correct_seq, tc.cdna)

        tc.check_orf()
        tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds())
        self.assertEqual(1, len(tc_orfs))
        self.assertTrue(tc_orfs[0].has_stop_codon,
                        (tc_orfs[0], tc_orfs[0].stop_codon))
        self.assertTrue(tc_orfs[0].has_start_codon,
                        (tc_orfs[0], tc_orfs[0].start_codon))

        self.assertTrue(tc.is_coding)
        self.assertIn("has_stop_codon", tc.attributes)
        self.assertIn("has_start_codon", tc.attributes)
        self.assertTrue(tc.has_start_codon, tc.cdna)
        self.assertTrue(tc.has_stop_codon, tc.cdna)

示例#14

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start -
                                                 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(),
                                             fasta_seq,
                                             strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")

示例#15

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")

示例#16

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_reverse_with_cds_negative(self):

        model = Transcript()
        model.chrom, model.start, model.end = "Chr5", 1251, 5043
        model.id, model.parent, model.strand = "AT5G01010.1", "AT5G01010", "-"
        model.add_exons([(4765, 5043), (4552, 4679), (4335, 4467),
                         (4102, 4258), (3927, 4005), (3762, 3802),
                         (3543, 3659), (3303, 3383), (2872, 2934),
                         (2748, 2799), (2435, 2509), (1914, 1961),
                         (1745, 1780), (1572, 1646), (1251, 1459)])
        model.add_exons([(4765, 4924), (4552, 4679), (4335, 4467),
                         (4102, 4258), (3927, 4005), (3762, 3802),
                         (3543, 3659), (3303, 3383), (2872, 2934),
                         (2748, 2799), (2435, 2509), (1914, 1961),
                         (1745, 1780), (1572, 1646), (1388, 1459)],
                        features="CDS")
        model.finalize()
        model_fasta = self.fasta["Chr5"][model.start - 1:model.end]
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=True)
        check_model.check_strand()
        self.assertEqual(check_model.strand, "-")
        self.assertGreater(check_model.combined_cds_length, 0)
        model.unfinalize()
        model.strand = "+"
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=True)
        check_model.check_strand()
        self.assertEqual(check_model.strand, "-")
        self.assertFalse(check_model.is_coding)
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=False)
        # Check that if we want to keep the CDS, this will raise an error
        with self.assertRaises(InvalidTranscript):
            check_model.check_strand()

示例#17

0

显示文件

文件： checking.py 项目： aseetharam/mikado

def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      is_reference=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT",
                                                                      "AC")),
                      force_keep_cds=False,
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded.
                           The intention is that this flag will mirror strip_cds.
    :type force_keep_cds: bool

    :param logger: optional logger to use during processing.

    :param is_reference: boolean. If set, the transcript's strand will not be checked.


    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger()

    if "tid" not in lines:
        logger.error("Lines datastore lacks the transcript ID. Exiting.")
        return None

    try:
        logger.debug("Starting with %s", lines["tid"])
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            coords, phases = [], []
            for feat in lines["features"][feature]:
                try:
                    assert isinstance(
                        feat, (list, tuple)) and 2 <= len(feat) <= 3, feat
                except AssertionError:
                    raise exceptions.InvalidTranscript("Invalid feature")
                coords.append((feat[0], feat[1]))
                if len(feat) == 3 and feat[2] in (0, 1, 2, None):
                    phases.append(feat[2])
                else:
                    phases.append(None)
            try:
                assert len(phases) == len(coords)
            except AssertionError:
                raise exceptions.InvalidTranscript("Invalid phases/coords")
            transcript_line.add_exons(coords, features=feature, phases=phases)

        transcript_object = TranscriptChecker(
            transcript_line,
            fasta_seq,
            lenient=lenient,
            strand_specific=strand_specific,
            canonical_splices=canonical_splices,
            force_keep_cds=force_keep_cds,
            is_reference=is_reference,
            logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
        transcript_object.check_orf()
    except exceptions.IncorrectStrandError:
        logger.info(
            "Discarded %s because of incorrect fusions of splice junctions",
            lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript as exc:
        logger.info(
            "Discarded generically invalid transcript %s, exception: %s",
            lines["tid"], exc)
        transcript_object = None
    except AssertionError as exc:
        logger.info("Validation failed on %s, assertion failure: %s",
                    lines["tid"], exc)
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    return transcript_object

示例#18

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_codon_finder_negative_strip_cds(self):
        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5769	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5335	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = str(self.genome[t.chrom][t.start - 1:t.end])
        # Basically insert an internal stop codon. This will make the ORF tests fail, leading to the ORF being stripped
        seq = seq[:72] + Bio.Seq.reverse_complement("TAG") + seq[75:]
        correct_seq = "".join(
            """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG
ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG
GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG
TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC
TAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT
TCTTGCTCCATTTGA""".split("\n"))
        logger = create_default_logger("test_codon_finder_negative_3",
                                       level="WARNING")

        with self.assertRaises(InvalidTranscript):
            for lenient in (False, True):
                tc = TranscriptChecker(t,
                                       seq,
                                       logger=logger,
                                       lenient=lenient,
                                       strip_faulty_cds=False)
                tc.finalize()
                tc.check_orf()

        for lenient in (False, True):
            tc = TranscriptChecker(t,
                                   seq,
                                   logger=logger,
                                   lenient=lenient,
                                   strip_faulty_cds=True)
            tc.finalize()
            correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1)
            self.assertEqual(correct_length, len(correct_seq),
                             (correct_length, len(correct_seq)))
            self.assertEqual(tc.cdna_length, correct_length,
                             (correct_length, tc.cdna_length))
            self.assertEqual(len(tc.cdna), tc.cdna_length)
            self.assertEqual(correct_seq, tc.cdna)

            tc.check_orf()
            self.assertFalse(tc.is_coding)
            tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds())
            self.assertEqual(1, len(tc_orfs))
            self.assertFalse(tc_orfs[0].has_stop_codon,
                             (tc_orfs[0], tc_orfs[0].stop_codon))
            self.assertFalse(tc_orfs[0].has_start_codon,
                             (tc_orfs[0], tc_orfs[0].start_codon))

            self.assertFalse(tc.is_coding)
            self.assertNotIn("has_stop_codon", tc.attributes)
            self.assertNotIn("has_start_codon", tc.attributes)
            self.assertFalse(tc.has_start_codon, tc.cdna)
            self.assertFalse(tc.has_stop_codon, tc.cdna)

示例#19

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_reverse_with_cds_positive(self):

        model = Transcript()
        model.chrom, model.start, model.end = "Chr5", 9930, 13235
        model.id, model.parent, model.strand = "AT5G01030.1", "AT5G01030", "+"
        model.add_exons([(9930, 10172), (10620, 12665), (12797, 13235)])
        model.add_exons([(10638, 12665), (12797, 13003)], features="CDS")
        model.finalize()
        model_fasta = self.fasta["Chr5"][model.start - 1:model.end]
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=True)
        check_model.check_strand()
        self.assertEqual(check_model.strand, "+")
        self.assertGreater(check_model.combined_cds_length, 0)
        model.unfinalize()
        model.strand = "-"
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=True)
        check_model.check_strand()
        self.assertEqual(check_model.strand, "+")
        self.assertFalse(check_model.is_coding)
        check_model = TranscriptChecker(model,
                                        model_fasta,
                                        strip_faulty_cds=False)
        with self.assertRaises(InvalidTranscript):
            check_model.check_strand()

示例#20

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_init(self):

        with self.assertRaises(ValueError):
            tcheck = TranscriptChecker(self.model, None)

        for wrong_splices in ["AGGT", None, 100]:
            with self.assertRaises(ValueError):
                tcheck = TranscriptChecker(self.model,
                                           self.model_fasta,
                                           canonical_splices=wrong_splices)

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        tcheck.finalize()
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(
            sorted(tcheck.exons),
            sorted([(exon[0], exon[1]) for exon in self.model.exons]))
        self.assertEqual(str(tcheck.fasta_seq.seq), self.model_fasta,
                         (type(tcheck.fasta_seq), type(self.model_fasta),
                          len(tcheck.fasta_seq), len(self.model_fasta)))

        with self.subTest(initializer=Bio.Seq.Seq):
            _ = TranscriptChecker(self.model,
                                  Bio.Seq.Seq(str(self.model_fasta)))

        with self.subTest(initializer=str):
            _ = TranscriptChecker(self.model, str(self.model_fasta))

        with self.subTest(initializer=pyfaidx.Sequence):
            _ = TranscriptChecker(
                self.model,
                pyfaidx.Sequence(seq=str(self.model_fasta), name=tcheck.id))

        # Now check initializing with a GFF/GTF line
        for out_format in ["gtf", "gff3"]:
            with self.subTest(out_format=out_format):
                line = self.model.format(out_format).split("\n")[0]
                try:
                    tcheck = TranscriptChecker(line, self.model_fasta)
                except ValueError as exc:
                    raise ValueError(line)

示例#21

0

显示文件

文件： test_transcript_checker.py 项目： Shabhonam/Mikado

    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1:model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")

示例#22

0

显示文件

def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT",
                                                                      "AC")),
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param logger: optional logger to use during processing.

    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger("checker")

    logger.debug("Starting with %s", lines["tid"])

    try:
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            transcript_line.add_exons(lines["features"][feature],
                                      features=feature)
        transcript_object = TranscriptChecker(
            transcript_line,
            fasta_seq,
            lenient=lenient,
            strand_specific=strand_specific,
            canonical_splices=canonical_splices,
            logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
    except exceptions.IncorrectStrandError:
        logger.info(
            "Discarded %s because of incorrect fusions of splice junctions",
            lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript:
        logger.info("Discarded generically invalid transcript %s",
                    lines["tid"])
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    logger.debug("Finished with %s", lines["tid"])

    return transcript_object

示例#23

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1: model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")

示例#24

0

显示文件

文件： checking.py 项目： Jamure/Mikado

def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"),
                                         ("GC", "AG"),
                                         ("AT", "AC")),
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param logger: optional logger to use during processing.

    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger("checker")

    logger.debug("Starting with %s", lines["tid"])

    try:
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            transcript_line.add_exons(lines["features"][feature],
                                      features=feature)
        transcript_object = TranscriptChecker(transcript_line,
                                              fasta_seq,
                                              lenient=lenient,
                                              strand_specific=strand_specific,
                                              canonical_splices=canonical_splices,
                                              logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
    except exceptions.IncorrectStrandError:
        logger.info("Discarded %s because of incorrect fusions of splice junctions",
                    lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript:
        logger.info("Discarded generically invalid transcript %s",
                    lines["tid"])
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    logger.debug("Finished with %s", lines["tid"])

    return transcript_object

示例#25

0

显示文件

文件： test_transcript_checker.py 项目： wook2014/mikado

    def test_monoexonic(self):

        self.model.unfinalize()
        for exon in sorted(self.model.exons)[1:]:
            self.model.remove_exon(exon)

        self.model.finalize()
        fasta = self.fasta[self.model.chrom][self.model.start -
                                             1:self.model.end]
        tcheck = TranscriptChecker(self.model.copy(),
                                   fasta,
                                   strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(self.model.copy(),
                                   fasta,
                                   strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = self.model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")

示例#26

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

 def test_check_reverse_strand(self):
     
     self.model.strand = "-"
     tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False)
     tcheck.check_strand()
     self.assertEqual(tcheck.strand, "+")

示例#27

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

    def test_rev_complement(self):

        string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC"

        self.assertEqual(TranscriptChecker.rev_complement(string),
                         "GCACTGTTGCACTTCGACNCTGCACGACT")

示例#28

0

显示文件

文件： test_transcript_checker.py 项目： Jamure/Mikado

    def test_translation_table(self):

        self.assertEqual(TranscriptChecker.get_translation_table(),
                         {65: 84, 67: 71, 71: 67, 84: 65})