def test_check_strand_not_reversed(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-") self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"]) self.assertTrue(tcheck.suspicious_splicing)
def test_check_reverse_strand(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False) tcheck.check_strand() self.assertEqual(tcheck.strand, "+")
def test_rev_complement(self): string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC" self.assertEqual(TranscriptChecker.rev_complement(string), "GCACTGTTGCACTTCGACNCTGCACGACT") string = "agtcGTGCAGNGTCGAAGTGCAACAgtgc" self.assertEqual(TranscriptChecker.rev_complement(string), "gcacTGTTGCACTTCGACNCTGCACgact")
def test_reference_not_flipped(self): model = Transcript() model.chrom, model.start, model.end = "Chr5", 9930, 13235 model.id, model.parent, model.strand = "AT5G01030.1", "AT5G01030", "-" model.add_exons([(9930, 10172), (10620, 12665), (12797, 13235)]) model.add_exons([(10638, 12665), (12797, 13003)], features="CDS") model.finalize() model_fasta = self.fasta["Chr5"][model.start - 1:model.end] check_model = TranscriptChecker(model, model_fasta, is_reference=True) check_model.check_strand() self.assertEqual(check_model.strand, "-") self.assertGreater(check_model.combined_cds_length, 0)
def test_check_reverse_strand(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False) logger = create_default_logger("test_check_reverse_strand", "DEBUG") tcheck.logger = logger self.assertFalse(tcheck.strand_specific) self.assertFalse(tcheck.is_reference) self.assertFalse(tcheck.lenient) tcheck.check_strand() self.assertEqual(tcheck.strand, "+")
def test_sequence_reversed(self): model = Transcript() model.chrom, model.start, model.end, model.strand = "Chr5", 1001, 1500, "+" model.add_exon((1001, 1500)) model.id, model.parent = "foo.1", "foo" model.finalize() seq = self.fasta.fetch("Chr5", 1001 - 1, 1500) self.assertEqual(len(seq), len(model)) model = TranscriptChecker(model, seq, strand_specific=True) model.reverse_strand() fasta = "".join(model.fasta.split("\n")[1:]) self.assertEqual(model.strand, "-") self.assertEqual(fasta, TranscriptChecker.rev_complement(seq))
def test_negative(self): transcript = Transcript() transcript.chrom, transcript.start, transcript.end, transcript.strand = "Chr5", 26575364, 26578163, "-" transcript.id, transcript.parent = "cufflinks_star_at.23553.1", "cufflinks_star_at.23553" transcript.add_exons([(26575364, 26575410), (26575495, 26575620), (26575711, 26575797), (26575885, 26575944), (26576035, 26576134), (26576261, 26577069), (26577163, 26577288), (26577378, 26577449), (26577856, 26578163)]) transcript.finalize() fasta_seq = self.fasta.fetch(transcript.chrom, transcript.start - 1, transcript.end) tr_neg = transcript.copy() tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False) self.assertEqual(tchecker.strand, "-") self.assertEqual(str(tchecker.fasta_seq.seq), fasta_seq) tchecker.check_strand() self.assertEqual(tchecker.strand, "-") tr_neg = transcript.copy() tr_neg.strand = "+" for ss in (False, True): with self.subTest(ss=ss): tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss) tchecker.check_strand() self.assertEqual(tchecker.strand, "-")
def test_translation_table(self): self.assertEqual(TranscriptChecker.get_translation_table(), { 65: 84, 67: 71, 71: 67, 84: 65 })
def test_init(self): tcheck = TranscriptChecker(self.model, self.model_fasta) self.assertEqual(tcheck.cdna_length, 1718) self.assertEqual( sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons])) self.assertEqual(tcheck.fasta_seq, self.model_fasta)
def test_monoexonic_cds(self): # Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 for strand in ("+", "-"): with self.subTest(strand=strand): model = self.model.copy() model.unfinalize() [model.remove_exon(exon) for exon in sorted(model.exons[1:])] exon = model.exons[0] model.add_exon((exon[0] + 2, exon[1]), feature="CDS") model.strand = strand model.finalize() self.assertTrue(model.is_coding) fasta = self.fasta[model.chrom][model.start - 1:model.end] tcheck = TranscriptChecker(model.copy(), fasta, strip_faulty_cds=False, strand_specific=False) tcheck.check_strand() self.assertTrue(tcheck.is_coding) self.assertEqual(tcheck.strand, strand)
def test_codon_finder_negative_2(self): gtf_lines = """Chr5 TAIR10 mRNA 5335 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5697 5766 . - 0 transcript_id "AT5G01015.1"; gene_id "AT5G01015";; Chr5 TAIR10 exon 5697 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5338 5576 . - 1 transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 exon 5335 5576 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015";""" gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")] t = Transcript(gtf_lines[0]) t.add_exons(gtf_lines[1:]) t.finalize() seq = self.genome[t.chrom][t.start - 1:t.end] logger = create_default_logger("test_codon_finder_negative_2", level="WARNING") self.assertTrue(t.has_start_codon) self.assertTrue(t.has_stop_codon) tc = TranscriptChecker(t, seq, logger=logger) tc.finalize() tc.check_orf() self.assertTrue(tc.is_coding) self.assertIn("has_stop_codon", tc.attributes) self.assertIn("has_start_codon", tc.attributes) self.assertFalse(tc.has_stop_codon) self.assertFalse(tc.has_start_codon)
def test_codon_finder_negative_3(self): gtf_lines = """Chr5 TAIR10 mRNA 5335 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5697 5769 . - 0 transcript_id "AT5G01015.1"; gene_id "AT5G01015";; Chr5 TAIR10 exon 5697 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5335 5576 . - 1 transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 exon 5335 5576 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015";""" gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")] t = Transcript(gtf_lines[0]) t.add_exons(gtf_lines[1:]) t.finalize() seq = self.genome[t.chrom][t.start - 1:t.end] correct_seq = "".join( """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC CAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT TCTTGCTCCATTTGA""".split("\n")) logger = create_default_logger("test_codon_finder_negative_3", level="WARNING") tc = TranscriptChecker(t, seq, logger=logger) tc.finalize() correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1) self.assertEqual(correct_length, len(correct_seq), (correct_length, len(correct_seq))) self.assertEqual(tc.cdna_length, correct_length, (correct_length, tc.cdna_length)) self.assertEqual(len(tc.cdna), tc.cdna_length) self.assertEqual(correct_seq, tc.cdna) tc.check_orf() tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds()) self.assertEqual(1, len(tc_orfs)) self.assertTrue(tc_orfs[0].has_stop_codon, (tc_orfs[0], tc_orfs[0].stop_codon)) self.assertTrue(tc_orfs[0].has_start_codon, (tc_orfs[0], tc_orfs[0].start_codon)) self.assertTrue(tc.is_coding) self.assertIn("has_stop_codon", tc.attributes) self.assertIn("has_start_codon", tc.attributes) self.assertTrue(tc.has_start_codon, tc.cdna) self.assertTrue(tc.has_stop_codon, tc.cdna)
def test_negative(self): gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403"; Chr5 Cufflinks exon 26575364 26575410 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575495 26575620 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575711 26575797 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575885 26575944 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576035 26576134 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576261 26577069 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577163 26577288 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577378 26577449 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577856 26578163 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";""" gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")] self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0) transcript = Transcript(gtf_lines[0]) transcript.add_exons(gtf_lines[1:]) transcript.finalize() fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end] tr_neg = transcript.copy() tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False) self.assertEqual(tchecker.strand, "-") self.assertEqual(tchecker.fasta_seq, fasta_seq) tchecker.check_strand() self.assertEqual(tchecker.strand, "-") tr_neg = transcript.copy() tr_neg.strand = "+" for ss in (False, True): with self.subTest(ss=ss): tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss) tchecker.check_strand() if ss: self.assertEqual(tchecker.strand, "+") self.assertTrue(tchecker.suspicious_splicing) else: self.assertEqual(tchecker.strand, "-")
def test_reverse_with_cds_negative(self): model = Transcript() model.chrom, model.start, model.end = "Chr5", 1251, 5043 model.id, model.parent, model.strand = "AT5G01010.1", "AT5G01010", "-" model.add_exons([(4765, 5043), (4552, 4679), (4335, 4467), (4102, 4258), (3927, 4005), (3762, 3802), (3543, 3659), (3303, 3383), (2872, 2934), (2748, 2799), (2435, 2509), (1914, 1961), (1745, 1780), (1572, 1646), (1251, 1459)]) model.add_exons([(4765, 4924), (4552, 4679), (4335, 4467), (4102, 4258), (3927, 4005), (3762, 3802), (3543, 3659), (3303, 3383), (2872, 2934), (2748, 2799), (2435, 2509), (1914, 1961), (1745, 1780), (1572, 1646), (1388, 1459)], features="CDS") model.finalize() model_fasta = self.fasta["Chr5"][model.start - 1:model.end] check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=True) check_model.check_strand() self.assertEqual(check_model.strand, "-") self.assertGreater(check_model.combined_cds_length, 0) model.unfinalize() model.strand = "+" check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=True) check_model.check_strand() self.assertEqual(check_model.strand, "-") self.assertFalse(check_model.is_coding) check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=False) # Check that if we want to keep the CDS, this will raise an error with self.assertRaises(InvalidTranscript): check_model.check_strand()
def create_transcript(lines, fasta_seq, start, end, lenient=False, is_reference=False, strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), force_keep_cds=False, logger=None): """Function to create the checker. :param lines: all the exon lines for an object :type lines: dict :param fasta_seq: genomic sequence of the transcript :param start: start position for the transcript :type start: int :param end: end position for the transcript :type end: int :type lenient: bool :type strand_specific: bool :param canonical_splices: the splices considered as canonical for the species. :type canonical_splices: list[tuple] :param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded. The intention is that this flag will mirror strip_cds. :type force_keep_cds: bool :param logger: optional logger to use during processing. :param is_reference: boolean. If set, the transcript's strand will not be checked. :rtype: (None|TranscriptChecker) """ if logger is None: logger = create_null_logger() if "tid" not in lines: logger.error("Lines datastore lacks the transcript ID. Exiting.") return None try: logger.debug("Starting with %s", lines["tid"]) transcript_line = Transcript() transcript_line.chrom = lines["chrom"] if "source" in lines: transcript_line.source = lines["source"] transcript_line.strand = lines["strand"] transcript_line.attributes.update(lines["attributes"]) transcript_line.feature = "transcript" transcript_line.start, transcript_line.end = sorted([start, end]) transcript_line.logger = logger assert lines["tid"] is not None, lines transcript_line.id = lines["tid"] transcript_line.parent = lines["parent"] for feature in lines["features"]: coords, phases = [], [] for feat in lines["features"][feature]: try: assert isinstance( feat, (list, tuple)) and 2 <= len(feat) <= 3, feat except AssertionError: raise exceptions.InvalidTranscript("Invalid feature") coords.append((feat[0], feat[1])) if len(feat) == 3 and feat[2] in (0, 1, 2, None): phases.append(feat[2]) else: phases.append(None) try: assert len(phases) == len(coords) except AssertionError: raise exceptions.InvalidTranscript("Invalid phases/coords") transcript_line.add_exons(coords, features=feature, phases=phases) transcript_object = TranscriptChecker( transcript_line, fasta_seq, lenient=lenient, strand_specific=strand_specific, canonical_splices=canonical_splices, force_keep_cds=force_keep_cds, is_reference=is_reference, logger=logger) logger.debug("Finished adding exon lines to %s", lines["tid"]) transcript_object.finalize() transcript_object.check_strand() transcript_object.check_orf() except exceptions.IncorrectStrandError: logger.info( "Discarded %s because of incorrect fusions of splice junctions", lines["tid"]) # logger.exception(exc) transcript_object = None except exceptions.InvalidTranscript as exc: logger.info( "Discarded generically invalid transcript %s, exception: %s", lines["tid"], exc) transcript_object = None except AssertionError as exc: logger.info("Validation failed on %s, assertion failure: %s", lines["tid"], exc) transcript_object = None except KeyboardInterrupt: raise KeyboardInterrupt except Exception as exc: logger.exception(exc) transcript_object = None return transcript_object
def test_codon_finder_negative_strip_cds(self): gtf_lines = """Chr5 TAIR10 mRNA 5335 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5697 5769 . - 0 transcript_id "AT5G01015.1"; gene_id "AT5G01015";; Chr5 TAIR10 exon 5697 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 CDS 5335 5576 . - 1 transcript_id "AT5G01015.1"; gene_id "AT5G01015"; Chr5 TAIR10 exon 5335 5576 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015";""" gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")] t = Transcript(gtf_lines[0]) t.add_exons(gtf_lines[1:]) t.finalize() seq = str(self.genome[t.chrom][t.start - 1:t.end]) # Basically insert an internal stop codon. This will make the ORF tests fail, leading to the ORF being stripped seq = seq[:72] + Bio.Seq.reverse_complement("TAG") + seq[75:] correct_seq = "".join( """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC TAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT TCTTGCTCCATTTGA""".split("\n")) logger = create_default_logger("test_codon_finder_negative_3", level="WARNING") with self.assertRaises(InvalidTranscript): for lenient in (False, True): tc = TranscriptChecker(t, seq, logger=logger, lenient=lenient, strip_faulty_cds=False) tc.finalize() tc.check_orf() for lenient in (False, True): tc = TranscriptChecker(t, seq, logger=logger, lenient=lenient, strip_faulty_cds=True) tc.finalize() correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1) self.assertEqual(correct_length, len(correct_seq), (correct_length, len(correct_seq))) self.assertEqual(tc.cdna_length, correct_length, (correct_length, tc.cdna_length)) self.assertEqual(len(tc.cdna), tc.cdna_length) self.assertEqual(correct_seq, tc.cdna) tc.check_orf() self.assertFalse(tc.is_coding) tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds()) self.assertEqual(1, len(tc_orfs)) self.assertFalse(tc_orfs[0].has_stop_codon, (tc_orfs[0], tc_orfs[0].stop_codon)) self.assertFalse(tc_orfs[0].has_start_codon, (tc_orfs[0], tc_orfs[0].start_codon)) self.assertFalse(tc.is_coding) self.assertNotIn("has_stop_codon", tc.attributes) self.assertNotIn("has_start_codon", tc.attributes) self.assertFalse(tc.has_start_codon, tc.cdna) self.assertFalse(tc.has_stop_codon, tc.cdna)
def test_reverse_with_cds_positive(self): model = Transcript() model.chrom, model.start, model.end = "Chr5", 9930, 13235 model.id, model.parent, model.strand = "AT5G01030.1", "AT5G01030", "+" model.add_exons([(9930, 10172), (10620, 12665), (12797, 13235)]) model.add_exons([(10638, 12665), (12797, 13003)], features="CDS") model.finalize() model_fasta = self.fasta["Chr5"][model.start - 1:model.end] check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=True) check_model.check_strand() self.assertEqual(check_model.strand, "+") self.assertGreater(check_model.combined_cds_length, 0) model.unfinalize() model.strand = "-" check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=True) check_model.check_strand() self.assertEqual(check_model.strand, "+") self.assertFalse(check_model.is_coding) check_model = TranscriptChecker(model, model_fasta, strip_faulty_cds=False) with self.assertRaises(InvalidTranscript): check_model.check_strand()
def test_init(self): with self.assertRaises(ValueError): tcheck = TranscriptChecker(self.model, None) for wrong_splices in ["AGGT", None, 100]: with self.assertRaises(ValueError): tcheck = TranscriptChecker(self.model, self.model_fasta, canonical_splices=wrong_splices) tcheck = TranscriptChecker(self.model, self.model_fasta) tcheck.finalize() self.assertEqual(tcheck.cdna_length, 1718) self.assertEqual( sorted(tcheck.exons), sorted([(exon[0], exon[1]) for exon in self.model.exons])) self.assertEqual(str(tcheck.fasta_seq.seq), self.model_fasta, (type(tcheck.fasta_seq), type(self.model_fasta), len(tcheck.fasta_seq), len(self.model_fasta))) with self.subTest(initializer=Bio.Seq.Seq): _ = TranscriptChecker(self.model, Bio.Seq.Seq(str(self.model_fasta))) with self.subTest(initializer=str): _ = TranscriptChecker(self.model, str(self.model_fasta)) with self.subTest(initializer=pyfaidx.Sequence): _ = TranscriptChecker( self.model, pyfaidx.Sequence(seq=str(self.model_fasta), name=tcheck.id)) # Now check initializing with a GFF/GTF line for out_format in ["gtf", "gff3"]: with self.subTest(out_format=out_format): line = self.model.format(out_format).split("\n")[0] try: tcheck = TranscriptChecker(line, self.model_fasta) except ValueError as exc: raise ValueError(line)
def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1:model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def create_transcript(lines, fasta_seq, start, end, lenient=False, strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), logger=None): """Function to create the checker. :param lines: all the exon lines for an object :type lines: dict :param fasta_seq: genomic sequence of the transcript :param start: start position for the transcript :type start: int :param end: end position for the transcript :type end: int :type lenient: bool :type strand_specific: bool :param canonical_splices: the splices considered as canonical for the species. :type canonical_splices: list[tuple] :param logger: optional logger to use during processing. :rtype: (None|TranscriptChecker) """ if logger is None: logger = create_null_logger("checker") logger.debug("Starting with %s", lines["tid"]) try: transcript_line = Transcript() transcript_line.chrom = lines["chrom"] if "source" in lines: transcript_line.source = lines["source"] transcript_line.strand = lines["strand"] transcript_line.attributes.update(lines["attributes"]) transcript_line.feature = "transcript" transcript_line.start, transcript_line.end = sorted([start, end]) transcript_line.logger = logger assert lines["tid"] is not None, lines transcript_line.id = lines["tid"] transcript_line.parent = lines["parent"] for feature in lines["features"]: transcript_line.add_exons(lines["features"][feature], features=feature) transcript_object = TranscriptChecker( transcript_line, fasta_seq, lenient=lenient, strand_specific=strand_specific, canonical_splices=canonical_splices, logger=logger) logger.debug("Finished adding exon lines to %s", lines["tid"]) transcript_object.finalize() transcript_object.check_strand() except exceptions.IncorrectStrandError: logger.info( "Discarded %s because of incorrect fusions of splice junctions", lines["tid"]) # logger.exception(exc) transcript_object = None except exceptions.InvalidTranscript: logger.info("Discarded generically invalid transcript %s", lines["tid"]) transcript_object = None except KeyboardInterrupt: raise KeyboardInterrupt except Exception as exc: logger.exception(exc) transcript_object = None logger.debug("Finished with %s", lines["tid"]) return transcript_object
def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1: model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def create_transcript(lines, fasta_seq, start, end, lenient=False, strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), logger=None): """Function to create the checker. :param lines: all the exon lines for an object :type lines: dict :param fasta_seq: genomic sequence of the transcript :param start: start position for the transcript :type start: int :param end: end position for the transcript :type end: int :type lenient: bool :type strand_specific: bool :param canonical_splices: the splices considered as canonical for the species. :type canonical_splices: list[tuple] :param logger: optional logger to use during processing. :rtype: (None|TranscriptChecker) """ if logger is None: logger = create_null_logger("checker") logger.debug("Starting with %s", lines["tid"]) try: transcript_line = Transcript() transcript_line.chrom = lines["chrom"] if "source" in lines: transcript_line.source = lines["source"] transcript_line.strand = lines["strand"] transcript_line.attributes.update(lines["attributes"]) transcript_line.feature = "transcript" transcript_line.start, transcript_line.end = sorted([start, end]) transcript_line.logger = logger assert lines["tid"] is not None, lines transcript_line.id = lines["tid"] transcript_line.parent = lines["parent"] for feature in lines["features"]: transcript_line.add_exons(lines["features"][feature], features=feature) transcript_object = TranscriptChecker(transcript_line, fasta_seq, lenient=lenient, strand_specific=strand_specific, canonical_splices=canonical_splices, logger=logger) logger.debug("Finished adding exon lines to %s", lines["tid"]) transcript_object.finalize() transcript_object.check_strand() except exceptions.IncorrectStrandError: logger.info("Discarded %s because of incorrect fusions of splice junctions", lines["tid"]) # logger.exception(exc) transcript_object = None except exceptions.InvalidTranscript: logger.info("Discarded generically invalid transcript %s", lines["tid"]) transcript_object = None except KeyboardInterrupt: raise KeyboardInterrupt except Exception as exc: logger.exception(exc) transcript_object = None logger.debug("Finished with %s", lines["tid"]) return transcript_object
def test_monoexonic(self): self.model.unfinalize() for exon in sorted(self.model.exons)[1:]: self.model.remove_exon(exon) self.model.finalize() fasta = self.fasta[self.model.chrom][self.model.start - 1:self.model.end] tcheck = TranscriptChecker(self.model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(self.model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = self.model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def test_rev_complement(self): string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC" self.assertEqual(TranscriptChecker.rev_complement(string), "GCACTGTTGCACTTCGACNCTGCACGACT")
def test_translation_table(self): self.assertEqual(TranscriptChecker.get_translation_table(), {65: 84, 67: 71, 71: 67, 84: 65})