def construct_gene_object(ensembl, transcript_id): """ creates an Transcript object for a gene from ensembl databases Args: ensembl: EnsemblRequest object to request data from ensembl transcript_id: string for an Ensembl transcript ID Returns: a Transcript object, containing transcript coordinates and gene and transcript sequence. Raises: ValueError if CDS from genomic sequence given gene coordinates and CDS retrieved from Ensembl do not match. """ # get the sequence for the identified transcript (chrom, start, end, strand, genomic_sequence) = ensembl.get_genomic_seq_for_transcript(transcript_id, expand=10) cds_sequence = ensembl.get_cds_seq_for_transcript(transcript_id) # get the locations of the exons and cds from ensembl cds_ranges = ensembl.get_cds_ranges_for_transcript(transcript_id) exon_ranges = ensembl.get_exon_ranges_for_transcript(transcript_id) # start a Transcript object with the locations and sequence transcript = Transcript(transcript_id, chrom, start, end, strand) transcript.set_exons(exon_ranges, cds_ranges) transcript.set_cds(cds_ranges) transcript.add_cds_sequence(cds_sequence) transcript.add_genomic_sequence(genomic_sequence, offset=10) return transcript
def construct_gene_object(ensembl, transcript_id): """ creates an Transcript object for a gene from ensembl databases Args: ensembl: EnsemblRequest object to request data from ensembl transcript_id: string for an Ensembl transcript ID Returns: a Transcript object, containing transcript coordinates and gene and transcript sequence. Raises: ValueError if CDS from genomic sequence given gene coordinates and CDS retrieved from Ensembl do not match. """ # get the sequence for the identified transcript (chrom, start, end, strand, genomic_sequence) = ensembl.get_genomic_seq_for_transcript(transcript_id, expand=10) cds_sequence = ensembl.get_cds_seq_for_transcript(transcript_id) # get the locations of the exons and cds from ensembl cds_ranges = ensembl.get_cds_ranges_for_transcript(transcript_id) exon_ranges = ensembl.get_exon_ranges_for_transcript(transcript_id) # start a Transcript object with the locations and sequence transcript = Transcript(transcript_id, chrom, start, end, strand) transcript.set_exons(exon_ranges, cds_ranges) transcript.set_cds(cds_ranges) transcript.add_cds_sequence(cds_sequence) transcript.add_genomic_sequence(genomic_sequence, offset=10) return transcript
async def construct_gene_object(ensembl, transcript_id): """ creates an Transcript object for a gene from ensembl databases Args: ensembl: EnsemblRequest object to request data from ensembl transcript_id: string for an Ensembl transcript ID Returns: a Transcript object, containing transcript coordinates and gene and transcript sequence. Raises: ValueError if CDS from genomic sequence given gene coordinates and CDS retrieved from Ensembl do not match. """ tasks = [ get_genomic_seq_for_transcript(ensembl, transcript_id, expand=10), get_cds_seq_for_transcript(ensembl, transcript_id), get_cds_ranges_for_transcript(ensembl, transcript_id), get_exon_ranges_for_transcript(ensembl, transcript_id) ] (chrom, start, end, strand, genomic), cds_seq, cds, exons = await asyncio.gather(*tasks) # start a Transcript object with the locations and sequence transcript = Transcript(transcript_id, chrom, start, end, strand) transcript.set_exons(exons, cds) transcript.set_cds(cds) transcript.add_cds_sequence(cds_seq) transcript.add_genomic_sequence(genomic, offset=10) return transcript
def test_add_genomic_sequencE_without_cds_coords(self): """ test that error is raised if we add gDNA without CDS coords """ a = Transcript("a", "1", 10, 20, "+") a.set_exons([(10, 20)], [(10, 20)]) with self.assertRaises(ValueError): a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5) a.set_cds([(10, 20)]) a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
def test_add_genomic_sequencE_without_cds_coords(self): """ test that error is raised if we add gDNA without CDS coords """ a = Transcript("a", "1", 10, 20, "+") a.set_exons([(10, 20)], [(10, 20)]) with self.assertRaises(ValueError): a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5) a.set_cds([(10, 20)]) a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
def test___add__not_overlapping(self): ''' test that __add__() works correctly when transcripts do not overlap ''' a = Transcript("a", "1", 10, 50, "+") b = Transcript("b", "1", 60, 80, "+") a.set_exons([(10, 50)], [(10, 50)]) a.set_cds([(10, 50)]) a.add_genomic_sequence('N' * 40) b.set_exons([(60, 80)], [(60, 80)]) b.set_cds([(60, 80)]) b.add_genomic_sequence('N' * 20) self.assertEqual(len((a + b).get_genomic_sequence()), 70)
def construct_gene(self): chrom = "1" name = "TEST" strand = "+" start = 0 end = 70 exons = [(5, 58)] cds = [(5, 58)] transcript = Transcript(name, chrom, start, end, strand) transcript.set_exons(exons, cds) transcript.set_cds(cds) cds = "ATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAA" gdna = "GGGGGATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAACCAGGCCCC" transcript.add_cds_sequence(cds) transcript.add_genomic_sequence(gdna) return transcript
def test___add__cds_length_fixed(self): """ check that we can merge transcripts, even with fixed CDS coords """ a = Transcript("a", "1", 10, 20, "+") a.set_exons([(10, 20)], [(10, 20)]) a.set_cds([(10, 20)]) a.add_cds_sequence('ACTGTACGCAT') a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5) b = Transcript("b", "1", 0, 10, "+") b.set_exons([(0, 10)], [(0, 10)]) b.set_cds([(0, 10)]) b.add_cds_sequence('ACTGTACGCAT') b.add_genomic_sequence('CGTAGACTGTACGCATCGTAG', offset=5) # without a fix to tx.cpp to adjust an exon coordinate simultaneously, # the line below would give an error. c = a + b
def set_transcript(self): """ construct a transcript for a known gene """ exon_ranges=[(120933859, 120934019), (120934219, 120934356), (120935876, 120936296)] cds_ranges=[(120934225, 120934356), (120935876, 120936013)] expected = Transcript("ENST00000242577", '12', 120933859, 120936296, "+") expected.set_exons(exon_ranges, cds_ranges) expected.set_cds(cds_ranges) cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \ "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \ "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \ "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \ "GTTCAAATCTGGTTAA" genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \ "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \ "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \ "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \ "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \ "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \ "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \ "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \ "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \ "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \ "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \ "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \ "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \ "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \ "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \ "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \ "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \ "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \ "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \ "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \ "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \ "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \ "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \ "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \ "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \ "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \ "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \ "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \ "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \ "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \ "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \ "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \ "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \ "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \ "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \ "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \ "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \ "GATGCAGATGTGTATGTGTGTG" expected.add_cds_sequence(cds) expected.add_genomic_sequence(genomic, offset=10) return expected
def set_transcript(self): """ construct a transcript for a known gene """ exon_ranges = [(120933859, 120934019), (120934219, 120934356), (120935876, 120936296)] cds_ranges = [(120934225, 120934356), (120935876, 120936013)] expected = Transcript("ENST00000242577", '12', 120933859, 120936296, "+") expected.set_exons(exon_ranges, cds_ranges) expected.set_cds(cds_ranges) cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \ "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \ "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \ "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \ "GTTCAAATCTGGTTAA" genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \ "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \ "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \ "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \ "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \ "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \ "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \ "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \ "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \ "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \ "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \ "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \ "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \ "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \ "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \ "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \ "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \ "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \ "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \ "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \ "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \ "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \ "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \ "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \ "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \ "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \ "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \ "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \ "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \ "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \ "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \ "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \ "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \ "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \ "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \ "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \ "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \ "GATGCAGATGTGTATGTGTGTG" expected.add_cds_sequence(cds) expected.add_genomic_sequence(genomic, offset=10) return expected