示例#1
0
def construct_gene_object(ensembl, transcript_id):
    """ creates an Transcript object for a gene from ensembl databases
    
    Args:
        ensembl: EnsemblRequest object to request data from ensembl
        transcript_id: string for an Ensembl transcript ID
    
    Returns:
        a Transcript object, containing transcript coordinates and gene and
        transcript sequence.
    
    Raises:
        ValueError if CDS from genomic sequence given gene coordinates and CDS
        retrieved from Ensembl do not match.
    """
    
    # get the sequence for the identified transcript
    (chrom, start, end, strand, genomic_sequence) = ensembl.get_genomic_seq_for_transcript(transcript_id, expand=10)
    cds_sequence = ensembl.get_cds_seq_for_transcript(transcript_id)
    
    # get the locations of the exons and cds from ensembl
    cds_ranges = ensembl.get_cds_ranges_for_transcript(transcript_id)
    exon_ranges = ensembl.get_exon_ranges_for_transcript(transcript_id)
    
    # start a Transcript object with the locations and sequence
    transcript = Transcript(transcript_id, chrom, start, end, strand)
    transcript.set_exons(exon_ranges, cds_ranges)
    transcript.set_cds(cds_ranges)
    
    transcript.add_cds_sequence(cds_sequence)
    transcript.add_genomic_sequence(genomic_sequence, offset=10)
    
    return transcript
示例#2
0
def construct_gene_object(ensembl, transcript_id):
    """ creates an Transcript object for a gene from ensembl databases
    
    Args:
        ensembl: EnsemblRequest object to request data from ensembl
        transcript_id: string for an Ensembl transcript ID
    
    Returns:
        a Transcript object, containing transcript coordinates and gene and
        transcript sequence.
    
    Raises:
        ValueError if CDS from genomic sequence given gene coordinates and CDS
        retrieved from Ensembl do not match.
    """

    # get the sequence for the identified transcript
    (chrom, start, end, strand,
     genomic_sequence) = ensembl.get_genomic_seq_for_transcript(transcript_id,
                                                                expand=10)
    cds_sequence = ensembl.get_cds_seq_for_transcript(transcript_id)

    # get the locations of the exons and cds from ensembl
    cds_ranges = ensembl.get_cds_ranges_for_transcript(transcript_id)
    exon_ranges = ensembl.get_exon_ranges_for_transcript(transcript_id)

    # start a Transcript object with the locations and sequence
    transcript = Transcript(transcript_id, chrom, start, end, strand)
    transcript.set_exons(exon_ranges, cds_ranges)
    transcript.set_cds(cds_ranges)

    transcript.add_cds_sequence(cds_sequence)
    transcript.add_genomic_sequence(genomic_sequence, offset=10)

    return transcript
示例#3
0
async def construct_gene_object(ensembl, transcript_id):
    """ creates an Transcript object for a gene from ensembl databases
    
    Args:
        ensembl: EnsemblRequest object to request data from ensembl
        transcript_id: string for an Ensembl transcript ID
    
    Returns:
        a Transcript object, containing transcript coordinates and gene and
        transcript sequence.
    
    Raises:
        ValueError if CDS from genomic sequence given gene coordinates and CDS
        retrieved from Ensembl do not match.
    """
    tasks = [
        get_genomic_seq_for_transcript(ensembl, transcript_id, expand=10),
        get_cds_seq_for_transcript(ensembl, transcript_id),
        get_cds_ranges_for_transcript(ensembl, transcript_id),
        get_exon_ranges_for_transcript(ensembl, transcript_id)
    ]
    (chrom, start, end, strand,
     genomic), cds_seq, cds, exons = await asyncio.gather(*tasks)

    # start a Transcript object with the locations and sequence
    transcript = Transcript(transcript_id, chrom, start, end, strand)
    transcript.set_exons(exons, cds)
    transcript.set_cds(cds)
    transcript.add_cds_sequence(cds_seq)
    transcript.add_genomic_sequence(genomic, offset=10)

    return transcript
示例#4
0
    def test_add_genomic_sequencE_without_cds_coords(self):
        """ test that error is raised if we add gDNA without CDS coords
        """

        a = Transcript("a", "1", 10, 20, "+")
        a.set_exons([(10, 20)], [(10, 20)])

        with self.assertRaises(ValueError):
            a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)

        a.set_cds([(10, 20)])
        a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
 def test_add_genomic_sequencE_without_cds_coords(self):
     """ test that error is raised if we add gDNA without CDS coords
     """
     
     a = Transcript("a", "1", 10, 20, "+")
     a.set_exons([(10, 20)], [(10, 20)])
     
     with self.assertRaises(ValueError):
         a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
     
     a.set_cds([(10, 20)])
     a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
示例#6
0
 def test___add__not_overlapping(self):
     ''' test that __add__() works correctly when transcripts do not overlap
     '''
     
     a = Transcript("a", "1", 10, 50, "+")
     b = Transcript("b", "1", 60, 80, "+")
     
     a.set_exons([(10, 50)], [(10, 50)])
     a.set_cds([(10, 50)])
     a.add_genomic_sequence('N' * 40)
     
     b.set_exons([(60, 80)], [(60, 80)])
     b.set_cds([(60, 80)])
     b.add_genomic_sequence('N' * 20)
     
     self.assertEqual(len((a + b).get_genomic_sequence()), 70)
示例#7
0
 def construct_gene(self):
     
     chrom = "1"
     name = "TEST"
     strand = "+"
     start = 0
     end = 70
     exons = [(5, 58)]
     cds = [(5, 58)]
     transcript = Transcript(name, chrom, start, end, strand)
     transcript.set_exons(exons, cds)
     transcript.set_cds(cds)
     
     cds = "ATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAA"
     gdna = "GGGGGATGTGGGCTCCACCAGCAGCAATCATGGGATGGGCCCACCAAGAAGGTGGGTAACCAGGCCCC"
     transcript.add_cds_sequence(cds)
     transcript.add_genomic_sequence(gdna)
     
     return transcript
示例#8
0
 def test___add__cds_length_fixed(self):
     """ check that we can merge transcripts, even with fixed CDS coords
     """
     
     a = Transcript("a", "1", 10, 20, "+")
     a.set_exons([(10, 20)], [(10, 20)])
     a.set_cds([(10, 20)])
     
     a.add_cds_sequence('ACTGTACGCAT')
     a.add_genomic_sequence('CGTAGACTGTACGCATCGATT', offset=5)
     
     b = Transcript("b", "1", 0, 10, "+")
     b.set_exons([(0, 10)], [(0, 10)])
     b.set_cds([(0, 10)])
     
     b.add_cds_sequence('ACTGTACGCAT')
     b.add_genomic_sequence('CGTAGACTGTACGCATCGTAG', offset=5)
     
     # without a fix to tx.cpp to adjust an exon coordinate simultaneously,
     # the line below would give an error.
     c = a + b
示例#9
0
 def set_transcript(self):
     """ construct a transcript for a known gene
     """
     
     exon_ranges=[(120933859, 120934019), (120934219, 120934356),
         (120935876, 120936296)]
     cds_ranges=[(120934225, 120934356), (120935876, 120936013)]
     
     expected = Transcript("ENST00000242577", '12', 120933859, 120936296, "+")
     expected.set_exons(exon_ranges, cds_ranges)
     expected.set_cds(cds_ranges)
     
     cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \
         "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \
         "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \
         "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \
         "GTTCAAATCTGGTTAA"
     
     genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \
         "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \
         "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \
         "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \
         "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \
         "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \
         "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \
         "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \
         "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \
         "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \
         "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \
         "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \
         "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \
         "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \
         "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \
         "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \
         "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \
         "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \
         "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \
         "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \
         "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \
         "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \
         "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \
         "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \
         "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \
         "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \
         "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \
         "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \
         "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \
         "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \
         "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \
         "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \
         "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \
         "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \
         "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \
         "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \
         "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \
         "GATGCAGATGTGTATGTGTGTG"
     
     expected.add_cds_sequence(cds)
     expected.add_genomic_sequence(genomic, offset=10)
     
     return expected
示例#10
0
    def set_transcript(self):
        """ construct a transcript for a known gene
        """

        exon_ranges = [(120933859, 120934019), (120934219, 120934356),
                       (120935876, 120936296)]
        cds_ranges = [(120934225, 120934356), (120935876, 120936013)]

        expected = Transcript("ENST00000242577", '12', 120933859, 120936296,
                              "+")
        expected.set_exons(exon_ranges, cds_ranges)
        expected.set_cds(cds_ranges)

        cds = "ATGTGCGACCGAAAGGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTC" \
            "GGTGGAGTGCGCTACTCAGGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATC" \
            "AAGAAGGAATTTGACAAGAAGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTA" \
            "GTTATGTGACACATGAAACCAAACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCT" \
            "GTTCAAATCTGGTTAA"

        genomic = "GGGCGGGGCCGGCGGGAGCAGGGCGGGGCCTGAGCACTAGGCGGCGGCGGCTGGCGTGGG" \
            "GCTGCTTAGATGCGCCACGGTTTCGGTAGCGACGGTATCTCTAGCCGGGCCTGAGCTGTGCTAGCA" \
            "CCTCCCCCAGGAGACCGTTGCAGTCGGCCAGCCCCCTTCTCCACGGTGAGAAACTCGGGGGGCCAG" \
            "GGGGTGTCCTCGCTGCCTTATTTCGCCCCACTCCGGACTTAGCCCTCCGCGTAGCCCGCGCTTCCT" \
            "GAGAAGTGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGGCAGTTAGTGCCTGG" \
            "GGGGCGCGGCCCAACTCAACCCCTTACCCCAGGCCTTGCCCACTAGGTAACCATGTGCGACCGAAA" \
            "GGCCGTGATCAAAAATGCGGACATGTCGGAAGAGATGCAACAGGACTCGGTGGAGTGCGCTACTCA" \
            "GGCGCTGGAGAAATACAACATAGAGAAGGACATTGCGGCTCATATCAAGAAGGTGAGGATGGGCGC" \
            "GGGGGCCGATACGCAGCCGGGAGCAGGGGGTTCCTTCCCCCCGATCCTGCTTTCCTAAGGGCGCCT" \
            "GACAGGTCCCGGGAATACTGCTGGCGGCTTGGGGCGTAGAAGCTTCCAGAAAGGACGCAGATGCAT" \
            "TTTGCGCTCCTGTGGAGAAGACCAGACCCCCGGCGTCCGAAGTTTTTTTTTTTTTTTTTTTAATTA" \
            "CCCAGCTCCGCGGGGGGAAAGCGCCACCTAGCAACGGTATCTAAGATCAGGGAGCAGCGGTTCCCC" \
            "CTTCTGTGTGGTTCCTGCGCCGAGGATCCATCTGGGTGTTCCGGAGGGGGGAGCTGCGTGGGTGTT" \
            "TCCAGCCGGGCCGGGAGGAGATCTTGCCAGCCTTCCAGTGGGGAGTTGAGGGAAGGTGGTGGGTGG" \
            "TGGCGGGGCTGGGGGCTGGGGTAGGGGCTTGGTAAATGGCAGTCTAGAAAGCCGGCAGGACTGCCA" \
            "ACTTCTCGAGCAGTGTTTGCTGGAAGGGAAGAAAGCTGGCAGCCTAAGCCGTGGGAGGGTTCCAGT" \
            "CGAGAATGGGAAGATGAAAGACTTCAGATGGAACAGAAATAAATGCCTTTTTTGACAAACGCAGCA" \
            "GTGCGTGCCTCTAGCTTGCAAGAGCGTTACTCCCCTTCATAGCTTTAAAAGGTTTTCGCACTGCGT" \
            "GCAGTTAGAGTAGCTAAATCTTGTGTGACGCTCCACAAACACTTGTAAGAATTTTGCAGAGAAAGA" \
            "TAACCGTTGCCACCCAATGCCCCCCACAGGCATTCTACTCCCCAGTACCTCTTAGGGTGGGAGAAA" \
            "TGGTGAAGAGTTGTTCCTACAACTTGCTAACCTAGTGGACAGGGTAGTAGATTAGCATCATCCGGA" \
            "TAGATGTGAAGAGGACGGCTGTTTGGATAATAATTAAGGATAAAATTTGGCCAGTTGACAGATTCT" \
            "GTTTCCAGCAGTTTTTACAGCAACAGTGGAGTGCTTCAGTATTGTGTTCCTGTAAATTTAATTTTG" \
            "ATCCGCAATCATTTGGTATACAATGCTGTTTGAAGTTTTGTCCTATTGGAAAAGTCTTGTGTTGCA" \
            "GGGGTGCAGTTAAGATCTTTGTGATGAGGAATGGGATGGGCTAATTTTTTGCCGTTTTCTTGGAAT" \
            "TGGGGGCATGGCAAATACAGTAGGGTAGTTTAGTTCTCTACACAGAACATGATAAACTACACCTGT" \
            "TGATGTCACCGTCTGTCAATGAATATTATAGAAGGTATGAAGGTGTAATTACCATAATAACAAAAC" \
            "ACCCTGTCTTTAGGGCTGACCTTTCGTCCTTTGACCTCCTCAGCCTCCATTCCCATCTTCGCTCAG" \
            "ACTGCAAGTATGTTTGTATTAATGTACTATGTAGGCGGCTTGGAGCTGGGGAACATTCTTTCATTC" \
            "TAAGAATTTGCAGATGCTGACGTTCCTCCTTTCTGCCCCTACAGGCTCTGGCTTATCCAAGAGGCA" \
            "AACACTGACCTCTGGTAATTAAAATCCTAGTTCTTTTCTTTTGTCTTTTCCAGGAATTTGACAAGA" \
            "AGTACAATCCCACCTGGCATTGCATCGTGGGGAGGAACTTCGGTAGTTATGTGACACATGAAACCA" \
            "AACACTTCATCTACTTCTACCTGGGCCAAGTGGCCATTCTTCTGTTCAAATCTGGTTAAAAGCATG" \
            "GACTGTGCCACACACCCAGTGATCCATCCAAAAACAAGGACTGCAGCCTAAATTCCAAATACCAGA" \
            "GACTGAAATTTTCAGCCTTGCTAAGGGAACATCTCGATGTTTGAACCTTTGTTGTGTTTTGTACAG" \
            "GGCATTCTCTGTACTAGTTTGTCGTGGTTATAAAACAATTAGCAGAATAGCCTACATTTGTATTTA" \
            "TTTTCTATTCCATACTTCTGCCCACGTTGTTTTCTCTCAAAATCCATTCCTTTAAAAAATAAATCT" \
            "GATGCAGATGTGTATGTGTGTG"

        expected.add_cds_sequence(cds)
        expected.add_genomic_sequence(genomic, offset=10)

        return expected