def test_mitochondrial_MTND5_translation_from_cdna(): mtnd5_001 = ensembl_grch38.transcripts_by_name("MT-ND5-201")[0] cdna = mtnd5_001.coding_sequence amino_acids, ends_with_stop_codon = translate_cdna( cdna, first_codon_is_start=True, mitochondrial=True) assert ends_with_stop_codon eq_(amino_acids, mtnd5_001.protein_sequence)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M") eq_(result, expected)
def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand(): # insert 'CCC' after start codon of TP53-001, which on the reverse # complement means inserting "GGG" between "CTC_CAT" tp53_insertion = Variant( "17", 7676589, "CTC", "CTCGGG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # The above gives us the cDNA sequence from the transcript, whereas the # reverse complement genomic sequence is: # GCGGCTCCTC_CAT_GGCAGTGACC # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME") eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start( ): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3) expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_TP53_translation_from_cdna(): tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] cdna = tp53_001.coding_sequence amino_acids, ends_with_stop_codon = translate_cdna( cdna, first_codon_is_start=True) assert ends_with_stop_codon eq_(amino_acids, tp53_001.protein_sequence)
def test_interbase_range_for_brca2_utr_insertion(): # T>TC insertion after the 6th nucleotide of BRCA2-001's 5' UTR brca2_insertion = Variant("13", 32315479, "T", "TC", ensembl_grch38) brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_insertion, transcript=brca2_001) print(interbase_range) eq_(interbase_range, (6, 6))
def test_sequence_key_with_reading_frame_insertion_before_start_codon(): # insert nucleotide "T" before of the start codon of TP53-001, tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1) assert result is None, "Expected result to be None when variant before start codon"
def test_sequence_key_with_reading_frame_insertion_before_start_codon(): # insert nucleotide "T" before of the start codon of TP53-001, tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1 ) assert result is None, "Expected result to be None when variant before start codon"
def test_interbase_range_for_brca2_utr_substitution(): # rs769125639 is a simple T>A substitution in the 6th nucleotide of # BRCA2-001's 5' UTR brca2_variant_rs769125639 = Variant("13", 32315479, "T", "A", ensembl_grch38) brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_variant_rs769125639, transcript=brca2_001) print(interbase_range) eq_(interbase_range, (5, 6))
def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1) assert result is None, "Expected result to be None when variant affects start codon"
def test_interbase_range_for_brca2_utr_substitution(): # rs769125639 is a simple T>A substitution in the 6th nucleotide of # BRCA2-001's 5' UTR brca2_variant_rs769125639 = Variant( "13", 32315479, "T", "A", ensembl_grch38) brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_variant_rs769125639, transcript=brca2_001) print(interbase_range) eq_(interbase_range, (5, 6))
def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1 ) assert result is None, "Expected result to be None when variant affects start codon"
def test_sequence_key_for_variant_on_transcript_deletion(): # Delete the 6th nucleotide of BRCA2-001's 5' UTR brca2_variant_deletion = Variant("13", 32315479, "T", "", ensembl_grch38) brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] # first 50 characters of BRCA2-001: # "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG" brca2_ref_seq = brca2_001.sequence[:50] eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_deletion, transcript=brca2_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", sequence_after_variant_locus=brca2_ref_seq[6:16]) eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): # delete start codon of TP53-001, which in reverse complement means # deleting the sequence "CAT" tp53_deletion = Variant("17", 7676592, "CAT", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): # Replace start codon of TP53-001 with 'CCC', however since this is on # reverse strand the variant becomes "CAT">"GGG" tp53_substitution = Variant("17", 7676592, "CAT", "GGG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_substitution, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_deletion(): # Delete the 6th nucleotide of BRCA2-001's 5' UTR brca2_variant_deletion = Variant( "13", 32315479, "T", "", ensembl_grch38) brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] # first 50 characters of BRCA2-001: # "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG" brca2_ref_seq = brca2_001.sequence[:50] eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_deletion, transcript=brca2_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", sequence_after_variant_locus=brca2_ref_seq[6:16]) eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): # Replace start codon of TP53-001 with 'CCC', however since this is on # reverse strand the variant becomes "CAT">"GGG" tp53_substitution = Variant( "17", 7676592, "CAT", "GGG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_substitution, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): # delete start codon of TP53-001, which in reverse complement means # deleting the sequence "CAT" tp53_deletion = Variant( "17", 7676592, "CAT", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_TP53_translation_from_cdna(): tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] cdna = tp53_001.coding_sequence amino_acids, ends_with_stop_codon = translate_cdna(cdna, first_codon_is_start=True) assert ends_with_stop_codon eq_(amino_acids, tp53_001.protein_sequence)
def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # replace second codon of TP53-001 with 'CCC' tp53_substitution = Variant( "17", 7676589, "CTC", "GGG", ensembl_grch38) variant_collection = VariantCollection([tp53_substitution]) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T # first calling without a transcript ID white to see if we get back # multiple contexts reference_context_dict_many_transcripts = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist=None) assert len(reference_context_dict_many_transcripts) == 1, \ "Dictionary should have only one variant but got %d keys" % ( len(reference_context_dict_many_transcripts),) reference_contexts = reference_context_dict_many_transcripts[tp53_substitution] assert len(reference_contexts) > 1, \ "Expected multiple reference contexts for %s but got %d: %s" % ( tp53_substitution, len(reference_contexts), reference_contexts) reference_context_dict_single_transcript = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist={tp53_001.id}) # still only expect one variant key eq_(len(reference_context_dict_single_transcript), 1) result_list = reference_context_dict_single_transcript[tp53_substitution] # since we limited the transcript ID whitelist, we only expect a single # reference context in the result eq_(len(result_list), 1) result = result_list[0] expected = ReferenceContext( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M", variant=tp53_substitution, transcripts=[tp53_001]) eq_(result, expected)
def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # replace second codon of TP53-001 with 'CCC' tp53_substitution = Variant("17", 7676589, "CTC", "GGG", ensembl_grch38) variant_collection = VariantCollection([tp53_substitution]) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T # first calling without a transcript ID white to see if we get back # multiple contexts reference_context_dict_many_transcripts = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist=None) assert len(reference_context_dict_many_transcripts) == 1, \ "Dictionary should have only one variant but got %d keys" % ( len(reference_context_dict_many_transcripts),) reference_contexts = reference_context_dict_many_transcripts[ tp53_substitution] assert len(reference_contexts) > 1, \ "Expected multiple reference contexts for %s but got %d: %s" % ( tp53_substitution, len(reference_contexts), reference_contexts) reference_context_dict_single_transcript = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist={tp53_001.id}) # still only expect one variant key eq_(len(reference_context_dict_single_transcript), 1) result_list = reference_context_dict_single_transcript[tp53_substitution] # since we limited the transcript ID whitelist, we only expect a single # reference context in the result eq_(len(result_list), 1) result = result_list[0] expected = ReferenceContext(strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M", variant=tp53_substitution, transcripts=[tp53_001]) eq_(result, expected)
def test_protein_protein_sequence(): transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_sequence, EGFR_001_protein_sequence)
def test_protein_id(): transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_id, "ENSP00000275493")