def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME") eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M") eq_(result, expected)
def test_reference_sequence_key_hash_and_equality_different_objects(): rcsk1 = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") rcsk_different_strand = ReferenceCodingSequenceKey( strand="+", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") assert rcsk1 != rcsk_different_strand assert str(rcsk1) != str(rcsk_different_strand) assert repr(rcsk1) != repr(rcsk_different_strand) assert hash(rcsk1) != hash(rcsk_different_strand)
def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M") reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3) eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3) expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start( ): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M", ) eq_(result, expected)
def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M", ) reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3 ) eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5 ) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E", ) eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_before_start_codon(): # insert nucleotide "T" before of the start codon of TP53-001, tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1) assert result is None, "Expected result to be None when variant before start codon"
def test_sequence_key_with_reading_frame_insertion_before_start_codon(): # insert nucleotide "T" before of the start codon of TP53-001, tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1 ) assert result is None, "Expected result to be None when variant before start codon"
def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1) assert result is None, "Expected result to be None when variant affects start codon"
def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=1 ) assert result is None, "Expected result to be None when variant affects start codon"
def test_reference_sequence_key_hash_and_equality_same_objects(): rcsk1 = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") rcsk2 = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(rcsk1, rcsk2) eq_(str(rcsk1), str(rcsk2)) eq_(repr(rcsk1), repr(rcsk2)) eq_(hash(rcsk1), hash(rcsk2))
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", n_bad_nucleotides_at_start=0, mismatches=0, reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches : int Expected number of nucleotide mismatches in the result reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" cdna_suffix = "AGGAGCCGCAGTCAGAT" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], number_mismatches=mismatches) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", cdna_suffix="AGGAGCCGCAGTCAGAT", n_bad_nucleotides_at_start=0, mismatches_before_variant=0, mismatches_after_variant=14, # the read is that much longer than the reference (17 vs 3) reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. cdna_suffix : str Transcript nucleotides after the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches_before_variant : int Expected number of nucleotide mismatches in the result before the variant locus. reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], reference_cdna_sequence_after_variant= "AGGAGCCGCAGTCAGAT"[:reference_context_size], number_mismatches_before_variant=mismatches_before_variant, number_mismatches_after_variant=mismatches_after_variant) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected