def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M") reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3) eq_(expected, reference_coding_sequence_key)
def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M", ) reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3 ) eq_(expected, reference_coding_sequence_key)
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", cdna_suffix="AGGAGCCGCAGTCAGAT", n_bad_nucleotides_at_start=0, mismatches_before_variant=0, mismatches_after_variant=14, # the read is that much longer than the reference (17 vs 3) reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. cdna_suffix : str Transcript nucleotides after the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches_before_variant : int Expected number of nucleotide mismatches in the result before the variant locus. reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], reference_cdna_sequence_after_variant= "AGGAGCCGCAGTCAGAT"[:reference_context_size], number_mismatches_before_variant=mismatches_before_variant, number_mismatches_after_variant=mismatches_after_variant) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", n_bad_nucleotides_at_start=0, mismatches=0, reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches : int Expected number of nucleotide mismatches in the result reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" cdna_suffix = "AGGAGCCGCAGTCAGAT" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], number_mismatches=mismatches) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected