def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M")
    eq_(result, expected)
def test_reference_sequence_key_hash_and_equality_different_objects():
    rcsk1 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")
    rcsk_different_strand = ReferenceCodingSequenceKey(
        strand="+",
        sequence_before_variant_locus="GAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAG",
        offset_to_first_complete_codon=0,
        contains_start_codon=False,
        overlaps_start_codon=False,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")

    assert rcsk1 != rcsk_different_strand
    assert str(rcsk1) != str(rcsk_different_strand)
    assert repr(rcsk1) != repr(rcsk_different_strand)
    assert hash(rcsk1) != hash(rcsk_different_strand)
def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="ATG",
                                          sequence_at_variant_locus="G",
                                          sequence_after_variant_locus="AGG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=True,
                                          overlaps_start_codon=True,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="M")
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3)
    eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3)

    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="GAG",
                                          sequence_at_variant_locus="",
                                          sequence_after_variant_locus="GAG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=False,
                                          overlaps_start_codon=False,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="E")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(
):
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10
    )
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M",
    )
    eq_(result, expected)
def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="ATG",
        sequence_at_variant_locus="G",
        sequence_after_variant_locus="AGG",
        offset_to_first_complete_codon=0,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="M",
    )
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3
    )
    eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="GAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAG",
        offset_to_first_complete_codon=0,
        contains_start_codon=False,
        overlaps_start_codon=False,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_before_start_codon():
    # insert nucleotide "T" before of the start codon of TP53-001,
    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1)
    assert result is None, "Expected result to be None when variant before start codon"
def test_sequence_key_with_reading_frame_insertion_before_start_codon():
    # insert nucleotide "T" before of the start codon of TP53-001,
    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1
    )
    assert result is None, "Expected result to be None when variant before start codon"
def test_reference_coding_sequence_key_insertion_inside_start_codon():
    # insert nucleotide "C" in the middle of the start codon of TP53-001,
    # keeping only 1 nucleotide of context. In the reverse complement this
    # becomes 'T'>'TG'
    tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1)
    assert result is None, "Expected result to be None when variant affects start codon"
def test_reference_coding_sequence_key_insertion_inside_start_codon():
    # insert nucleotide "C" in the middle of the start codon of TP53-001,
    # keeping only 1 nucleotide of context. In the reverse complement this
    # becomes 'T'>'TG'
    tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1
    )
    assert result is None, "Expected result to be None when variant affects start codon"
def test_reference_sequence_key_hash_and_equality_same_objects():
    rcsk1 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")
    rcsk2 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")

    eq_(rcsk1, rcsk2)
    eq_(str(rcsk1), str(rcsk2))
    eq_(repr(rcsk1), repr(rcsk2))
    eq_(hash(rcsk1), hash(rcsk2))
def make_inputs_for_tp53_201_variant(
        cdna_prefix="ATG",
        n_bad_nucleotides_at_start=0,
        mismatches=0,
        reference_context_size=3):
    """
    Parameters
    ----------
    cdna_prefix : str
        Transcript nucleotides before the variant that we're pretending
        got detected from RNA-seq reads.

    n_bad_nucleotides_at_start : int
        Number of nucleotides we expect to get trimmed from the
        beginning of the variant sequence while matching to a reference context.

    mismatches : int
        Expected number of nucleotide mismatches in the result

    reference_context_size : int
        Number of nucleotides before the variant locus to try matching
        against a reference transcript.
    """
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")

    cdna_alt = "A"
    cdna_suffix = "AGGAGCCGCAGTCAGAT"

    # genomic DNA is the reverse complement of the cDNA
    # for TP53-001 since it's on the negative strand
    gdna_prefix = reverse_complement_dna(cdna_suffix)
    gdna_alt = reverse_complement_dna(cdna_alt)
    gdna_suffix = reverse_complement_dna(cdna_prefix)

    # variant sequence supported by two reads
    # one fully spanning the variant sequence
    # and another missing the last nucleotide
    fully_overlapping_read = AlleleRead(
        prefix=gdna_prefix,
        allele=gdna_alt,
        suffix=gdna_suffix,
        name="full-overlap")
    # testing the prefix and allele to make sure they have the expected
    # TP53-201 sequence but the suffix might change depending on what's
    # passed in as cdna_prefix
    eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(fully_overlapping_read.allele, "T")

    partially_overlapping_read = AlleleRead(
        prefix=gdna_prefix,
        allele=gdna_alt,
        suffix=gdna_suffix[:-1],
        name="partial-overlap")
    eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(partially_overlapping_read.allele, "T")

    variant_sequence = VariantSequence(
        prefix=gdna_prefix,
        alt=gdna_alt,
        suffix=gdna_suffix,
        reads=[fully_overlapping_read, partially_overlapping_read])
    assert isinstance(variant_sequence, VariantSequence)

    prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start

    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant,
        transcript=transcript,
        context_size=reference_context_size)
    assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey)

    reference_context = ReferenceContext.from_reference_coding_sequence_key(
        key=reference_coding_sequence_key,
        variant=variant,
        transcripts=[transcript])
    assert isinstance(reference_context, ReferenceContext)

    expected = VariantSequenceInReadingFrame(
        cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix,
        offset_to_first_complete_codon=prefix_length % 3,
        variant_cdna_interval_start=prefix_length,
        variant_cdna_interval_end=prefix_length + 1,
        reference_cdna_sequence_before_variant="ATG"[-prefix_length:],
        number_mismatches=mismatches)
    assert isinstance(expected, VariantSequenceInReadingFrame)

    return variant_sequence, reference_context, expected
示例#18
0
def make_inputs_for_tp53_201_variant(
    cdna_prefix="ATG",
    cdna_suffix="AGGAGCCGCAGTCAGAT",
    n_bad_nucleotides_at_start=0,
    mismatches_before_variant=0,
    mismatches_after_variant=14,  # the read is that much longer than the reference (17 vs 3)
    reference_context_size=3):
    """
    Parameters
    ----------
    cdna_prefix : str
        Transcript nucleotides before the variant that we're pretending
        got detected from RNA-seq reads.

    cdna_suffix : str
        Transcript nucleotides after the variant that we're pretending
        got detected from RNA-seq reads.

    n_bad_nucleotides_at_start : int
        Number of nucleotides we expect to get trimmed from the
        beginning of the variant sequence while matching to a reference context.

    mismatches_before_variant : int
        Expected number of nucleotide mismatches in the result before
        the variant locus.

    reference_context_size : int
        Number of nucleotides before the variant locus to try matching
        against a reference transcript.
    """
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")

    cdna_alt = "A"

    # genomic DNA is the reverse complement of the cDNA
    # for TP53-001 since it's on the negative strand
    gdna_prefix = reverse_complement_dna(cdna_suffix)
    gdna_alt = reverse_complement_dna(cdna_alt)
    gdna_suffix = reverse_complement_dna(cdna_prefix)

    # variant sequence supported by two reads
    # one fully spanning the variant sequence
    # and another missing the last nucleotide
    fully_overlapping_read = AlleleRead(prefix=gdna_prefix,
                                        allele=gdna_alt,
                                        suffix=gdna_suffix,
                                        name="full-overlap")
    # testing the prefix and allele to make sure they have the expected
    # TP53-201 sequence but the suffix might change depending on what's
    # passed in as cdna_prefix
    if cdna_suffix == "AGGAGCCGCAGTCAGAT":
        eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(fully_overlapping_read.allele, "T")

    partially_overlapping_read = AlleleRead(prefix=gdna_prefix,
                                            allele=gdna_alt,
                                            suffix=gdna_suffix[:-1],
                                            name="partial-overlap")
    if cdna_suffix == "AGGAGCCGCAGTCAGAT":
        eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(partially_overlapping_read.allele, "T")

    variant_sequence = VariantSequence(
        prefix=gdna_prefix,
        alt=gdna_alt,
        suffix=gdna_suffix,
        reads=[fully_overlapping_read, partially_overlapping_read])
    assert isinstance(variant_sequence, VariantSequence)

    prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start

    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant,
        transcript=transcript,
        context_size=reference_context_size)
    assert isinstance(reference_coding_sequence_key,
                      ReferenceCodingSequenceKey)

    reference_context = ReferenceContext.from_reference_coding_sequence_key(
        key=reference_coding_sequence_key,
        variant=variant,
        transcripts=[transcript])
    assert isinstance(reference_context, ReferenceContext)

    expected = VariantSequenceInReadingFrame(
        cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix,
        offset_to_first_complete_codon=prefix_length % 3,
        variant_cdna_interval_start=prefix_length,
        variant_cdna_interval_end=prefix_length + 1,
        reference_cdna_sequence_before_variant="ATG"[-prefix_length:],
        reference_cdna_sequence_after_variant=
        "AGGAGCCGCAGTCAGAT"[:reference_context_size],
        number_mismatches_before_variant=mismatches_before_variant,
        number_mismatches_after_variant=mismatches_after_variant)
    assert isinstance(expected, VariantSequenceInReadingFrame)

    return variant_sequence, reference_context, expected