예제 #1
0
def test_serialization():
    variants = [
        Variant(1, start=10, ref="AA", alt="AAT", ensembl=ensembl77),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        assert original == reconstituted

        assert original.contig == reconstituted.contig
        assert original.ref == reconstituted.ref
        assert original.alt == reconstituted.alt
        assert original.start == reconstituted.start
        assert original.end == reconstituted.end

        # Test json.
        serialized = original.to_json()
        reconstituted = Variant.from_json(serialized)
        assert original == reconstituted
예제 #2
0
def test_serialization():
    original = VariantCollection([
            Variant(
                1, start=10, ref="AA", alt="AAT", ensembl=77),
            Variant(10, start=15, ref="A", alt="G"),
            Variant(20, start=150, ref="", alt="G"),
    ])
    original.metadata[original[0]] = {"a": "b"}
    original.metadata[original[2]] = {"bar": 2}

    # This causes the variants' ensembl objects to make a SQL connection,
    # which makes the ensembl object non-serializable. By calling this
    # method, we are checking that we don't attempt to directly serialize
    # the ensembl object.
    original.effects()

    # Test pickling.
    serialized = pickle.dumps(original)
    reconstituted = pickle.loads(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])

    # Test json.
    serialized = original.to_json()
    reconstituted = VariantCollection.from_json(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
예제 #3
0
def test_serialization():
    variants = [
        Variant(
            1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        eq_(original, reconstituted)

        eq_(original.contig, reconstituted.contig)
        eq_(original.ref, reconstituted.ref)
        eq_(original.alt, reconstituted.alt)
        eq_(original.start, reconstituted.start)
        eq_(original.end, reconstituted.end)
        eq_(original.original_ref, reconstituted.original_ref)
        eq_(original.original_alt, reconstituted.original_alt)
        eq_(original.original_start, reconstituted.original_start)

        # Test json.
        serialized = original.to_json()
        reconstituted = Variant.from_json(serialized)
        eq_(original, reconstituted)
예제 #4
0
def test_drop_duplicates():
    ensembl = EnsemblRelease(78)
    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
    collection_without_duplicates = VariantCollection(
        variants=[v1, v1, v1_copy, v2])
    assert len(collection_without_duplicates) == 2
예제 #5
0
def test_multiple_alleles_per_line():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    variant_list = list(variants)
    expected_variants = [
        Variant(1, 1431105, "A", "C", genome="GRCh37"),
        Variant(1, 1431105, "A", "G", genome="GRCh37"),
    ]
    eq_(set(variant_list), set(expected_variants))
예제 #6
0
def test_contig_name_normalization():
    eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1")
    eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1)

    # uppercase
    eq_(Variant(
        "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM")
    eq_(Variant(
        "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
예제 #7
0
def test_silent_stop_codons():
    silent_stop_codon_variants = {
        "ENST00000290524":
        Variant(1, start=151314663, ref="C", alt="T", genome=ensembl_grch37),
        "ENST00000368725":
        Variant(1, start=153409535, ref="C", alt="T", genome=ensembl_grch37),
        "ENST00000353479":
        Variant(10, start=105791994, ref="C", alt="T", genome=ensembl_grch37),
    }
    for transcript_id, variant in silent_stop_codon_variants.items():
        yield (expect_effect, variant, transcript_id, Silent)
예제 #8
0
def test_snv_transition_transversion():
    ref_variant = Variant(1, start=100, ref="C", alt="C")
    assert not ref_variant.is_snv

    variant = Variant(1, start=100, ref="C", alt="T")
    assert variant.is_snv
    assert variant.is_transition
    assert not variant.is_transversion

    transversion = Variant(1, start=100, ref="C", alt="A")
    assert transversion.is_snv
    assert not transversion.is_transition
    assert transversion.is_transversion
예제 #9
0
def test_maf():
    expected_tcga_ov_variants = [
        Variant(1, 1650797, "A", "G", ensembl),
        Variant(1, 23836447, "C", "A", ensembl),
        Variant(1, 231401797, "A", "C", ensembl),
        Variant(11, 124617502, "C", "G", ensembl),
    ]
    eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants))
    for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants):
        eq_(v_expect, v_maf)
        gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol']
        assert any(gene.name == gene_name for gene in v_maf.genes), \
            "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
예제 #10
0
def test_mhc_predictor_error():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    # throws an error for each prediction, make sure vaxrank doesn't fall down
    class FakeMHCPredictor:
        def predict_subsequences(self, x):
            raise ValueError('I throw an error in your general direction')

    epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(),
                                           protein_fragment=protein_fragment,
                                           genome=genome)

    eq_(0, len(epitope_predictions))
예제 #11
0
def variants_from_csv(csv_file, sample_id=None, reference=None):
    """Variants from csv file.
    
    Args:
        csv_file: csv file with following column names-
            chromosome, position, reference_allele, alt_allele, gene_name, transcript_id, sample_id
        sample_id: if provided, select variants only for this id
        reference: ref genome used for variant calling
    """

    from pyensembl import ensembl_grch38
    import varcode
    from varcode import Variant
    df = pd.read_csv(csv_file)
    variants = []
    if sample_id != None and 'sample_id' in df.columns:
        df = df[df.sample_id == sample_id]
        df = df.drop_duplicates(['POS', 'REF', 'ALT'])
    for i, r in list(df.iterrows()):
        #print i
        v = Variant(contig=r.CHROM,
                    start=r.POS,
                    ref=r.REF,
                    alt=r.ALT,
                    ensembl=ensembl_grch38)
        variants.append(v)
    varcl = varcode.variant_collection.VariantCollection(variants)
    return varcl
def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand():
    # insert 'CCC' after start codon of TP53-001, which on the reverse
    # complement means inserting "GGG" between "CTC_CAT"
    tp53_insertion = Variant("17", 7676589, "CTC", "CTCGGG", grch38)
    tp53_001 = grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # The above gives us the cDNA sequence from the transcript, whereas the
    # reverse complement genomic sequence is:
    #    GCGGCTCCTC_CAT_GGCAGTGACC

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion,
        transcript=tp53_001,
        context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
예제 #13
0
def test_frameshift_near_start_of_BRCA1_001():
    #
    # Insertion of genomic "A" after second codon of coding sequence.
    #
    # Transcript: BRCA1-001 (ENST00000357654)
    # Manually annotated using Ensembl release 85
    #
    # Original mRNA coding sequnce:
    #   ATG GAT TTA TCT GCT CTT CGC GTT GAA GAA GTA CAA
    #   -M- -D- -L- -S- -A- -L- -A- -V- -E- -E- -V- -Q-
    #
    # After variant:
    #   ATG GAT TTT ATC TGC TCT TCG CGT TGA
    #   -M- -D- -F- -I- -C- -S- -S- -R-  *
    variant = Variant("17",
                      43124096 - 6,
                      ref="",
                      alt="A",
                      ensembl=ensembl_grch38)
    expect_effect(variant,
                  transcript_id="ENST00000357654",
                  effect_class=FrameShift,
                  modifies_coding_sequence=True,
                  modifies_protein_sequence=True,
                  aa_alt="FICSSR")
예제 #14
0
def test_multiple_variant_forms():
    """
    Load VCF, MAF and VariantCollection together.
    """
    vcf_dir, cohort = None, None
    try:
        vcf_dir, cohort = make_cohort([FILE_FORMAT_1])
        patient = cohort[0]
        patient.variants.append(data_path(MAF_FILE))
        # Make sure listing the file twice has no effect.
        patient.variants.append(data_path(MAF_FILE))
        variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75)
        patient.variants.append(VariantCollection([variant]))

        cohort_variants = cohort.load_variants(patients=[patient])

        # Make sure the VariantCollection was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1)

        # Make sure the VCF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1)

        # Make sure the MAF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1)

        # Make sure a non-existant variant is not included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0)
    finally:
        if vcf_dir is not None and path.exists(vcf_dir):
            rmtree(vcf_dir)
        if cohort is not None:
            cohort.clear_caches()
예제 #15
0
def test_allele_count_dataframe():
    variant = Variant("test_contig", 50, "C", "G")
    read_evidence = ReadEvidence(trimmed_base1_start=50,
                                 trimmed_ref="C",
                                 trimmed_alt="G",
                                 ref_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="C",
                                                suffix="TTT",
                                                name="C1"),
                                     AlleleRead(prefix="AAC",
                                                allele="C",
                                                suffix="TTA",
                                                name="C2"),
                                 ],
                                 alt_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="G",
                                                suffix="TTT",
                                                name="G1")
                                 ],
                                 other_reads=[])
    df = allele_counts_dataframe([(variant, read_evidence)])
    assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, )
    row = df.iloc[0]
    eq_(row.num_ref_reads, 2)
    eq_(row.num_alt_reads, 1)
    eq_(row.num_other_reads, 0)
예제 #16
0
def test_most_common_nucleotides_for_chr12_deletion():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(
        chromosome,
        base1_location,
        ref,
        alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    consensus_sequence, chosen_counts, other_counts = most_common_nucleotides(
        variant_reads)
    print(chosen_counts)
    print(other_counts)
    eq_(len(chosen_counts), len(consensus_sequence))
    eq_(len(other_counts), len(consensus_sequence))
    assert other_counts.sum() < chosen_counts.sum(), \
        "Counts for alternate nucleotides should not exceed the chosen sequence"

    number_matching_reads = 0
    for variant_read in variant_reads:
        full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix
        number_matching_reads += (full_seq in consensus_sequence)
    fraction_matching_reads = number_matching_reads / float(len(variant_reads))
    print("Fraction matching reads is %d/%d = %f" % (
        number_matching_reads, len(variant_reads), fraction_matching_reads))
    assert fraction_matching_reads > 0.5, \
        "Expected majority of reads to match consensus sequence"
def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="ATG",
                                          sequence_at_variant_locus="G",
                                          sequence_after_variant_locus="AGG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=True,
                                          overlaps_start_codon=True,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="M")
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3)
    eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(
):
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")
    eq_(result, expected)
예제 #19
0
def test_group_unique_sequences():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref, alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    print("%d variant reads: %s" % (
        len(variant_reads), variant_reads))
    groups = group_unique_sequences(
        variant_reads,
        max_prefix_size=30,
        max_suffix_size=30)
    print("%d unique sequences: %s" % (
        len(groups), groups))
    # there are some redundant reads, so we expect that the number of
    # unique entries should be less than the total read partitions
    assert len(variant_reads) > len(groups)
예제 #20
0
def test_locus_reads_snv():
    """
    test_partitioned_read_sequences_snv : Test that read gets correctly
    partitioned for chr1:4 T>G where the sequence for chr1 is assumed
    to be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    variant = Variant("chromosome",
                      4,
                      ref="T",
                      alt="G",
                      normalize_contig_name=False)

    pysam_read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=variant.start - 1,
                             base1_position_after_variant=variant.start + 1))
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=2,
                         base0_read_position_after_variant=4)
    assert_equal_fields(read, expected)
예제 #21
0
def test_locus_reads_substitution_shorter():
    # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence
    # "ACCTTG", for which the alignment is interpreted as a C>G variant
    # followed by the deletion of a C
    variant = Variant("chromosome",
                      2,
                      ref="CC",
                      alt="G",
                      normalize_contig_name=False)
    print(variant)
    pysam_read = make_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=1,
                             base1_position_after_variant=4))
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    print(reads)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=0,
                         base0_read_position_after_variant=2)
    assert_equal_fields(read, expected)
예제 #22
0
def test_locus_reads_substitution_longer():
    # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG",
    # the alignment is interpreted as a C>G variant followed by an insertion of
    # another G
    variant = Variant("chromosome",
                      2,
                      ref="C",
                      alt="GG",
                      normalize_contig_name=False)
    print(variant)
    pysam_read = make_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=1,
                             base1_position_after_variant=3))
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, None, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=0,
                         base0_read_position_after_variant=3)
    assert_equal_fields(read, expected)
예제 #23
0
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position,
                                 dna_ref, dna_alt, aa_pos, aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
        # exonic splice site mutations carry with them an alternate effect
        # which is what we check against dbNSFP (since that database seemed
        # to ignore exonic splicing mutations)
        effect = effect.alternate_effect

    assert isinstance(effect, Substitution), \
        "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
            aa_pos, aa_alt, effect)
    effect_aa_pos = effect.aa_mutation_start_offset
    effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
    assert (
        effect_aa_pos + 1 == aa_pos and
        effect_aa_alt == aa_alt), \
            "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
                aa_alt,
                aa_pos,
                chrom,
                dna_position,
                dna_ref,
                dna_alt,
                effect)
예제 #24
0
def test_locus_reads_deletion():
    """
    test_partitioned_read_sequences_deletion : Test that read gets correctly
    partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to
    be "ACCTTG"
    """
    # normalization of this variant will turn it into the deletion of
    # "T" at base-1 position 5
    variant = Variant("1", 4, ref="TT", alt="T")
    pysam_read = make_pysam_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome",
                                         variant.start - 1, variant.start)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(
        name=pysam_read.qname,
        sequence=pysam_read.query_sequence,
        reference_positions=[0, 1, 2, 3, 5],
        quality_scores=pysam_read.query_qualities,
        # missing would have gone after 4th nucleotide in the read
        read_base0_start_inclusive=4,
        read_base0_end_exclusive=4,
        reference_base0_start_inclusive=4,
        reference_base0_end_exclusive=5)
    assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME")
    eq_(result, expected)
def test_partitioned_read_sequences_deletion():
    """
    test_partitioned_read_sequences_deletion : Test that read gets correctly
    partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to
    be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    chromosome = "chromosome"
    location = 4
    ref = "TT"
    alt = "T"
    variant = Variant(
        chromosome, location, ref, alt, grch38, normalize_contig_name=False)

    read = make_pysam_read(
        seq="ACCTG",
        cigar="4M1D1M",
        mdtag="4^T1")
    samfile = MockAlignmentFile(
        references=(chromosome,),
        reads=[read])
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignment_file=samfile,
        variant=variant)
    print(variant_reads)
    assert len(variant_reads) == 1
    variant_read = variant_reads[0]
    expected = AlleleRead(
        name=read.qname,
        prefix="ACCT",
        allele="",
        suffix="G")
    eq_(variant_read, expected)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3)

    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="GAG",
                                          sequence_at_variant_locus="",
                                          sequence_after_variant_locus="GAG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=False,
                                          overlaps_start_codon=False,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="E")
    eq_(result, expected)
예제 #28
0
def test_locus_reads_substitution_shorter():
    # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence
    # "ACCTTG", for which the alignment is interpreted as a C>G variant
    # followed by the deletion of a C
    variant = Variant("1", 2, ref="CC", alt="G")
    print(variant)
    pysam_read = make_pysam_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 3)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    print(reads)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         read_base0_start_inclusive=1,
                         read_base0_end_exclusive=2,
                         reference_base0_start_inclusive=1,
                         reference_base0_end_exclusive=3)
    assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M")
    eq_(result, expected)
예제 #30
0
def test_locus_reads_insertion():
    """
    test_partitioned_read_sequences_insertion : Test that read gets correctly
    partitioned for chr1:4 T>TG
    where the sequence for chr1 is assumed to be "ACCTTG"
    and the variant sequence is "ACCTGTG"
    """
    variant = Variant("1", 4, ref="T", alt="TG")

    pysam_read = make_pysam_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start,
                                         variant.start)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(
        name=pysam_read.qname,
        sequence=pysam_read.query_sequence,
        # expect the inserted nucleotide to be missing a corresponding
        # ref position
        reference_positions=[0, 1, 2, 3, None, 4, 5],
        quality_scores=pysam_read.query_qualities,
        read_base0_start_inclusive=4,
        read_base0_end_exclusive=5,
        reference_base0_start_inclusive=4,
        reference_base0_end_exclusive=4)
    print("Actual: %s" % (read, ))
    print("Expected: %s" % (expected, ))
    assert_equal_fields(read, expected)