def test_serialization(): variants = [ Variant(1, start=10, ref="AA", alt="AAT", ensembl=ensembl77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] for original in variants: # This causes the variant's ensembl object to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) assert original == reconstituted assert original.contig == reconstituted.contig assert original.ref == reconstituted.ref assert original.alt == reconstituted.alt assert original.start == reconstituted.start assert original.end == reconstituted.end # Test json. serialized = original.to_json() reconstituted = Variant.from_json(serialized) assert original == reconstituted
def test_serialization(): original = VariantCollection([ Variant( 1, start=10, ref="AA", alt="AAT", ensembl=77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ]) original.metadata[original[0]] = {"a": "b"} original.metadata[original[2]] = {"bar": 2} # This causes the variants' ensembl objects to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]]) # Test json. serialized = original.to_json() reconstituted = VariantCollection.from_json(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
def test_serialization(): variants = [ Variant( 1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] for original in variants: # This causes the variant's ensembl object to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(original.contig, reconstituted.contig) eq_(original.ref, reconstituted.ref) eq_(original.alt, reconstituted.alt) eq_(original.start, reconstituted.start) eq_(original.end, reconstituted.end) eq_(original.original_ref, reconstituted.original_ref) eq_(original.original_alt, reconstituted.original_alt) eq_(original.original_start, reconstituted.original_start) # Test json. serialized = original.to_json() reconstituted = Variant.from_json(serialized) eq_(original, reconstituted)
def test_drop_duplicates(): ensembl = EnsemblRelease(78) v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) v2 = Variant("2", 10, "G", "T", ensembl=ensembl) collection_without_duplicates = VariantCollection( variants=[v1, v1, v1_copy, v2]) assert len(collection_without_duplicates) == 2
def test_multiple_alleles_per_line(): variants = load_vcf(data_path("multiallelic.vcf")) assert len(variants) == 2, "Expected 2 variants but got %s" % variants variant_list = list(variants) expected_variants = [ Variant(1, 1431105, "A", "C", genome="GRCh37"), Variant(1, 1431105, "A", "G", genome="GRCh37"), ] eq_(set(variant_list), set(expected_variants))
def test_contig_name_normalization(): eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1") eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1) # uppercase eq_(Variant( "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM") eq_(Variant( "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
def test_silent_stop_codons(): silent_stop_codon_variants = { "ENST00000290524": Variant(1, start=151314663, ref="C", alt="T", genome=ensembl_grch37), "ENST00000368725": Variant(1, start=153409535, ref="C", alt="T", genome=ensembl_grch37), "ENST00000353479": Variant(10, start=105791994, ref="C", alt="T", genome=ensembl_grch37), } for transcript_id, variant in silent_stop_codon_variants.items(): yield (expect_effect, variant, transcript_id, Silent)
def test_snv_transition_transversion(): ref_variant = Variant(1, start=100, ref="C", alt="C") assert not ref_variant.is_snv variant = Variant(1, start=100, ref="C", alt="T") assert variant.is_snv assert variant.is_transition assert not variant.is_transversion transversion = Variant(1, start=100, ref="C", alt="A") assert transversion.is_snv assert not transversion.is_transition assert transversion.is_transversion
def test_maf(): expected_tcga_ov_variants = [ Variant(1, 1650797, "A", "G", ensembl), Variant(1, 23836447, "C", "A", ensembl), Variant(1, 231401797, "A", "C", ensembl), Variant(11, 124617502, "C", "G", ensembl), ] eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants)) for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants): eq_(v_expect, v_maf) gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol'] assert any(gene.name == gene_name for gene in v_maf.genes), \ "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
def test_mhc_predictor_error(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) # throws an error for each prediction, make sure vaxrank doesn't fall down class FakeMHCPredictor: def predict_subsequences(self, x): raise ValueError('I throw an error in your general direction') epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(), protein_fragment=protein_fragment, genome=genome) eq_(0, len(epitope_predictions))
def variants_from_csv(csv_file, sample_id=None, reference=None): """Variants from csv file. Args: csv_file: csv file with following column names- chromosome, position, reference_allele, alt_allele, gene_name, transcript_id, sample_id sample_id: if provided, select variants only for this id reference: ref genome used for variant calling """ from pyensembl import ensembl_grch38 import varcode from varcode import Variant df = pd.read_csv(csv_file) variants = [] if sample_id != None and 'sample_id' in df.columns: df = df[df.sample_id == sample_id] df = df.drop_duplicates(['POS', 'REF', 'ALT']) for i, r in list(df.iterrows()): #print i v = Variant(contig=r.CHROM, start=r.POS, ref=r.REF, alt=r.ALT, ensembl=ensembl_grch38) variants.append(v) varcl = varcode.variant_collection.VariantCollection(variants) return varcl
def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand(): # insert 'CCC' after start codon of TP53-001, which on the reverse # complement means inserting "GGG" between "CTC_CAT" tp53_insertion = Variant("17", 7676589, "CTC", "CTCGGG", grch38) tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # The above gives us the cDNA sequence from the transcript, whereas the # reverse complement genomic sequence is: # GCGGCTCCTC_CAT_GGCAGTGACC # get the 5 nucleotides before the variant and 10 nucleotides after sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key)
def test_frameshift_near_start_of_BRCA1_001(): # # Insertion of genomic "A" after second codon of coding sequence. # # Transcript: BRCA1-001 (ENST00000357654) # Manually annotated using Ensembl release 85 # # Original mRNA coding sequnce: # ATG GAT TTA TCT GCT CTT CGC GTT GAA GAA GTA CAA # -M- -D- -L- -S- -A- -L- -A- -V- -E- -E- -V- -Q- # # After variant: # ATG GAT TTT ATC TGC TCT TCG CGT TGA # -M- -D- -F- -I- -C- -S- -S- -R- * variant = Variant("17", 43124096 - 6, ref="", alt="A", ensembl=ensembl_grch38) expect_effect(variant, transcript_id="ENST00000357654", effect_class=FrameShift, modifies_coding_sequence=True, modifies_protein_sequence=True, aa_alt="FICSSR")
def test_multiple_variant_forms(): """ Load VCF, MAF and VariantCollection together. """ vcf_dir, cohort = None, None try: vcf_dir, cohort = make_cohort([FILE_FORMAT_1]) patient = cohort[0] patient.variants.append(data_path(MAF_FILE)) # Make sure listing the file twice has no effect. patient.variants.append(data_path(MAF_FILE)) variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75) patient.variants.append(VariantCollection([variant])) cohort_variants = cohort.load_variants(patients=[patient]) # Make sure the VariantCollection was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1) # Make sure the VCF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1) # Make sure the MAF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1) # Make sure a non-existant variant is not included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0) finally: if vcf_dir is not None and path.exists(vcf_dir): rmtree(vcf_dir) if cohort is not None: cohort.clear_caches()
def test_allele_count_dataframe(): variant = Variant("test_contig", 50, "C", "G") read_evidence = ReadEvidence(trimmed_base1_start=50, trimmed_ref="C", trimmed_alt="G", ref_reads=[ AlleleRead(prefix="AAA", allele="C", suffix="TTT", name="C1"), AlleleRead(prefix="AAC", allele="C", suffix="TTA", name="C2"), ], alt_reads=[ AlleleRead(prefix="AAA", allele="G", suffix="TTT", name="G1") ], other_reads=[]) df = allele_counts_dataframe([(variant, read_evidence)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, ) row = df.iloc[0] eq_(row.num_ref_reads, 2) eq_(row.num_alt_reads, 1) eq_(row.num_other_reads, 0)
def test_most_common_nucleotides_for_chr12_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant( chromosome, base1_location, ref, alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) consensus_sequence, chosen_counts, other_counts = most_common_nucleotides( variant_reads) print(chosen_counts) print(other_counts) eq_(len(chosen_counts), len(consensus_sequence)) eq_(len(other_counts), len(consensus_sequence)) assert other_counts.sum() < chosen_counts.sum(), \ "Counts for alternate nucleotides should not exceed the chosen sequence" number_matching_reads = 0 for variant_read in variant_reads: full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix number_matching_reads += (full_seq in consensus_sequence) fraction_matching_reads = number_matching_reads / float(len(variant_reads)) print("Fraction matching reads is %d/%d = %f" % ( number_matching_reads, len(variant_reads), fraction_matching_reads)) assert fraction_matching_reads > 0.5, \ "Expected majority of reads to match consensus sequence"
def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M") reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3) eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start( ): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_group_unique_sequences(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) print("%d variant reads: %s" % ( len(variant_reads), variant_reads)) groups = group_unique_sequences( variant_reads, max_prefix_size=30, max_suffix_size=30) print("%d unique sequences: %s" % ( len(groups), groups)) # there are some redundant reads, so we expect that the number of # unique entries should be less than the total read partitions assert len(variant_reads) > len(groups)
def test_locus_reads_snv(): """ test_partitioned_read_sequences_snv : Test that read gets correctly partitioned for chr1:4 T>G where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" variant = Variant("chromosome", 4, ref="T", alt="G", normalize_contig_name=False) pysam_read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=variant.start - 1, base1_position_after_variant=variant.start + 1)) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=2, base0_read_position_after_variant=4) assert_equal_fields(read, expected)
def test_locus_reads_substitution_shorter(): # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence # "ACCTTG", for which the alignment is interpreted as a C>G variant # followed by the deletion of a C variant = Variant("chromosome", 2, ref="CC", alt="G", normalize_contig_name=False) print(variant) pysam_read = make_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=1, base1_position_after_variant=4)) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) print(reads) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=0, base0_read_position_after_variant=2) assert_equal_fields(read, expected)
def test_locus_reads_substitution_longer(): # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG", # the alignment is interpreted as a C>G variant followed by an insertion of # another G variant = Variant("chromosome", 2, ref="C", alt="GG", normalize_contig_name=False) print(variant) pysam_read = make_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=1, base1_position_after_variant=3)) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, None, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=0, base0_read_position_after_variant=3) assert_equal_fields(read, expected)
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite): # exonic splice site mutations carry with them an alternate effect # which is what we check against dbNSFP (since that database seemed # to ignore exonic splicing mutations) effect = effect.alternate_effect assert isinstance(effect, Substitution), \ "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( aa_pos, aa_alt, effect) effect_aa_pos = effect.aa_mutation_start_offset effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( aa_alt, aa_pos, chrom, dna_position, dna_ref, dna_alt, effect)
def test_locus_reads_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # normalization of this variant will turn it into the deletion of # "T" at base-1 position 5 variant = Variant("1", 4, ref="TT", alt="T") pysam_read = make_pysam_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start - 1, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead( name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 5], quality_scores=pysam_read.query_qualities, # missing would have gone after 4th nucleotide in the read read_base0_start_inclusive=4, read_base0_end_exclusive=4, reference_base0_start_inclusive=4, reference_base0_end_exclusive=5) assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME") eq_(result, expected)
def test_partitioned_read_sequences_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" chromosome = "chromosome" location = 4 ref = "TT" alt = "T" variant = Variant( chromosome, location, ref, alt, grch38, normalize_contig_name=False) read = make_pysam_read( seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = MockAlignmentFile( references=(chromosome,), reads=[read]) read_creator = ReadCollector() variant_reads = read_creator.allele_reads_supporting_variant( alignment_file=samfile, variant=variant) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] expected = AlleleRead( name=read.qname, prefix="ACCT", allele="", suffix="G") eq_(variant_read, expected)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3) expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_locus_reads_substitution_shorter(): # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence # "ACCTTG", for which the alignment is interpreted as a C>G variant # followed by the deletion of a C variant = Variant("1", 2, ref="CC", alt="G") print(variant) pysam_read = make_pysam_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 3) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) print(reads) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 3, 4, 5], quality_scores=pysam_read.query_qualities, read_base0_start_inclusive=1, read_base0_end_exclusive=2, reference_base0_start_inclusive=1, reference_base0_end_exclusive=3) assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M") eq_(result, expected)
def test_locus_reads_insertion(): """ test_partitioned_read_sequences_insertion : Test that read gets correctly partitioned for chr1:4 T>TG where the sequence for chr1 is assumed to be "ACCTTG" and the variant sequence is "ACCTGTG" """ variant = Variant("1", 4, ref="T", alt="TG") pysam_read = make_pysam_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead( name=pysam_read.qname, sequence=pysam_read.query_sequence, # expect the inserted nucleotide to be missing a corresponding # ref position reference_positions=[0, 1, 2, 3, None, 4, 5], quality_scores=pysam_read.query_qualities, read_base0_start_inclusive=4, read_base0_end_exclusive=5, reference_base0_start_inclusive=4, reference_base0_end_exclusive=4) print("Actual: %s" % (read, )) print("Expected: %s" % (expected, )) assert_equal_fields(read, expected)