def test_reference_sequence_repository_register_genomic_ranges(): rsr = ReferenceSequenceRepository() grs = [ GenomicRange('X', 1, 100, '+'), GenomicRange('X', 500, 800, '+') ] rsr.register_genomic_ranges(grs) assert rsr.region_count == len(grs)
def test_genomic_range_contains(a_range, b_range, b_contains_a): chromosome = 'X' strand = '+' a = GenomicRange(chromosome, *a_range, strand) b = GenomicRange(chromosome, *b_range, strand) assert a in a assert (a in b if b_contains_a else a not in b) assert b not in a
def test_reference_sequence_get_subsequence(): seq = 'AAACCC' gr = GenomicRange('X', 100, 105, '+') gr_sub = GenomicRange('X', 100, 102, '+') # Initialise reference sequence ref_seq = ReferenceSequence(seq, gr) # Extract reference subsequence sub_ref_seq = ref_seq.get_subsequence(gr_sub) # Check reference subsequence assert sub_ref_seq.genomic_range == gr_sub assert sub_ref_seq.sequence == 'AAA'
def test_oligo_compute_mutations(targetons, mutator, pam_protection): def get_segment(seq): return TargetonOligoSegment(get_targeton(seq, pam_protection), {mutator}) ct = CodonTable.load(get_data_file_path(CODON_TABLE_FP)) ref_seq = ''.join(targetons) pam_ref_seq = PamProtectedReferenceSequence( ref_seq, GenomicRange('X', 1, sum(len(seq) for seq in targetons), '+'), ref_seq) adaptor_5 = 'AAAAAA' adaptor_3 = 'AAAAAA' segments = list(map(get_segment, targetons)) ot = OligoTemplate(TRANSCRIPT_INFO, pam_ref_seq, set(), set(), adaptor_5, adaptor_3, segments) for _, target_segment in ot.target_segments: mutation_collections = target_segment.compute_mutations(ct) mutation_collection = mutation_collections[mutator] if pam_protection: if mutator == TargetonMutator.DEL1: assert all( set(m.sequence) == {DUMMY_PAM_PROTECTION_NT} for m in mutation_collection.mutations) elif mutator == TargetonMutator.SNV: assert all( set(m.sequence) - {m.new} == {DUMMY_PAM_PROTECTION_NT} for m in mutation_collection.mutations)
def test_reference_sequence_init(seq, start, end, valid): gr = GenomicRange('X', start, end, '+') with pytest.raises(ValueError) if not valid else nullcontext(): # Initialise reference sequence ReferenceSequence(seq, gr)
def test_utr_repository_get_transcript_infos(): utr = UTRRepository(UTR_RANGES) d = utr.get_transcript_infos(REF_RANGES) assert len(d) == 1 gr = GenomicRange('chrX', 800, 1200, '+') ti = TranscriptInfo('G1', 'T1') assert d[gr] == ti
def test_cds_context_repository_compute_cds_contexts(start, end, exp_ext_5, exp_ext_3): cds_ranges = PyRanges(df=CDS_RANGES_DF) chromosome = 'X' strand = '+' gr = GenomicRange(chromosome, start, end, strand) target_ranges = PyRanges(df=pd.DataFrame.from_records([ gr.as_pyrange() ], columns=PYRANGES_FIELDS)) target_ranges.is_const = False # Initialise repository ccr = CDSContextRepository(cds_ranges) ccr.register_target_ranges(target_ranges) # Compute CDS contexts ccr.compute_cds_contexts() # Check CDS contexts assert len(ccr._target_cds_contexts) == 1 exon_info, (ext_5, ext_3) = ccr._target_cds_contexts[gr] # Check exon information assert isinstance(exon_info, ExonInfo) assert exon_info.gene_id == GID assert exon_info.transcript_id == TID # Check CDS extension if exp_ext_5 is not None: assert ext_5 == GenomicRange(chromosome, *exp_ext_5, strand) else: assert ext_5 is None if exp_ext_3 is not None: assert ext_3 == GenomicRange(chromosome, *exp_ext_3, strand) else: assert ext_3 is None # Check information retrieval assert ccr.get_cds_extensions(gr) == (ext_5, ext_3) assert ccr.get_exon_info(gr) == exon_info assert ccr.get_transcript_info(gr) == exon_info.transcript_info
def test_reference_sequence_repository_get_genomic_range_subsequence(): genomic_range = GenomicRange('X', 1023, 1032, '+') a = 'AAAAA' b = 'GGGGG' sequence = a + b rsr = ReferenceSequenceRepository() rsr.register_genomic_range(genomic_range) rsr._sequences[genomic_range.chromosome][(genomic_range.start, genomic_range.end)] = sequence assert rsr.get_genomic_range_subsequence(genomic_range, 1023, 1027) == a assert rsr.get_genomic_range_subsequence(genomic_range, 1028, 1032) == b assert a + b == sequence
def test_compute_pam_protected_sequence(seq, pos, ref, alt, ppseq, valid): chromosome = 'X' start = 100 end = start + len(seq) - 1 variant = PamVariant(GenomicPosition(chromosome, pos), ref, alt) gr = GenomicRange(chromosome, start, end, '+') ref_seq = ReferenceSequence(seq, gr) with pytest.raises(Exception) if not valid else nullcontext(): pam_ref_seq = compute_pam_protected_sequence(ref_seq, {variant}) if valid: assert pam_ref_seq.sequence == ref_seq.sequence assert pam_ref_seq.pam_protected_sequence == ppseq
def test_snv_table_get_snvs_triplet(triplet, alt_triplets, strand, mut_types_plus, mut_types_minus): gr = GenomicRange('X', 10, 12, strand) tr = codon_table.translate if strand == '+' else codon_table.translate_rc mut_types = mut_types_plus if strand == '+' else mut_types_minus # Retrieve SNV metadata snv_meta = snv_table.get_snvs(strand, triplet, gr, 0, 0, reset_index=True) snv_meta = snv_meta.sort_values(['pos', 'alt']).reset_index(drop=True) # Check amino acid change ref_aas = snv_meta.ref_aa.unique().astype('string') assert len(ref_aas) == 1 assert ref_aas[0] == tr(triplet) assert np.array_equal(snv_meta.alt_aa, np.array(list(map(tr, alt_triplets)))) assert np.array_equal(snv_meta.mut_type, mut_types)
def test_reference_sequence_repository_register_get_sequence(): chromosome = 'X' start = 1 end = 100 seq = 'AACCGGTT' gr = GenomicRange(chromosome, start, end, '+') # Initialise repository rsr = ReferenceSequenceRepository() rsr.register_genomic_range(gr) # Register sequence rsr.register_sequence(chromosome, start, end, seq) # Retrieve sequence assert rsr.get_sequence(chromosome, start, end) == seq assert rsr.get_genomic_range_sequence(gr) == seq
def test_get_inframe_mutations(seq, pre, suf, strand, exp_pos, exp_ref, exp_mseq): # Generate target gr = GenomicRange('X', 10, 10 + len(seq) - 1, strand) t = CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), pre, suf) # Generate in-frame deletions mc = t.get_inframe_mutations() # Check metadata table assert np.array_equal(mc.df.mut_position, np.array(exp_pos)) assert np.array_equal( mc.df.ref.astype('string').to_numpy(), np.array(exp_ref)) assert np.array_equal( mc.df.mseq.astype('string').to_numpy(), np.array(exp_mseq)) assert mc.df.new.isna().all() assert mc.df.var_type.unique()[0] == del_var_type
def test_get_2del_mutations(offset, seq, exp_pos, exp_ref, exp_mseq, cds): # Generate target gr = GenomicRange('X', 10, 10 + len(seq) - 1, '+') t = (CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), 'AA', 'A') if cds else Targeton(PamProtectedReferenceSequence(seq, gr, seq))) # Generate in-frame deletions mc = getattr(t, del_offset_method[offset])() # Check metadata table assert np.array_equal(mc.df.mut_position, np.array(exp_pos)) assert np.array_equal( mc.df.ref.astype('string').to_numpy(), np.array(exp_ref)) assert np.array_equal( mc.df.mseq.astype('string').to_numpy(), np.array(exp_mseq)) assert mc.df.new.isna().all() assert mc.df.var_type.unique()[0] == del_var_type
def test_get_snvre_aa_mutations(aa, strand, seq, exp_mseq): # Generate target gr = GenomicRange('X', 10, 10 + len(seq) - 1, strand) t = CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), '', '') # Generate codon substitution mutations mc = getattr(t, CONST_CODON_METHODS[aa])(aux_tables=aux) # Check results assert mc.df.alt_aa.unique()[0] == aa assert np.array_equal(mc.df.mseq.to_numpy(), np.array(exp_mseq)) assert mc.df.var_type.unique()[0] == sub_var_type assert np.array_equal( mc.df.ref.astype('string').to_numpy(), np.array([ triplet for i, triplet in enumerate(seq2triplets(seq)) if 3 * i in mc.df.mut_position.unique() ]))
def test_reference_subsequence(ref_fp): ref = ReferenceSequenceRepository() fasta_file = get_fasta_file(ref_fp) chromosome = 'X' strand = '+' start = 41341615 end = 41341635 ref_range = GenomicRange(chromosome, start, end, strand) ref.register_genomic_range(ref_range) ref.fetch_sequences(fasta_file) seq = ref.get_genomic_range_sequence(ref_range) offset = 10 pre = ref.get_genomic_range_subsequence(ref_range, start, start + offset) print(seq) print(pre) print(len(pre)) assert pre == seq[:offset]
def cds_seq_to_genomic_range(cds_seq, strand, plen, slen, offset=10): start = offset + plen end = offset + len(cds_seq) - slen - 1 return GenomicRange('X', start, end, strand)
# organisation for which payment is received. If you are interested in using the Software commercially, please contact # [email protected]. Contact details are: [email protected] quoting reference Valiant-software. ############################# import pandas as pd from pyranges import PyRanges import pytest from valiant.models.base import GenomicRange from valiant.models.exon import ExonInfo, CDSContextRepository PYRANGES_FIELDS = ['Chromosome', 'Strand', 'Start', 'End'] GID = 'GENE_ID_001' TID = 'TRANSCRIPT_ID_001' RANGE = ('X', '+', 100, 120) GR = GenomicRange('X', 100, 120, '+') RANGES = pd.DataFrame.from_records([ RANGE ], columns=PYRANGES_FIELDS) CDS_RANGES = [ ('X', '+', 100, 120, GID, TID, 0, 0), ('X', '+', 200, 207, GID, TID, 2, 1), ('X', '+', 300, 304, GID, TID, 1, 2) ] CDS_RANGES_DF = pd.DataFrame.from_records([ (chromosome, strand, start - 1, end, gene_id, transcript_id, frame, exon_index) for chromosome, strand, start, end, gene_id, transcript_id, frame, exon_index in CDS_RANGES ], columns=PYRANGES_FIELDS + ['gene_id', 'transcript_id', 'frame', 'exon_index'])
def test_genomic_range_get_from_5_prime(strand, n, exp_start, exp_end): chromosome = 'X' gr = GenomicRange(chromosome, 100, 200, strand) assert gr.get_from_5_prime(n) == GenomicRange( chromosome, exp_start, exp_end, strand)
def test_genomic_range_region(): assert GenomicRange('X', 1, 10, '+').region == 'X:1-10'
def test_genomic_range_length(): assert len(GenomicRange('X', 5, 10, '+')) == 6
def test_genomic_range_init(chromosome, start, end, strand, valid): with pytest.raises(ValueError) if not valid else nullcontext(): GenomicRange(chromosome, start, end, strand)
def test_genomic_range_as_unstranded_pyrange(chromosome, start, end, exp_start, strand): assert GenomicRange(chromosome, start, end, strand).as_unstranded_pyrange() == (chromosome, exp_start, end)
def test_genomic_range_contains_position(pos,exp): gr = GenomicRange('X', 100, 200, '+') assert gr.contains_position(pos) == exp
chromosome, exp_start, exp_end, strand) @pytest.mark.parametrize('strand,n,exp_start,exp_end', [ ('-', 3, 100, 102), ('+', 3, 198, 200) ]) def test_genomic_range_get_from_3_prime(strand, n, exp_start, exp_end): chromosome = 'X' gr = GenomicRange(chromosome, 100, 200, strand) assert gr.get_from_3_prime(n) == GenomicRange( chromosome, exp_start, exp_end, strand) @pytest.mark.parametrize('parent,child,exp_range,valid', [ (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 150, 160, '+'), (50, 61), True), (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 20, 160, '+'), None, False), (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 150, 160, '-'), None, False) ]) def test_genomic_range_get_relative_subrange(parent, child, exp_range, valid): with pytest.raises(ValueError) if not valid else nullcontext(): assert exp_range == parent.get_relative_subrange(child) @pytest.mark.parametrize('pos,exp', [ (GenomicPosition('X', 100), True), (GenomicPosition('X', 200), True), (GenomicPosition('X', 150), True), (GenomicPosition('X', 300), False), (GenomicPosition('Y', 100), False)
def get_pam_protected_sequence(seq, pam_protection, chromosome='X', strand='+', pos=1): gr = GenomicRange(chromosome, pos, len(seq), strand) ref_seq = ReferenceSequence(seq, gr) return PamProtectedReferenceSequence.from_reference_sequence( ref_seq, get_dummy_pam_protected(seq) if pam_protection else seq)