Exemplo n.º 1
0
def test_reference_sequence_repository_register_genomic_ranges():
    rsr = ReferenceSequenceRepository()
    grs = [
        GenomicRange('X', 1, 100, '+'),
        GenomicRange('X', 500, 800, '+')
    ]
    rsr.register_genomic_ranges(grs)
    assert rsr.region_count == len(grs)
Exemplo n.º 2
0
def test_genomic_range_contains(a_range, b_range, b_contains_a):
    chromosome = 'X'
    strand = '+'
    a = GenomicRange(chromosome, *a_range, strand)
    b = GenomicRange(chromosome, *b_range, strand)
    assert a in a
    assert (a in b if b_contains_a else a not in b)
    assert b not in a
Exemplo n.º 3
0
def test_reference_sequence_get_subsequence():
    seq = 'AAACCC'
    gr = GenomicRange('X', 100, 105, '+')
    gr_sub = GenomicRange('X', 100, 102, '+')

    # Initialise reference sequence
    ref_seq = ReferenceSequence(seq, gr)

    # Extract reference subsequence
    sub_ref_seq = ref_seq.get_subsequence(gr_sub)

    # Check reference subsequence
    assert sub_ref_seq.genomic_range == gr_sub
    assert sub_ref_seq.sequence == 'AAA'
Exemplo n.º 4
0
def test_oligo_compute_mutations(targetons, mutator, pam_protection):
    def get_segment(seq):
        return TargetonOligoSegment(get_targeton(seq, pam_protection),
                                    {mutator})

    ct = CodonTable.load(get_data_file_path(CODON_TABLE_FP))
    ref_seq = ''.join(targetons)
    pam_ref_seq = PamProtectedReferenceSequence(
        ref_seq, GenomicRange('X', 1, sum(len(seq) for seq in targetons), '+'),
        ref_seq)

    adaptor_5 = 'AAAAAA'
    adaptor_3 = 'AAAAAA'
    segments = list(map(get_segment, targetons))

    ot = OligoTemplate(TRANSCRIPT_INFO, pam_ref_seq, set(), set(), adaptor_5,
                       adaptor_3, segments)
    for _, target_segment in ot.target_segments:
        mutation_collections = target_segment.compute_mutations(ct)
        mutation_collection = mutation_collections[mutator]
        if pam_protection:
            if mutator == TargetonMutator.DEL1:
                assert all(
                    set(m.sequence) == {DUMMY_PAM_PROTECTION_NT}
                    for m in mutation_collection.mutations)
            elif mutator == TargetonMutator.SNV:
                assert all(
                    set(m.sequence) - {m.new} == {DUMMY_PAM_PROTECTION_NT}
                    for m in mutation_collection.mutations)
Exemplo n.º 5
0
def test_reference_sequence_init(seq, start, end, valid):
    gr = GenomicRange('X', start, end, '+')

    with pytest.raises(ValueError) if not valid else nullcontext():

        # Initialise reference sequence
        ReferenceSequence(seq, gr)
Exemplo n.º 6
0
def test_utr_repository_get_transcript_infos():
    utr = UTRRepository(UTR_RANGES)
    d = utr.get_transcript_infos(REF_RANGES)
    assert len(d) == 1
    gr = GenomicRange('chrX', 800, 1200, '+')
    ti = TranscriptInfo('G1', 'T1')
    assert d[gr] == ti
Exemplo n.º 7
0
def test_cds_context_repository_compute_cds_contexts(start, end, exp_ext_5, exp_ext_3):
    cds_ranges = PyRanges(df=CDS_RANGES_DF)

    chromosome = 'X'
    strand = '+'
    gr = GenomicRange(chromosome, start, end, strand)
    target_ranges = PyRanges(df=pd.DataFrame.from_records([
        gr.as_pyrange()
    ], columns=PYRANGES_FIELDS))
    target_ranges.is_const = False

    # Initialise repository
    ccr = CDSContextRepository(cds_ranges)
    ccr.register_target_ranges(target_ranges)

    # Compute CDS contexts
    ccr.compute_cds_contexts()

    # Check CDS contexts
    assert len(ccr._target_cds_contexts) == 1
    exon_info, (ext_5, ext_3) = ccr._target_cds_contexts[gr]

    # Check exon information
    assert isinstance(exon_info, ExonInfo)
    assert exon_info.gene_id == GID
    assert exon_info.transcript_id == TID

    # Check CDS extension
    if exp_ext_5 is not None:
        assert ext_5 == GenomicRange(chromosome, *exp_ext_5, strand)
    else:
        assert ext_5 is None

    if exp_ext_3 is not None:
        assert ext_3 == GenomicRange(chromosome, *exp_ext_3, strand)
    else:
        assert ext_3 is None

    # Check information retrieval
    assert ccr.get_cds_extensions(gr) == (ext_5, ext_3)
    assert ccr.get_exon_info(gr) == exon_info
    assert ccr.get_transcript_info(gr) == exon_info.transcript_info
Exemplo n.º 8
0
def test_reference_sequence_repository_get_genomic_range_subsequence():
    genomic_range = GenomicRange('X', 1023, 1032, '+')
    a = 'AAAAA'
    b = 'GGGGG'
    sequence = a + b
    rsr = ReferenceSequenceRepository()
    rsr.register_genomic_range(genomic_range)
    rsr._sequences[genomic_range.chromosome][(genomic_range.start, genomic_range.end)] = sequence

    assert rsr.get_genomic_range_subsequence(genomic_range, 1023, 1027) == a
    assert rsr.get_genomic_range_subsequence(genomic_range, 1028, 1032) == b
    assert a + b == sequence
Exemplo n.º 9
0
def test_compute_pam_protected_sequence(seq, pos, ref, alt, ppseq, valid):
    chromosome = 'X'
    start = 100
    end = start + len(seq) - 1

    variant = PamVariant(GenomicPosition(chromosome, pos), ref, alt)

    gr = GenomicRange(chromosome, start, end, '+')
    ref_seq = ReferenceSequence(seq, gr)
    with pytest.raises(Exception) if not valid else nullcontext():
        pam_ref_seq = compute_pam_protected_sequence(ref_seq, {variant})

    if valid:
        assert pam_ref_seq.sequence == ref_seq.sequence
        assert pam_ref_seq.pam_protected_sequence == ppseq
Exemplo n.º 10
0
def test_snv_table_get_snvs_triplet(triplet, alt_triplets, strand,
                                    mut_types_plus, mut_types_minus):
    gr = GenomicRange('X', 10, 12, strand)
    tr = codon_table.translate if strand == '+' else codon_table.translate_rc
    mut_types = mut_types_plus if strand == '+' else mut_types_minus

    # Retrieve SNV metadata
    snv_meta = snv_table.get_snvs(strand, triplet, gr, 0, 0, reset_index=True)
    snv_meta = snv_meta.sort_values(['pos', 'alt']).reset_index(drop=True)

    # Check amino acid change
    ref_aas = snv_meta.ref_aa.unique().astype('string')
    assert len(ref_aas) == 1
    assert ref_aas[0] == tr(triplet)
    assert np.array_equal(snv_meta.alt_aa, np.array(list(map(tr,
                                                             alt_triplets))))
    assert np.array_equal(snv_meta.mut_type, mut_types)
Exemplo n.º 11
0
def test_reference_sequence_repository_register_get_sequence():
    chromosome = 'X'
    start = 1
    end = 100
    seq = 'AACCGGTT'
    gr = GenomicRange(chromosome, start, end, '+')

    # Initialise repository
    rsr = ReferenceSequenceRepository()
    rsr.register_genomic_range(gr)

    # Register sequence
    rsr.register_sequence(chromosome, start, end, seq)

    # Retrieve sequence
    assert rsr.get_sequence(chromosome, start, end) == seq
    assert rsr.get_genomic_range_sequence(gr) == seq
Exemplo n.º 12
0
def test_get_inframe_mutations(seq, pre, suf, strand, exp_pos, exp_ref,
                               exp_mseq):

    # Generate target
    gr = GenomicRange('X', 10, 10 + len(seq) - 1, strand)
    t = CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), pre, suf)

    # Generate in-frame deletions
    mc = t.get_inframe_mutations()

    # Check metadata table
    assert np.array_equal(mc.df.mut_position, np.array(exp_pos))
    assert np.array_equal(
        mc.df.ref.astype('string').to_numpy(), np.array(exp_ref))
    assert np.array_equal(
        mc.df.mseq.astype('string').to_numpy(), np.array(exp_mseq))
    assert mc.df.new.isna().all()
    assert mc.df.var_type.unique()[0] == del_var_type
Exemplo n.º 13
0
def test_get_2del_mutations(offset, seq, exp_pos, exp_ref, exp_mseq, cds):

    # Generate target
    gr = GenomicRange('X', 10, 10 + len(seq) - 1, '+')
    t = (CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), 'AA', 'A')
         if cds else Targeton(PamProtectedReferenceSequence(seq, gr, seq)))

    # Generate in-frame deletions
    mc = getattr(t, del_offset_method[offset])()

    # Check metadata table
    assert np.array_equal(mc.df.mut_position, np.array(exp_pos))
    assert np.array_equal(
        mc.df.ref.astype('string').to_numpy(), np.array(exp_ref))
    assert np.array_equal(
        mc.df.mseq.astype('string').to_numpy(), np.array(exp_mseq))
    assert mc.df.new.isna().all()
    assert mc.df.var_type.unique()[0] == del_var_type
Exemplo n.º 14
0
def test_get_snvre_aa_mutations(aa, strand, seq, exp_mseq):

    # Generate target
    gr = GenomicRange('X', 10, 10 + len(seq) - 1, strand)
    t = CDSTargeton(PamProtectedReferenceSequence(seq, gr, seq), '', '')

    # Generate codon substitution mutations
    mc = getattr(t, CONST_CODON_METHODS[aa])(aux_tables=aux)

    # Check results
    assert mc.df.alt_aa.unique()[0] == aa
    assert np.array_equal(mc.df.mseq.to_numpy(), np.array(exp_mseq))
    assert mc.df.var_type.unique()[0] == sub_var_type
    assert np.array_equal(
        mc.df.ref.astype('string').to_numpy(),
        np.array([
            triplet for i, triplet in enumerate(seq2triplets(seq))
            if 3 * i in mc.df.mut_position.unique()
        ]))
Exemplo n.º 15
0
def test_reference_subsequence(ref_fp):
    ref = ReferenceSequenceRepository()
    fasta_file = get_fasta_file(ref_fp)

    chromosome = 'X'
    strand = '+'
    start = 41341615
    end = 41341635

    ref_range = GenomicRange(chromosome, start, end, strand)

    ref.register_genomic_range(ref_range)
    ref.fetch_sequences(fasta_file)
    seq = ref.get_genomic_range_sequence(ref_range)
    offset = 10
    pre = ref.get_genomic_range_subsequence(ref_range, start, start + offset)
    print(seq)
    print(pre)
    print(len(pre))
    assert pre == seq[:offset]
Exemplo n.º 16
0
def cds_seq_to_genomic_range(cds_seq, strand, plen, slen, offset=10):
    start = offset + plen
    end = offset + len(cds_seq) - slen - 1
    return GenomicRange('X', start, end, strand)
Exemplo n.º 17
0
# organisation for which payment is received. If you are interested in using the Software commercially, please contact
# [email protected]. Contact details are: [email protected] quoting reference Valiant-software.
#############################

import pandas as pd
from pyranges import PyRanges
import pytest
from valiant.models.base import GenomicRange
from valiant.models.exon import ExonInfo, CDSContextRepository

PYRANGES_FIELDS = ['Chromosome', 'Strand', 'Start', 'End']
GID = 'GENE_ID_001'
TID = 'TRANSCRIPT_ID_001'

RANGE = ('X', '+', 100, 120)
GR = GenomicRange('X', 100, 120, '+')

RANGES = pd.DataFrame.from_records([
    RANGE
], columns=PYRANGES_FIELDS)

CDS_RANGES = [
    ('X', '+', 100, 120, GID, TID, 0, 0),
    ('X', '+', 200, 207, GID, TID, 2, 1),
    ('X', '+', 300, 304, GID, TID, 1, 2)
]

CDS_RANGES_DF = pd.DataFrame.from_records([
    (chromosome, strand, start - 1, end, gene_id, transcript_id, frame, exon_index)
    for chromosome, strand, start, end, gene_id, transcript_id, frame, exon_index in CDS_RANGES
], columns=PYRANGES_FIELDS + ['gene_id', 'transcript_id', 'frame', 'exon_index'])
Exemplo n.º 18
0
def test_genomic_range_get_from_5_prime(strand, n, exp_start, exp_end):
    chromosome = 'X'
    gr = GenomicRange(chromosome, 100, 200, strand)
    assert gr.get_from_5_prime(n) == GenomicRange(
        chromosome, exp_start, exp_end, strand)
Exemplo n.º 19
0
def test_genomic_range_region():
    assert GenomicRange('X', 1, 10, '+').region == 'X:1-10'
Exemplo n.º 20
0
def test_genomic_range_length():
    assert len(GenomicRange('X', 5, 10, '+')) == 6
Exemplo n.º 21
0
def test_genomic_range_init(chromosome, start, end, strand, valid):
    with pytest.raises(ValueError) if not valid else nullcontext():
        GenomicRange(chromosome, start, end, strand)
Exemplo n.º 22
0
def test_genomic_range_as_unstranded_pyrange(chromosome, start, end, exp_start, strand):
    assert GenomicRange(chromosome, start, end, strand).as_unstranded_pyrange() == (chromosome, exp_start, end)
Exemplo n.º 23
0
def test_genomic_range_contains_position(pos,exp):
    gr = GenomicRange('X', 100, 200, '+')
    assert gr.contains_position(pos) == exp
Exemplo n.º 24
0
        chromosome, exp_start, exp_end, strand)


@pytest.mark.parametrize('strand,n,exp_start,exp_end', [
    ('-', 3, 100, 102),
    ('+', 3, 198, 200)
])
def test_genomic_range_get_from_3_prime(strand, n, exp_start, exp_end):
    chromosome = 'X'
    gr = GenomicRange(chromosome, 100, 200, strand)
    assert gr.get_from_3_prime(n) == GenomicRange(
        chromosome, exp_start, exp_end, strand)


@pytest.mark.parametrize('parent,child,exp_range,valid', [
    (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 150, 160, '+'), (50, 61), True),
    (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 20, 160, '+'), None, False),
    (GenomicRange('X', 100, 200, '+'), GenomicRange('X', 150, 160, '-'), None, False)

])
def test_genomic_range_get_relative_subrange(parent, child, exp_range, valid):
    with pytest.raises(ValueError) if not valid else nullcontext():
        assert exp_range == parent.get_relative_subrange(child)


@pytest.mark.parametrize('pos,exp', [
    (GenomicPosition('X', 100), True),
    (GenomicPosition('X', 200), True),
    (GenomicPosition('X', 150), True),
    (GenomicPosition('X', 300), False),
    (GenomicPosition('Y', 100), False)
Exemplo n.º 25
0
def get_pam_protected_sequence(seq, pam_protection, chromosome='X', strand='+', pos=1):
    gr = GenomicRange(chromosome, pos, len(seq), strand)
    ref_seq = ReferenceSequence(seq, gr)
    return PamProtectedReferenceSequence.from_reference_sequence(
        ref_seq, get_dummy_pam_protected(seq) if pam_protection else seq)