Exemplo n.º 1
0
def variant_queryable():
    vcf = MultiSampleVCF(vcf_file)
    return VariantIntervalQueryable(
        vcf, [([
            Variant('chr1', 12, 'A', 'T'),
            Variant('chr1', 18, 'A', 'C', filter='q10'),
        ], Interval('chr1', 10, 20)),
              ([
                  Variant('chr2', 120, 'AT', 'AAAT'),
              ], Interval('chr2', 110, 200))])
Exemplo n.º 2
0
def test_MultiSampleVCF__regions_from_variants(multi_sample_vcf):
    variants = [
        Variant('chr1', 4, 'T', 'C'),
        Variant('chr1', 25, 'AACG', 'GA'),
        Variant('chr1', 55525, 'AACG', 'GA'),
        Variant('chr10', 55525, 'AACG', 'GA')
    ]
    regions = multi_sample_vcf._regions_from_variants(variants)

    assert set(regions) == set([
        Interval('chr1', 3, 25),
        Interval('chr1', 55524, 55525),
        Interval('chr10', 55524, 55525)
    ])
Exemplo n.º 3
0
    def get_variants(self, variants: Iterable[Union[str, Variant]],
                     regions=None, variant_gap=150) -> List[Variant]:
        """Returns list of variants from vcf file. Lets you use vcf file as dict.

        # Arguments:
            variants: list of variants
            regions: list of regions to seek for variants.
              Automatically generated from variants if not given.
            strategy: strategy if there is not variant in region.

        # Returns
           List of variants
        """
        variants = [
            Variant.from_str(v) if type(v) == str else v
            for v in variants
        ]
        regions = regions or self._regions_from_variants(
            variants, variant_gap=variant_gap)
        variant_map = dict()

        for r in regions:
            r_variants = self.fetch_variants(r)
            for v in r_variants:
                variant_map[v] = v

        return [variant_map.get(v) for v in variants]
Exemplo n.º 4
0
 def _variants_from_cyvcf2(self, cy_variant):
     # in case deletion is present
     ALTs = cy_variant.ALT or ['']
     # single REF can have multiple ALT
     for alt in ALTs:
         v = Variant.from_cyvcf_and_given_alt(cy_variant, alt)
         if 'N' in alt or '*' in alt:
             logging.warning(
                 'Undefined variant %s are not supported: Skip' % str(v))
             continue
         yield v
Exemplo n.º 5
0
def test_MultiSampleVCF_get_variant(multi_sample_vcf):
    variant = multi_sample_vcf.get_variant("chr1:4:T>C")
    assert variant.chrom == 'chr1'
    assert variant.pos == 4
    assert variant.ref == 'T'
    assert variant.alt == 'C'

    variant = multi_sample_vcf.get_variant(Variant('chr1', 4, 'T', 'C'))
    assert variant.chrom == 'chr1'
    assert variant.pos == 4
    assert variant.ref == 'T'
    assert variant.alt == 'C'

    with pytest.raises(KeyError):
        multi_sample_vcf.get_variant("chr1:4:A>C")
Exemplo n.º 6
0
    def _get_sample_variants(self, variants, sample, phase):
        """Given a list of `cyvcf2.Variant`, returns all those present for a
        given sample and phase and converts them to
        `kipoiseq.dataclasses.Variant`

        Args:
          variants: List of `cyvcf2.Variant`, Variants of interest
          sample: `str`, Sample for which to filter genotypes
          phase: `0` or `1`, Phase for which to filter genotypes

        Returns:
          List of `kipoiseq.dataclasses.Variant`
        """
        sample_index = self._sample_indices[sample]
        return [
            Variant.from_cyvcf(v) for v in variants
            if v.genotypes[sample_index][phase]
        ]
Exemplo n.º 7
0
    def get_variant(self, variant: Union[Variant, str]) -> Variant:
        """Returns variant from vcf file. Lets you use vcf file as dict.

        # Arguments:
            variant: variant object or variant id as string.

        # Returns
            Variant object.

        # Example
            ```python
              >>> MultiSampleVCF(vcf_path).get_variant("chr1:4:T:['C']")
            ```
        """
        if type(variant) == str:
            variant = Variant.from_str(variant)

        variants = self.fetch_variants(
            Interval(variant.chrom, variant.pos - 1, variant.pos))
        for v in variants:
            if v.ref == variant.ref and v.alt == variant.alt:
                return v
        raise KeyError('Variant %s not found in vcf file.' % str(variant))
Exemplo n.º 8
0
import pytest
from conftest import vcf_file, gtf_file, example_intervals_bed
import pyranges
from kipoiseq.dataclasses import Interval, Variant
from kipoiseq.extractors.vcf import MultiSampleVCF
from kipoiseq.extractors.vcf_matching import variants_to_pyranges, \
    pyranges_to_intervals, intervals_to_pyranges, BaseVariantMatcher, \
    SingleVariantMatcher, MultiVariantsMatcher, VariantFetcher

intervals = [
    Interval('chr1', 1, 10, strand='+'),
    Interval('chr1', 23, 30, strand='-')
]

variants = [
    Variant('chr1', 4, 'T', 'C'),
    Variant('chr1', 5, 'A', 'GA'),
    Variant('chr1', 25, 'AACG', 'GA')
]

pr = pyranges.PyRanges(chromosomes='chr1',
                       starts=[1, 23, 5],
                       ends=[10, 30, 50],
                       strands=['+', '-', '.'])


class VariantFetcherProxy(VariantFetcher):
    def __init__(self, variant_fetcher: VariantFetcher):
        self.variant_fetcher = variant_fetcher

    def fetch_variants(
Exemplo n.º 9
0
 def __next__(self):
     return Variant.from_cyvcf(super().__next__())
Exemplo n.º 10
0
def test_extract(variant_seq_extractor):
    variants = [Variant.from_cyvcf(v) for v in VCF(vcf_file)]

    interval = Interval('chr1', 2, 9)

    seq = variant_seq_extractor.extract(interval, variants, anchor=5)
    assert len(seq) == interval.end - interval.start
    assert seq == 'CGAACGT'

    interval = Interval('chr1', 2, 9, strand='-')
    seq = variant_seq_extractor.extract(interval, variants, anchor=5)
    assert len(seq) == interval.end - interval.start
    assert seq == 'ACGTTCG'

    interval = Interval('chr1', 4, 14)
    seq = variant_seq_extractor.extract(interval, variants, anchor=7)
    assert len(seq) == interval.end - interval.start
    assert seq == 'AACGTAACGT'

    interval = Interval('chr1', 4, 14)
    seq = variant_seq_extractor.extract(interval, variants, anchor=4)
    assert len(seq) == interval.end - interval.start
    assert seq == 'GAACGTAACG'

    interval = Interval('chr1', 2, 5)
    seq = variant_seq_extractor.extract(interval, variants, anchor=3)
    assert len(seq) == interval.end - interval.start
    assert seq == 'GCG'

    interval = Interval('chr1', 24, 34)
    seq = variant_seq_extractor.extract(interval, variants, anchor=27)
    assert len(seq) == interval.end - interval.start
    assert seq == 'TGATAACGTA'

    interval = Interval('chr1', 25, 35)
    seq = variant_seq_extractor.extract(interval, variants, anchor=34)
    assert len(seq) == interval.end - interval.start
    assert seq == 'TGATAACGTA'

    interval = Interval('chr1', 34, 44)
    seq = variant_seq_extractor.extract(interval, variants, anchor=37)
    assert len(seq) == interval.end - interval.start
    assert seq == 'AACGTAACGT'

    interval = Interval('chr1', 34, 44)
    seq = variant_seq_extractor.extract(interval, variants, anchor=100)
    assert len(seq) == interval.end - interval.start
    assert seq == 'AACGTAACGT'

    interval = Interval('chr1', 5, 11, strand='+')
    seq = variant_seq_extractor.extract(interval,
                                        variants,
                                        anchor=10,
                                        fixed_len=False)
    assert seq == 'ACGTAA'

    interval = Interval('chr1', 0, 3, strand='+')
    seq = variant_seq_extractor.extract(interval,
                                        variants,
                                        anchor=10,
                                        fixed_len=False)
    assert seq == 'ACG'

    interval = Interval('chr1', 0, 3, strand='+')
    ref_seq_extractor = FastaStringExtractor(fasta_file, use_strand=True)
    seq = VariantSeqExtractor(reference_sequence=ref_seq_extractor).extract(
        interval, variants, anchor=10, fixed_len=False)
    assert seq == 'ACG'
Exemplo n.º 11
0
 def fetch_variants(self, interval, sample_id=None):
     for v in self(self._region(interval)):
         v = Variant.from_cyvcf(v)
         if sample_id is None or self.has_variant(v, sample_id):
             yield v
Exemplo n.º 12
0
def test_variant():
    v = Variant("chr1", 10, 'C', 'T')

    assert v.start == 9
    assert v.chrom == 'chr1'
    assert v.pos == 10
    assert v.ref == 'C'
    assert v.alt == 'T'
    assert isinstance(v.info, dict)
    assert len(v.info) == 0
    assert v.qual == 0
    assert v.filter == 'PASS'
    v.info['test'] = 10
    assert v.info['test'] == 10
    assert isinstance(str(v), str)

    # make sure the original got unchangd
    v2 = v.copy()
    v.info['test'] = 20
    assert v2.info['test'] == 10
    v.__repr__()

    # __str__, from_str
    assert v == Variant.from_str(str(v))

    # hash test
    assert isinstance(hash(v), int)
    assert hash(v) == hash(Variant.from_str(str(v)))

    # fixed arguments
    with pytest.raises(AttributeError):
        v.chrom = 'asd'
    with pytest.raises(AttributeError):
        v.pos = 10
    with pytest.raises(AttributeError):
        v.ref = 'asd'
    with pytest.raises(AttributeError):
        v.alt = 'asd'

    # non-fixed arguments
    v.id = 'asd'
    v.qual = 10
    v.filter = 'asd'
    v.source = 2

    assert isinstance(Variant("chr1", '10', 'C', 'T').pos, int)

    # from cyvcf2
    vcf = cyvcf2.VCF('tests/data/test.vcf.gz')
    cv = list(vcf)[0]

    v2 = Variant.from_cyvcf(cv)
    assert isinstance(v2.source, cyvcf2.Variant)