示例#1
0
    def test_haplotypes(self):
        # REF      TGG   TT    G-
        #          2093  2099  3200
        # varset1  CCC   GG
        #                CC    GG
        #          v1   v2,v3   v4
        v1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC')
        v2 = factory.from_edit('chr24', 2098, 'TT', 'GG')
        v3 = factory.from_edit('chr24', 2098, 'TT', 'CC')
        v4 = factory.from_edit('chr24', 3200, 'G', 'GG')
        hap = variant.Haplotype.from_variants([v1, v2])
        s1 = VariantSet.from_variants([hap, v3, v4])
        s2 = VariantSet.from_variants(s1.iter_vrt())
        vrt = list(s2.find_vrt('chr24', 2090, 2095, expand=True))
        self.assertEqual(len(vrt), 2)
        self.assertEqual(s2.nof_unit_vrt(), 8)
        self.assertEqual(s1.diff(s2).nof_unit_vrt(), 0)
        self.assertEqual(len(list(s2.diff_vrt(s1).iter_vrt())), 0)

        h1 = variant.Haplotype.from_variants([v1, v2])
        h2 = variant.Haplotype.from_variants([v3, v4])
        s3 = VariantSet.from_variants([h1, h2])

        self.assertEqual(s3.diff(s2).nof_unit_vrt(), 0)
        self.assertEqual(s2.diff(s3).nof_unit_vrt(), 0)
示例#2
0
    def test_diff_vrt(self):
        # REF    G     TTGG         C
        #        1206   2093        10044
        # s1     C      CCC         T
        #        G      CCC         G
        #                           C
        #              2092         10044
        # s2           TT           T
        #              AC           T

        variants1 = [('chr24', 1206, 'G', 'C'), ('chr24', 2093, 'TG', 'CC'),
                     ('chr24', 2095, 'G', 'C'), ('chr24', 10044, 'C', 'T'),
                     ('chr24', 10044, 'C', 'G')]
        variants2 = [('chr24', 2092, 'TT', 'AC'), ('chr24', 10044, 'C', 'T')]

        s1 = VariantSet.from_variants(
            [factory.from_edit(*v) for v in variants1])
        s2 = VariantSet.from_variants(
            [factory.from_edit(*v) for v in variants2])

        diff = s1.diff(s2)
        self.assertEqual(diff.nof_unit_vrt(), 4)

        v1, v2, v3, v4 = list(diff.iter_vrt())

        self.assertEqual([v1.start, v1.ref, v1.alt], [1206, 'G', 'C'])
        self.assertEqual([v4.start, v4.ref, v4.alt], [10044, 'C', 'G'])
示例#3
0
    def test_VariantSet_cmp2(self):
        vcf1 = pkg_file('genomvar.test', 'data/example3.vcf')
        s1 = VariantSet.from_vcf(vcf1)
        s2 = VariantSet.from_variants(reversed(list(s1.iter_vrt())))
        diff = s1.diff(s2)
        self.assertEqual(diff.nof_unit_vrt(), 0)

        comm = s1.comm(s2)
        self.assertEqual(comm.nof_unit_vrt(), s1.nof_unit_vrt())
        self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
示例#4
0
    def test_to_vcf_no_ref(self):
        vs1 = VariantSet.from_variants(
            [variant.Del("chr24", 23, 24),
             variant.SNP("chr24", 1206, "C")])

        buf = io.StringIO()
        vs1.to_vcf(buf, reference=self.chr24)
        buf.seek(0)

        vs2 = VariantSet.from_vcf(buf)
        self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), 2)
示例#5
0
    def test_many_chroms_shuffled(self):
        vs1 = VariantSet.from_vcf(
            pkg_file('genomvar.test', 'data/example3.vcf'))
        vrt = list(vs1.iter_vrt())
        vrt2 = [vrt[i] for i in (0, 7, 1, 6, 2, 5, 3, 4)]
        vs2 = VariantSet.from_variants(vrt2)

        self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), vs1.nof_unit_vrt())
        self.assertEqual(vs1.diff(vs2).nof_unit_vrt(), 0)

        self.assertEqual(vs2.comm(vs1).nof_unit_vrt(), vs2.nof_unit_vrt())
        self.assertEqual(vs2.diff(vs1).nof_unit_vrt(), 0)
示例#6
0
    def test_diff_of_ambig_indel(self):
        #     9945
        #     CTTTTTCAT
        # s1  CTT--TCAT
        # s2  C--TTTCAT
        s1 = VariantSet.from_variants(
            [factory.from_edit('chr24', 9947, 'TT', 'T')])
        s2 = VariantSet.from_variants(
            [factory.from_edit('chr24', 9945, 'CTT', 'C')])

        # diff1 = s1.diff(s2, match_partial=False).iter_vrt()
        # self.assertEqual(len(list(diff1)), 0) #TODO fix/reconsider
        diff2 = s1.diff(s2).iter_vrt()
        self.assertEqual(len(list(diff2)), 1)  #TODO fix/reconsider
示例#7
0
 def test_ambig_difference_snp_in_locus(self):
     #         10043
     # Ref     TC-ACA--G
     # v1 s1         CA
     # v1 s2    G
     # v2 s2     T
     fac = variant.VariantFactory(reference=self.chr24, normindel=True)
     s1 = VariantSet.from_variants(
         [fac.from_edit('chr24', 10047, 'A', 'ACA')])
     s2 = VariantSet.from_variants([
         fac.from_edit('chr24', 10044, 'C', 'G'),
         fac.from_edit('chr24', 10044, 'C', 'CT')
     ])
     self.assertEqual(len(list(s1.comm(s2, match_ambig=True).iter_vrt())),
                      0)
示例#8
0
    def test_from_vcf_with_attr(self):
        s = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'),
                                parse_info=True)
        _vrt = list(s.find_vrt('chr24', 150, 160))
        self.assertEqual(len(_vrt), 1)
        vrt = _vrt[0]
        self.assertEqual(vrt.attrib['info']['AF'], 1.0)

        # Check multiallelic locus
        _vrt = list(s.find_vrt('chr24', 20, 30))
        self.assertEqual(len(_vrt), 2)
        for vrt in _vrt:
            if not vrt.is_variant_instance(variant.Null):
                self.assertEqual(vrt.attrib['info']['AF'], 0.5)

        # Check None/KeyError cases (".",field absent...)
        _vrt = list(
            filter(lambda o: not o.is_variant_instance(variant.Null),
                   s.find_vrt('chr24', 450, 460)))
        self.assertEqual(len(_vrt), 1)
        vrt = _vrt[0]
        with self.assertRaises(ValueError):
            vrt.attrib['info']['Randomfields']

        _vrt = list(
            filter(lambda o: not o.is_variant_instance(variant.Null),
                   s.find_vrt('chr24', 4750, 4760)))
        self.assertEqual(len(_vrt), 1)
        vrt = _vrt[0]
        self.assertEqual(vrt.attrib['info']['STR'], True)
示例#9
0
    def test_match_with_haplotypes(self):
        # REF      TGG   TT    G|
        #          2093  2099  3200
        # varset1  CCC   GG
        #                CC    GG
        #          r1   r2,r3   r4
        r1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC')
        r2 = factory.from_edit('chr24', 2098, 'TT', 'GG')
        r3 = factory.from_edit('chr24', 2098, 'TT', 'CC')
        r4 = factory.from_edit('chr24', 3200, 'GG', 'G')

        hap1 = Haplotype.from_variants([r1, r2])
        ccc1 = sorted(hap1.variants, key=lambda o: o.start)[0]
        hap2 = Haplotype.from_variants([r3, r4])
        s1 = VariantSet.from_variants([hap1, hap2])

        hap = Haplotype.from_variants([r1, r4])
        ccc2 = sorted(hap.variants, key=lambda o: o.start)[0]

        match = s1.match(hap)
        self.assertEqual(len(match), 1)
        d1 = {k: [v2.base for v2 in v] for k, v in match.items()}
        d2 = {ccc2.key: [ccc1]}
        self.assertEqual(len(d1), len(d2))
        self.assertEqual(list(d1.keys()), list(d2.keys()))
        for k in d1:
            self.assertTrue(
                all([v1.edit_equal(v2) for v1, v2 in zip(d1[k], d2[k])]))
示例#10
0
    def test_variant_cluster2(self):
        vs1 = VariantSet.from_variants([
            variant.SNP('chr1', 13366968, 'A'),
            variant.SNP('chr1', 13366969, 'T'),
            variant.SNP('chr1', 13366970, 'G')
        ])

        vs2 = VariantSet.from_variants([
            variant.SNP('chr1', 13366968, 'A'),
            variant.SNP('chr1', 13366969, 'T'),
            variant.SNP('chr1', 13366970, 'G'),
            variant.Ins('chr1', 13366970, 'TTT')
        ])

        com = list(vs1.comm(vs2, match_partial=False).iter_vrt())
        self.assertEqual(len(com), 3)
示例#11
0
    def test_asterisk_variant(self):
        vset = VariantSet.from_vcf(pkg_file(
            'genomvar.test', 'data/example_with_asterisk.vcf.gz'),
                                   parse_info=True)

        vrt = list(vset.find_vrt('chr1', 995507, 995515))
        self.assertEqual(len(vrt), 3)
示例#12
0
    def test_from_variants_vcf(self):
        vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test',
                                                  'data/example1.vcf'),
                                         parse_info=True)
        variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key)
        vs = VariantSet.from_variants(variants1)
        _desc = 'Test for multinumber field'
        info_spec_tuples = [('DP4', 4, 'Integer', _desc),
                            ('NSV', 1, 'Integer')]
        info_spec_dict = vs0.dtype['info']
        for info_spec in (info_spec_tuples, info_spec_dict):
            tf = tempfile.NamedTemporaryFile(suffix='.vcf')
            with open(tf.name, 'wt') as fh:
                vs.to_vcf(fh, info_spec=info_spec)

            with open(tf.name, 'rt') as fh:
                self.assertIn(
                    '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\
                          .format(_desc),
                    fh.read().splitlines())
                fh.seek(0)
                # print(fh.read())
            variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True),
                               key=lambda v: v.key)
            self.assertEqual(len(variants1), len(variants2))
            cnt = 0
            for v1, v2 in zip(variants1, variants2):
                self.assertTrue(v1.edit_equal(v2))
                self.assertEqual(v1.attrib['info']['NSV'],
                                 v2.attrib['info']['NSV'])
示例#13
0
    def test_from_vcf_to_records(self):
        vs = VariantSet.from_vcf(pkg_file('genomvar.test',
                                          'data/example1.vcf'),
                                 parse_info=True,
                                 parse_samples=True)

        self.assertEqual(vs._samples, ['SAMP1'])

        # Test nested dtype
        recs = vs.to_records(nested=True)
        self.assertEqual(list(recs.dtype.fields), [
            'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group',
            'info', 'SAMPLES'
        ])
        self.assertEqual(
            list(recs['info'].dtype.fields),
            ['NSV', 'AF', 'DP4', 'ECNT', 'pl', 'mt', 'RECN', 'STR'])
        self.assertEqual(list(recs['SAMPLES'].dtype.fields), ['SAMP1'])
        self.assertEqual(list(recs['SAMPLES']['SAMP1'].dtype.fields), ['GT'])

        # Test not nested
        recs = vs.to_records(nested=False)
        self.assertEqual(list(recs.dtype.fields), [
            'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group',
            'info_NSV', 'info_AF', 'info_DP4', 'info_ECNT', 'info_pl',
            'info_mt', 'info_RECN', 'info_STR', 'SAMPLES_SAMP1_GT'
        ])
示例#14
0
    def test_from_variants_to_vcf_with_sampdata(self):
        file = pkg_file('genomvar.test', 'data/example3.vcf')
        variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True),
                           key=lambda v: v.key)
        vs = VariantSet.from_variants(variants1)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')

        with open(tf.name, 'wt') as fh:
            vs.to_vcf(
                fh,
                format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')],
                samples=['SAMP1'])

        with open(tf.name, 'rt') as fh:
            fh.seek(0)
            self.assertIn(
                '##FORMAT=<ID=AD,Number=R,Type=Integer,'\
                +'Description="">',
                fh.read().splitlines())
        variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True),
                           key=lambda v: v.key)
        self.assertEqual(len(variants1), len(variants2))
        cnt = 0
        for v1, v2 in zip(variants1, variants2):
            self.assertTrue(v1.edit_equal(v2))
            self.assertEqual(v1.attrib['samples']['SAMP1']['AD'],
                             v2.attrib['samples']['SAMP1']['AD'])
示例#15
0
    def test_sort_chroms(self):
        vs = VariantSet.from_vcf(
            pkg_file('genomvar.test', 'data/example2.vcf.gz'))
        vs.sort_chroms()
        self.assertEqual(list(vs.get_chroms()), ['chr23', 'chr24'])

        vs.sort_chroms(key=lambda c: 1 if c == 'chr24' else 2)
        self.assertEqual(list(vs.get_chroms()), ['chr24', 'chr23'])
示例#16
0
 def test_empty_vcf(self):
     buf = io.StringIO()
     with open(pkg_file('genomvar.test', 'data/example1.vcf')) as fh:
         for line in itertools.takewhile(lambda l: l.startswith('#'), fh):
             buf.write(line)
     buf.seek(0)
     vs = VariantSet.from_vcf(buf)
     self.assertEqual(vs.nof_unit_vrt(), 0)
示例#17
0
    def test_mnp_com_split(self):
        #                           23
        #    TTCACTTAGCATAATGTCTTCAAGATT
        # v1                       TT -single
        # v2                       TT -splitted
        vset1 = VariantSet.from_variants(
            [factory.from_edit('chr24', 22, 'AG', 'TT')])
        vset2 = VariantSet.from_variants([
            factory.from_edit('chr24', 22, 'A', 'T'),
            factory.from_edit('chr24', 23, 'G', 'T')
        ])

        com = vset1.comm(vset2)
        v = list(com.iter_vrt())[0]

        self.assertEqual([v.start, v.ref], [22, 'AG'])
        self.assertFalse(list(vset1.diff(vset2).iter_vrt()))
示例#18
0
    def test_strip_order_dependent_Ambig(self):
        #    10043
        # R  T--CA--CAG
        # v1 TCACA--CAG
        # v2 T--CACACAG
        factory = variant.VariantFactory(reference=pkg_file(
            'genomvar.test', 'data/chr24.fna'),
                                         normindel=True)
        v1 = factory.from_edit('chr24', 10043, 'T', 'TCA')
        v2 = factory.from_edit('chr24', 10045, 'A', 'ACA')
        s1 = VariantSet.from_variants([v1])
        s2 = VariantSet.from_variants([v2])

        diff = s1.diff(s2, match_ambig=True)
        self.assertEqual(len(list(diff.iter_vrt())), 0)
        diff = s1.diff(s2, match_ambig=False)
        self.assertEqual(len(list(diff.iter_vrt())), 1)
示例#19
0
 def test_sv_types(self):
     with warnings.catch_warnings(record=True) as wrn:
         vs = VariantSet.from_vcf(
             pkg_file('genomvar.test', 'data/example4.vcf.gz'))
         warnings.simplefilter('always')
         self.assertEqual(vs.nof_unit_vrt(), 100)
         self.assertGreater(len(wrn), 1)
         self.assertIn('Structural', str(wrn[-1].message))
示例#20
0
    def test_from_vcf_with_samples(self):
        vset = VariantSet.from_vcf(pkg_file('genomvar.test',
                                            'data/example1.vcf'),
                                   parse_samples=True)

        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)
        v1, v2 = vrt
        self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (1, 0))
示例#21
0
 def test_ambig_difference_different_ambig(self):
     #         10043
     # Ref     T--CA--CA--G
     # v1 s1   T--CA--CACAG ins CA right
     # v2 s1   T------CA--G del CA left
     # v1 s2   TCACA--CA--G ins CA left
     # v2 s2   T--CA------G del CA right
     fac = variant.VariantFactory(reference=self.chr24, normindel=True)
     s1 = VariantSet.from_variants([
         fac.from_edit('chr24', 10047, 'A', 'ACA'),
         fac.from_edit('chr24', 10043, 'TCA', 'T')
     ])
     s2 = VariantSet.from_variants([
         fac.from_edit('chr24', 10043, 'T', 'TCA'),
         fac.from_edit('chr24', 10045, 'ACA', 'A')
     ])
     self.assertEqual(len(list(s1.diff(s2, match_ambig=True).iter_vrt())),
                      0)
示例#22
0
    def test_from_variants(self):
        vfset = VariantSetFromFile(
            pkg_file('genomvar.test', 'data/example1.vcf'))
        vset = VariantSet.from_variants(list(vfset.iter_vrt()))
        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)

        # Test error on out of reference bounds
        with self.assertRaises(ValueError):
            VariantSet.from_variants(list(vfset.iter_vrt()) +
                                     [variant.SNP('chr24', 10000000, 'C')],
                                     reference=self.chr24)

        # Test error on chromosome not in reference
        with self.assertRaises(ValueError):
            vs = VariantSet.from_variants(list(vfset.iter_vrt()) +
                                          [variant.SNP('chr2', 10, 'C')],
                                          reference=self.chr24)
示例#23
0
 def test_from_string_buffer(self):
     buf = io.StringIO()
     with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh:
         for line in fh:
             buf.write(line)
     buf.seek(0)
     vs = VariantSet.from_vcf(buf)
     self.assertEqual(len(list(vs.find_vrt('chr24', 150, 160))), 1)
     self.assertEqual(len(list(vs.find_vrt('chr24', 20, 30))), 2)
示例#24
0
    def test_in_the_middle(self):
        #      2
        #    TTCACTTAGCAT
        # v1   GGG
        # v2    G
        s1 = VariantSet.from_variants(
            [factory.from_edit('chr24', 2, 'CAC', 'GGG')])
        s2 = VariantSet.from_variants(
            [factory.from_edit('chr24', 3, 'A', 'G')])

        diff = s1.diff(s2)
        vv = list(diff.iter_vrt())
        self.assertEqual(len(vv), 2)
        v1, v2 = vv
        self.assertEqual([v1.start, v1.ref, v1.alt], [2, 'C', 'G'])
        self.assertEqual([v2.start, v2.ref, v2.alt], [4, 'C', 'G'])

        self.assertEqual(
            len(list(s1.diff(s2, match_partial=False).iter_vrt())), 1)
示例#25
0
    def test_from_vcf_with_info(self):
        vset = VariantSet.from_vcf(pkg_file('genomvar.test',
                                            'data/example1.vcf'),
                                   parse_info=True)

        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)
        v1, v2 = vrt
        self.assertEqual(v1.alt, 'C')
        self.assertEqual(v1.attrib['info']['NSV'], 1)
示例#26
0
    def test_from_vcf_problematic(self):
        vset = VariantSet.from_vcf(pkg_file('genomvar.test',
                                            'data/example5.vcf.gz'),
                                   parse_info=True)

        v = list(vset.find_vrt('1', 160109406, 160109409))

        self.assertEqual(len(v), 1)
        v = v[0]
        self.assertEqual(v.attrib['info']['PH'], ('.', ))
示例#27
0
    def test_VariantSet_cmp(self):
        vcf1 = pkg_file('genomvar.test', 'data/example1.vcf')
        vcf2 = pkg_file('genomvar.test', 'data/example2.vcf.gz')
        s1 = VariantSet.from_vcf(vcf1, parse_info=True, parse_samples=True)
        s2 = VariantSet.from_vcf(vcf2)
        diff = s1.diff(s2)
        self.assertEqual(diff.nof_unit_vrt(), 14)
        # Now same diff but without loading in memory
        N = 0
        for vrt in s1.diff_vrt(s2).iter_vrt():
            N += vrt.nof_unit_vrt()
        self.assertEqual(N, 14)

        comm = s1.comm(s2)
        self.assertEqual(len(list(comm.iter_vrt())), 4)
        v1, v2 = sorted(comm.iter_vrt(), key=lambda v: v.key)[:2]
        self.assertEqual(v1.attrib['info']['NSV'], 1)
        self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1))

        self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
示例#28
0
    def test_ovlp(self):
        #                        23
        # TTCACTTAGCATAATGTCTTCAAG|ATT
        #                         G
        #                          C <- not interfering

        ins = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG')
        s1 = VariantSet.from_variants([ins], )
        vb = factory.from_edit(chrom='chr24', start=24, ref='A', alt='C')
        self.assertEqual(len(s1.ovlp(vb)), 0)
        vb2 = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG')
        self.assertEqual(len(s1.ovlp(vb2, match_ambig=False)), 1)
示例#29
0
    def test_match2(self):
        #       1
        # R     TCACAG
        # del1  T--CAG
        # del2  TCA--G
        del1 = variant.Del('chrom', 1, 3)
        adel1 = variant.AmbigDel('chrom', (1, 1), (5, 3), 'CA', '')
        del2 = variant.Del('chrom', 2, 4)
        adel2 = variant.AmbigDel('chrom', (1, 2), (5, 4), 'AC', '')

        vs = VariantSet.from_variants([del1])
        self.assertEqual(len(vs.match(adel1)), 1)
        self.assertEqual(len(vs.match(adel2)), 0)
        self.assertEqual(len(vs.match(adel2, match_ambig=True)), 1)

        vs2 = VariantSet.from_variants([adel1])
        self.assertEqual(len(vs2.match(adel1)), 1)
        self.assertEqual(len(vs2.match(del1)), 1)
        self.assertEqual(len(vs2.match(adel2)), 0)
        self.assertEqual(len(vs2.match(adel2, match_ambig=True)), 1)
        self.assertEqual(len(vs2.match(del2, match_ambig=True)), 1)
        self.assertEqual(len(vs2.match(del2, match_ambig=False)), 0)
示例#30
0
    def test_find_vrt(self):
        ivfs = VariantSetFromFile(pkg_file('genomvar.test',
                                           'data/example2.vcf.gz'),
                                  index=True)
        vs = VariantSet.from_vcf(
            pkg_file('genomvar.test', 'data/example2.vcf.gz'))

        self.assertEqual(
            sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]),
            sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')]))

        self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]),
                         sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))