def test_from_vcf_to_records(self): vs = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True, parse_samples=True) self.assertEqual(vs._samples, ['SAMP1']) # Test nested dtype recs = vs.to_records(nested=True) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info', 'SAMPLES' ]) self.assertEqual( list(recs['info'].dtype.fields), ['NSV', 'AF', 'DP4', 'ECNT', 'pl', 'mt', 'RECN', 'STR']) self.assertEqual(list(recs['SAMPLES'].dtype.fields), ['SAMP1']) self.assertEqual(list(recs['SAMPLES']['SAMP1'].dtype.fields), ['GT']) # Test not nested recs = vs.to_records(nested=False) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info_NSV', 'info_AF', 'info_DP4', 'info_ECNT', 'info_pl', 'info_mt', 'info_RECN', 'info_STR', 'SAMPLES_SAMP1_GT' ])
def test_asterisk_variant(self): vset = VariantSet.from_vcf(pkg_file( 'genomvar.test', 'data/example_with_asterisk.vcf.gz'), parse_info=True) vrt = list(vset.find_vrt('chr1', 995507, 995515)) self.assertEqual(len(vrt), 3)
def test_from_vcf_with_attr(self): s = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) _vrt = list(s.find_vrt('chr24', 150, 160)) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['AF'], 1.0) # Check multiallelic locus _vrt = list(s.find_vrt('chr24', 20, 30)) self.assertEqual(len(_vrt), 2) for vrt in _vrt: if not vrt.is_variant_instance(variant.Null): self.assertEqual(vrt.attrib['info']['AF'], 0.5) # Check None/KeyError cases (".",field absent...) _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 450, 460))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] with self.assertRaises(ValueError): vrt.attrib['info']['Randomfields'] _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 4750, 4760))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['STR'], True)
def test_empty_vcf(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf')) as fh: for line in itertools.takewhile(lambda l: l.startswith('#'), fh): buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf) self.assertEqual(vs.nof_unit_vrt(), 0)
def test_sv_types(self): with warnings.catch_warnings(record=True) as wrn: vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example4.vcf.gz')) warnings.simplefilter('always') self.assertEqual(vs.nof_unit_vrt(), 100) self.assertGreater(len(wrn), 1) self.assertIn('Structural', str(wrn[-1].message))
def test_sort_chroms(self): vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) vs.sort_chroms() self.assertEqual(list(vs.get_chroms()), ['chr23', 'chr24']) vs.sort_chroms(key=lambda c: 1 if c == 'chr24' else 2) self.assertEqual(list(vs.get_chroms()), ['chr24', 'chr23'])
def test_from_string_buffer(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh: for line in fh: buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf) self.assertEqual(len(list(vs.find_vrt('chr24', 150, 160))), 1) self.assertEqual(len(list(vs.find_vrt('chr24', 20, 30))), 2)
def test_from_vcf_with_samples(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_samples=True) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (1, 0))
def test_from_vcf_problematic(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example5.vcf.gz'), parse_info=True) v = list(vset.find_vrt('1', 160109406, 160109409)) self.assertEqual(len(v), 1) v = v[0] self.assertEqual(v.attrib['info']['PH'], ('.', ))
def test_VariantSet_cmp2(self): vcf1 = pkg_file('genomvar.test', 'data/example3.vcf') s1 = VariantSet.from_vcf(vcf1) s2 = VariantSet.from_variants(reversed(list(s1.iter_vrt()))) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 0) comm = s1.comm(s2) self.assertEqual(comm.nof_unit_vrt(), s1.nof_unit_vrt()) self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
def test_from_vcf_with_info(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.alt, 'C') self.assertEqual(v1.attrib['info']['NSV'], 1)
def test_VariantSet_cmp(self): vcf1 = pkg_file('genomvar.test', 'data/example1.vcf') vcf2 = pkg_file('genomvar.test', 'data/example2.vcf.gz') s1 = VariantSet.from_vcf(vcf1, parse_info=True, parse_samples=True) s2 = VariantSet.from_vcf(vcf2) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 14) # Now same diff but without loading in memory N = 0 for vrt in s1.diff_vrt(s2).iter_vrt(): N += vrt.nof_unit_vrt() self.assertEqual(N, 14) comm = s1.comm(s2) self.assertEqual(len(list(comm.iter_vrt())), 4) v1, v2 = sorted(comm.iter_vrt(), key=lambda v: v.key)[:2] self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1)) self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
def test_to_vcf_no_ref(self): vs1 = VariantSet.from_variants( [variant.Del("chr24", 23, 24), variant.SNP("chr24", 1206, "C")]) buf = io.StringIO() vs1.to_vcf(buf, reference=self.chr24) buf.seek(0) vs2 = VariantSet.from_vcf(buf) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), 2)
def test_many_chroms_shuffled(self): vs1 = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example3.vcf')) vrt = list(vs1.iter_vrt()) vrt2 = [vrt[i] for i in (0, 7, 1, 6, 2, 5, 3, 4)] vs2 = VariantSet.from_variants(vrt2) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), vs1.nof_unit_vrt()) self.assertEqual(vs1.diff(vs2).nof_unit_vrt(), 0) self.assertEqual(vs2.comm(vs1).nof_unit_vrt(), vs2.nof_unit_vrt()) self.assertEqual(vs2.diff(vs1).nof_unit_vrt(), 0)
def test_find_vrt(self): ivfs = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), index=True) vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) self.assertEqual( sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]), sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')])) self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]), sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))
def test_match(self): # REF TGG TT # 2093 2099 # vs1 CCC GG # vs2 CG # r1 r2,r3 vs1 = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example1.vcf')) vrt_CG = factory.from_edit('chr24', 2098, 'TT', 'CG') vrt_CC = factory.from_edit('chr24', 2098, 'TT', 'CC') self.assertEqual(len(vs1.match(vrt_CG)), 1) self.assertEqual(len(vs1.match(vrt_CG, match_partial=False)), 0) self.assertEqual(len(vs1.match(vrt_CC)), 0) self.assertEqual(len(vs1.match(vrt_CC, match_partial=False)), 0)
def test_from_vcf(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), reference=self.chr24, normindel=True) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) # Test presence of null operation vrt = list(vset.find_vrt('chr24', 20, 25)) self.assertEqual(len(vrt), 2) for _v in vrt: if not _v.is_variant_instance(variant.Null): self.assertEqual(_v.attrib['id'], '1')
def test_drop_duplicates(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf')) as fh: for line in fh: if line.startswith('#'): buf.write(line) else: for i in range(2): buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf, parse_info=True, parse_samples=True) vs2, dropped = vs.drop_duplicates(return_dropped=True) self.assertEqual(vs.nof_unit_vrt() / 2, vs2.nof_unit_vrt()) self.assertEqual(vs2.nof_unit_vrt(), sum([v.nof_unit_vrt() for v in dropped]))
def test_from_to_vcf(self): fl = pkg_file('genomvar.test', 'data/example1.vcf') variants1 = sorted(VCFReader(fl).iter_vrt(parse_info=True), key=lambda v: v.key) vs = VariantSet.from_vcf(fl, parse_info=True) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Test for multinumber field">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_from_vcf(self): vset = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example1.vcf')) self.assertEqual(vset.nof_unit_vrt(), 18) self.assertEqual(list(vset.chroms), ['chr24']) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertTrue(v1.is_variant_instance(variant.SNP)) self.assertEqual(v1.attrib['vcf_notation']['ref'], 'G') self.assertEqual(v1.attrib['vcf_notation']['start'], 1206) self.assertEqual(v1.attrib['vcf_notation']['row'], 4) self.assertEqual(v1.attrib['id'], '5') self.assertEqual(v1.attrib['filter'], 'PASS') self.assertEqual(v1.attrib['qual'], 100) vrt = list(vset.find_vrt('chr24', 154, 156)) self.assertEqual(len(vrt), 1) v1 = vrt[0] self.assertTrue(v1.is_variant_instance(variant.Ins)) vrt = list(vset.find_vrt('chr24', 20, 25)) self.assertEqual(len(vrt), 2) self.assertEqual(len(list(vset.iter_vrt())), 16)
def test_find_vrt_chrom_only(self): s1 = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example2.vcf.gz'), parse_info=True, parse_samples=True) self.assertEqual(len(list(s1.find_vrt('chr24'))), 4)
def test_VariantSet_no_common(self): vcf1 = pkg_file('genomvar.test', 'data/example1.vcf') vcf2 = pkg_file('genomvar.test', 'data/example_gnomad_1.vcf.gz') s1 = VariantSet.from_vcf(vcf1) s2 = VariantSet.from_vcf(vcf2) self.assertEqual(s1.comm(s2).nof_unit_vrt(), 0)
def test_from_bcf(self): vset = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example3.bcf')) self.assertEqual(len(list(vset.iter_vrt())), 8)
def test_random_sample(self): vs = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf')) sample = vs.sample(5) self.assertEqual(len(sample), 5)