def test_haplotypes(self): # REF TGG TT G- # 2093 2099 3200 # varset1 CCC GG # CC GG # v1 v2,v3 v4 v1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC') v2 = factory.from_edit('chr24', 2098, 'TT', 'GG') v3 = factory.from_edit('chr24', 2098, 'TT', 'CC') v4 = factory.from_edit('chr24', 3200, 'G', 'GG') hap = variant.Haplotype.from_variants([v1, v2]) s1 = VariantSet.from_variants([hap, v3, v4]) s2 = VariantSet.from_variants(s1.iter_vrt()) vrt = list(s2.find_vrt('chr24', 2090, 2095, expand=True)) self.assertEqual(len(vrt), 2) self.assertEqual(s2.nof_unit_vrt(), 8) self.assertEqual(s1.diff(s2).nof_unit_vrt(), 0) self.assertEqual(len(list(s2.diff_vrt(s1).iter_vrt())), 0) h1 = variant.Haplotype.from_variants([v1, v2]) h2 = variant.Haplotype.from_variants([v3, v4]) s3 = VariantSet.from_variants([h1, h2]) self.assertEqual(s3.diff(s2).nof_unit_vrt(), 0) self.assertEqual(s2.diff(s3).nof_unit_vrt(), 0)
def test_diff_vrt(self): # REF G TTGG C # 1206 2093 10044 # s1 C CCC T # G CCC G # C # 2092 10044 # s2 TT T # AC T variants1 = [('chr24', 1206, 'G', 'C'), ('chr24', 2093, 'TG', 'CC'), ('chr24', 2095, 'G', 'C'), ('chr24', 10044, 'C', 'T'), ('chr24', 10044, 'C', 'G')] variants2 = [('chr24', 2092, 'TT', 'AC'), ('chr24', 10044, 'C', 'T')] s1 = VariantSet.from_variants( [factory.from_edit(*v) for v in variants1]) s2 = VariantSet.from_variants( [factory.from_edit(*v) for v in variants2]) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 4) v1, v2, v3, v4 = list(diff.iter_vrt()) self.assertEqual([v1.start, v1.ref, v1.alt], [1206, 'G', 'C']) self.assertEqual([v4.start, v4.ref, v4.alt], [10044, 'C', 'G'])
def test_VariantSet_cmp2(self): vcf1 = pkg_file('genomvar.test', 'data/example3.vcf') s1 = VariantSet.from_vcf(vcf1) s2 = VariantSet.from_variants(reversed(list(s1.iter_vrt()))) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 0) comm = s1.comm(s2) self.assertEqual(comm.nof_unit_vrt(), s1.nof_unit_vrt()) self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
def test_to_vcf_no_ref(self): vs1 = VariantSet.from_variants( [variant.Del("chr24", 23, 24), variant.SNP("chr24", 1206, "C")]) buf = io.StringIO() vs1.to_vcf(buf, reference=self.chr24) buf.seek(0) vs2 = VariantSet.from_vcf(buf) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), 2)
def test_many_chroms_shuffled(self): vs1 = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example3.vcf')) vrt = list(vs1.iter_vrt()) vrt2 = [vrt[i] for i in (0, 7, 1, 6, 2, 5, 3, 4)] vs2 = VariantSet.from_variants(vrt2) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), vs1.nof_unit_vrt()) self.assertEqual(vs1.diff(vs2).nof_unit_vrt(), 0) self.assertEqual(vs2.comm(vs1).nof_unit_vrt(), vs2.nof_unit_vrt()) self.assertEqual(vs2.diff(vs1).nof_unit_vrt(), 0)
def test_diff_of_ambig_indel(self): # 9945 # CTTTTTCAT # s1 CTT--TCAT # s2 C--TTTCAT s1 = VariantSet.from_variants( [factory.from_edit('chr24', 9947, 'TT', 'T')]) s2 = VariantSet.from_variants( [factory.from_edit('chr24', 9945, 'CTT', 'C')]) # diff1 = s1.diff(s2, match_partial=False).iter_vrt() # self.assertEqual(len(list(diff1)), 0) #TODO fix/reconsider diff2 = s1.diff(s2).iter_vrt() self.assertEqual(len(list(diff2)), 1) #TODO fix/reconsider
def test_ambig_difference_snp_in_locus(self): # 10043 # Ref TC-ACA--G # v1 s1 CA # v1 s2 G # v2 s2 T fac = variant.VariantFactory(reference=self.chr24, normindel=True) s1 = VariantSet.from_variants( [fac.from_edit('chr24', 10047, 'A', 'ACA')]) s2 = VariantSet.from_variants([ fac.from_edit('chr24', 10044, 'C', 'G'), fac.from_edit('chr24', 10044, 'C', 'CT') ]) self.assertEqual(len(list(s1.comm(s2, match_ambig=True).iter_vrt())), 0)
def test_from_vcf_with_attr(self): s = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) _vrt = list(s.find_vrt('chr24', 150, 160)) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['AF'], 1.0) # Check multiallelic locus _vrt = list(s.find_vrt('chr24', 20, 30)) self.assertEqual(len(_vrt), 2) for vrt in _vrt: if not vrt.is_variant_instance(variant.Null): self.assertEqual(vrt.attrib['info']['AF'], 0.5) # Check None/KeyError cases (".",field absent...) _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 450, 460))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] with self.assertRaises(ValueError): vrt.attrib['info']['Randomfields'] _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 4750, 4760))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['STR'], True)
def test_match_with_haplotypes(self): # REF TGG TT G| # 2093 2099 3200 # varset1 CCC GG # CC GG # r1 r2,r3 r4 r1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC') r2 = factory.from_edit('chr24', 2098, 'TT', 'GG') r3 = factory.from_edit('chr24', 2098, 'TT', 'CC') r4 = factory.from_edit('chr24', 3200, 'GG', 'G') hap1 = Haplotype.from_variants([r1, r2]) ccc1 = sorted(hap1.variants, key=lambda o: o.start)[0] hap2 = Haplotype.from_variants([r3, r4]) s1 = VariantSet.from_variants([hap1, hap2]) hap = Haplotype.from_variants([r1, r4]) ccc2 = sorted(hap.variants, key=lambda o: o.start)[0] match = s1.match(hap) self.assertEqual(len(match), 1) d1 = {k: [v2.base for v2 in v] for k, v in match.items()} d2 = {ccc2.key: [ccc1]} self.assertEqual(len(d1), len(d2)) self.assertEqual(list(d1.keys()), list(d2.keys())) for k in d1: self.assertTrue( all([v1.edit_equal(v2) for v1, v2 in zip(d1[k], d2[k])]))
def test_variant_cluster2(self): vs1 = VariantSet.from_variants([ variant.SNP('chr1', 13366968, 'A'), variant.SNP('chr1', 13366969, 'T'), variant.SNP('chr1', 13366970, 'G') ]) vs2 = VariantSet.from_variants([ variant.SNP('chr1', 13366968, 'A'), variant.SNP('chr1', 13366969, 'T'), variant.SNP('chr1', 13366970, 'G'), variant.Ins('chr1', 13366970, 'TTT') ]) com = list(vs1.comm(vs2, match_partial=False).iter_vrt()) self.assertEqual(len(com), 3)
def test_asterisk_variant(self): vset = VariantSet.from_vcf(pkg_file( 'genomvar.test', 'data/example_with_asterisk.vcf.gz'), parse_info=True) vrt = list(vset.find_vrt('chr1', 995507, 995515)) self.assertEqual(len(vrt), 3)
def test_from_variants_vcf(self): vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) _desc = 'Test for multinumber field' info_spec_tuples = [('DP4', 4, 'Integer', _desc), ('NSV', 1, 'Integer')] info_spec_dict = vs0.dtype['info'] for info_spec in (info_spec_tuples, info_spec_dict): tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=info_spec) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\ .format(_desc), fh.read().splitlines()) fh.seek(0) # print(fh.read()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_from_vcf_to_records(self): vs = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True, parse_samples=True) self.assertEqual(vs._samples, ['SAMP1']) # Test nested dtype recs = vs.to_records(nested=True) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info', 'SAMPLES' ]) self.assertEqual( list(recs['info'].dtype.fields), ['NSV', 'AF', 'DP4', 'ECNT', 'pl', 'mt', 'RECN', 'STR']) self.assertEqual(list(recs['SAMPLES'].dtype.fields), ['SAMP1']) self.assertEqual(list(recs['SAMPLES']['SAMP1'].dtype.fields), ['GT']) # Test not nested recs = vs.to_records(nested=False) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info_NSV', 'info_AF', 'info_DP4', 'info_ECNT', 'info_pl', 'info_mt', 'info_RECN', 'info_STR', 'SAMPLES_SAMP1_GT' ])
def test_from_variants_to_vcf_with_sampdata(self): file = pkg_file('genomvar.test', 'data/example3.vcf') variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf( fh, format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')], samples=['SAMP1']) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##FORMAT=<ID=AD,Number=R,Type=Integer,'\ +'Description="">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['samples']['SAMP1']['AD'], v2.attrib['samples']['SAMP1']['AD'])
def test_sort_chroms(self): vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) vs.sort_chroms() self.assertEqual(list(vs.get_chroms()), ['chr23', 'chr24']) vs.sort_chroms(key=lambda c: 1 if c == 'chr24' else 2) self.assertEqual(list(vs.get_chroms()), ['chr24', 'chr23'])
def test_empty_vcf(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf')) as fh: for line in itertools.takewhile(lambda l: l.startswith('#'), fh): buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf) self.assertEqual(vs.nof_unit_vrt(), 0)
def test_mnp_com_split(self): # 23 # TTCACTTAGCATAATGTCTTCAAGATT # v1 TT -single # v2 TT -splitted vset1 = VariantSet.from_variants( [factory.from_edit('chr24', 22, 'AG', 'TT')]) vset2 = VariantSet.from_variants([ factory.from_edit('chr24', 22, 'A', 'T'), factory.from_edit('chr24', 23, 'G', 'T') ]) com = vset1.comm(vset2) v = list(com.iter_vrt())[0] self.assertEqual([v.start, v.ref], [22, 'AG']) self.assertFalse(list(vset1.diff(vset2).iter_vrt()))
def test_strip_order_dependent_Ambig(self): # 10043 # R T--CA--CAG # v1 TCACA--CAG # v2 T--CACACAG factory = variant.VariantFactory(reference=pkg_file( 'genomvar.test', 'data/chr24.fna'), normindel=True) v1 = factory.from_edit('chr24', 10043, 'T', 'TCA') v2 = factory.from_edit('chr24', 10045, 'A', 'ACA') s1 = VariantSet.from_variants([v1]) s2 = VariantSet.from_variants([v2]) diff = s1.diff(s2, match_ambig=True) self.assertEqual(len(list(diff.iter_vrt())), 0) diff = s1.diff(s2, match_ambig=False) self.assertEqual(len(list(diff.iter_vrt())), 1)
def test_sv_types(self): with warnings.catch_warnings(record=True) as wrn: vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example4.vcf.gz')) warnings.simplefilter('always') self.assertEqual(vs.nof_unit_vrt(), 100) self.assertGreater(len(wrn), 1) self.assertIn('Structural', str(wrn[-1].message))
def test_from_vcf_with_samples(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_samples=True) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (1, 0))
def test_ambig_difference_different_ambig(self): # 10043 # Ref T--CA--CA--G # v1 s1 T--CA--CACAG ins CA right # v2 s1 T------CA--G del CA left # v1 s2 TCACA--CA--G ins CA left # v2 s2 T--CA------G del CA right fac = variant.VariantFactory(reference=self.chr24, normindel=True) s1 = VariantSet.from_variants([ fac.from_edit('chr24', 10047, 'A', 'ACA'), fac.from_edit('chr24', 10043, 'TCA', 'T') ]) s2 = VariantSet.from_variants([ fac.from_edit('chr24', 10043, 'T', 'TCA'), fac.from_edit('chr24', 10045, 'ACA', 'A') ]) self.assertEqual(len(list(s1.diff(s2, match_ambig=True).iter_vrt())), 0)
def test_from_variants(self): vfset = VariantSetFromFile( pkg_file('genomvar.test', 'data/example1.vcf')) vset = VariantSet.from_variants(list(vfset.iter_vrt())) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) # Test error on out of reference bounds with self.assertRaises(ValueError): VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr24', 10000000, 'C')], reference=self.chr24) # Test error on chromosome not in reference with self.assertRaises(ValueError): vs = VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr2', 10, 'C')], reference=self.chr24)
def test_from_string_buffer(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh: for line in fh: buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf) self.assertEqual(len(list(vs.find_vrt('chr24', 150, 160))), 1) self.assertEqual(len(list(vs.find_vrt('chr24', 20, 30))), 2)
def test_in_the_middle(self): # 2 # TTCACTTAGCAT # v1 GGG # v2 G s1 = VariantSet.from_variants( [factory.from_edit('chr24', 2, 'CAC', 'GGG')]) s2 = VariantSet.from_variants( [factory.from_edit('chr24', 3, 'A', 'G')]) diff = s1.diff(s2) vv = list(diff.iter_vrt()) self.assertEqual(len(vv), 2) v1, v2 = vv self.assertEqual([v1.start, v1.ref, v1.alt], [2, 'C', 'G']) self.assertEqual([v2.start, v2.ref, v2.alt], [4, 'C', 'G']) self.assertEqual( len(list(s1.diff(s2, match_partial=False).iter_vrt())), 1)
def test_from_vcf_with_info(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.alt, 'C') self.assertEqual(v1.attrib['info']['NSV'], 1)
def test_from_vcf_problematic(self): vset = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example5.vcf.gz'), parse_info=True) v = list(vset.find_vrt('1', 160109406, 160109409)) self.assertEqual(len(v), 1) v = v[0] self.assertEqual(v.attrib['info']['PH'], ('.', ))
def test_VariantSet_cmp(self): vcf1 = pkg_file('genomvar.test', 'data/example1.vcf') vcf2 = pkg_file('genomvar.test', 'data/example2.vcf.gz') s1 = VariantSet.from_vcf(vcf1, parse_info=True, parse_samples=True) s2 = VariantSet.from_vcf(vcf2) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 14) # Now same diff but without loading in memory N = 0 for vrt in s1.diff_vrt(s2).iter_vrt(): N += vrt.nof_unit_vrt() self.assertEqual(N, 14) comm = s1.comm(s2) self.assertEqual(len(list(comm.iter_vrt())), 4) v1, v2 = sorted(comm.iter_vrt(), key=lambda v: v.key)[:2] self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1)) self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
def test_ovlp(self): # 23 # TTCACTTAGCATAATGTCTTCAAG|ATT # G # C <- not interfering ins = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG') s1 = VariantSet.from_variants([ins], ) vb = factory.from_edit(chrom='chr24', start=24, ref='A', alt='C') self.assertEqual(len(s1.ovlp(vb)), 0) vb2 = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG') self.assertEqual(len(s1.ovlp(vb2, match_ambig=False)), 1)
def test_match2(self): # 1 # R TCACAG # del1 T--CAG # del2 TCA--G del1 = variant.Del('chrom', 1, 3) adel1 = variant.AmbigDel('chrom', (1, 1), (5, 3), 'CA', '') del2 = variant.Del('chrom', 2, 4) adel2 = variant.AmbigDel('chrom', (1, 2), (5, 4), 'AC', '') vs = VariantSet.from_variants([del1]) self.assertEqual(len(vs.match(adel1)), 1) self.assertEqual(len(vs.match(adel2)), 0) self.assertEqual(len(vs.match(adel2, match_ambig=True)), 1) vs2 = VariantSet.from_variants([adel1]) self.assertEqual(len(vs2.match(adel1)), 1) self.assertEqual(len(vs2.match(del1)), 1) self.assertEqual(len(vs2.match(adel2)), 0) self.assertEqual(len(vs2.match(adel2, match_ambig=True)), 1) self.assertEqual(len(vs2.match(del2, match_ambig=True)), 1) self.assertEqual(len(vs2.match(del2, match_ambig=False)), 0)
def test_find_vrt(self): ivfs = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), index=True) vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) self.assertEqual( sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]), sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')])) self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]), sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))