def test_haplotypes(self): # REF TGG TT G- # 2093 2099 3200 # varset1 CCC GG # CC GG # v1 v2,v3 v4 v1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC') v2 = factory.from_edit('chr24', 2098, 'TT', 'GG') v3 = factory.from_edit('chr24', 2098, 'TT', 'CC') v4 = factory.from_edit('chr24', 3200, 'G', 'GG') hap = variant.Haplotype.from_variants([v1, v2]) s1 = VariantSet.from_variants([hap, v3, v4]) s2 = VariantSet.from_variants(s1.iter_vrt()) vrt = list(s2.find_vrt('chr24', 2090, 2095, expand=True)) self.assertEqual(len(vrt), 2) self.assertEqual(s2.nof_unit_vrt(), 8) self.assertEqual(s1.diff(s2).nof_unit_vrt(), 0) self.assertEqual(len(list(s2.diff_vrt(s1).iter_vrt())), 0) h1 = variant.Haplotype.from_variants([v1, v2]) h2 = variant.Haplotype.from_variants([v3, v4]) s3 = VariantSet.from_variants([h1, h2]) self.assertEqual(s3.diff(s2).nof_unit_vrt(), 0) self.assertEqual(s2.diff(s3).nof_unit_vrt(), 0)
def test_diff_vrt(self): # REF G TTGG C # 1206 2093 10044 # s1 C CCC T # G CCC G # C # 2092 10044 # s2 TT T # AC T variants1 = [('chr24', 1206, 'G', 'C'), ('chr24', 2093, 'TG', 'CC'), ('chr24', 2095, 'G', 'C'), ('chr24', 10044, 'C', 'T'), ('chr24', 10044, 'C', 'G')] variants2 = [('chr24', 2092, 'TT', 'AC'), ('chr24', 10044, 'C', 'T')] s1 = VariantSet.from_variants( [factory.from_edit(*v) for v in variants1]) s2 = VariantSet.from_variants( [factory.from_edit(*v) for v in variants2]) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 4) v1, v2, v3, v4 = list(diff.iter_vrt()) self.assertEqual([v1.start, v1.ref, v1.alt], [1206, 'G', 'C']) self.assertEqual([v4.start, v4.ref, v4.alt], [10044, 'C', 'G'])
def test_diff_of_ambig_indel(self): # 9945 # CTTTTTCAT # s1 CTT--TCAT # s2 C--TTTCAT s1 = VariantSet.from_variants( [factory.from_edit('chr24', 9947, 'TT', 'T')]) s2 = VariantSet.from_variants( [factory.from_edit('chr24', 9945, 'CTT', 'C')]) # diff1 = s1.diff(s2, match_partial=False).iter_vrt() # self.assertEqual(len(list(diff1)), 0) #TODO fix/reconsider diff2 = s1.diff(s2).iter_vrt() self.assertEqual(len(list(diff2)), 1) #TODO fix/reconsider
def test_ambig_difference_snp_in_locus(self): # 10043 # Ref TC-ACA--G # v1 s1 CA # v1 s2 G # v2 s2 T fac = variant.VariantFactory(reference=self.chr24, normindel=True) s1 = VariantSet.from_variants( [fac.from_edit('chr24', 10047, 'A', 'ACA')]) s2 = VariantSet.from_variants([ fac.from_edit('chr24', 10044, 'C', 'G'), fac.from_edit('chr24', 10044, 'C', 'CT') ]) self.assertEqual(len(list(s1.comm(s2, match_ambig=True).iter_vrt())), 0)
def test_match_with_haplotypes(self): # REF TGG TT G| # 2093 2099 3200 # varset1 CCC GG # CC GG # r1 r2,r3 r4 r1 = factory.from_edit('chr24', 2093, 'TGG', 'CCC') r2 = factory.from_edit('chr24', 2098, 'TT', 'GG') r3 = factory.from_edit('chr24', 2098, 'TT', 'CC') r4 = factory.from_edit('chr24', 3200, 'GG', 'G') hap1 = Haplotype.from_variants([r1, r2]) ccc1 = sorted(hap1.variants, key=lambda o: o.start)[0] hap2 = Haplotype.from_variants([r3, r4]) s1 = VariantSet.from_variants([hap1, hap2]) hap = Haplotype.from_variants([r1, r4]) ccc2 = sorted(hap.variants, key=lambda o: o.start)[0] match = s1.match(hap) self.assertEqual(len(match), 1) d1 = {k: [v2.base for v2 in v] for k, v in match.items()} d2 = {ccc2.key: [ccc1]} self.assertEqual(len(d1), len(d2)) self.assertEqual(list(d1.keys()), list(d2.keys())) for k in d1: self.assertTrue( all([v1.edit_equal(v2) for v1, v2 in zip(d1[k], d2[k])]))
def test_variant_cluster2(self): vs1 = VariantSet.from_variants([ variant.SNP('chr1', 13366968, 'A'), variant.SNP('chr1', 13366969, 'T'), variant.SNP('chr1', 13366970, 'G') ]) vs2 = VariantSet.from_variants([ variant.SNP('chr1', 13366968, 'A'), variant.SNP('chr1', 13366969, 'T'), variant.SNP('chr1', 13366970, 'G'), variant.Ins('chr1', 13366970, 'TTT') ]) com = list(vs1.comm(vs2, match_partial=False).iter_vrt()) self.assertEqual(len(com), 3)
def test_from_variants_vcf(self): vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) _desc = 'Test for multinumber field' info_spec_tuples = [('DP4', 4, 'Integer', _desc), ('NSV', 1, 'Integer')] info_spec_dict = vs0.dtype['info'] for info_spec in (info_spec_tuples, info_spec_dict): tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=info_spec) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\ .format(_desc), fh.read().splitlines()) fh.seek(0) # print(fh.read()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_from_variants_to_vcf_with_sampdata(self): file = pkg_file('genomvar.test', 'data/example3.vcf') variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf( fh, format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')], samples=['SAMP1']) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##FORMAT=<ID=AD,Number=R,Type=Integer,'\ +'Description="">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['samples']['SAMP1']['AD'], v2.attrib['samples']['SAMP1']['AD'])
def test_mnp_com_split(self): # 23 # TTCACTTAGCATAATGTCTTCAAGATT # v1 TT -single # v2 TT -splitted vset1 = VariantSet.from_variants( [factory.from_edit('chr24', 22, 'AG', 'TT')]) vset2 = VariantSet.from_variants([ factory.from_edit('chr24', 22, 'A', 'T'), factory.from_edit('chr24', 23, 'G', 'T') ]) com = vset1.comm(vset2) v = list(com.iter_vrt())[0] self.assertEqual([v.start, v.ref], [22, 'AG']) self.assertFalse(list(vset1.diff(vset2).iter_vrt()))
def test_strip_order_dependent_Ambig(self): # 10043 # R T--CA--CAG # v1 TCACA--CAG # v2 T--CACACAG factory = variant.VariantFactory(reference=pkg_file( 'genomvar.test', 'data/chr24.fna'), normindel=True) v1 = factory.from_edit('chr24', 10043, 'T', 'TCA') v2 = factory.from_edit('chr24', 10045, 'A', 'ACA') s1 = VariantSet.from_variants([v1]) s2 = VariantSet.from_variants([v2]) diff = s1.diff(s2, match_ambig=True) self.assertEqual(len(list(diff.iter_vrt())), 0) diff = s1.diff(s2, match_ambig=False) self.assertEqual(len(list(diff.iter_vrt())), 1)
def test_ambig_difference_different_ambig(self): # 10043 # Ref T--CA--CA--G # v1 s1 T--CA--CACAG ins CA right # v2 s1 T------CA--G del CA left # v1 s2 TCACA--CA--G ins CA left # v2 s2 T--CA------G del CA right fac = variant.VariantFactory(reference=self.chr24, normindel=True) s1 = VariantSet.from_variants([ fac.from_edit('chr24', 10047, 'A', 'ACA'), fac.from_edit('chr24', 10043, 'TCA', 'T') ]) s2 = VariantSet.from_variants([ fac.from_edit('chr24', 10043, 'T', 'TCA'), fac.from_edit('chr24', 10045, 'ACA', 'A') ]) self.assertEqual(len(list(s1.diff(s2, match_ambig=True).iter_vrt())), 0)
def test_from_variants(self): vfset = VariantSetFromFile( pkg_file('genomvar.test', 'data/example1.vcf')) vset = VariantSet.from_variants(list(vfset.iter_vrt())) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) # Test error on out of reference bounds with self.assertRaises(ValueError): VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr24', 10000000, 'C')], reference=self.chr24) # Test error on chromosome not in reference with self.assertRaises(ValueError): vs = VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr2', 10, 'C')], reference=self.chr24)
def test_in_the_middle(self): # 2 # TTCACTTAGCAT # v1 GGG # v2 G s1 = VariantSet.from_variants( [factory.from_edit('chr24', 2, 'CAC', 'GGG')]) s2 = VariantSet.from_variants( [factory.from_edit('chr24', 3, 'A', 'G')]) diff = s1.diff(s2) vv = list(diff.iter_vrt()) self.assertEqual(len(vv), 2) v1, v2 = vv self.assertEqual([v1.start, v1.ref, v1.alt], [2, 'C', 'G']) self.assertEqual([v2.start, v2.ref, v2.alt], [4, 'C', 'G']) self.assertEqual( len(list(s1.diff(s2, match_partial=False).iter_vrt())), 1)
def test_VariantSet_cmp2(self): vcf1 = pkg_file('genomvar.test', 'data/example3.vcf') s1 = VariantSet.from_vcf(vcf1) s2 = VariantSet.from_variants(reversed(list(s1.iter_vrt()))) diff = s1.diff(s2) self.assertEqual(diff.nof_unit_vrt(), 0) comm = s1.comm(s2) self.assertEqual(comm.nof_unit_vrt(), s1.nof_unit_vrt()) self.assertEqual(s2.comm(s1).nof_unit_vrt(), comm.nof_unit_vrt())
def test_to_vcf_no_ref(self): vs1 = VariantSet.from_variants( [variant.Del("chr24", 23, 24), variant.SNP("chr24", 1206, "C")]) buf = io.StringIO() vs1.to_vcf(buf, reference=self.chr24) buf.seek(0) vs2 = VariantSet.from_vcf(buf) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), 2)
def test_many_chroms_shuffled(self): vs1 = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example3.vcf')) vrt = list(vs1.iter_vrt()) vrt2 = [vrt[i] for i in (0, 7, 1, 6, 2, 5, 3, 4)] vs2 = VariantSet.from_variants(vrt2) self.assertEqual(vs1.comm(vs2).nof_unit_vrt(), vs1.nof_unit_vrt()) self.assertEqual(vs1.diff(vs2).nof_unit_vrt(), 0) self.assertEqual(vs2.comm(vs1).nof_unit_vrt(), vs2.nof_unit_vrt()) self.assertEqual(vs2.diff(vs1).nof_unit_vrt(), 0)
def test_ovlp(self): # 23 # TTCACTTAGCATAATGTCTTCAAG|ATT # G # C <- not interfering ins = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG') s1 = VariantSet.from_variants([ins], ) vb = factory.from_edit(chrom='chr24', start=24, ref='A', alt='C') self.assertEqual(len(s1.ovlp(vb)), 0) vb2 = factory.from_edit(chrom='chr24', start=23, ref='G', alt='GG') self.assertEqual(len(s1.ovlp(vb2, match_ambig=False)), 1)
def test_match2(self): # 1 # R TCACAG # del1 T--CAG # del2 TCA--G del1 = variant.Del('chrom', 1, 3) adel1 = variant.AmbigDel('chrom', (1, 1), (5, 3), 'CA', '') del2 = variant.Del('chrom', 2, 4) adel2 = variant.AmbigDel('chrom', (1, 2), (5, 4), 'AC', '') vs = VariantSet.from_variants([del1]) self.assertEqual(len(vs.match(adel1)), 1) self.assertEqual(len(vs.match(adel2)), 0) self.assertEqual(len(vs.match(adel2, match_ambig=True)), 1) vs2 = VariantSet.from_variants([adel1]) self.assertEqual(len(vs2.match(adel1)), 1) self.assertEqual(len(vs2.match(del1)), 1) self.assertEqual(len(vs2.match(adel2)), 0) self.assertEqual(len(vs2.match(adel2, match_ambig=True)), 1) self.assertEqual(len(vs2.match(del2, match_ambig=True)), 1) self.assertEqual(len(vs2.match(del2, match_ambig=False)), 0)
def test_variant_cluster(self): vs1 = VariantSet.from_variants([ variant.SNP('chr1', 13366968, 'A'), variant.SNP('chr1', 13366969, 'T'), variant.SNP('chr1', 13366970, 'G') ]) vs2 = VariantSet.from_variants([ variant.Del('chr1', 13366967, 13366969), variant.Ins('chr1', 13366971, 'TG') ]) diff = list(vs1.diff(vs2).iter_vrt()) self.assertEqual(len(diff), 3) vs3 = VariantSet.from_variants([variant.MNP("chr1", 13366968, 'ATG')]) self.assertEqual(len(list(vs1.diff(vs3,match_partial=True)\ .iter_vrt())), 0) self.assertEqual(len(list(vs3.diff(vs1,match_partial=True)\ .iter_vrt())), 0) self.assertEqual(len(list(vs1.diff(vs3,match_partial=False)\ .iter_vrt())), 3) self.assertEqual(len(list(vs3.diff(vs1,match_partial=False)\ .iter_vrt())), 1)
def test_from_variants_with_attributes(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf')) vset = VariantSet.from_variants(list(reader.iter_vrt(parse_info=True))) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1 = vrt[0] self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v1.attrib['id'], '5') v2 = vrt[1] self.assertEqual(v2.attrib['id'], None) recs = vset.to_records() self.assertEqual(recs[0]['attrib']['info']['NSV'], 2)
def test_consistency_of_diff_and_com(self): # REF G TTGG CACAGTTC---CA-C # 1206 2093 10044 # s1 C CCC T CC G # C CCC G CC # 2092 10044 # s2 TT T CC # AC T G vfac = VariantFactory(reference=self.chr24, normindel=True) variants1 = [('chr24', 1206, 'G', 'C'), ('chr24', 2093, 'TG', 'CC'), ('chr24', 2095, 'G', 'C'), ('chr24', 10044, 'C', 'T'), ('chr24', 10044, 'C', 'G'), ('chr24', 10051, 'C', 'CCC')] variants2 = [('chr24', 2092, 'TT', 'AC'), ('chr24', 10044, 'C', 'T'), ('chr24', 10052, 'C', 'CCC'), ('chr24', 10053, 'A', 'AG')] s1 = VariantSet.from_variants([vfac.from_edit(*v) for v in variants1]) s2 = VariantSet.from_variants([vfac.from_edit(*v) for v in variants2]) com1 = s1.comm(s2, match_ambig=True) com2 = s2.comm(s1, match_ambig=True) s1_s2 = s1.diff(s2, match_ambig=True) s2_s1 = s2.diff(s1, match_ambig=True) nof = {} nof['com1'] = com1.nof_unit_vrt() nof['com2'] = com1.nof_unit_vrt() nof['s1'] = s1.nof_unit_vrt() nof['s2'] = s2.nof_unit_vrt() nof['s1_s2'] = s1_s2.nof_unit_vrt() nof['s2_s1'] = s2_s1.nof_unit_vrt() self.assertEqual(nof['com1'], 3) self.assertEqual(nof['com2'], nof['com1']) self.assertEqual(nof['com1'] + nof['s1_s2'], nof['s1']) self.assertEqual(nof['com1'] + nof['s2_s1'], nof['s2'])
def test_from_variants_to_records(self): fac = variant.VariantFactory(reference=self.chr24, normindel=True) hap = Haplotype.from_variants([ fac.from_edit('chr24', 1207, 'G', 'C'), fac.from_edit('chr24', 1207, 'G', 'T') ]) vs = VariantSet.from_variants([ fac.from_edit('chr24', 10043, 'T', 'TCA'), fac.from_edit('chr24', 10045, 'ACA', 'A'), hap ]) recs = vs.to_records() self.assertEqual(recs.shape, (4, )) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'attrib' ])
def test_from_variants_to_vcf_with_info(self): variants1 = sorted(VCFReader( pkg_file('genomvar.test', 'data/example1.vcf')).iter_vrt(parse_info=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') # Test Invalid specs invalid_specs = [('NSV', ), ('NSV', 1, 'Integedr'), ('NSV', 'C', 'Integer', 'Number of Simple Variants')] _buf = io.StringIO() for spec in invalid_specs: with self.assertRaises(ValueError) as cm: vs.to_vcf(_buf, info_spec=[spec]) exc = cm.exception self.assertTrue('INFO spec' in exc.args[0]) with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=[ ('NSV', 1, 'Integer', 'Number of Simple Variants'), ('AF', 'A', 'Float', '', 'source', 'version') ]) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=NSV,Number=1,Type=Integer,'\ +'Description="Number of Simple Variants">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_diff_del(self): vrt = variant.Del(chrom="chr1", start=6751613, end=6751627) vs1 = VariantSet.from_variants([vrt]) vs2 = VariantSet.from_variants([vrt]) self.assertEqual(len(list(vs1.diff(vs2).iter_vrt())), 0) self.assertEqual(len(list(vs1.comm(vs2).iter_vrt())), 1)
def test_diff_ins(self): vrt = variant.Ins(chrom="chr1", start=6751613, alt='AGTC') vs1 = VariantSet.from_variants([vrt]) vs2 = VariantSet.from_variants([vrt]) self.assertEqual(len(list(vs1.diff(vs2).iter_vrt())), 0) self.assertEqual(len(list(vs1.comm(vs2).iter_vrt())), 1)