Пример #1
0
 def test_cmp_vrt_iter_vrt2(self):
     vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example1.vcf'))
     vs2 = VariantSetFromFile(
         pkg_file('genomvar.test', 'data/example_gnomad_1.vcf.gz'))
     vrt = list(vs1.iter_vrt())
     self.assertEqual(len(list(vs1.diff_vrt(vs2).iter_vrt())), len(vrt))
Пример #2
0
 def test_cmp_vrt_iter_same(self):
     vs = VariantSetFromFile(
         pkg_file('genomvar.test', 'data/example2.vcf.gz'))
     tot = list(vs.find_vrt())
     # print(tot)
     comm = list(vs.comm_vrt(vs).iter_vrt())
     self.assertEqual(len(comm), len(tot))
Пример #3
0
 def test_wrong_chrom_name_in_ref(self):
     ref = Reference(pkg_file(__name__, 'data/chr25.fasta'))
     vset = VariantSetFromFile(pkg_file('genomvar.test',
                                        'data/example1.vcf.gz'),
                               reference=ref,
                               index=True)
     self.assertEqual(len(list(vset.find_vrt(rgn='chr24:1200-1210'))), 2)
     ref.close()
Пример #4
0
 def test_diff_callback(self):
     s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf'))
     s2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf'))
     cb = lambda m: [v.attrib['vcf_notation']['row'] for v in m]
     for N, vrt in enumerate(s1.comm_vrt(s2).iter_vrt(callback=cb)):
         self.assertEqual(vrt.attrib['vcf_notation']['row'],
                          vrt.attrib['cmp'][0])
     self.assertEqual(N, 7)
Пример #5
0
 def test_cmp_stream(self):
     s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf'))
     s2 = VariantSetFromFile(
         pkg_file('genomvar.test', 'data/example2.vcf.gz'))
     nofv = 0
     for vrt in s1.diff_vrt(s2).iter_vrt():
         nofv += vrt.nof_unit_vrt()
     self.assertEqual(nofv, 14)
Пример #6
0
    def test_index_errors(self):
        file = pkg_file('genomvar.test', 'data/example1.vcf')
        with self.assertRaises(NoIndexFoundError):
            vset = VariantSetFromFile(file, reference=self.chr24, index=True)

        vset = VariantSetFromFile(file, reference=self.chr24)

        with self.assertRaises(ValueError) as cm:
            list(vset.find_vrt('chr1', 1, 100))
        error = cm.exception
        self.assertIn('index is required', error.args[0].lower())
Пример #7
0
    def test_find_vrt(self):
        ivfs = VariantSetFromFile(pkg_file('genomvar.test',
                                           'data/example2.vcf.gz'),
                                  index=True)
        vs = VariantSet.from_vcf(
            pkg_file('genomvar.test', 'data/example2.vcf.gz'))

        self.assertEqual(
            sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]),
            sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')]))

        self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]),
                         sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))
Пример #8
0
    def test_class(self):
        vset = VariantSetFromFile(pkg_file('genomvar.test',
                                           'data/example1.vcf.gz'),
                                  parse_info=True,
                                  reference=self.chr24,
                                  parse_samples='SAMP1')

        # Test find_vrt and returned INFO
        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)
        v1, v2 = vrt
        self.assertEqual(v1.attrib['info']['NSV'], 1)
        self.assertEqual(v2.attrib['info']['RECN'], 19)

        # Test multiallelic
        vrt = list(vset.find_vrt('chr24', 20, 30))
        self.assertEqual(len(vrt), 2)
        v1, v2 = vrt
        self.assertEqual(v1.attrib['info']['AF'], 0.5)
        self.assertEqual(v2.attrib['info']['AF'], 0.5)

        # Test find_vrt precision
        vrt = list(vset.find_vrt('chr24', 2095, 2096))
        self.assertEqual(len(vrt), 1)
        vrt = list(vset.find_vrt('chr24', 2098, 2100))
        self.assertEqual(len(vrt), 1)

        # Test find all variants
        self.assertEqual(len(list(vset.find_vrt())), 16)

        # Test finding all variants
        self.assertEqual(len(list(vset.find_vrt())), 16)
Пример #9
0
 def test_complex_info_example(self):
     vset = VariantSetFromFile(pkg_file('genomvar.test',
                                        'data/example_gnomad_1.vcf.gz'),
                               parse_info=True,
                               index=True)
     checked = False
     for vrt in vset.find_vrt(rgn='chr1:69090-69091'):
         if vrt.alt != 'C':
             continue
         if not vrt.attrib['info']['MutPred_Top5features'] is None:
             checked = True
             self.assertTrue(vrt.attrib['info']['MutPred_Top5features']\
                             .startswith('Loss of sheet (P = 0.0817)| L'))
     self.assertTrue(checked)
Пример #10
0
 def test_cmp_vrt_region_multisample2(self):
     vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example_1000genomes_1.vcf.gz'),
                              parse_samples=True,
                              index=True)
     vs2 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example_1000genomes_2.vcf.gz'),
                              parse_samples=True,
                              index=True)
     comm = []
     for vrt in vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005'):
         comm.append(vrt)
         self.assertTrue(hasattr(vrt, 'attrib'), msg='False for' + str(vrt))
     comm = list(vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005'))
     self.assertGreater(len(comm), 0)
Пример #11
0
 def test_cmp_vrt_iter_vrt(self):
     vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example1.vcf.gz'),
                              parse_samples=True)
     vs2 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example2.vcf.gz'),
                              parse_samples=True)
     comm = list()
     for vrt in vs1.comm_vrt(vs2).iter_vrt():
         comm.append(vrt)
         self.assertTrue(vrt.attrib['samples'],
                         msg='Vrt {} has no samples'.format(vrt))
     self.assertEqual(len(comm), 4)
     diff = vs1.diff_vrt(vs2).iter_vrt()
     self.assertEqual(len(list(diff)), 12)
Пример #12
0
    def test_match(self):
        # REF      TGG   TT
        #          2093  2099
        # vs1      CCC   GG
        # vrt            CG
        #          r1   r2,r3
        vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                          'data/example1.vcf.gz'),
                                 index=True)
        vrt = factory.from_edit('chr24', 2098, 'TT', 'CG')
        self.assertEqual(len(vs1.match(vrt)), 1)

        # Test insertion
        vrt = factory.from_edit('chr24', 22, 'AG', 'AGG')
        match = vs1.match(vrt)
        self.assertEqual(len(match), 1)
Пример #13
0
 def test_cmp_vrt_region(self):
     vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example1.vcf.gz'),
                              parse_samples=True,
                              parse_info=True,
                              index=True)
     vs2 = VariantSetFromFile(pkg_file('genomvar.test',
                                       'data/example2.vcf.gz'),
                              parse_samples='SAMP1',
                              parse_info=True,
                              index=True)
     comm = list(vs1.comm_vrt(vs2).region(rgn='chr24:10040-10050'))
     self.assertEqual(len(comm), 2)
     v1, v2 = comm
     self.assertEqual(v1.attrib['info']['AF'], 1.0)
     self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1))
Пример #14
0
    def test_from_variants(self):
        vfset = VariantSetFromFile(
            pkg_file('genomvar.test', 'data/example1.vcf'))
        vset = VariantSet.from_variants(list(vfset.iter_vrt()))
        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)

        # Test error on out of reference bounds
        with self.assertRaises(ValueError):
            VariantSet.from_variants(list(vfset.iter_vrt()) +
                                     [variant.SNP('chr24', 10000000, 'C')],
                                     reference=self.chr24)

        # Test error on chromosome not in reference
        with self.assertRaises(ValueError):
            vs = VariantSet.from_variants(list(vfset.iter_vrt()) +
                                          [variant.SNP('chr2', 10, 'C')],
                                          reference=self.chr24)
Пример #15
0
    def test_unsorted_VCF_input(self):
        header = []
        lines = []
        with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh:
            for line in fh:
                if line.startswith('#'):
                    header.append(line)
                else:
                    lines.append(line)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')
        with open(tf.name, 'wt') as fh:
            fh.writelines(header)
            fh.writelines(reversed(lines))

        vs1 = VariantSetFromFile(pkg_file('genomvar.test',
                                          'data/example1.vcf'))
        vs2 = VariantSetFromFile(tf.name)
        with self.assertRaises(UnsortedVariantFileError):
            list(vs1.diff_vrt(vs2).iter_vrt())
Пример #16
0
    def test_differently_sorted_chroms(self):
        s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf'))
        header = []
        variants = {}
        with open(pkg_file('genomvar.test', 'data/example3.vcf')) as fh:
            for line in fh:
                if line.startswith('#'):
                    header.append(line)
                else:
                    variants.setdefault(line.split(maxsplit=1)[0],[])\
                            .append(line)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')
        with open(tf.name, 'wt') as fh:
            fh.writelines(header)
            for chrom in ['chr1', 'chr10', 'chr2']:
                fh.writelines(variants[chrom])

        s2 = VariantSetFromFile(tf.name)
        with self.assertRaises(DifferentlySortedChromsError):
            list(s1.diff_vrt(s2).iter_vrt())
Пример #17
0
    def test_find_vrt2(self):
        vset = VariantSetFromFile(pkg_file('genomvar.test',
                                           'data/example1.vcf.gz'),
                                  reference=self.chr24,
                                  index=True)
        self.assertEqual(len(list(vset.find_vrt(rgn='chr24:1200-1210'))), 2)
        v1, v2 = list(vset.find_vrt(rgn='chr24:1200-1210'))
        self.assertEqual([v1.start, v1.end], [1206, 1207])
        self.assertEqual([v2.start, v2.end], [1206, 1207])

        self.assertEqual(len(list(vset.find_vrt(rgn='chr24:3200-3205'))), 1)
        v1 = list(vset.find_vrt(rgn='chr24:3200-3205'))[0]
        self.assertEqual([v1.start, v1.end], [3201, 3202])

        self.assertEqual(len(list(vset.find_vrt(rgn='chr24:20-30'))), 2)
        v1, v2 = list(vset.find_vrt(rgn='chr24:20-30'))
        self.assertEqual([v1.start, v1.end, type(v1.base)],
                         [23, 24, variant.Del])
        self.assertEqual([v2.start, v2.end, type(v2.base)],
                         [24, 25, variant.Ins])
Пример #18
0
def _cmp_vcf(f1, f2, out, match_partial=False, chunk=1000):
    """
    Writes comparison of two VCF files to a specified file handle.
    """
    info = [('VT', 1, 'String', 'Variant type'),
            ('whichVCF', 1, 'String',
             'Which input VCF contains the variant; first, second or both'),
            ('ln', 1, 'Integer',
             'Line number in input VCF variant originating from'),
            ('ln2', '.', 'Integer',
             'If whichVCF is both indicates line numberin the second file')]

    writer = VCFWriter(info_spec=info)
    header = writer.get_header()
    out.write(header)

    if _isindexed(f1):
        vs1 = VariantSetFromFile(f1)
    else:
        warnings.warn('{} not indexed; may impact performance.'.format(f1))
        vs1 = VariantSetFromFile(f1)
    if _isindexed(f2):
        vs2 = VariantSetFromFile(f2)
    else:
        warnings.warn('{} not indexed; may impact performance.'.format(f2))
        vs2 = VariantSetFromFile(f2)

    _which = {0: 'first', 1: 'second', 2: 'both'}
    nof_vrt = {i: 0 for i in _which}
    cb = lambda m: [v.attrib['vcf_notation']['row'] for v in m]
    for which,vrt in vs1._cmp_vrt(vs2,action='all')\
                        .iter_vrt(callback=cb):

        nof_vrt[which] += vrt.nof_unit_vrt()
        if which == 0:
            lineno = vrt.attrib['vcf_notation'][
                'row'] + vs1.vcfreader.header_len + 1
        elif which == 1:
            lineno = vrt.attrib['vcf_notation'][
                'row'] + vs2.vcfreader.header_len + 1
        if which == 2:
            lineno = vrt.attrib['vcf_notation'][
                'row'] + vs1.vcfreader.header_len + 1
            lineno2 = [
                vs2.vcfreader.header_len + n + 1 for n in vrt.attrib['cmp']
            ]

        vrt.attrib['info'] = {'whichVCF':_which[which],
                              'ln':lineno,
                              'ln2':lineno2 if which==2\
                                  else None}
        try:
            row = writer.get_row(vrt)
        except ValueError as exc:
            if vrt.is_variant_instance(variant.Haplotype) \
                    or vrt.is_variant_instance(variant.Asterisk):
                continue
            else:
                raise exc

        try:
            out.write(str(row) + '\n')
        except BrokenPipeError:
            exit(1)
    return nof_vrt
Пример #19
0
 def test_no_index(self):
     with self.assertRaises(NoIndexFoundError):
         vset = VariantSetFromFile(pkg_file('genomvar.test',
                                            'data/example3.vcf'),
                                   index=True)
Пример #20
0
 def test_find_nonexistent_chrom(self):
     vcf = pkg_file('genomvar.test', 'data/example_1000genomes_1.vcf.gz')
     vset = VariantSetFromFile(vcf, index=True)
     self.assertEqual(list(vset.find_vrt('chr24')), [])
Пример #21
0
 def test_ctg_len_without_ref(self):
     vset = VariantSetFromFile(pkg_file('genomvar.test',
                                        'data/example1.vcf.gz'),
                               parse_samples='SAMP1',
                               index=True)
     self.assertEqual(vset.chroms, {'chr24'})