示例#1
0
 def test_load_burden(self):
     t = []
     load_burden(B, t)
     tl = [('CDS1', [
         'FM211187:3910-3951',
     ]), ('CDS2', [
         'FM211187:4006-4057',
     ])]
     self.assertEqual(t, tl)
     t = []
     load_burden(BM, t)
     tl = [('CDS1', [
         'FM211187:3910-3951',
     ]), ('CDS2', [
         'FM211187:4006-4057',
     ]), ('CDS3', [
         'FM211187:3910-3951',
         'FM211187:4006-4057',
     ])]
     self.assertEqual(t, tl)
     # no file found
     with self.assertRaises(FileNotFoundError):
         load_burden('nope', [])
     # not the right file
     with self.assertRaises(ValueError):
         load_burden(P, [])
示例#2
0
 def test_read_variant_vcf(self):
     p = pd.read_csv(P, index_col=0, sep='\t')['binary']
     infile = VariantFile(VCF)
     t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.zeros(50)).max()), 0.0)
     self.assertEqual(var_name, 'FM211187_16_G_A')
     self.assertEqual(kstrains, [])
     self.assertEqual(nkstrains,
                      sorted(['sample_%d' % x for x in range(1, 51)]))
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     t = read_variant(infile, p, 'vcf', False, [], False, set(), [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     # providing burden
     burden_regions = deque([])
     load_burden(B, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name, 'CDS1')
     self.assertEqual(kstrains, [])
     self.assertEqual(
         nkstrains,
         ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # uncompressed option - no effect
     t = read_variant(infile, p.head(5), 'vcf', False, [], True,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k - np.array([0, 1, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name, 'FM211187_3982_C_A')
     self.assertEqual(kstrains, ['sample_2'])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.2)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'kmers', False, [], True,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'Rtab', False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
     infile = open(PRES)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf', False, [], False,
                          p.head(5).index, [])
示例#3
0
 def test_read_variant_vcf(self):
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCF)
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(abs((k - np.zeros(50)).max()), 0.0)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      sorted(['sample_%d' % x
                              for x in range(1, 51)]))
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # not providing samples
     t = read_variant(infile, p, 'vcf',
                      False, [], False,
                      set(), [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     # providing burden
     burden_regions = deque([])
     load_burden(B, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # providing burden
     burden_regions = deque([])
     load_burden(BM, burden_regions)
     # last one has multiple regions
     burden_regions.reverse()
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS3')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0)
     # uncompressed option - no effect
     infile = VariantFile(VCF)
     t = read_variant(infile, p.head(5), 'vcf',
                      False, [], True,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([0, 0, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'FM211187_16_G_A')
     self.assertEqual(kstrains,
                      [])
     self.assertEqual(nkstrains,
                      ['sample_1', 'sample_2', 'sample_3',
                       'sample_4', 'sample_5'])
     self.assertEqual(af, 0.0)
     self.assertEqual(missing, 0.0)
     # different type
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'kmers',
                          False, [], True,
                          p.head(5).index, [])
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'Rtab',
                          False, [], False,
                          p.head(5).index, [])
     # read until exhaustion
     while not t[0]:
         t = read_variant(infile, p, 'vcf',
                          False, [], False,
                          p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, True)
     self.assertEqual(k, None)
     self.assertEqual(var_name, None)
     self.assertEqual(kstrains, None)
     self.assertEqual(nkstrains, None)
     self.assertEqual(af, None)
     self.assertEqual(missing, None)
     # different file
     infile = gzip.open(KMER)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     infile = open(PRES)
     with self.assertRaises(AttributeError):
         t = read_variant(infile, p.head(5), 'vcf',
                          False, [], False,
                          p.head(5).index, [])
     # burden with missing genotypes in last read variant
     # issue #90
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     burden_regions = deque([])
     load_burden(BMISSING, burden_regions)
     t = read_variant(infile, p.head(5), 'vcf',
                      True, burden_regions, False,
                      p.head(5).index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(eof, False)
     self.assertTrue(abs((k -
                      np.array([1, 1, 0, 0, 0])).max()) < 1E-7)
     self.assertEqual(var_name,
                      'CDS1')
     self.assertEqual(kstrains,
                      ['sample_1', 'sample_2'])
     self.assertEqual(nkstrains,
                      ['sample_3', 'sample_4', 'sample_5'])
     self.assertEqual(af, 0.4)
     self.assertEqual(missing, 0)
     # check that missing variants are properly missed
     # issue #120
     p = pd.read_csv(P,
                     index_col=0,
                     sep='\t')['binary']
     infile = VariantFile(VCFMISSING)
     variant = next(infile)
     total = 0
     missing = 0
     samples = set()
     for sample, call in variant.samples.items():
         if sample not in p.index:
             continue
         for haplotype in call.get('GT', [None]):
             if haplotype is None or haplotype == '.':
                 missing += 1
             total += 1
             samples.add(sample)
     pysam_missing = missing / float(total)
     infile = VariantFile(VCFMISSING)
     t = read_variant(infile, p, 'vcf', False, [], False, p.index, [])
     eof, k, var_name, kstrains, nkstrains, af, missing = t
     self.assertEqual(pysam_missing, missing)