def test_load_burden(self): t = [] load_burden(B, t) tl = [('CDS1', [ 'FM211187:3910-3951', ]), ('CDS2', [ 'FM211187:4006-4057', ])] self.assertEqual(t, tl) t = [] load_burden(BM, t) tl = [('CDS1', [ 'FM211187:3910-3951', ]), ('CDS2', [ 'FM211187:4006-4057', ]), ('CDS3', [ 'FM211187:3910-3951', 'FM211187:4006-4057', ])] self.assertEqual(t, tl) # no file found with self.assertRaises(FileNotFoundError): load_burden('nope', []) # not the right file with self.assertRaises(ValueError): load_burden(P, [])
def test_read_variant_vcf(self): p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCF) t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(abs((k - np.zeros(50)).max()), 0.0) self.assertEqual(var_name, 'FM211187_16_G_A') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, sorted(['sample_%d' % x for x in range(1, 51)])) self.assertEqual(af, 0.0) self.assertEqual(missing, 0.0) # not providing samples t = read_variant(infile, p, 'vcf', False, [], False, set(), []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) # providing burden burden_regions = deque([]) load_burden(B, burden_regions) t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS1') self.assertEqual(kstrains, []) self.assertEqual( nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0) # uncompressed option - no effect t = read_variant(infile, p.head(5), 'vcf', False, [], True, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 1, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'FM211187_3982_C_A') self.assertEqual(kstrains, ['sample_2']) self.assertEqual(nkstrains, ['sample_1', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.2) self.assertEqual(missing, 0.0) # different type with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'kmers', False, [], True, p.head(5).index, []) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'Rtab', False, [], False, p.head(5).index, []) # read until exhaustion while not t[0]: t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, True) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) self.assertEqual(missing, None) # different file infile = gzip.open(KMER) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, []) infile = open(PRES) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, [])
def test_read_variant_vcf(self): p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCF) t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(abs((k - np.zeros(50)).max()), 0.0) self.assertEqual(var_name, 'FM211187_16_G_A') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, sorted(['sample_%d' % x for x in range(1, 51)])) self.assertEqual(af, 0.0) self.assertEqual(missing, 0.0) # not providing samples t = read_variant(infile, p, 'vcf', False, [], False, set(), []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) # providing burden burden_regions = deque([]) load_burden(B, burden_regions) t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS1') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0) # providing burden burden_regions = deque([]) load_burden(BM, burden_regions) # last one has multiple regions burden_regions.reverse() t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS3') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0) # uncompressed option - no effect infile = VariantFile(VCF) t = read_variant(infile, p.head(5), 'vcf', False, [], True, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'FM211187_16_G_A') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0.0) # different type with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'kmers', False, [], True, p.head(5).index, []) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'Rtab', False, [], False, p.head(5).index, []) # read until exhaustion while not t[0]: t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, True) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) self.assertEqual(missing, None) # different file infile = gzip.open(KMER) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, []) infile = open(PRES) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, []) # burden with missing genotypes in last read variant # issue #90 p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCFMISSING) burden_regions = deque([]) load_burden(BMISSING, burden_regions) t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([1, 1, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS1') self.assertEqual(kstrains, ['sample_1', 'sample_2']) self.assertEqual(nkstrains, ['sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.4) self.assertEqual(missing, 0) # check that missing variants are properly missed # issue #120 p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCFMISSING) variant = next(infile) total = 0 missing = 0 samples = set() for sample, call in variant.samples.items(): if sample not in p.index: continue for haplotype in call.get('GT', [None]): if haplotype is None or haplotype == '.': missing += 1 total += 1 samples.add(sample) pysam_missing = missing / float(total) infile = VariantFile(VCFMISSING) t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(pysam_missing, missing)