def test_calc_missing_gt_rates(self): gts = numpy.array([]) varis = {'/calls/GT': gts} called_vars = calc_called_gt(varis, rates=False) assert called_vars.shape[0] == 0 called_vars = calc_called_gt(varis, rates=True) assert called_vars.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') arrays = VariationsArrays() arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) rates = calc_missing_gt(arrays) rates2 = calc_missing_gt(hdf5) assert rates.shape == (943,) assert numpy.allclose(rates, rates2) assert numpy.min(rates) == 0 assert numpy.all(rates <= 1) gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) varis = {'/calls/GT': gts} expected = numpy.array([2, 1, 1, 0]) called_vars = calc_called_gt(varis, rates=False) assert numpy.all(called_vars == expected) missing_vars = calc_missing_gt(varis, rates=False) assert numpy.all(missing_vars == 2 - expected) expected = numpy.array([0, 0.5, 0.5, 1]) rates = calc_called_gt(varis) assert numpy.allclose(rates, 1 - expected) rates = calc_missing_gt(varis) assert numpy.allclose(rates, expected)
def test_calculate_hwe(self): variations = VariationsArrays() gts = numpy.array([]) variations['/calls/GT'] = gts variations['/variations/alt'] = gts result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert result.shape[0] == 0 variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [1, 1], [0, 0]], [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [1, 0], [1, 1], [0, 0]]]) variations['/calls/GT'] = gts variations._create_matrix('/variations/alt', shape=(1, 1), dtype=numpy.int16, fillvalue=0) expected = numpy.array([[1.25825397e+01, 1.85240619e-03], [1.25825397e+01, 1.85240619e-03]]) result = calc_hwe_chi2_test(variations, min_num_genotypes=0, chunk_size=None) assert numpy.allclose(result, expected) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hwe_test2 = calc_hwe_chi2_test(hdf5) assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
def test_calc_obs_het(self): gts = numpy.array([]) dps = numpy.array([]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert het.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het(hdf5, min_num_genotypes=0) het_array = calc_obs_het(snps, min_num_genotypes=0) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = numpy.array([[5, 12, 10, 10], [10, 10, 10, 10]]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert numpy.allclose(het, [0.5, 0]) het = calc_obs_het(varis, min_num_genotypes=10) assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10) assert numpy.allclose(het, [1, 0]) het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11) assert numpy.allclose(het, [0, 0]) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5) assert numpy.allclose(het, [0.5, 0])
def test_merge_with_depth(self): vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([1, 1])}]) vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'], 'gts': numpy.array([[0, 0], [1, 1]]), 'dp': numpy.array([20, 20])}]) vars1.samples = ['a', 'b'] vars2.samples = ['c', 'd'] merger = MockMerger(gt_shape=(4, 2)) variation = VarMerger._merge_vars(merger, vars1[0], vars2[0]) exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1, 'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]} self.var_is_equal(exp, variation) # merge the same var with depth h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5], dtype=numpy.int16) depth = list(merger.variations)[0][8][1] assert depth[0] == b'DP' assert numpy.all(depth[1] == first_snv_merged_depth) new_vars.put_vars(merger) assert '/calls/DP' in new_vars.keys() assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
def test_calc_obs_het_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het_by_sample(hdf5) het_array = calc_obs_het_by_sample(snps) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True) gts = numpy.array([]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert het.shape[0] == 0 snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') calc_obs_het_by_sample(snps, min_call_dp=3) calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20) het_0 = calc_obs_het_by_sample(snps) het = calc_obs_het_by_sample(snps, chunk_size=None) assert numpy.allclose(het_0, het)
def test_iterate_wins(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_wins(win_size=1000000) hd5_2 = VariationsArrays() hd5_2.put_chunks(wins) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_put_vars_arrays_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) assert snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(snps['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def test_ignore_non_matching(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False, ignore_non_matching=True) new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) assert new_vars.num_variations == 1
def test_field_filter(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = FieldFilter(kept_fields=[GT_FIELD]) pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT']) assert list(vars_out.keys()) == [GT_FIELD] assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
def test_min_mac(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = MacFilter(min_mac=10, max_mac=30, do_histogram=True) pipeline.append(flt, id_='filter1') vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(result['filter1']['counts'], result2['counts']) assert numpy.allclose(result['filter1']['edges'], result2['edges']) assert not vars_out.keys() assert result2[FLT_VARS]['/calls/GT'].shape[0] == 0
def test_mat012(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts gts012 = varis.gts_as_mat012 expected = [[0, 1, 2, -1], [0, 0, 2, 2], [-1, -1, -1, -1]] assert numpy.allclose(gts012, expected, equal_nan=True)
def test_pca(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') do_pca(hdf5) varis = VariationsArrays() gts = [[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], ] gts = numpy.array(gts) varis[GT_FIELD] = gts varis.samples = ['a', 'b', 'c'] res = do_pca(varis) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert numpy.allclose(projs[0], projs[1]) assert not numpy.allclose(projs[0], projs[2])
def test_chunk_pairs(self): poss = [5, 7, 8, 10, 11, 12] chroms = ['c1'] * len(poss) poss = numpy.array(poss) chroms = numpy.array(chroms) varis = VariationsArrays() varis[POS_FIELD] = poss varis[CHROM_FIELD] = chroms pairs = list(varis.iterate_chunk_pairs(max_dist=3, chunk_size=2)) pos_pairs = [(pair['chunk1'][POS_FIELD][0], pair['chunk2'][POS_FIELD][0]) for pair in pairs] expected = [(5, 5), (5, 8), (8, 8), (8, 11), (11, 11)] assert pos_pairs == expected pairs = list(varis.iterate_chunk_pairs(max_dist=4, chunk_size=2)) pos_pairs = [(pair['chunk1'][POS_FIELD][0], pair['chunk2'][POS_FIELD][0]) for pair in pairs] expected = [(5, 5), (5, 8), (5, 11), (8, 8), (8, 11), (11, 11)] assert pos_pairs == expected
def test_sort_variations(self): fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb') var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom2', 'pos': 345}, b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346}, b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 325}} parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t', max_field_lens={'alt': 3}, max_field_str_lens={'chrom': 10, 'alt': 10}) variations = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) variations.put_vars(parser) sorted_vars = VariationsArrays() sort_variations(variations, sorted_vars) exp_chrom = [b'chrom1', b'chrom1', b'chrom2'] exp_pos = [325, 346, 345] assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom) assert numpy.all(sorted_vars['/variations/pos'] == exp_pos) fhand.close()
def test_calc_dp_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) cnts, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15) assert cnts['hom'].shape == (15,) assert cnts['het'].shape == (15,) return cnts2, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15, chunk_size=None) assert numpy.all(cnts['hom'] == cnts2['hom']) assert numpy.all(cnts['het'] == cnts2['het']) cnts3, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15, chunk_size=50) assert numpy.all(cnts['hom'] == cnts3['hom']) assert numpy.all(cnts['het'] == cnts3['het'])
def test_by_chunks(self): fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays() snps.put_vars(vcf_parser) fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays(vars_in_chunk=1) snps.put_vars(vcf_parser) fhand.close()
def test_calc_snp_density(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) density_h5 = list(calc_snp_density(hdf5, 1000)) density_array = list(calc_snp_density(snps, 1000)) assert density_array == density_h5 var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2] assert dens_var == expected var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array([]), '/variations/pos': numpy.array([])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [] var = {'/variations/chrom': numpy.array([1]), '/variations/pos': numpy.array([1])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [1]
def test_pca(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') do_pca(hdf5) varis = VariationsArrays() gts = [ [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], ] gts = numpy.array(gts) varis[GT_FIELD] = gts varis.samples = ['a', 'b', 'c'] res = do_pca(varis) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert numpy.allclose(projs[0], projs[1]) assert not numpy.allclose(projs[0], projs[2])
def test_nei_dist(self): gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1, chunk_size=1) assert math.isclose(dists[0], 0.3726315908494797) # all missing gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isnan(dists[0]) # min_num_genotypes gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, chunk_size=1) assert math.isnan(dists[0])
def test_write_meta_header(self): files = ['format_def_without_info.vcf', 'format_def_without_filter.vcf', 'format_without_flt_info_qual.vcf'] for file in files: vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb') header_lines = [line for line in vcf_fhand if line.startswith(b'#')] vcf_fhand.close() with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand: vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2}, pre_read_max_size=10000) var_array = VariationsArrays(ignore_undefined_fields=True) var_array.put_vars(vcf) with NamedTemporaryFile(suffix='.h5') as tmp_fhand: _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0') _write_vcf_header(var_array, tmp_fhand) tmp_fhand.flush() with open(tmp_fhand.name, 'rb') as retmp_fhand: for line in retmp_fhand: assert line in header_lines
def test_samples(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] assert varis.samples == [1, 2, 3, 4] # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() samples = h5.samples samples[0] = '0' h5.samples = samples
def test_samples(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] assert varis.samples == [1, 2, 3, 4] # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() samples = h5.samples samples[0] = '0' h5.samples = samples
def test_kosman_pairwise_by_chunk(self): a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) gts = numpy.stack((a, b, c, d), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations['/calls/GT'] = gts expected = [0.33333333, 0.75, 0.75, 0.45, 0.45, 0.] distance = calc_pairwise_distance(variations, chunk_size=None, min_num_snps=1) assert numpy.allclose(distance, expected) distance = calc_pairwise_distance(variations, chunk_size=2) assert numpy.allclose(distance, expected) distance = calc_pairwise_distance(variations, chunk_size=None, min_num_snps=11) assert numpy.sum(numpy.isnan(distance)) == 5 # With all missing a = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16) b = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16) gts = numpy.stack((a, b), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations['/calls/GT'] = gts distance = calc_pairwise_distance(variations) assert numpy.isnan(distance[0]) # With missing in some chunks only variations['/calls/GT'][:5, 0, :] = 1 variations['/calls/GT'][:5, 1, :] = 0 assert calc_pairwise_distance(variations)[0] == 1 assert calc_pairwise_distance(variations, chunk_size=3)[0] == 1
def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: assert numpy.all(expected_h5[field][:] == result) except AssertionError: print(field) print(expected_h5[field][:]) print(result) # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.dtype(bool)) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [ samples[idx] for idx, keep in enumerate(sample_cols) if keep ] filtered_vars.samples = kept_samples return filtered_vars
def test_write_meta_header(self): files = [ 'format_def_without_info.vcf', 'format_def_without_filter.vcf', 'format_without_flt_info_qual.vcf' ] for file in files: vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb') header_lines = [ line for line in vcf_fhand if line.startswith(b'#') ] vcf_fhand.close() with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand: vcf = VCFParser(vcf_fhand) var_array = VariationsArrays(ignore_undefined_fields=True) var_array.put_vars(vcf) with NamedTemporaryFile(suffix='.h5') as tmp_fhand: _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0') _write_vcf_header(var_array, tmp_fhand) tmp_fhand.flush() with open(tmp_fhand.name, 'rb') as retmp_fhand: for line in retmp_fhand: assert line in header_lines
def test_excel(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]], [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts variations.samples = list(range(gts.shape[1])) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # chrom pos variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2]) variations[POS_FIELD] = numpy.array([10, 20, 10, 20]) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # REF, ALT variations[REF_FIELD] = numpy.array(['A', 'A', 'A', 'A']) variations[ALT_FIELD] = numpy.array([['T'], ['T'], ['T'], ['T']]) write_excel(variations, fhand)
def test_expected_het(self): gts = [[[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]], [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) exp = [0.5, 0.48979592, 0.48979592] assert numpy.allclose(calc_expected_het(snps, min_num_genotypes=0), exp) exp = [0.53846154, 0.52747253, 0.52747253] assert numpy.allclose(calc_unbias_expected_het(snps, min_num_genotypes=0), exp)
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_maf(self): gts = numpy.array([]) varis = VariationsArrays() varis[GT_FIELD] = gts mafs = calc_maf(varis, chunk_size=None) assert mafs.shape == (0,) mafs = calc_maf(varis) assert mafs.shape == (0,) mafs = calc_mac(varis, chunk_size=None) assert mafs.shape == (0,) mafs = calc_mac(varis) assert mafs.shape == (0,) gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts mafs = calc_maf(varis, min_num_genotypes=1) assert numpy.allclose(mafs, numpy.array([1., 0.5, 0.75, numpy.NaN]), equal_nan=True) macs = calc_mac(varis, min_num_genotypes=1) assert numpy.allclose(macs, numpy.array([4, 2, 3, numpy.NaN]), equal_nan=True) varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') mafs = calc_maf(varis) assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] >= 0.5) assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] <= 1) assert mafs.shape == (943,) macs = calc_mac(varis) # assert macs.shape == (943,) min_mac = varis['/calls/GT'].shape[1] / 2 max_mac = varis['/calls/GT'].shape[1] assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] >= min_mac) assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] <= max_mac)
def test_low_dp_gt(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = LowDPGTsToMissingSetter(min_dp=5) pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_biallelic(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = NonBiallelicFilter() pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_fieldpath(self): pipeline = Pipeline() annot_id = 'test' hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') field = '/variations/info/{}'.format(annot_id) annotator = IsVariableAnnotator(annot_id=annot_id, samples=['1_14_1_gbs', '1_17_1_gbs']) pipeline.append(annotator) annotator = FieldValueFilter(field_path=field, value=0) pipeline.append(annotator) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) assert vars_out.num_variations == 484
def test_set_to_missing(self): orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9) orig_gts = orig_vars[GT_FIELD][...] noise_gts = vars[GT_FIELD] assert orig_gts.shape == noise_gts.shape mask_different_gts = orig_gts != noise_gts expected_num_gts_set_to_missing = int( round(numpy.sum(orig_gts != MISSING_INT) * 0.9)) assert expected_num_gts_set_to_missing == mask_different_gts.sum() assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT) vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) vcf_fhand.close() numpy.random.seed(1) gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [1, 0]], [[0, 1], [-1, 2], [1, 1]]]) expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[-1, -1], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [-1, -1]], [[-1, -1], [-1, 2], [-1, -1]]]) del snps[GT_FIELD] snps[GT_FIELD] = gts vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5) noise_gts = vars[GT_FIELD] assert numpy.all(noise_gts == expected_gts)
def test_kosman_pairwise_between_pops_by_chunk(self): a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) gts = numpy.stack((a, b, c, d), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations.samples = [1, 2, 3, 4] variations['/calls/GT'] = gts expected = [[0., 0.33333333, 0.75, 0.75], [0.33333333, 0., 0.45, 0.45], [0.75, 0.45, 0., 0.], [0.75, 0.45, 0., 0.]] distance = calc_pairwise_distances_between_pops( variations, chunk_size=None, min_num_snps=1, pop1_samples=[1, 2, 3, 4], pop2_samples=[1, 2, 3, 4]) assert numpy.allclose(distance, expected) expected = [[0., 0.33333333, 0.75, 0.75]] distance = calc_pairwise_distances_between_pops( variations, chunk_size=None, min_num_snps=1, pop1_samples=[1], pop2_samples=[1, 2, 3, 4]) assert numpy.allclose(distance, expected) expected = [[0.75, 0.75], [0.45, 0.45]] distance = calc_pairwise_distances_between_pops(variations, chunk_size=None, min_num_snps=1, pop1_samples=[1, 2], pop2_samples=[3, 4])
def test_filter_samples(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') samples = hdf5.samples[:20] flt = SampleFilter(samples) pipeline.append(flt) vars_out = VariationsArrays() pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_calc_maf_depth_distribs_per_sample(self): variations = VariationsArrays() variations['/calls/AO'] = numpy.array([]) variations['/calls/RO'] = numpy.array([]) distribs, bins = calc_maf_depth_distribs_per_sample(variations, chunk_size=None) assert distribs is None assert bins is None variations = VariationsArrays() variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]]) variations['/calls/RO'] = numpy.array([[10, 5, 15]]) variations.samples = list(range(3)) distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4, min_depth=6, chunk_size=None) expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]] assert numpy.all(distribs == expected) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6, chunk_size=None) distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6) assert numpy.all(distribs1 == distribs2)
def test_calc_called_gts_distribution_per_depth(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30), chunk_size=10) assert dist[1, 1] == 1 dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30), chunk_size=None) assert numpy.all(dist == dist2) vars_ = VariationsArrays() vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]]) vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10, 0, 0, 25, 20, 10]]) vars_.samples = list(range(10)) dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10, 30]) expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(dist == expected)
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.bool) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols) if keep] filtered_vars.samples = kept_samples return filtered_vars
def test_snp_qual(self): pipeline = Pipeline() hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') flt = SNPQualFilter(min_qual=100, max_qual=50000, do_histogram=True) pipeline.append(flt) vars_out = VariationsArrays() result = pipeline.run(hdf5, vars_out) # check same result with no pipeline result2 = flt(hdf5) assert numpy.allclose(result['0']['counts'], result2['counts']) assert numpy.allclose(result['0']['edges'], result2['edges']) assert numpy.allclose(vars_out['/calls/GT'], result2[FLT_VARS]['/calls/GT'])
def test_kosman_pairwise(self): a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 2]]) c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) gts = numpy.stack((a, b, c, d), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) varis = VariationsArrays() varis[GT_FIELD] = gts pairwise_dist_calculator = _IndiPairwiseCalculator() abs_dist, n_snps = pairwise_dist_calculator.calc_dist(varis, method='kosman') distance = abs_dist / n_snps expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.] assert numpy.allclose(distance, expected)
def test_calc_r2_windows(self): variations = VariationsArrays() chrom = numpy.array([b'chr1'] * 4) pos = numpy.array([1, 4, 6, 20]) gts = numpy.array([[[0, 0], [1, 1], [0, 0]], [[0, 0], [1, 1], [0, 0]], [[1, 1], [0, 0], [1, 1]], [[0, 0], [0, 1], [-1, -1]]]) variations['/variations/chrom'] = chrom variations['/variations/pos'] = pos variations['/calls/GT'] = gts expected = [1.0, 1.0000002, 1.0, 1.0000002, 1.0, 1.0] assert numpy.allclose(_calc_r2(gts), expected) chrom, pos, r2 = calc_r2_windows(variations, 10) assert numpy.allclose(r2, [1.0000002384185933, numpy.nan], equal_nan=True) assert numpy.all(chrom == b'chr1')
def test_empty_pop(self): missing = (-1, -1) gts = [ [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.allclose(dists, [0.65490196]) gts = [ [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.isnan(dists[0])
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, pre_read_max_size=1000, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, pre_read_max_size=1000, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_put_vars_from_csv(self): fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t', gt_splitter=create_iupac_allele_splitter(), max_field_lens={'alt': 1}, max_field_str_lens={'alt': 1, 'chrom': 20, 'ref': 1}) with NamedTemporaryFile(suffix='.h5') as fhand: os.remove(fhand.name) h5 = VariationsH5(fhand.name, mode='w', ignore_overflows=True, ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) if os.path.exists(fhand.name): os.remove(fhand.name) fhand_ex.close() fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'two_letter_coding_ex3.txt'), 'rb') var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954}, b'2': {'chrom': b'SL2.40ch02', 'pos': 681961}, b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}} parser = CSVParser(fhand_ex, var_info, first_sample_column=3, first_gt_column=3, sep=b'\t', max_field_lens={'alt': 1}, max_field_str_lens={'alt': 1, 'chrom': 20, 'ref': 1}) h5 = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) h5.put_vars(parser) exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02'] assert list(h5['/variations/chrom'][:]) == exp alleles = list(zip(h5['/variations/ref'], [alts[0] for alts in h5['/variations/alt']])) exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')] for als, aexp in zip(alleles, exp): assert set(als) == set(aexp) assert list(h5['/variations/pos'][:]) == [331954, 681961, 1511764] exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [1, 0]]]) exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[1, 1], [1, 1], [0, 1]]]) for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2): for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2): assert set(gt) == set(ex1) or set(gt) == set(ex2) fhand_ex.close()
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = scipy.stats.ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def stats_missing_rate_from_hdf5_memory(): fpath = join(TEST_DATA_DIR, 'performance', 'inca_torvum_all_snps.h5') var_mat = VariationsH5(fpath, mode='r') array = VariationsArrays() array.put_chunks(var_mat.iterate_chunks(kept_fields=['/calls/GT'])) calc_stat_by_chunk(array, _MissingGTCalculator())
def test_write_vcf(self): # With all fields available tmp_fhand = NamedTemporaryFile() tmp_fhand.close() vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_exp.vcf'), 'rb') vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2}, pre_read_max_size=10000) max_field_lens = {'CALLS': {b'GT': 1, b'HQ': 2, b'DP': 1, b'GQ': 1}, 'FILTER': 1, 'INFO': {b'AA': 1, b'AF': 2, b'DP': 1, b'DB': 1, b'NS': 1, b'H2': 1}, 'alt': 2} max_field_str_lens = {'INFO': {b'AA': 1}, 'alt': 5, 'chrom': 2, 'ref': 4, 'id': 10, 'FILTER': 0} variations = VariationsArrays(ignore_undefined_fields=True) variations.put_vars(vcf, max_field_lens=max_field_lens, max_field_str_lens=max_field_str_lens) vcf_fhand.close() with NamedTemporaryFile(mode='wb') as out_fhand: write_vcf(variations, out_fhand, vcf_format='VCFv4.0') vcf_fpath = join(TEST_DATA_DIR, 'format_def_exp.vcf') with open(vcf_fpath, 'r') as exp_fhand: exp_lines = list(exp_fhand) out_fhand.seek(0) with open(out_fhand.name) as refhand: for line in refhand: try: assert line in exp_lines except AssertionError: print('aa', line) # With missing info in variations tmp_fhand = NamedTemporaryFile() out_fpath = tmp_fhand.name tmp_fhand.close() vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'), 'rb') vcf = VCFParser(vcf_fhand) max_field_lens = {'INFO': {}, 'CALLS': {b'GQ': 1, b'GT': 1, b'HQ': 2, b'DP': 1}, 'FILTER': 1, 'alt': 2} max_field_str_lens = {'ref': 4, 'INFO': {}, 'id': 10, 'FILTER': 0, 'alt': 5, 'chrom': 2} h5_without_info = VariationsH5(fpath=out_fpath, mode='w', ignore_undefined_fields=True) h5_without_info.put_vars(vcf, max_field_lens=max_field_lens, max_field_str_lens=max_field_str_lens) vcf_fhand.close() with NamedTemporaryFile(mode='wb') as out_fhand: write_vcf(h5_without_info, out_fhand, vcf_format='VCFv4.0') vcf_fpath = join(TEST_DATA_DIR, 'format_def_without_info_exp.vcf') with open(vcf_fpath, 'r') as exp_fhand: exp_lines = list(exp_fhand) out_fhand.seek(0) with open(out_fhand.name) as refhand: for line in refhand: try: assert line in exp_lines except AssertionError: print(line)