def test_calc_obs_het(self): gts = numpy.array([]) dps = numpy.array([]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert het.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het(hdf5, min_num_genotypes=0) het_array = calc_obs_het(snps, min_num_genotypes=0) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = numpy.array([[5, 12, 10, 10], [10, 10, 10, 10]]) varis = {'/calls/GT': gts, '/calls/DP': dps} het = calc_obs_het(varis, min_num_genotypes=0) assert numpy.allclose(het, [0.5, 0]) het = calc_obs_het(varis, min_num_genotypes=10) assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10) assert numpy.allclose(het, [1, 0]) het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11) assert numpy.allclose(het, [0, 0]) het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5) assert numpy.allclose(het, [0.5, 0])
def test_calc_missing_gt_rates(self): gts = numpy.array([]) varis = {'/calls/GT': gts} called_vars = calc_called_gt(varis, rates=False) assert called_vars.shape[0] == 0 called_vars = calc_called_gt(varis, rates=True) assert called_vars.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') arrays = VariationsArrays() arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) rates = calc_missing_gt(arrays) rates2 = calc_missing_gt(hdf5) assert rates.shape == (943,) assert numpy.allclose(rates, rates2) assert numpy.min(rates) == 0 assert numpy.all(rates <= 1) gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) varis = {'/calls/GT': gts} expected = numpy.array([2, 1, 1, 0]) called_vars = calc_called_gt(varis, rates=False) assert numpy.all(called_vars == expected) missing_vars = calc_missing_gt(varis, rates=False) assert numpy.all(missing_vars == 2 - expected) expected = numpy.array([0, 0.5, 0.5, 1]) rates = calc_called_gt(varis) assert numpy.allclose(rates, 1 - expected) rates = calc_missing_gt(varis) assert numpy.allclose(rates, expected)
def test_calc_obs_het_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) het_h5 = calc_obs_het_by_sample(hdf5) het_array = calc_obs_het_by_sample(snps) assert numpy.all(het_array == het_h5) gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1]]]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True) gts = numpy.array([]) varis = {'/calls/GT': gts} het = calc_obs_het_by_sample(varis, chunk_size=None) assert het.shape[0] == 0 snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') calc_obs_het_by_sample(snps, min_call_dp=3) calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20) het_0 = calc_obs_het_by_sample(snps) het = calc_obs_het_by_sample(snps, chunk_size=None) assert numpy.allclose(het_0, het)
def test_iterate_chroms(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_chroms() hd5_2 = VariationsArrays() hd5_2.put_chunks([win for _, win in wins]) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def test_iterate_wins(self): fpath = join(TEST_DATA_DIR, 'ril.hdf5') hd5 = VariationsH5(fpath, mode='r') wins = hd5.iterate_wins(win_size=1000000) hd5_2 = VariationsArrays() hd5_2.put_chunks(wins) numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
def sample_variations(in_vars, sample_rate, out_vars=None, chunk_size=None): if out_vars is None: out_vars = VariationsArrays() chunks = in_vars.iterate_chunks(chunk_size=chunk_size, random_sample_rate=sample_rate) out_vars.put_chunks(chunks) return out_vars
def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def copy_setting_gts_to_missing(in_vars, gt_rate_to_missing, out_vars=None, chunk_size=None): if out_vars is None: out_vars = VariationsArrays() chunks = in_vars.iterate_chunks(chunk_size=chunk_size) chunks = (_set_gts_to_missing(chunk, gt_rate_to_missing) for chunk in chunks) out_vars.put_chunks(chunks) return out_vars
def test_annotator_all_samples(self): annot_id = 'test' hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') variations = VariationsArrays() variations.put_chunks(hdf5.iterate_chunks()) annotator = IsVariableAnnotator(annot_id=annot_id) result = annotator(variations) annotated_variations = result[ANNOTATED_VARS] field = '/variations/info/{}'.format(annot_id) assert annotated_variations.metadata[field]['Type'] == 'Integer' assert annotated_variations.metadata[field]['Number'] == 1 assert field in annotated_variations.keys() assert annotated_variations[field][3] == TRUE_INT
def test_calc_dp_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) cnts, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15) assert cnts['hom'].shape == (15,) assert cnts['het'].shape == (15,) return cnts2, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15, chunk_size=None) assert numpy.all(cnts['hom'] == cnts2['hom']) assert numpy.all(cnts['het'] == cnts2['het']) cnts3, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs', n_bins=15, chunk_size=50) assert numpy.all(cnts['hom'] == cnts3['hom']) assert numpy.all(cnts['het'] == cnts3['het'])
def test_calc_snp_density(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) density_h5 = list(calc_snp_density(hdf5, 1000)) density_array = list(calc_snp_density(snps, 1000)) assert density_array == density_h5 var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch', 'ch']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2] assert dens_var == expected var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c2', 'c3']), '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44, 80, 200, 300, 302])} dens_var = list(calc_snp_density(var, 11)) expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1] assert dens_var == expected var = {'/variations/chrom': numpy.array([]), '/variations/pos': numpy.array([])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [] var = {'/variations/chrom': numpy.array([1]), '/variations/pos': numpy.array([1])} dens_var = list(calc_snp_density(var, 11)) assert dens_var == [1]
def stats_missing_rate_from_hdf5_memory(): fpath = join(TEST_DATA_DIR, 'performance', 'inca_torvum_all_snps.h5') var_mat = VariationsH5(fpath, mode='r') array = VariationsArrays() array.put_chunks(var_mat.iterate_chunks(kept_fields=['/calls/GT'])) calc_stat_by_chunk(array, _MissingGTCalculator())
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
def test_create_hdf5_with_chunks(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks()) assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') out_fhand = NamedTemporaryFile(suffix='.hdf5') out_fpath = out_fhand.name out_fhand.close() hdf5_2 = VariationsH5(out_fpath, 'w') try: hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) assert list(hdf5_2['calls'].keys()) == ['GT'] assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:]) finally: os.remove(out_fpath) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2)) _, prob = scipy.stats.ttest_ind(hdf5['/variations/pos'][:], hdf5_2['/variations/pos'][:]) assert prob > 0.05 assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1 chrom = hdf5_2['/variations/chrom'][0] pos = hdf5_2['/variations/pos'][0] index = PosIndex(hdf5) idx = index.index_pos(chrom, pos) old_snp = hdf5['/calls/GT'][idx] new_snp = hdf5_2['/calls/GT'][0] assert numpy.all(old_snp == new_snp) # putting empty chunks hdf5_2.put_chunks(None) hdf5_2.put_chunks([]) chunk = hdf5.get_chunk(slice(1000, None)) hdf5_2.put_chunks([chunk]) old_snp = hdf5['/calls/DP'][idx] new_snp = hdf5_2['/calls/DP'][0] assert numpy.all(old_snp == new_snp) hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') hdf5_2 = VariationsArrays() hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0)) assert hdf5_2.num_variations == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') hdf5_3 = VariationsArrays() hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))