def test_calc_obs_het_in_memory(self): variations = Variations(samples=np.array(['a', 'b', 'c', 'd'])) gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) variations[GT_FIELD] = gts variations[DP_FIELD] = dps het = calc_obs_het(variations, min_num_genotypes=0) self.assertTrue(np.allclose(het, [0.5, 0])) # het = calc_obs_het(variations, min_num_genotypes=10) # assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) self.assertTrue(np.allclose(het, [1, 0])) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) self.assertTrue(np.allclose(het, [0, 0])) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) self.assertTrue(np.allclose(het, [0.5, 0]))
def test_calc_obs_het2(self): gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) het = calc_obs_het(variations, min_num_genotypes=0) het = compute(het) assert np.allclose(het, [0.5, 0]) het = calc_obs_het(variations, min_num_genotypes=10) het = compute(het) assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) het = compute(het) assert np.allclose(het, [1, 0]) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) het = compute(het) assert np.allclose(het, [0, 0]) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) het = compute(het) assert np.allclose(het, [0.5, 0])
def test_calc_obs_het(self): variations = Variations(samples=da.array(['a', 'b', 'c', 'd'])) gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) # with this step we create a variation with dask arrays of unknown shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] het = calc_obs_het(variations, min_num_genotypes=0) self.assertTrue(np.allclose(het.compute(), [0.5, 0])) # het = calc_obs_het(variations, min_num_genotypes=10) # assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) self.assertTrue(np.allclose(het.compute(), [1, 0])) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) self.assertTrue(np.allclose(het.compute(), [0, 0])) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
def filter_by_obs_heterocigosis( variations, max_allowable_het=None, min_allowable_het=None, min_call_dp_for_het_call=None, max_call_dp_for_het_call=None, filter_id='obs_het', min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): obs_het = calc_obs_het(variations, min_num_genotypes=min_num_genotypes, min_call_dp_for_het_call=min_call_dp_for_het_call, max_call_dp_for_het_call=max_call_dp_for_het_call) result = _select_vars(variations, obs_het, min_allowable=min_allowable_het, max_allowable=max_allowable_het) if calc_histogram: if limits is None: limits = (0, 1) counts, bin_edges = va.histogram(obs_het, n_bins=n_bins, limits=limits) result[FLT_STATS][COUNT] = counts result[FLT_STATS][BIN_EDGES] = bin_edges limits = [] if min_allowable_het is not None: limits.append(min_allowable_het) if max_allowable_het is not None: limits.append(max_allowable_het) result[FLT_STATS]['limits'] = limits return { FLT_VARS: result[FLT_VARS], FLT_ID: filter_id, FLT_STATS: result[FLT_STATS] }