def test_filter_chi2_gt_sample_sets(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]], [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]]]) variations[GT_FIELD] = gts variations.samples = [1, 2, 3, 4, 5, 6] samples1 = [1, 2, 3] samples2 = [4, 5, 6] flt = Chi2GtFreqs2SampleSetsFilter(samples1, samples2, min_pval=0.05, n_bins=2, report_selection=True) res = flt(variations) assert list(res[COUNTS]) == [2, 2] assert numpy.all(res[FLT_VARS][GT_FIELD] == gts[:2, ...]) assert res[FLT_STATS][N_KEPT] == 2 assert res[FLT_STATS][TOT] == 4 assert res[FLT_STATS][N_FILTERED_OUT] == 2 assert res[SELECTED_VARS].shape flt = Chi2GtFreqs2SampleSetsFilter(samples1, samples2, min_pval=0.05, n_bins=2, return_discarded=True) res = flt(variations) assert res[DISCARDED_VARS].num_variations
def test_filter_or(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]]) variations[GT_FIELD] = gts variations.samples = [1, 2, 3, 4, 5] filter1 = ObsHetFilter(min_num_genotypes=0) filter2 = ObsHetFilter(min_het=0.2, min_num_genotypes=0) filtered = OrFilter([filter1, filter2])(variations) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][TOT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
def test_filter_obs_het(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]]) variations[GT_FIELD] = gts variations.samples = [1, 2, 3, 4, 5] filtered = ObsHetFilter(min_num_genotypes=0, report_selection=True)(variations) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][TOT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0 assert filtered[SELECTED_VARS].shape filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=0)(variations) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]]) assert filtered[FLT_STATS][N_KEPT] == 3 assert filtered[FLT_STATS][TOT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 1 filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=10)(variations) assert filtered[FLT_STATS][N_KEPT] == 0 assert filtered[FLT_STATS][TOT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 4 filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=10, keep_missing=True)(variations) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][TOT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0 return filtered = ObsHetFilter(max_het=0.1, min_num_genotypes=0)(variations) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]]) filtered = ObsHetFilter(min_het=0.2, max_het=0.3, min_num_genotypes=0)(variations) assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]]) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') filtered = ObsHetFilter(min_het=0.6, max_het=0.9)(hdf5) counts = Counter(filtered[FLT_VARS][GT_FIELD].flat) assert counts == {} filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5)(hdf5) counts = Counter(filtered[FLT_VARS][GT_FIELD].flat) assert counts == {0: 978, -1: 910, 1: 774, 2: 92} filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5, n_bins=3, range_=(0, 1))(hdf5) counts = Counter(filtered[FLT_VARS][GT_FIELD].flat) assert counts == {0: 978, -1: 910, 1: 774, 2: 92} assert numpy.all(filtered[COUNTS] == [391, 14, 10]) assert numpy.all(filtered[EDGES] == [0, 1 / 3, 2 / 3, 1]) samples = hdf5.samples[:50] filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5, n_bins=3, range_=(0, 1), samples=samples)(hdf5) counts = Counter(filtered[FLT_VARS][GT_FIELD].flat) assert sum(filtered[COUNTS]) == sum([339, 14, 6])