示例#1
0
    def test_filter_by_call_rate(self):
        variations = create_dask_variations()
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5)
        _add_task_to_pipeline(pipeline_futures, future_result)

        future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS],
                                                   min_call_rate=0.5,
                                                   filter_id='call_rate2')
        _add_task_to_pipeline(pipeline_futures, future_result2)

        processed = compute(pipeline_futures, store_variation_to_memory=True)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0)

        gts = processed[FLT_VARS][GT_FIELD]
        self.assertEqual(gts.shape, (5, 3, 2))
        self.assertTrue(
            np.all(
                processed[FLT_VARS].samples == variations.samples.compute()))
        self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
示例#2
0
    def test_filter_by_call_rate_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5)
        _add_task_to_pipeline(pipeline_futures, future_result)

        future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS],
                                                   min_call_rate=0.5,
                                                   filter_id='call_rate2')
        _add_task_to_pipeline(pipeline_futures, future_result2)
        processed = pipeline_futures
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0)

        gts = processed[FLT_VARS][GT_FIELD]
        self.assertEqual(gts.shape, (5, 3, 2))
        self.assertTrue(
            np.all(processed[FLT_VARS].samples == variations.samples))
        self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
示例#3
0
 def test_save_to_zarr(self):
     zarr_path = TEST_DATA_DIR / 'test.zarr'
     variations = load_zarr(zarr_path, num_vars_per_chunk=2)
     # with this step we create a  variation with dask arrays of unknown shapes
     variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]
     with TemporaryDirectory() as tmp_dir:
         tmp_path = Path(tmp_dir)
         delayed_store = prepare_zarr_storage(variations, tmp_path)
         dask.compute(delayed_store, scheduler='sync')
         variations2 = load_zarr(tmp_path)
         self.assertTrue(np.all(variations.samples.compute() == variations2.samples.compute()))
         for field in VARIATION_FIELDS + CALL_FIELDS:
             # dont chec
             if field == QUAL_FIELD:
                 continue
             original = variations[field]
             if original is None:
                 continue
             original = original.compute()
             new = variations2[field].compute()
             try:
                 self.assertTrue(np.all(original == new))
             except AssertionError:
                 for row in range(original.shape[0]):
                     print(row, original[row, ...], new[row, ...])
                 raise
示例#4
0
    def test_save_to_hdf5(self):
        h5_path = TEST_DATA_DIR / 'test.h5'
        variations = load_hdf5(h5_path)
#         h5_path = TEST_DATA_DIR / 'test.zarr'
#         variations = load_zarr(h5_path)
        # with this step we create a  variation with dask arrays of unknown shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        with NamedTemporaryFile(suffix='.h5') as tmp_dir:
            tmp_path = Path(tmp_dir.name)
            delayed_store = prepare_hdf5_storage(variations, tmp_path)
            dask.compute(delayed_store)
            variations2 = load_hdf5(tmp_path)
            self.assertEqual(variations.metadata, variations2.metadata)
            self.assertTrue(np.all(variations.samples.compute() == variations2.samples.compute()))
            for field in VARIATION_FIELDS + CALL_FIELDS:
                # dont chec
                if field == QUAL_FIELD:
                    continue
                original = variations[field]
                if original is None:
                    continue
                original = original.compute()
                new = variations2[field].compute()
                self.assertTrue(np.all(original == new))
示例#5
0
    def test_calc_obs_het(self):
        variations = Variations(samples=da.array(['a', 'b', 'c', 'd']))
        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)
        # with this step we create a  variation with dask arrays of unknown shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        het = calc_obs_het(variations, min_num_genotypes=0)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))

        #         het = calc_obs_het(variations, min_num_genotypes=10)
        #         assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        self.assertTrue(np.allclose(het.compute(), [1, 0]))
        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        self.assertTrue(np.allclose(het.compute(), [0, 0]))

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
示例#6
0
    def test_filter_by_call_rate_twice(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        pipeline_futures = {}
        # this rate has no sense but I use to remove all calls
        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=1.1)
        pipeline_futures.update(future_result)

        future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS],
                                                   min_call_rate=0.5,
                                                   filter_id='call_rate2')
        pipeline_futures.update(future_result2)

        processed = compute(pipeline_futures, store_variation_to_memory=True)
        self.assertEqual(processed[FLT_STATS], {
            'n_kept': 0,
            'n_filtered_out': 0
        })
示例#7
0
    def test_filter_and_hist_by_call_rate(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5,
                                                  calc_histogram=True)
        _add_task_to_pipeline(pipeline_futures, future_result)
        processed = compute(pipeline_futures, store_variation_to_memory=True)
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]),
                         DEF_NUM_BINS)
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]),
                         DEF_NUM_BINS + 1)
        self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
示例#8
0
    def test_filter_and_hist_by_call_rate_in_memory(self):
        variations = create_dask_variations()
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5,
                                                  calc_histogram=True)
        _add_task_to_pipeline(pipeline_futures, future_result)
        processed = pipeline_futures
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]),
                         DEF_NUM_BINS)
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]),
                         DEF_NUM_BINS + 1)
        self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
示例#9
0
    def test_calc_mac(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0)
        result = compute(macs)
        expected = [2, 1, 1, math.nan]
        for a, b in zip(result, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)
示例#10
0
    def test_gts_to_012mat(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = remove_low_call_rate_vars(variations,
                                               min_call_rate=0)[FLT_VARS]
        gts012 = va.gts_as_mat012(variations[GT_FIELD])

        expected = [[-1, 0, 2], [-1, 0, 2], [-1, 0, 2], [1, -1, 0],
                    [-1, -1, -1], [-1, 1, -1], [-1, 1, 2]]
        self.assertTrue(np.allclose(expected, gts012.compute()))

        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        gts012 = va.gts_as_mat012(variations[GT_FIELD])
        self.assertTrue(np.allclose(expected, gts012.compute()))

        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        gts012 = va.gts_as_mat012(variations[GT_FIELD])
        self.assertTrue(np.allclose(expected, gts012))
示例#11
0
    def test_calc_maf_by_allele_count(self):
        variations = Variations(samples=da.array(['aa', 'bb']))
        variations[GT_FIELD] = da.from_array([[[-1, 1], [2, 1]],
                                              [[-1, -1], [-1, 2]],
                                              [[1, -1], [1, 1]]])
        variations[RO_FIELD] = da.from_array(
            np.array([[-1, 8], [-1, -1], [6, 4]]))
        variations[AO_FIELD] = da.from_array(
            np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]]))
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        future_result = calc_maf_by_allele_count(variations,
                                                 min_num_genotypes=0)
        result = compute(future_result)

        expected = [0.5, 0.5, 0.47619048]
        for a, b in zip(result, expected):
            self.assertAlmostEqual(a, b, places=2)
示例#12
0
def _create_dask_variations():
    variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
    return remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
示例#13
0
 def test_iterate_chunk_pairs(self):
     variations = load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=1)
     variations = remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
     for p in iterate_chunk_pairs(variations, max_distance=100000):
         self.assertTrue(len(p), 2)
示例#14
0
def create_non_materialized_snp_filtered_variations():
    variations = create_dask_variations()
    return remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
示例#15
0
def _create_empty_dask_variations():
    variations = _load_one_dask()
    return remove_low_call_rate_vars(variations, min_call_rate=1.1)[FLT_VARS]