예제 #1
0
def filter_by_maf_by_allele_count(
        variations,
        max_allowable_maf=None,
        min_allowable_maf=None,
        filter_id='filter_by_maf_by_allele_count',
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
        calc_histogram=False,
        n_bins=DEF_NUM_BINS,
        limits=None):
    mafs = calc_maf_by_allele_count(variations,
                                    min_num_genotypes=min_num_genotypes)
    # print(compute(mafs))
    result = _select_vars(variations, mafs, min_allowable_maf,
                          max_allowable_maf)

    if calc_histogram:
        if limits is None:
            limits = (0, 1)
        counts, bin_edges = va.histogram(mafs, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_maf is not None:
            limits.append(min_allowable_maf)
        if max_allowable_maf is not None:
            limits.append(max_allowable_maf)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }
예제 #2
0
    def test_calc_maf_by_allele_count_in_memory(self):
        variations = Variations(samples=np.array(['aa', 'bb']))
        variations[GT_FIELD] = np.array([[[-1, 1], [2, 1]], [[-1, -1], [-1,
                                                                        2]],
                                         [[1, -1], [1, 1]]])
        variations[RO_FIELD] = np.array([[-1, 8], [-1, -1], [6, 4]])
        variations[AO_FIELD] = np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]],
                                         [[1, 4], [5, 1]]])

        result = calc_maf_by_allele_count(variations, min_num_genotypes=0)

        expected = [0.5, 0.5, 0.47619048]
        for a, b in zip(result, expected):
            self.assertAlmostEqual(a, b, places=2)
예제 #3
0
    def test_calc_maf_by_allele_count(self):
        variations = Variations(samples=da.array(['aa', 'bb']))
        variations[GT_FIELD] = da.from_array([[[-1, 1], [2, 1]],
                                              [[-1, -1], [-1, 2]],
                                              [[1, -1], [1, 1]]])
        variations[RO_FIELD] = da.from_array(
            np.array([[-1, 8], [-1, -1], [6, 4]]))
        variations[AO_FIELD] = da.from_array(
            np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]]))
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        future_result = calc_maf_by_allele_count(variations,
                                                 min_num_genotypes=0)
        result = compute(future_result)

        expected = [0.5, 0.5, 0.47619048]
        for a, b in zip(result, expected):
            self.assertAlmostEqual(a, b, places=2)
예제 #4
0
 def test_calc_maf_by_allele_count_empty_vars(self):
     variations = _create_empty_dask_variations()
     task = calc_maf_by_allele_count(variations)
     result = compute(task)
     self.assertEqual(result.shape, (0, ))