Пример #1
0
    def test_kosman_missing_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_cd = _kosman(vars1, vars2)

        assert np.all(distance_ab == distance_cd)
Пример #2
0
    def test_empty_pop(self):
        missing = (-1, -1)
        gts = [
            [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0],
               [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        variations = Variations()
        variations.samples = da.from_array(np.array(samples))
        variations[GT_FIELD] = da.from_array(np.array(gts))
        variations[DP_FIELD] = da.from_array(np.array(dps))

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.allclose(dists, [0.65490196])

        gts = [
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        dps = [[0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        variations = Variations()
        variations.samples = da.from_array(np.array(samples))
        variations[GT_FIELD] = da.from_array(np.array(gts))
        variations[DP_FIELD] = da.from_array(np.array(dps))
        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.isnan(dists[0])
Пример #3
0
    def test_calc_missing_memory(self):
        variations = Variations()
        gts = np.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0],
                                                               [-1, -1]],
                        [[-1, -1], [-1, -1]]])
        samples = [str(i) for i in range(gts.shape[1])]
        variations.samples = np.array(samples)
        variations[GT_FIELD] = gts

        result = calc_missing_gt(variations, rates=False)

        expected = np.array([2, 1, 1, 0])
        assert np.all(result == 2 - expected)

        gts = np.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, -1]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                        [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                        [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        samples = [str(i) for i in range(gts.shape[1])]
        variations = Variations()
        variations.samples = np.array(samples)
        variations[GT_FIELD] = gts
        result = calc_missing_gt(variations, rates=False)
        #         result = compute(task)
        expected = np.array([0.5, 1, 2, 4])
        assert np.all(result == expected)
Пример #4
0
    def test_nei_dist_in_memory(self):

        gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                        [[1, 1], [1, 2], [2, 2], [2, 1]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts

        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=6,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        # all missing
        gts = np.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts

        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=1,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isnan(dists[0])

        # min_num_genotypes
        gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                        [[1, 1], [1, 2], [2, 2], [2, 1]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])

        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts
        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=6,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)
Пример #5
0
    def test_kosman_2_indis(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]

        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Пример #6
0
    def test_calc_obs_het2(self):

        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)

        het = calc_obs_het(variations, min_num_genotypes=0)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
        het = calc_obs_het(variations, min_num_genotypes=10)
        het = compute(het)
        assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        het = compute(het)
        assert np.allclose(het, [1, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        het = compute(het)
        assert np.allclose(het, [0, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
Пример #7
0
    def test_dest_jost_distance_in_memory(self):
        gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)],
               [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]
        dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20],
               [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]]
        variations = Variations()
        variations.samples = np.array(samples)
        variations[GT_FIELD] = np.array(gts)
        variations[DP_FIELD] = np.array(dps)

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.allclose(dists, [0.65490196])

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=6)
        assert np.all(np.isnan(dists))
Пример #8
0
    def test_calc_obs_het(self):
        variations = Variations(samples=da.array(['a', 'b', 'c', 'd']))
        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)
        # with this step we create a  variation with dask arrays of unknown shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        het = calc_obs_het(variations, min_num_genotypes=0)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))

        #         het = calc_obs_het(variations, min_num_genotypes=10)
        #         assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        self.assertTrue(np.allclose(het.compute(), [1, 0]))
        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        self.assertTrue(np.allclose(het.compute(), [0, 0]))

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
Пример #9
0
def load_hdf5(path, fields=None):
    if fields is None:
        fields = []
    store = h5py.File(str(path), mode='r')
    samples = store['samples']
    print(samples)
    variations = Variations(
        samples=da.from_array(samples, chunks=samples.shape))
    metadata = {}
    for group_name, group in (store.items()):
        if isinstance(group, Group):
            for array_name, dataset in group.items():
                path = f'{group_name}/{array_name}'
                path = ZARR_VARIATION_FIELD_MAPPING[path]
                if fields and path not in fields:
                    continue
                if dataset.attrs:
                    metadata[path] = dict(dataset.attrs.items())

                chunks = list(dataset.shape)
                chunks[0] = DEF_CHUNK_SIZE
                chunks = tuple(chunks)

                variations[path] = da.from_array(dataset, chunks=chunks)

    variations.metadata = metadata
    return variations
Пример #10
0
def _filter_samples(variations, desired_samples, reverse=False):
    desired_samples = _str_list_to_byte_list(desired_samples)

    orig_sample_names = va.make_sure_array_is_in_memory(variations.samples)

    if reverse:
        desired_samples = [
            sample for sample in orig_sample_names
            if sample not in desired_samples
        ]

    orig_sample_names = list(orig_sample_names)
    sample_cols = np.array(
        [orig_sample_names.index(sample) for sample in desired_samples])
    sample_cols = list(sample_cols)

    new_variations = Variations(samples=np.array(desired_samples),
                                metadata=variations.metadata)
    for field, array in variations._arrays.items():
        if PUBLIC_CALL_GROUP in field:
            with dask.config.set(
                    **{'array.slicing.split_large_chunks': False}):
                array = array[:, sample_cols]
        new_variations[field] = array
    return {FLT_VARS: new_variations}
Пример #11
0
    def test_calc_obs_het_in_memory(self):
        variations = Variations(samples=np.array(['a', 'b', 'c', 'd']))
        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        variations[GT_FIELD] = gts
        variations[DP_FIELD] = dps

        het = calc_obs_het(variations, min_num_genotypes=0)
        self.assertTrue(np.allclose(het, [0.5, 0]))

        #         het = calc_obs_het(variations, min_num_genotypes=10)
        #         assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        self.assertTrue(np.allclose(het, [1, 0]))
        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        self.assertTrue(np.allclose(het, [0, 0]))

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        self.assertTrue(np.allclose(het, [0.5, 0]))
Пример #12
0
    def test_kosman_2_indis_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()

        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        va.make_sure_array_is_in_memory(distance_ab)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Пример #13
0
    def test_calc_mac2(self):
        gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                        [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)

        # with this step we create a  variation wi
        result = calc_mac(variations, max_alleles=3, min_num_genotypes=1)
        macs = compute(result)
        assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
Пример #14
0
 def _create_fake_variations_and_regions(self):
     variations = Variations(samples=da.array(['aa', 'bb']))
     poss = np.array(
         [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
     chroms = np.array([
         'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1',
         'chr1', 'chr1', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2',
         'chr2', 'chr2', 'chr2', 'chr2'
     ])
     variations[CHROM_FIELD] = da.from_array(chroms)
     variations[POS_FIELD] = da.from_array(poss)
     regions = [('chr1', 4, 6), ('chr2', )]
     return variations, regions
Пример #15
0
    def test_calc_mac_in_memory(self):
        variations = Variations(samples=np.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0)
        expected = [2, 1, 1, math.nan]
        for a, b in zip(macs, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)
Пример #16
0
    def test_unavailable_shape(self):
        variations = Variations()
        variations.samples = ['1', '2', '3']
        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        variations[GT_FIELD] = gts
        assert variations.num_variations == 3

        variations = _create_empty_dask_variations()
        try:
            variations.num_variations
            self.fail('NotMaterializedError expected')
        except NotMaterializedError:
            pass
Пример #17
0
    def test_non_variable_filter(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)

        task = keep_variable_variations(variations, max_alleles=3)

        result = compute(task, store_variation_to_memory=True)

        filtered_vars = result[FLT_VARS]
        self.assertEqual(filtered_vars.num_variations, 2)
        self.assertEqual(result[FLT_STATS], {'n_kept': 2, 'n_filtered_out': 2})
Пример #18
0
    def test_calc_maf_by_allele_count_in_memory(self):
        variations = Variations(samples=np.array(['aa', 'bb']))
        variations[GT_FIELD] = np.array([[[-1, 1], [2, 1]], [[-1, -1], [-1,
                                                                        2]],
                                         [[1, -1], [1, 1]]])
        variations[RO_FIELD] = np.array([[-1, 8], [-1, -1], [6, 4]])
        variations[AO_FIELD] = np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]],
                                         [[1, 4], [5, 1]]])

        result = calc_maf_by_allele_count(variations, min_num_genotypes=0)

        expected = [0.5, 0.5, 0.47619048]
        for a, b in zip(result, expected):
            self.assertAlmostEqual(a, b, places=2)
Пример #19
0
    def xtest_do_pca(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        do_pca(variations)

        gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]])
        variations = Variations()
        variations.samples = da.from_array(np.array(['a', 'b', 'c']))
        variations[GT_FIELD] = da.from_array(gts)

        res = do_pca(variations)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert np.allclose(projs[0], projs[1])
        assert not np.allclose(projs[0], projs[2])
Пример #20
0
    def test_kosman_missing(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_cd = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        assert np.all(distance_ab == distance_cd)
Пример #21
0
    def test_iterate_chunks(self):
        # in memory
        variations = Variations()
        variations.samples = ['1', '2', '3']
        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        variations[GT_FIELD] = gts
        for index, chunk in enumerate(variations.iterate_chunks(chunk_size=1)):
            assert np.all(chunk[GT_FIELD] == variations[GT_FIELD][index, :])
            assert np.all(chunk.samples == variations.samples)

        # in disk
        variations = load_zarr((TEST_DATA_DIR / 'test.zarr'),
                               num_vars_per_chunk=1)
        chunks = list(variations.iterate_chunks())
        self.assertEqual(len(chunks), 7)
Пример #22
0
    def test_allele_freq_in_memory(self):

        gts = np.array([[[0, 0], [1, 1], [0, -1], [-1, -1]],
                        [[0, -1], [0, 0], [0, -1], [-1, -1]],
                        [[0, 1], [0, 2], [0, 0], [-1, -1]]])
        samples = ['1', '2', '3', '4']
        variations = Variations(samples=np.array(samples))
        variations[GT_FIELD] = gts
        variations[ALT_FIELD] = np.zeros((3, 2))

        allele_freq = calc_allele_freq(variations,
                                       max_alleles=3,
                                       min_num_genotypes=0)
        allele_freq = allele_freq
        expected = np.array([[0.6, 0.4, 0], [1, 0, 0], [4 / 6, 1 / 6, 1 / 6]])
        assert np.allclose(allele_freq, expected)
Пример #23
0
    def test_do_pca_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        do_pca(variations)

        gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]])
        variations = Variations()
        variations.samples = da.from_array(np.array(['a', 'b', 'c']))
        variations[GT_FIELD] = da.from_array(gts)

        res = do_pca(variations)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert np.allclose(projs[0], projs[1])
        assert not np.allclose(projs[0], projs[2])
Пример #24
0
    def test_kosman_pairwise_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 2]])
        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((a, b, c, d), axis=0)
        gts = np.transpose(gts, axes=(1, 0, 2)).astype(np.int16)

        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts
        distances, samples = calc_kosman_dist(variations)
        expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.]
        assert np.allclose(distances, expected)
Пример #25
0
    def test_filter_obs_het_in_mem(self):
        variations = Variations()
        gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                        [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations.samples = da.from_array([1, 2, 3, 4, 5])
        variations[GT_FIELD] = da.from_array(gts)
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']

        filtered = filter_by_obs_heterocigosis(variations, min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               min_num_genotypes=10)
        #         filtered = compute(task, store_variation_to_memory=True,
        #                            silence_runtime_warnings=True)
        assert filtered[FLT_STATS][N_KEPT] == 0
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 4

        filtered = filter_by_obs_heterocigosis(variations,
                                               max_allowable_het=0.1,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]])

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               max_allowable_het=0.3,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
Пример #26
0
    def test_calc_mac(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0)
        result = compute(macs)
        expected = [2, 1, 1, math.nan]
        for a, b in zip(result, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)
Пример #27
0
def compute(data,
            store_variation_to_memory=False,
            silence_runtime_warnings=False):
    if isinstance(data, (Delayed, da.Array)):
        with warnings.catch_warnings():
            if silence_runtime_warnings:
                warnings.filterwarnings("ignore", category=RuntimeWarning)
            return data.compute()

    res = _collect_cargo_to_compute(
        data, store_variation_to_memory=store_variation_to_memory)
    darrays_to_compute, orig_keys, orig_dicts, variation_info = res

    in_memory_variations = None

    with warnings.catch_warnings():
        if silence_runtime_warnings:
            warnings.filterwarnings("ignore", category=RuntimeWarning)
        computed_darrays = dask.compute(*darrays_to_compute)

    for idx, computed_darray in enumerate(computed_darrays):
        key = orig_keys[idx]
        dict_in_which_the_result_was_stored = orig_dicts[idx]
        if (isinstance(dict_in_which_the_result_was_stored, Variations)
                and store_variation_to_memory):
            if in_memory_variations is None:
                in_memory_variations = Variations(
                    metadata=variation_info['metadata'])
            if key == 'samples':
                in_memory_variations.samples = computed_darray
            else:
                in_memory_variations[key] = computed_darray
        else:
            dict_in_which_the_result_was_stored[key] = computed_darray

    if variation_info['key']:
        if store_variation_to_memory:
            data[variation_info['key']] = in_memory_variations
        else:
            del data[variation_info['key']]

    return data
Пример #28
0
    def test_calc_maf_by_allele_count(self):
        variations = Variations(samples=da.array(['aa', 'bb']))
        variations[GT_FIELD] = da.from_array([[[-1, 1], [2, 1]],
                                              [[-1, -1], [-1, 2]],
                                              [[1, -1], [1, 1]]])
        variations[RO_FIELD] = da.from_array(
            np.array([[-1, 8], [-1, -1], [6, 4]]))
        variations[AO_FIELD] = da.from_array(
            np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]]))
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        future_result = calc_maf_by_allele_count(variations,
                                                 min_num_genotypes=0)
        result = compute(future_result)

        expected = [0.5, 0.5, 0.47619048]
        for a, b in zip(result, expected):
            self.assertAlmostEqual(a, b, places=2)
Пример #29
0
def load_zarr(path, num_vars_per_chunk=DEFAULT_VARIATION_NUM_IN_CHUNK):
    z_object = zarr.open_group(str(path), mode='r')
    variations = Variations(samples=da.from_zarr(z_object.samples))
    metadata = {}
    for group_name, group in (z_object.groups()):
        for array_name, array in group.arrays():
            zarr_field = f'{group_name}/{array_name}'
            try:
                field = ZARR_VARIATION_FIELD_MAPPING[zarr_field]
            except KeyError:
                continue
            if array.attrs:
                metadata[field] = dict(array.attrs.items())

            chunks = (num_vars_per_chunk,) + array.shape[1:]
            # chunks = None
            variations[field] = da.from_zarr(array, chunks=chunks)
    variations.metadata = metadata

    return variations
Пример #30
0
def load_hdf5(path):
    store = h5py.File(str(path), mode='r')
    samples = store['samples']
    variations = Variations(samples=da.from_array(samples,
                                                  chunks=samples.shape))
    metadata = {}
    for group_name, group in (store.items()):
        if isinstance(group, Group):
            for array_name, dataset in group.items():
                path = f'{group_name}/{array_name}'
                path = ZARR_VARIATION_FIELD_MAPPING[path]
                if dataset.attrs:
                    metadata[path] = dict(dataset.attrs.items())
                chunks = [600]
                if dataset.ndim > 1:
                    chunks.append(dataset.shape[1])
                if dataset.ndim > 2:
                    chunks.append(dataset.shape[2])
                variations[path] = da.from_array(dataset, chunks=tuple(chunks))

    variations.metadata = metadata
    return variations