def test_dest_jost_distance_in_memory(self): gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20], [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]] variations = Variations() variations.samples = np.array(samples) variations[GT_FIELD] = np.array(gts) variations[DP_FIELD] = np.array(dps) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.allclose(dists, [0.65490196]) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=6) assert np.all(np.isnan(dists))
def load_hdf5(path, fields=None): if fields is None: fields = [] store = h5py.File(str(path), mode='r') samples = store['samples'] print(samples) variations = Variations( samples=da.from_array(samples, chunks=samples.shape)) metadata = {} for group_name, group in (store.items()): if isinstance(group, Group): for array_name, dataset in group.items(): path = f'{group_name}/{array_name}' path = ZARR_VARIATION_FIELD_MAPPING[path] if fields and path not in fields: continue if dataset.attrs: metadata[path] = dict(dataset.attrs.items()) chunks = list(dataset.shape) chunks[0] = DEF_CHUNK_SIZE chunks = tuple(chunks) variations[path] = da.from_array(dataset, chunks=chunks) variations.metadata = metadata return variations
def test_unavailable_shape(self): variations = Variations() variations.samples = ['1', '2', '3'] gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts assert variations.num_variations == 3 variations = _create_empty_dask_variations() try: variations.num_variations self.fail('NotMaterializedError expected') except NotMaterializedError: pass
def test_iterate_chunks(self): # in memory variations = Variations() variations.samples = ['1', '2', '3'] gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts for index, chunk in enumerate(variations.iterate_chunks(chunk_size=1)): assert np.all(chunk[GT_FIELD] == variations[GT_FIELD][index, :]) assert np.all(chunk.samples == variations.samples) # in disk variations = load_zarr((TEST_DATA_DIR / 'test.zarr'), num_vars_per_chunk=1) chunks = list(variations.iterate_chunks()) self.assertEqual(len(chunks), 7)
def xtest_do_pca(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') do_pca(variations) gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]]) variations = Variations() variations.samples = da.from_array(np.array(['a', 'b', 'c'])) variations[GT_FIELD] = da.from_array(gts) res = do_pca(variations) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert np.allclose(projs[0], projs[1]) assert not np.allclose(projs[0], projs[2])
def _filter_samples(variations, desired_samples, reverse=False): desired_samples = _str_list_to_byte_list(desired_samples) orig_sample_names = va.make_sure_array_is_in_memory(variations.samples) if reverse: desired_samples = [ sample for sample in orig_sample_names if sample not in desired_samples ] orig_sample_names = list(orig_sample_names) sample_cols = np.array( [orig_sample_names.index(sample) for sample in desired_samples]) sample_cols = list(sample_cols) new_variations = Variations(samples=np.array(desired_samples), metadata=variations.metadata) for field, array in variations._arrays.items(): if PUBLIC_CALL_GROUP in field: with dask.config.set( **{'array.slicing.split_large_chunks': False}): array = array[:, sample_cols] new_variations[field] = array return {FLT_VARS: new_variations}
def test_calc_obs_het(self): variations = Variations(samples=da.array(['a', 'b', 'c', 'd'])) gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) # with this step we create a variation with dask arrays of unknown shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] het = calc_obs_het(variations, min_num_genotypes=0) self.assertTrue(np.allclose(het.compute(), [0.5, 0])) # het = calc_obs_het(variations, min_num_genotypes=10) # assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) self.assertTrue(np.allclose(het.compute(), [1, 0])) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) self.assertTrue(np.allclose(het.compute(), [0, 0])) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
def test_calc_obs_het_in_memory(self): variations = Variations(samples=np.array(['a', 'b', 'c', 'd'])) gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) variations[GT_FIELD] = gts variations[DP_FIELD] = dps het = calc_obs_het(variations, min_num_genotypes=0) self.assertTrue(np.allclose(het, [0.5, 0])) # het = calc_obs_het(variations, min_num_genotypes=10) # assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) self.assertTrue(np.allclose(het, [1, 0])) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) self.assertTrue(np.allclose(het, [0, 0])) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) self.assertTrue(np.allclose(het, [0.5, 0]))
def test_calc_obs_het2(self): gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) het = calc_obs_het(variations, min_num_genotypes=0) het = compute(het) assert np.allclose(het, [0.5, 0]) het = calc_obs_het(variations, min_num_genotypes=10) het = compute(het) assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) het = compute(het) assert np.allclose(het, [1, 0]) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) het = compute(het) assert np.allclose(het, [0, 0]) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) het = compute(het) assert np.allclose(het, [0.5, 0])
def test_do_pca_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] do_pca(variations) gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]]) variations = Variations() variations.samples = da.from_array(np.array(['a', 'b', 'c'])) variations[GT_FIELD] = da.from_array(gts) res = do_pca(variations) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert np.allclose(projs[0], projs[1]) assert not np.allclose(projs[0], projs[2])
def test_kosman_pairwise_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 2]]) c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((a, b, c, d), axis=0) gts = np.transpose(gts, axes=(1, 0, 2)).astype(np.int16) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts distances, samples = calc_kosman_dist(variations) expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.] assert np.allclose(distances, expected)
def test_filter_obs_het_in_mem(self): variations = Variations() gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]]) variations.samples = da.from_array([1, 2, 3, 4, 5]) variations[GT_FIELD] = da.from_array(gts) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] filtered = filter_by_obs_heterocigosis(variations, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0 filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]]) assert filtered[FLT_STATS][N_KEPT] == 3 assert filtered[FLT_STATS][N_FILTERED_OUT] == 1 filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=10) # filtered = compute(task, store_variation_to_memory=True, # silence_runtime_warnings=True) assert filtered[FLT_STATS][N_KEPT] == 0 assert filtered[FLT_STATS][N_FILTERED_OUT] == 4 filtered = filter_by_obs_heterocigosis(variations, max_allowable_het=0.1, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]]) filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, max_allowable_het=0.3, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
def test_nei_dist_in_memory(self): gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=6, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) # all missing gts = np.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=1, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isnan(dists[0]) # min_num_genotypes gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=6, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797)
def test_kosman_2_indis(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def compute(data, store_variation_to_memory=False, silence_runtime_warnings=False): if isinstance(data, (Delayed, da.Array)): with warnings.catch_warnings(): if silence_runtime_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) return data.compute() res = _collect_cargo_to_compute( data, store_variation_to_memory=store_variation_to_memory) darrays_to_compute, orig_keys, orig_dicts, variation_info = res in_memory_variations = None with warnings.catch_warnings(): if silence_runtime_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) computed_darrays = dask.compute(*darrays_to_compute) for idx, computed_darray in enumerate(computed_darrays): key = orig_keys[idx] dict_in_which_the_result_was_stored = orig_dicts[idx] if (isinstance(dict_in_which_the_result_was_stored, Variations) and store_variation_to_memory): if in_memory_variations is None: in_memory_variations = Variations( metadata=variation_info['metadata']) if key == 'samples': in_memory_variations.samples = computed_darray else: in_memory_variations[key] = computed_darray else: dict_in_which_the_result_was_stored[key] = computed_darray if variation_info['key']: if store_variation_to_memory: data[variation_info['key']] = in_memory_variations else: del data[variation_info['key']] return data
def load_zarr(path, num_vars_per_chunk=DEFAULT_VARIATION_NUM_IN_CHUNK): z_object = zarr.open_group(str(path), mode='r') variations = Variations(samples=da.from_zarr(z_object.samples)) metadata = {} for group_name, group in (z_object.groups()): for array_name, array in group.arrays(): zarr_field = f'{group_name}/{array_name}' try: field = ZARR_VARIATION_FIELD_MAPPING[zarr_field] except KeyError: continue if array.attrs: metadata[field] = dict(array.attrs.items()) chunks = (num_vars_per_chunk,) + array.shape[1:] # chunks = None variations[field] = da.from_zarr(array, chunks=chunks) variations.metadata = metadata return variations
def test_calc_mac2(self): gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation wi result = calc_mac(variations, max_alleles=3, min_num_genotypes=1) macs = compute(result) assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
def test_kosman_2_indis_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) va.make_sure_array_is_in_memory(distance_ab) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def test_calc_missing_memory(self): variations = Variations() gts = np.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) samples = [str(i) for i in range(gts.shape[1])] variations.samples = np.array(samples) variations[GT_FIELD] = gts result = calc_missing_gt(variations, rates=False) expected = np.array([2, 1, 1, 0]) assert np.all(result == 2 - expected) gts = np.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) samples = [str(i) for i in range(gts.shape[1])] variations = Variations() variations.samples = np.array(samples) variations[GT_FIELD] = gts result = calc_missing_gt(variations, rates=False) # result = compute(task) expected = np.array([0.5, 1, 2, 4]) assert np.all(result == expected)
def test_kosman_missing_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_cd = _kosman(vars1, vars2) assert np.all(distance_ab == distance_cd)
def test_empty_pop(self): missing = (-1, -1) gts = [ [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0], [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] variations = Variations() variations.samples = da.from_array(np.array(samples)) variations[GT_FIELD] = da.from_array(np.array(gts)) variations[DP_FIELD] = da.from_array(np.array(dps)) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.allclose(dists, [0.65490196]) gts = [ [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] dps = [[0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] variations = Variations() variations.samples = da.from_array(np.array(samples)) variations[GT_FIELD] = da.from_array(np.array(gts)) variations[DP_FIELD] = da.from_array(np.array(dps)) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.isnan(dists[0])
def load_hdf5(path): store = h5py.File(str(path), mode='r') samples = store['samples'] variations = Variations(samples=da.from_array(samples, chunks=samples.shape)) metadata = {} for group_name, group in (store.items()): if isinstance(group, Group): for array_name, dataset in group.items(): path = f'{group_name}/{array_name}' path = ZARR_VARIATION_FIELD_MAPPING[path] if dataset.attrs: metadata[path] = dict(dataset.attrs.items()) chunks = [600] if dataset.ndim > 1: chunks.append(dataset.shape[1]) if dataset.ndim > 2: chunks.append(dataset.shape[2]) variations[path] = da.from_array(dataset, chunks=tuple(chunks)) variations.metadata = metadata return variations
def test_calc_mac_in_memory(self): variations = Variations(samples=np.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0) expected = [2, 1, 1, math.nan] for a, b in zip(macs, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)
def _create_fake_variations_and_regions(self): variations = Variations(samples=da.array(['aa', 'bb'])) poss = np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0]) chroms = np.array([ 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2' ]) variations[CHROM_FIELD] = da.from_array(chroms) variations[POS_FIELD] = da.from_array(poss) regions = [('chr1', 4, 6), ('chr2', )] return variations, regions
def test_non_variable_filter(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) task = keep_variable_variations(variations, max_alleles=3) result = compute(task, store_variation_to_memory=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 2) self.assertEqual(result[FLT_STATS], {'n_kept': 2, 'n_filtered_out': 2})
def test_calc_maf_by_allele_count_in_memory(self): variations = Variations(samples=np.array(['aa', 'bb'])) variations[GT_FIELD] = np.array([[[-1, 1], [2, 1]], [[-1, -1], [-1, 2]], [[1, -1], [1, 1]]]) variations[RO_FIELD] = np.array([[-1, 8], [-1, -1], [6, 4]]) variations[AO_FIELD] = np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]]) result = calc_maf_by_allele_count(variations, min_num_genotypes=0) expected = [0.5, 0.5, 0.47619048] for a, b in zip(result, expected): self.assertAlmostEqual(a, b, places=2)
def test_basic_operations(self): variations = Variations() self.assertEqual(variations.num_variations, 0) self.assertEqual(variations.num_samples, 0) gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) # trying to add call data without samples fails with self.assertRaises(ValueError) as _: variations[GT_FIELD] = gts # set samples variations.samples = ['1', '2', '3'] self.assertEqual(variations.num_samples, 3) # adding again samples fails with self.assertRaises(RuntimeError) as _: variations.samples = ['1', '2', '3'] # add variationData chroms = np.array(['chr1', 'chr2', 'chr3']) variations[CHROM_FIELD] = chroms # add data with wrong shape with self.assertRaises(ValueError) as context: variations[GT_FIELD] = gts = np.array([[1, 2, 3]]) self.assertIn('Introduced matrix shape', str(context.exception)) with self.assertRaises(ValueError) as context: variations[GT_FIELD] = gts = np.array([[1, 2], [1, 2], [1, 2]]) self.assertIn('not fit with num samples', str(context.exception)) # set gt array gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts self.assertTrue(np.array_equal(gts, variations[GT_FIELD])) self.assertEqual(variations.num_variations, 3)
def test_allele_freq_in_memory(self): gts = np.array([[[0, 0], [1, 1], [0, -1], [-1, -1]], [[0, -1], [0, 0], [0, -1], [-1, -1]], [[0, 1], [0, 2], [0, 0], [-1, -1]]]) samples = ['1', '2', '3', '4'] variations = Variations(samples=np.array(samples)) variations[GT_FIELD] = gts variations[ALT_FIELD] = np.zeros((3, 2)) allele_freq = calc_allele_freq(variations, max_alleles=3, min_num_genotypes=0) allele_freq = allele_freq expected = np.array([[0.6, 0.4, 0], [1, 0, 0], [4 / 6, 1 / 6, 1 / 6]]) assert np.allclose(allele_freq, expected)
def test_calc_mac(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0) result = compute(macs) expected = [2, 1, 1, math.nan] for a, b in zip(result, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)
def test_calc_maf_by_allele_count(self): variations = Variations(samples=da.array(['aa', 'bb'])) variations[GT_FIELD] = da.from_array([[[-1, 1], [2, 1]], [[-1, -1], [-1, 2]], [[1, -1], [1, 1]]]) variations[RO_FIELD] = da.from_array( np.array([[-1, 8], [-1, -1], [6, 4]])) variations[AO_FIELD] = da.from_array( np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]])) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] future_result = calc_maf_by_allele_count(variations, min_num_genotypes=0) result = compute(future_result) expected = [0.5, 0.5, 0.47619048] for a, b in zip(result, expected): self.assertAlmostEqual(a, b, places=2)