def test_divergence__missing_calls(): ds = get_dataset([ [[0, 0], [-1, -1], [-1, -1]], # all of cohort 1 calls are missing ]) ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples") ds = divergence(ds) np.testing.assert_equal(ds["stat_divergence"].values[0, 1], np.nan)
def test_divergence__windowed(sample_size, n_cohorts, chunks): ts = msprime.simulate(sample_size, length=200, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = divergence(ds) div = ds["stat_divergence"].values # test off-diagonal entries, by replacing diagonal with NaNs div[:, np.arange(2), np.arange(2)] = np.nan # Calculate diversity using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) n_windows = len(windows) - 1 ts_div = np.full([n_windows, n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_div[:, i, j] = ts.divergence([subsets[i], subsets[j]], windows=windows, span_normalise=False) ts_div[:, j, i] = ts_div[:, i, j] np.testing.assert_allclose(div, ts_div)
def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = divergence(ds) div = ds["stat_divergence"].values # test off-diagonal entries, by replacing diagonal with NaNs div[:, np.arange(2), np.arange(2)] = np.nan # Calculate divergence using scikit-allel moving_statistic # (Don't use windowed_divergence, since it treats the last window differently) ds1 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[:1])) # type: ignore[no-untyped-call] ds2 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[1:])) # type: ignore[no-untyped-call] ac1 = ds1["variant_allele_count"].values ac2 = ds2["variant_allele_count"].values mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841 # TODO: investigate why numbers are different np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def test_divergence(sample_size, n_cohorts, chunks): ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = divergence(ds) div = ds.stat_divergence.sum(axis=0, skipna=False).values # entries on the diagonal are diversity values for i in range(n_cohorts): ts_div = ts.diversity([subsets[i]], span_normalise=False) np.testing.assert_allclose(div[i, i], ts_div) # test off-diagonal entries, by replacing diagonal with NaNs np.fill_diagonal(div, np.nan) ts_div = np.full([n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_div[i, j] = ts.divergence([subsets[i], subsets[j]], span_normalise=False) ts_div[j, i] = ts.divergence([subsets[j], subsets[i]], span_normalise=False) np.testing.assert_allclose(div, ts_div)