def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = divergence(ds) div = ds["stat_divergence"].values # test off-diagonal entries, by replacing diagonal with NaNs div[:, np.arange(2), np.arange(2)] = np.nan # Calculate divergence using scikit-allel moving_statistic # (Don't use windowed_divergence, since it treats the last window differently) ds1 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[:1])) # type: ignore[no-untyped-call] ds2 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[1:])) # type: ignore[no-untyped-call] ac1 = ds1["variant_allele_count"].values ac2 = ds2["variant_allele_count"].values mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841 # TODO: investigate why numbers are different np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def test_diversity__windowed(sample_size): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts( ds, ts, cohort_key_names=["cohorts"]) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = diversity(ds) div = ds["stat_diversity"].sel(cohorts="co_0").compute() # Calculate diversity using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) ts_div = ts.diversity(windows=windows, span_normalise=False) np.testing.assert_allclose(div, ts_div) # Calculate diversity using scikit-allel moving_statistic # (Don't use windowed_diversity, since it treats the last window differently) ds = count_variant_alleles( ts_to_dataset(ts)) # type: ignore[no-untyped-call] ac = ds["variant_allele_count"].values mpd = allel.mean_pairwise_difference(ac, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing