def test_Fst__windowed(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) fst_ds = Fst(ds, estimator="Nei") fst = fst_ds["stat_Fst"].values # Calculate Fst using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) n_windows = len(windows) - 1 ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_fst[:, i, j] = ts.Fst([subsets[i], subsets[j]], windows=windows, span_normalise=False) ts_fst[:, j, i] = ts_fst[:, i, j] # We can values close to zero, and the default value of atol isn't # appropriate for this. atol = 1e-8 np.testing.assert_allclose(fst, ts_fst, atol=atol) # scikit-allel fst_ds = Fst(ds, estimator="Hudson") for i, j in itertools.combinations(range(n_cohorts), 2): fst = fst_ds["stat_Fst"].sel(cohorts_0=f"co_{i}", cohorts_1=f"co_{j}").values ac_i = fst_ds.cohort_allele_count.values[:, i, :] ac_j = fst_ds.cohort_allele_count.values[:, j, :] ska_fst = allel.moving_hudson_fst(ac_i, ac_j, size=25) np.testing.assert_allclose( fst[:-1], ska_fst, atol=atol) # scikit-allel has final window missing
def test_Fst__Nei(sample_size, n_cohorts): ts = simulate_ts(sample_size) ds = ts_to_dataset(ts) ds, subsets = add_cohorts(ds, ts, n_cohorts) n_variants = ds.dims["variants"] ds = window_by_variant(ds, size=n_variants) # single window ds = Fst(ds, estimator="Nei") fst = ds.stat_Fst.values ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]]) ts_fst[0, j, i] = ts_fst[0, i, j] np.testing.assert_allclose(fst, ts_fst)
def test_Fst__Hudson(sample_size): # scikit-allel can only calculate Fst for pairs of cohorts (populations) n_cohorts = 2 ts = simulate_ts(sample_size) ds = ts_to_dataset(ts) ds, subsets = add_cohorts(ds, ts, n_cohorts) n_variants = ds.dims["variants"] ds = window_by_variant(ds, size=n_variants) # single window ds = Fst(ds, estimator="Hudson") fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values # scikit-allel ac1 = ds.cohort_allele_count.values[:, 0, :] ac2 = ds.cohort_allele_count.values[:, 1, :] num, den = hudson_fst(ac1, ac2) ska_fst = np.sum(num) / np.sum(den) np.testing.assert_allclose(fst, ska_fst)
def test_Fst__Nei(sample_size, n_cohorts): ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] n_variants = ds.dims["variants"] ds = window(ds, size=n_variants) # single window ds = Fst(ds, estimator="Nei") fst = ds.stat_Fst.values ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]]) ts_fst[0, j, i] = ts_fst[0, i, j] np.testing.assert_allclose(fst, ts_fst)
def test_Fst__Hudson(sample_size): # scikit-allel can only calculate Fst for pairs of cohorts (populations) n_cohorts = 2 ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] n_variants = ds.dims["variants"] ds = window(ds, size=n_variants) # single window ds = Fst(ds, estimator="Hudson") fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values # scikit-allel ac1 = ds.cohort_allele_count.values[:, 0, :] ac2 = ds.cohort_allele_count.values[:, 1, :] num, den = hudson_fst(ac1, ac2) ska_fst = np.sum(num) / np.sum(den) np.testing.assert_allclose(fst, ska_fst)
def test_Fst__unknown_estimator(): ts = simulate_ts(2) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] with pytest.raises(ValueError, match="Estimator 'Unknown' is not a known estimator"): Fst(ds, estimator="Unknown")
def test_Fst__unknown_estimator(): ts = msprime.simulate(2, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] with pytest.raises(ValueError, match="Estimator 'Unknown' is not a known estimator"): Fst(ds, estimator="Unknown")