Пример #1
0
def test_Fst__windowed(sample_size, n_cohorts, chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    fst_ds = Fst(ds, estimator="Nei")
    fst = fst_ds["stat_Fst"].values

    # Calculate Fst using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    n_windows = len(windows) - 1
    ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[:, i, j] = ts.Fst([subsets[i], subsets[j]],
                                 windows=windows,
                                 span_normalise=False)
        ts_fst[:, j, i] = ts_fst[:, i, j]

    # We can values close to zero, and the default value of atol isn't
    # appropriate for this.
    atol = 1e-8
    np.testing.assert_allclose(fst, ts_fst, atol=atol)

    # scikit-allel
    fst_ds = Fst(ds, estimator="Hudson")
    for i, j in itertools.combinations(range(n_cohorts), 2):
        fst = fst_ds["stat_Fst"].sel(cohorts_0=f"co_{i}",
                                     cohorts_1=f"co_{j}").values

        ac_i = fst_ds.cohort_allele_count.values[:, i, :]
        ac_j = fst_ds.cohort_allele_count.values[:, j, :]
        ska_fst = allel.moving_hudson_fst(ac_i, ac_j, size=25)

        np.testing.assert_allclose(
            fst[:-1], ska_fst,
            atol=atol)  # scikit-allel has final window missing
Пример #2
0
def test_Fst__Nei(sample_size, n_cohorts):
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)
    ds, subsets = add_cohorts(ds, ts, n_cohorts)
    n_variants = ds.dims["variants"]
    ds = window_by_variant(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Nei")
    fst = ds.stat_Fst.values

    ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]])
        ts_fst[0, j, i] = ts_fst[0, i, j]
    np.testing.assert_allclose(fst, ts_fst)
Пример #3
0
def test_Fst__Hudson(sample_size):
    # scikit-allel can only calculate Fst for pairs of cohorts (populations)
    n_cohorts = 2
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)
    ds, subsets = add_cohorts(ds, ts, n_cohorts)
    n_variants = ds.dims["variants"]
    ds = window_by_variant(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Hudson")
    fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

    # scikit-allel
    ac1 = ds.cohort_allele_count.values[:, 0, :]
    ac2 = ds.cohort_allele_count.values[:, 1, :]
    num, den = hudson_fst(ac1, ac2)
    ska_fst = np.sum(num) / np.sum(den)

    np.testing.assert_allclose(fst, ska_fst)
Пример #4
0
def test_Fst__Nei(sample_size, n_cohorts):
    ts = msprime.simulate(sample_size,
                          length=100,
                          mutation_rate=0.05,
                          random_seed=42)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Nei")
    fst = ds.stat_Fst.values

    ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]])
        ts_fst[0, j, i] = ts_fst[0, i, j]
    np.testing.assert_allclose(fst, ts_fst)
Пример #5
0
def test_Fst__Hudson(sample_size):
    # scikit-allel can only calculate Fst for pairs of cohorts (populations)
    n_cohorts = 2
    ts = msprime.simulate(sample_size,
                          length=100,
                          mutation_rate=0.05,
                          random_seed=42)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Hudson")
    fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

    # scikit-allel
    ac1 = ds.cohort_allele_count.values[:, 0, :]
    ac2 = ds.cohort_allele_count.values[:, 1, :]
    num, den = hudson_fst(ac1, ac2)
    ska_fst = np.sum(num) / np.sum(den)

    np.testing.assert_allclose(fst, ska_fst)
Пример #6
0
def test_Fst__unknown_estimator():
    ts = simulate_ts(2)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    with pytest.raises(ValueError,
                       match="Estimator 'Unknown' is not a known estimator"):
        Fst(ds, estimator="Unknown")
Пример #7
0
def test_Fst__unknown_estimator():
    ts = msprime.simulate(2, length=100, mutation_rate=0.05, random_seed=42)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    with pytest.raises(ValueError,
                       match="Estimator 'Unknown' is not a known estimator"):
        Fst(ds, estimator="Unknown")