示例#1
0
def test_window():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])

    with pytest.raises(MergeWarning):
        window(ds, 2, 2)
示例#2
0
def test_observed_heterozygosity__scikit_allel_comparison(
        n_variant, n_sample, missing_pct, window_size, seed):
    ds = simulate_genotype_call_dataset(
        n_variant=n_variant,
        n_sample=n_sample,
        n_ploidy=2,
        missing_pct=missing_pct,
        seed=seed,
    )
    ds["sample_cohort"] = (
        ["samples"],
        np.zeros(n_sample, int),
    )
    ds = window(ds, size=window_size)
    ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values
    if n_sample % window_size:
        # scikit-allel will drop the ragged end
        ho_sg = ho_sg[0:-1]
    # calculate with scikit-allel
    ho_sa = allel.moving_statistic(
        allel.heterozygosity_observed(ds["call_genotype"]),
        np.sum,
        size=window_size,
    )
    # add cohort dimension to scikit-allel result
    np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
示例#3
0
def test_pbs__windowed(sample_size, n_cohorts, cohorts, cohort_indexes,
                       chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds,
        ts,
        n_cohorts,
        cohort_key_names=["cohorts_0", "cohorts_1",
                          "cohorts_2"])  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)

    ds = pbs(ds, cohorts=cohorts)

    # scikit-allel
    for i, j, k in itertools.combinations(range(n_cohorts), 3):
        stat_pbs = (ds["stat_pbs"].sel(cohorts_0=f"co_{i}",
                                       cohorts_1=f"co_{j}",
                                       cohorts_2=f"co_{k}").values)

        if cohort_indexes is not None and (i, j, k) not in cohort_indexes:
            np.testing.assert_array_equal(stat_pbs,
                                          np.full_like(stat_pbs, np.nan))
        else:
            ac_i = ds.cohort_allele_count.values[:, i, :]
            ac_j = ds.cohort_allele_count.values[:, j, :]
            ac_k = ds.cohort_allele_count.values[:, k, :]

            ska_pbs_value = allel.pbs(ac_i, ac_j, ac_k, window_size=25)

            # scikit-allel has final window missing
            np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
示例#4
0
def test_pbs(sample_size, n_cohorts):
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds,
        ts,
        n_cohorts,
        cohort_key_names=["cohorts_0", "cohorts_1",
                          "cohorts_2"])  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window

    ds = pbs(ds)

    # scikit-allel
    for i, j, k in itertools.combinations(range(n_cohorts), 3):
        stat_pbs = (ds["stat_pbs"].sel(cohorts_0=f"co_{i}",
                                       cohorts_1=f"co_{j}",
                                       cohorts_2=f"co_{k}").values)

        ac_i = ds.cohort_allele_count.values[:, i, :]
        ac_j = ds.cohort_allele_count.values[:, j, :]
        ac_k = ds.cohort_allele_count.values[:, k, :]

        ska_pbs_value = allel.pbs(ac_i, ac_j, ac_k, window_size=n_variants)

        np.testing.assert_allclose(stat_pbs, ska_pbs_value)
示例#5
0
def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts,
                                                      chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = divergence(ds)
    div = ds["stat_divergence"].values
    # test off-diagonal entries, by replacing diagonal with NaNs
    div[:, np.arange(2), np.arange(2)] = np.nan

    # Calculate divergence using scikit-allel moving_statistic
    # (Don't use windowed_divergence, since it treats the last window differently)
    ds1 = count_variant_alleles(ts_to_dataset(
        ts, samples=ts.samples()[:1]))  # type: ignore[no-untyped-call]
    ds2 = count_variant_alleles(ts_to_dataset(
        ts, samples=ts.samples()[1:]))  # type: ignore[no-untyped-call]
    ac1 = ds1["variant_allele_count"].values
    ac2 = ds2["variant_allele_count"].values
    mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
    ska_div = allel.moving_statistic(mpd, np.sum, size=25)  # noqa: F841
    # TODO: investigate why numbers are different
    np.testing.assert_allclose(
        div[:-1], ska_div)  # scikit-allel has final window missing
示例#6
0
def test_diversity__windowed(sample_size):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds, ts, cohort_key_names=["cohorts"])  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = diversity(ds)
    div = ds["stat_diversity"].sel(cohorts="co_0").compute()

    # Calculate diversity using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    ts_div = ts.diversity(windows=windows, span_normalise=False)
    np.testing.assert_allclose(div, ts_div)

    # Calculate diversity using scikit-allel moving_statistic
    # (Don't use windowed_diversity, since it treats the last window differently)
    ds = count_variant_alleles(
        ts_to_dataset(ts))  # type: ignore[no-untyped-call]
    ac = ds["variant_allele_count"].values
    mpd = allel.mean_pairwise_difference(ac, fill=0)
    ska_div = allel.moving_statistic(mpd, np.sum, size=25)
    np.testing.assert_allclose(
        div[:-1], ska_div)  # scikit-allel has final window missing
示例#7
0
def test_divergence__windowed(sample_size, n_cohorts, chunks):
    ts = msprime.simulate(sample_size,
                          length=200,
                          mutation_rate=0.05,
                          random_seed=42)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = divergence(ds)
    div = ds["stat_divergence"].values
    # test off-diagonal entries, by replacing diagonal with NaNs
    div[:, np.arange(2), np.arange(2)] = np.nan

    # Calculate diversity using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    n_windows = len(windows) - 1
    ts_div = np.full([n_windows, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_div[:, i, j] = ts.divergence([subsets[i], subsets[j]],
                                        windows=windows,
                                        span_normalise=False)
        ts_div[:, j, i] = ts_div[:, i, j]
    np.testing.assert_allclose(div, ts_div)
示例#8
0
def test_window__default_step():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
示例#9
0
def test_observed_heterozygosity__windowed(chunks):
    ds = simulate_genotype_call_dataset(
        n_variant=4,
        n_sample=6,
        n_ploidy=4,
    )
    ds["call_genotype"] = (
        ["variants", "samples", "ploidy"],
        da.asarray([
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
                [1, 1, 1, 1],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 1],
                [0, 0, 1, 1],
                [0, 0, 1, 1],
                [1, 0, 1, 0],
                [0, 1, 0, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [4, 5, 6, 7],
            ],
            [
                [0, 0, -1, -1],
                [0, 1, -1, -1],
                [0, 0, 1, 1],
                [-1, -1, -1, -1],
                [0, -1, -1, -1],
                [-1, -1, -1, -1],
            ],
        ]).rechunk(chunks),
    )
    ds.call_genotype_mask.values = ds.call_genotype < 0
    ds["sample_cohort"] = (
        ["samples"],
        da.asarray([0, 0, 1, 1, 2, 2]).rechunk(chunks[1]),
    )
    ds = window(ds, size=2)
    ho = observed_heterozygosity(ds)["stat_observed_heterozygosity"]
    np.testing.assert_almost_equal(
        ho,
        np.array([
            [1 / 4, 2 / 3, 2 / 3],
            [1 / 2, 5 / 3, np.nan],
        ]),
    )
示例#10
0
def test_window__multiple_contigs(n_variant, n_contig, window_contigs_exp,
                                  window_starts_exp, window_stops_exp):
    ds = simulate_genotype_call_dataset(n_variant=n_variant,
                                        n_sample=1,
                                        n_contig=n_contig)
    ds = window(ds, 2, 2)
    np.testing.assert_equal(ds[window_contig].values, window_contigs_exp)
    np.testing.assert_equal(ds[window_start].values, window_starts_exp)
    np.testing.assert_equal(ds[window_stop].values, window_stops_exp)
示例#11
0
def test_Tajimas_D(sample_size):
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds, ts, cohort_key_names=None)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Tajimas_D(ds)
    d = ds.stat_Tajimas_D.compute()
    ts_d = ts.Tajimas_D()
    np.testing.assert_allclose(d, ts_d)
示例#12
0
def test_Tajimas_D(sample_size):
    ts = msprime.simulate(sample_size,
                          length=100,
                          mutation_rate=0.05,
                          random_seed=42)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds, ts, cohort_key_names=None)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Tajimas_D(ds)
    d = ds.stat_Tajimas_D.compute()
    ts_d = ts.Tajimas_D()
    np.testing.assert_allclose(d, ts_d)
示例#13
0
def test_Fst__Nei(sample_size, n_cohorts):
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Nei")
    fst = ds.stat_Fst.values

    ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]])
        ts_fst[0, j, i] = ts_fst[0, i, j]
    np.testing.assert_allclose(fst, ts_fst)
示例#14
0
def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, cohorts,
                 cohort_indexes, chunks):
    ds = simulate_genotype_call_dataset(n_variant=n_variants,
                                        n_sample=n_samples,
                                        n_contig=n_contigs)
    ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
    subsets = np.array_split(ds.samples.values, n_cohorts)
    sample_cohorts = np.concatenate(
        [np.full_like(subset, i) for i, subset in enumerate(subsets)])
    ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
    cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    coords = {k: cohort_names for k in ["cohorts"]}
    ds = ds.assign_coords(coords)  # type: ignore[no-untyped-call]
    ds = window(ds, size=3)

    gh = Garud_H(ds, cohorts=cohorts)
    h1 = gh.stat_Garud_h1.values
    h12 = gh.stat_Garud_h12.values
    h123 = gh.stat_Garud_h123.values
    h2_h1 = gh.stat_Garud_h2_h1.values

    # scikit-allel
    for c in range(n_cohorts):
        if cohort_indexes is not None and c not in cohort_indexes:
            # cohorts that were not computed should be nan
            np.testing.assert_array_equal(h1[:, c],
                                          np.full_like(h1[:, c], np.nan))
            np.testing.assert_array_equal(h12[:, c],
                                          np.full_like(h12[:, c], np.nan))
            np.testing.assert_array_equal(h123[:, c],
                                          np.full_like(h123[:, c], np.nan))
            np.testing.assert_array_equal(h2_h1[:, c],
                                          np.full_like(h2_h1[:, c], np.nan))
        else:
            gt = ds.call_genotype.values[:, sample_cohorts == c, :]
            ska_gt = allel.GenotypeArray(gt)
            ska_ha = ska_gt.to_haplotypes()
            ska_h = allel.moving_garud_h(ska_ha, size=3)

            np.testing.assert_allclose(h1[:, c], ska_h[0])
            np.testing.assert_allclose(h12[:, c], ska_h[1])
            np.testing.assert_allclose(h123[:, c], ska_h[2])
            np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
示例#15
0
def test_Fst__Hudson(sample_size):
    # scikit-allel can only calculate Fst for pairs of cohorts (populations)
    n_cohorts = 2
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Hudson")
    fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

    # scikit-allel
    ac1 = ds.cohort_allele_count.values[:, 0, :]
    ac2 = ds.cohort_allele_count.values[:, 1, :]
    num, den = hudson_fst(ac1, ac2)
    ska_fst = np.sum(num) / np.sum(den)

    np.testing.assert_allclose(fst, ska_fst)
示例#16
0
def test_Fst__windowed(sample_size, n_cohorts, chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    fst_ds = Fst(ds, estimator="Nei")
    fst = fst_ds["stat_Fst"].values

    # Calculate Fst using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    n_windows = len(windows) - 1
    ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[:, i, j] = ts.Fst([subsets[i], subsets[j]],
                                 windows=windows,
                                 span_normalise=False)
        ts_fst[:, j, i] = ts_fst[:, i, j]

    # We can values close to zero, and the default value of atol isn't
    # appropriate for this.
    atol = 1e-8
    np.testing.assert_allclose(fst, ts_fst, atol=atol)

    # scikit-allel
    fst_ds = Fst(ds, estimator="Hudson")
    for i, j in itertools.combinations(range(n_cohorts), 2):
        fst = fst_ds["stat_Fst"].sel(cohorts_0=f"co_{i}",
                                     cohorts_1=f"co_{j}").values

        ac_i = fst_ds.cohort_allele_count.values[:, i, :]
        ac_j = fst_ds.cohort_allele_count.values[:, j, :]
        ska_fst = allel.moving_hudson_fst(ac_i, ac_j, size=25)

        np.testing.assert_allclose(
            fst[:-1], ska_fst,
            atol=atol)  # scikit-allel has final window missing