예제 #1
0
def test_pca__raise_on_missing_data(sample_dataset, sentinel):
    ac = sample_dataset["call_alternate_allele_count"]
    ac = ac.where(sample_dataset["call_alternate_allele_count"] == 1, sentinel)
    ds = sample_dataset.assign(call_alternate_allele_count=ac)
    with pytest.raises(ValueError,
                       match="Input data cannot contain missing values"):
        pca.pca(ds, n_components=2)
예제 #2
0
def test_pca__default_allele_counts_with_index(sample_dataset):
    pca.pca(
        sample_dataset.drop_vars("call_alternate_allele_count").set_index(
            {"variants": ("variant_contig", "variant_position")}),
        n_components=2,
        merge=False,
    ).compute()
예제 #3
0
def test_pca__lazy_evaluation(shape, chunks, algorithm):
    # Ensure that all new variables are backed by lazy arrays
    if algorithm == "tsqr" and all(c > 0 for c in chunks):
        return
    ds = simulate_dataset(*shape, chunks=chunks)  # type: ignore[misc]
    ds = pca.pca(ds, n_components=2, algorithm=algorithm, merge=False)
    for v in ds:
        assert isinstance(ds[v].data, da.Array)
예제 #4
0
def test_pca__array_backend(backend, algorithm):
    # Ensure that calculation succeeds regardless of array input backend
    ds = simulate_dataset(25, 5)
    ds["call_alternate_allele_count"] = ds["call_alternate_allele_count"].copy(
        data=backend.asarray(ds["call_alternate_allele_count"]))
    ds = pca.pca(ds, n_components=2, algorithm=algorithm, merge=False)
    for v in ds:
        ds[v].compute()
예제 #5
0
def test_pca__tsqr_allel_comparison(shape, chunks, n_components):
    # Validate chunked, non-random implementation vs scikit-allel single chunk results
    ds = simulate_dataset(*shape, chunks=chunks)  # type: ignore[misc]
    ds_sg = pca.pca(ds, n_components=n_components, algorithm="tsqr")
    ds_sk = allel_pca(
        ds["call_alternate_allele_count"].values.astype("float32"),
        n_components=n_components,
        scaler="patterson",
        randomized=False,
    )
    assert ds_sg["sample_pca_projection"].values.dtype == np.float32
    assert ds_sk["sample_pca_projection"].values.dtype == np.float32
    validate_allel_comparison(ds_sg, ds_sk)
예제 #6
0
def test_pca__stability(stability_test_result, chunks, algorithm):
    # Ensure that results are stable across algorithms and that sign flips
    # do not occur when chunking changes
    if algorithm == "tsqr" and all(c > 0 for c in chunks):
        return
    shape, expected = stability_test_result
    ds = simulate_dataset(*shape, chunks=chunks, n_cohort=3)  # type: ignore[misc]
    actual = pca.pca(
        ds, n_components=2, algorithm=algorithm, n_iter=6, random_state=0, merge=False
    )
    # Results are expected to change slightly with chunking, but they
    # will change drastically (far more than 1e-5) if a sign flip occurs
    xr.testing.assert_allclose(expected, actual, atol=1e-5)
예제 #7
0
def test_pca__randomized_allel_comparison(shape, chunks, n_components):
    # Validate chunked, randomized implementation vs scikit-allel single chunk results --
    # randomized validation requires more data, more structure, and fewer components in
    # order for results to be equal within the same tolerance as deterministic svd.
    ds = simulate_dataset(*shape, chunks=chunks, n_cohort=3)  # type: ignore[misc]
    ds_sg = pca.pca(
        ds, n_components=n_components, algorithm="randomized", n_iter=5, random_state=0
    )
    ds_sk = allel_pca(
        ds["call_alternate_allele_count"].values.astype("float32"),
        n_components=n_components,
        scaler="patterson",
        randomized=True,
        iterated_power=5,
        random_state=0,
    )
    assert ds_sg["sample_pca_projection"].values.dtype == np.float32
    assert ds_sk["sample_pca_projection"].values.dtype == np.float32
    validate_allel_comparison(ds_sg, ds_sk)
예제 #8
0
def stability_test_result(request):
    shape = request.param
    ds = simulate_dataset(*shape, chunks=(-1, -1), n_cohort=3)  # type: ignore[misc]
    res = pca.pca(ds, n_components=2, algorithm="tsqr", merge=False)
    return shape, res
예제 #9
0
def test_pca__default_allele_counts(sample_dataset):
    pca.pca(
        sample_dataset.drop_vars("call_alternate_allele_count"),
        n_components=2,
        merge=False,
    ).compute()