Exemplo n.º 1
0
def Garud_H(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    cohorts: Optional[Sequence[Union[int, str]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015).

    By default, values of this statistic are calculated across all variants.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    cohorts
        The cohorts to compute statistics for, specified as a sequence of
        cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the following variables:

    - `stat_Garud_h1` (windows, cohorts): Garud H1 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h1_spec`.

    - `stat_Garud_h12` (windows, cohorts): Garud H12 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h12_spec`.

    - `stat_Garud_h123` (windows, cohorts): Garud H123 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h123_spec`.

    - `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`.

    Raises
    ------
    NotImplementedError
        If the dataset is not diploid.

    Warnings
    --------
    This function is currently only implemented for diploid datasets.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3, step=3)

    >>> gh = sg.Garud_H(ds)
    >>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.25 , 0.375],
        [0.375, 0.375]])
    >>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.375, 0.625],
        [0.625, 0.625]])
    >>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.625, 1.   ],
        [1.   , 1.   ]])
    >>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.75      , 0.33333333],
        [0.33333333, 0.33333333]])
    """

    if ds.dims["ploidy"] != 2:
        raise NotImplementedError(
            "Garud H only implemented for diploid genotypes")

    if not has_windows(ds):
        raise ValueError("Dataset must be windowed for Garud_H")

    variables.validate(ds, {call_genotype: variables.call_genotype_spec})

    gt = ds[call_genotype]

    # convert sample cohorts to haplotype layout
    sc = ds.sample_cohort.values
    hsc = np.stack((sc, sc), axis=1).ravel()  # TODO: assumes diploid
    n_cohorts = sc.max() + 1  # 0-based indexing
    cohorts = cohorts or range(n_cohorts)
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts", None))

    gh = window_statistic(
        gt,
        lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts, ct),
        ds.window_start.values,
        ds.window_stop.values,
        dtype=np.float64,
        # first chunks dimension is windows, computed in window_statistic
        chunks=(-1, n_cohorts, N_GARUD_H_STATS),
    )
    n_windows = ds.window_start.shape[0]
    assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
    new_ds = create_dataset({
        variables.stat_Garud_h1: (
            ("windows", "cohorts"),
            gh[:, :, 0],
        ),
        variables.stat_Garud_h12: (
            ("windows", "cohorts"),
            gh[:, :, 1],
        ),
        variables.stat_Garud_h123: (
            ("windows", "cohorts"),
            gh[:, :, 2],
        ),
        variables.stat_Garud_h2_h1: (
            ("windows", "cohorts"),
            gh[:, :, 3],
        ),
    })

    return conditional_merge_datasets(ds, new_ds, merge)
Exemplo n.º 2
0
def pbs(
    ds: Dataset,
    *,
    stat_Fst: Hashable = variables.stat_Fst,
    cohorts: Optional[Sequence[Union[Tuple[int, int, int],
                                     Tuple[str, str, str]]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the population branching statistic (PBS) between cohort triples.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_Fst
        Fst variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_Fst_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`Fst`.
    cohorts
        The cohort triples to compute statistics for, specified as a sequence of
        tuples of cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the PBS value between cohort triples, as defined by
    :data:`sgkit.variables.stat_pbs_spec`.
    Shape (variants, cohorts, cohorts, cohorts), or
    (windows, cohorts, cohorts, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6)

    >>> # Divide samples into three named cohorts
    >>> n_cohorts = 3
    >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
    >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})

    >>> # Divide into two windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
    array([ 0.      , -0.160898])
    """

    ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst)
    variables.validate(ds, {stat_Fst: variables.stat_Fst_spec})

    fst = ds[variables.stat_Fst]
    fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg))

    t = -np.log(1 - fst)
    n_cohorts = ds.dims["cohorts"]
    n_windows = ds.dims["windows"]
    assert_array_shape(t, n_windows, n_cohorts, n_cohorts)

    # calculate PBS triples
    t = da.asarray(t)
    shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts)

    cohorts = cohorts or list(itertools.combinations(range(n_cohorts),
                                                     3))  # type: ignore
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None))

    p = da.map_blocks(lambda t: _pbs_cohorts(t, ct),
                      t,
                      chunks=shape,
                      new_axis=3,
                      dtype=np.float64)
    assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts)

    new_ds = create_dataset({
        variables.stat_pbs:
        (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Exemplo n.º 3
0
def count_cohort_alleles(
    ds: Dataset,
    *,
    call_allele_count: Hashable = variables.call_allele_count,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort allele counts from per-sample allele counts, or genotype calls.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_allele_count
        Input variable name holding call_allele_count as defined by
        :data:`sgkit.variables.call_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_call_alleles`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing :data:`sgkit.variables.cohort_allele_count_spec`
    of allele counts with shape (variants, cohorts, alleles) and values corresponding to
    the number of non-missing occurrences of each allele.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
    samples    S0   S1   S2   S3
    variants
    0         0/0  1/0  1/0  0/1
    1         1/0  0/1  0/0  1/0
    2         1/1  0/0  1/0  0/1
    3         1/0  1/1  1/1  1/0
    4         1/0  0/0  1/0  1/1

    >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[3, 1],
            [2, 2]],
    <BLANKLINE>
            [[2, 2],
            [3, 1]],
    <BLANKLINE>
            [[2, 2],
            [2, 2]],
    <BLANKLINE>
            [[1, 3],
            [1, 3]],
    <BLANKLINE>
            [[3, 1],
            [1, 3]]])
    """
    ds = define_variable_if_absent(
        ds, variables.call_allele_count, call_allele_count, count_call_alleles
    )
    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})

    n_variants = ds.dims["variants"]
    n_alleles = ds.dims["alleles"]

    AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort])
    n_cohorts = SC.max().compute() + 1  # 0-based indexing
    C = da.empty(n_cohorts, dtype=np.uint8)

    G = da.asarray(ds.call_genotype)
    shape = (G.chunks[0], n_cohorts, n_alleles)

    AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32)
    assert_array_shape(
        AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2]
    )

    # Stack the blocks and sum across them
    # (which will only work because each chunk is guaranteed to have same size)
    AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0)
    assert_array_shape(AC, n_variants, n_cohorts, n_alleles)

    new_ds = create_dataset(
        {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)}
    )
    return conditional_merge_datasets(ds, new_ds, merge)
Exemplo n.º 4
0
def divergence(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute divergence between pairs of cohorts.

    The entry at (i, j) is the divergence between for cohort i and cohort j,
    except for the case where i and j are the same, in which case the entry
    is the diversity for cohort i.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the divergence value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_divergence_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5       , 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.5       ]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.5       , 0.375     ],
            [0.375     , 0.5       ]],
    <BLANKLINE>
        [[0.5       , 0.625     ],
            [0.625     , 0.5       ]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1.83333333, 1.5       ],
            [1.5       , 1.83333333]],
    <BLANKLINE>
        [[1.        , 1.        ],
            [1.        , 1.        ]]])
    """

    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    ac = ds[cohort_allele_count]

    n_variants = ds.dims["variants"]
    n_cohorts = ds.dims["cohorts"]
    ac = da.asarray(ac)
    shape = (ac.chunks[0], n_cohorts, n_cohorts)
    d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64)
    assert_array_shape(d, n_variants, n_cohorts, n_cohorts)

    if has_windows(ds):
        div = window_statistic(
            d,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=d.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("windows", "cohorts_0", "cohorts_1"),
                div,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("variants", "cohorts_0", "cohorts_1"),
                d,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)
Exemplo n.º 5
0
def test_assert_array_shape():
    x = da.zeros((10, 10))
    assert_array_shape(x, 10, 10)
    with pytest.raises(AssertionError):
        assert_array_shape(x, 9, 10)