def convert_probability_to_call( ds: Dataset, call_genotype_probability: str = variables.call_genotype_probability, threshold: float = 0.9, merge: bool = True, ) -> Dataset: """ Convert genotype probabilities to hard calls. Parameters ---------- ds Dataset containing genotype probabilities, such as from :func:`sgkit.io.bgen.read_bgen`. call_genotype_probability Genotype probability variable to be converted as defined by :data:`sgkit.variables.call_genotype_probability_spec`. threshold Probability threshold in [0, 1] that must be met or exceeded by at least one genotype probability in order for any calls to be made -- all values will be -1 (missing) otherwise. Setting this value to less than or equal to 0 disables any effect it has. Default value is 0.9. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the following variables: - `call_genotype` (variants, samples, ploidy): Converted hard calls. Defined by :data:`sgkit.variables.call_genotype_spec`. - `call_genotype_mask` (variants, samples, ploidy): Mask for converted hard calls. Defined by :data:`sgkit.variables.call_genotype_mask_spec`. """ if not (0 <= threshold <= 1): raise ValueError( f"Threshold must be float in [0, 1], not {threshold}.") variables.validate( ds, {call_genotype_probability: variables.call_genotype_probability_spec}) if ds.dims["genotypes"] != 3: raise NotImplementedError( f"Hard call conversion only supported for diploid, biallelic genotypes; " f"num genotypes in provided probabilities array = {ds.dims['genotypes']}." ) GP = da.asarray(ds[call_genotype_probability]) # Remove chunking in genotypes dimension, if present if len(GP.chunks[2]) > 1: GP = GP.rechunk((None, None, -1)) K = da.empty(2, dtype=np.uint8) GT = _convert_probability_to_call(GP, K, threshold) new_ds = create_dataset({ variables.call_genotype: (("variants", "samples", "ploidy"), GT), variables.call_genotype_mask: (("variants", "samples", "ploidy"), GT < 0), }) return conditional_merge_datasets(ds, new_ds, merge)
def count_genotypes( ds: Dataset, dim: Dimension, call_genotype: Hashable = variables.call_genotype, call_genotype_mask: Hashable = variables.call_genotype_mask, merge: bool = True, ) -> Dataset: variables.validate( ds, { call_genotype_mask: variables.call_genotype_mask_spec, call_genotype: variables.call_genotype_spec, }, ) odim = _swap(dim)[:-1] M, G = ds[call_genotype_mask].any(dim="ploidy"), ds[call_genotype] n_hom_ref = (G == 0).all(dim="ploidy") n_hom_alt = ((G > 0) & (G[..., 0] == G)).all(dim="ploidy") n_non_ref = (G > 0).any(dim="ploidy") n_het = ~(n_hom_alt | n_hom_ref) # This would 0 out the `het` case with any missing calls agg = lambda x: xr.where(M, False, x).sum(dim=dim) # type: ignore[no-untyped-call] new_ds = create_dataset( { f"{odim}_n_het": agg(n_het), # type: ignore[no-untyped-call] f"{odim}_n_hom_ref": agg(n_hom_ref), # type: ignore[no-untyped-call] f"{odim}_n_hom_alt": agg(n_hom_alt), # type: ignore[no-untyped-call] f"{odim}_n_non_ref": agg(n_non_ref), # type: ignore[no-untyped-call] } ) return conditional_merge_datasets(ds, new_ds, merge)
def call_rate(ds: Dataset, dim: Dimension, call_genotype_mask: Hashable) -> Dataset: odim = _swap(dim)[:-1] n_called = (~ds[call_genotype_mask].any(dim="ploidy")).sum(dim=dim) return create_dataset({ f"{odim}_n_called": n_called, f"{odim}_call_rate": n_called / ds.dims[dim] })
def count_variant_alleles( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, merge: bool = True, ) -> Dataset: """Compute allele count from per-sample allele counts, or genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.variant_allele_count_spec` of allele counts with shape (variants, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.count_variant_alleles(ds)["variant_allele_count"].values # doctest: +SKIP array([[2, 2], [1, 3], [2, 2], [4, 0]], dtype=uint64) """ ds = define_variable_if_absent(ds, variables.call_allele_count, call_allele_count, count_call_alleles) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) new_ds = create_dataset({ variables.variant_allele_count: ds[call_allele_count].sum(dim="samples") }) return conditional_merge_datasets(ds, new_ds, merge)
def infer_call_genotype_fill( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, merge: bool = True, ) -> Dataset: variables.validate(ds, {call_genotype: variables.call_genotype_spec}) mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False) if mixed_ploidy: call_genotype_fill = ds[call_genotype] < -1 else: call_genotype_fill = xr.full_like(ds[call_genotype], False, "b1") new_ds = create_dataset({variables.call_genotype_fill: call_genotype_fill}) return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
def infer_call_ploidy( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, call_genotype_non_allele: Hashable = variables.call_genotype_non_allele, merge: bool = True, ) -> Dataset: """Infer the ploidy of each call genotype based on the number of non-allele values in each call genotype. Parameters ---------- ds Dataset containing genotype calls. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. call_genotype_non_allele Input variable name holding call_genotype_non_allele as defined by :data:`sgkit.variables.call_genotype_non_allele_spec`. If the variable is not present in ``ds``, it will be computed assuming that allele values less than -1 are non-alleles in mixed ploidy datasets, or that no non-alleles are present in fixed ploidy datasets. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_ploidy_spec`. """ ds = define_variable_if_absent( ds, variables.call_genotype_non_allele, call_genotype_non_allele, infer_non_alleles, ) mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False) if mixed_ploidy: call_ploidy = (~ds[call_genotype_non_allele]).sum( axis=-1) # type: ignore[operator] else: ploidy = ds[variables.call_genotype].shape[-1] call_ploidy = xr.full_like(ds[variables.call_genotype][..., 0], ploidy) new_ds = create_dataset({variables.call_ploidy: call_ploidy}) return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
def infer_sample_ploidy( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, call_ploidy: Hashable = variables.call_ploidy, merge: bool = True, ) -> Dataset: """Infer the ploidy of each sample across all variants based on the number of non-allele values in call genotypes. Parameters ---------- ds Dataset containing genotype calls. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. call_ploidy Input variable name holding call_ploidy as defined by :data:`sgkit.variables.call_ploidy_spec`. If the variable is not present in ``ds``, it will be computed using :func:`infer_call_ploidy`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.sample_ploidy_spec`. """ ds = define_variable_if_absent(ds, variables.call_ploidy, call_ploidy, infer_call_ploidy) # validate against spec mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False) if mixed_ploidy: sample_ploidy_fixed = (ds[call_ploidy][0, :] == ds[call_ploidy]).all( axis=-1) sample_ploidy = xr.where(sample_ploidy_fixed, ds[call_ploidy][0, :], -1) # type: ignore[no-untyped-call] else: ploidy = ds[variables.call_genotype].shape[-1] sample_ploidy = xr.full_like(ds[call_ploidy][0, ...], ploidy) new_ds = create_dataset({variables.sample_ploidy: sample_ploidy}) return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
def _window_per_contig( ds: Dataset, variant_contig: Hashable, merge: bool, windowing_fn: Callable[..., Any], *args: Any, **kwargs: Any, ) -> Dataset: n_variants = ds.dims["variants"] n_contigs = len(ds.attrs["contigs"]) contig_ids = np.arange(n_contigs) variant_contig = ds["variant_contig"] contig_starts = np.searchsorted(variant_contig.values, contig_ids) contig_bounds = np.append(contig_starts, [n_variants], axis=0) # type: ignore[no-untyped-call] contig_window_contigs = [] contig_window_starts = [] contig_window_stops = [] for i in range(n_contigs): starts, stops = windowing_fn(contig_bounds[i], contig_bounds[i + 1], *args, **kwargs) contig_window_starts.append(starts) contig_window_stops.append(stops) contig_window_contigs.append(np.full_like(starts, i)) window_contigs = np.concatenate( contig_window_contigs) # type: ignore[no-untyped-call] window_starts = np.concatenate( contig_window_starts) # type: ignore[no-untyped-call] window_stops = np.concatenate( contig_window_stops) # type: ignore[no-untyped-call] new_ds = create_dataset({ window_contig: ( "windows", window_contigs, ), window_start: ( "windows", window_starts, ), window_stop: ( "windows", window_stops, ), }) return conditional_merge_datasets(ds, new_ds, merge)
def allele_frequency( ds: Dataset, call_genotype_mask: Hashable, variant_allele_count: Hashable, ) -> Dataset: data_vars: Dict[Hashable, Any] = {} # only compute variant allele count if not already in dataset if variant_allele_count in ds: variables.validate( ds, {variant_allele_count: variables.variant_allele_count_spec} ) AC = ds[variant_allele_count] else: AC = count_variant_alleles(ds, merge=False)[variables.variant_allele_count] data_vars[variables.variant_allele_count] = AC M = ds[call_genotype_mask].stack(calls=("samples", "ploidy")) AN = (~M).sum(dim="calls") assert AN.shape == (ds.dims["variants"],) data_vars[variables.variant_allele_total] = AN data_vars[variables.variant_allele_frequency] = AC / AN return create_dataset(data_vars)
def diversity( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute diversity from cohort allele counts. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. cohort_allele_count Cohort allele count variable to use or calculate. Defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the diversity values, as defined by :data:`sgkit.variables.stat_diversity_spec`. Shape (variants, cohorts), or (windows, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE array([[0.5 , 0.66666667], [0.66666667, 0.5 ], [0.66666667, 0.66666667], [0.5 , 0.5 ], [0.5 , 0.5 ]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE array([[1.83333333, 1.83333333], [1. , 1. ]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) ac = ds[cohort_allele_count] an = ac.sum(axis=2) n_pairs = an * (an - 1) / 2 n_same = (ac * (ac - 1) / 2).sum(axis=2) n_diff = n_pairs - n_same # replace zeros to avoid divide by zero error n_pairs_na = n_pairs.where(n_pairs != 0) pi = n_diff / n_pairs_na if has_windows(ds): div = window_statistic( pi, np.sum, ds.window_start.values, ds.window_stop.values, dtype=pi.dtype, axis=0, ) new_ds = create_dataset( {variables.stat_diversity: ( ("windows", "cohorts"), div, )}) else: new_ds = create_dataset( {variables.stat_diversity: ( ("variants", "cohorts"), pi, )}) return conditional_merge_datasets(ds, new_ds, merge)
def divergence( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute divergence between pairs of cohorts. The entry at (i, j) is the divergence between for cohort i and cohort j, except for the case where i and j are the same, in which case the entry is the diversity for cohort i. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. cohort_allele_count Cohort allele count variable to use or calculate. Defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the divergence value between pairs of cohorts, as defined by :data:`sgkit.variables.stat_divergence_spec`. Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[0.5 , 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.5 ]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.5 , 0.375 ], [0.375 , 0.5 ]], <BLANKLINE> [[0.5 , 0.625 ], [0.625 , 0.5 ]]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[1.83333333, 1.5 ], [1.5 , 1.83333333]], <BLANKLINE> [[1. , 1. ], [1. , 1. ]]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) ac = ds[cohort_allele_count] n_variants = ds.dims["variants"] n_cohorts = ds.dims["cohorts"] ac = da.asarray(ac) shape = (ac.chunks[0], n_cohorts, n_cohorts) d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64) assert_array_shape(d, n_variants, n_cohorts, n_cohorts) if has_windows(ds): div = window_statistic( d, np.sum, ds.window_start.values, ds.window_stop.values, dtype=d.dtype, axis=0, ) new_ds = create_dataset({ variables.stat_divergence: ( ("windows", "cohorts_0", "cohorts_1"), div, ) }) else: new_ds = create_dataset({ variables.stat_divergence: ( ("variants", "cohorts_0", "cohorts_1"), d, ) }) return conditional_merge_datasets(ds, new_ds, merge)
def identity_by_state( ds: Dataset, *, call_allele_frequency: Hashable = variables.call_allele_frequency, merge: bool = True, ) -> Dataset: """Compute identity by state (IBS) probabilities between all pairs of samples. The IBS probability between a pair of individuals is the probability that a randomly drawn allele from the first individual is identical in state with a randomly drawn allele from the second individual at a single random locus. Parameters ---------- ds Dataset containing call genotype alleles. call_allele_frequency Input variable name holding call_allele_frequency as defined by :data:`sgkit.variables.call_allele_frequency_spec`. If the variable is not present in ``ds``, it will be computed using :func:`call_allele_frequencies`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.stat_identity_by_state_spec` which is a matrix of pairwise IBS probabilities among all samples. The dimensions are named ``samples_0`` and ``samples_1``. Raises ------ NotImplementedError If the variable holding call_allele_frequency is chunked along the samples dimension. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=2, n_sample=3, seed=2) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 S2 variants 0 0/0 1/1 1/0 1 1/1 1/1 1/0 >>> sg.identity_by_state(ds)["stat_identity_by_state"].values # doctest: +NORMALIZE_WHITESPACE array([[1. , 0.5, 0.5], [0.5, 1. , 0.5], [0.5, 0.5, 0.5]]) """ ds = define_variable_if_absent( ds, variables.call_allele_frequency, call_allele_frequency, call_allele_frequencies, ) variables.validate( ds, {call_allele_frequency: variables.call_allele_frequency_spec} ) af = da.asarray(ds[call_allele_frequency]) if len(af.chunks[1]) > 1: raise NotImplementedError( "identity_by_state does not support chunking in the samples dimension" ) af0 = da.where(da.isnan(af), 0.0, af) num = da.einsum("ixj,iyj->xy", af0, af0) called = da.nansum(af, axis=-1) count = da.einsum("ix,iy->xy", called, called) denom = da.where(count == 0, np.nan, count) new_ds = create_dataset( { variables.stat_identity_by_state: ( ("samples_0", "samples_1"), num / denom, ) } ) return conditional_merge_datasets(ds, new_ds, merge)
def Weir_Goudet_beta( ds: Dataset, *, stat_identity_by_state: Hashable = variables.stat_identity_by_state, merge: bool = True, ) -> Dataset: """Estimate pairwise beta between all pairs of samples as described in Weir and Goudet 2017 [1]. Beta is the kinship scaled by the average kinship of all pairs of individuals in the dataset such that the non-diagonal (non-self) values sum to zero. Beta may be corrected to more accurately reflect pedigree based kinship estimates using the formula :math:`\\hat{\\beta}^c=\\frac{\\hat{\\beta}-\\hat{\\beta}_0}{1-\\hat{\\beta}_0}` where :math:`\\hat{\\beta}_0` is the estimated beta between samples which are known to be unrelated [1]. Parameters ---------- ds Genotype call dataset. stat_identity_by_state Input variable name holding stat_identity_by_state as defined by :data:`sgkit.variables.stat_identity_by_state_spec`. If the variable is not present in ``ds``, it will be computed using :func:`identity_by_state`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.stat_Weir_Goudet_beta_spec` which is a matrix of estimated pairwise kinship relative to the average kinship of all pairs of individuals in the dataset. The dimensions are named ``samples_0`` and ``samples_1``. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_allele=10, seed=3) >>> # sample 2 "inherits" alleles from samples 0 and 1 >>> ds.call_genotype.data[:, 2, 0] = ds.call_genotype.data[:, 0, 0] >>> ds.call_genotype.data[:, 2, 1] = ds.call_genotype.data[:, 1, 0] >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 S2 variants 0 7/1 8/6 7/8 1 9/5 3/6 9/3 2 8/8 8/3 8/8 >>> # estimate beta >>> ds = sg.Weir_Goudet_beta(ds).compute() >>> ds.stat_Weir_Goudet_beta.values # doctest: +NORMALIZE_WHITESPACE array([[ 0.5 , -0.25, 0.25], [-0.25, 0.25, 0. ], [ 0.25, 0. , 0.5 ]]) >>> # correct beta assuming least related samples are unrelated >>> beta = ds.stat_Weir_Goudet_beta >>> beta0 = beta.min() >>> beta_corrected = (beta - beta0) / (1 - beta0) >>> beta_corrected.values # doctest: +NORMALIZE_WHITESPACE array([[0.6, 0. , 0.4], [0. , 0.4, 0.2], [0.4, 0.2, 0.6]]) References ---------- [1] - Bruce, S. Weir, and Jérôme Goudet 2017. "A Unified Characterization of Population Structure and Relatedness." Genetics 206 (4): 2085-2103. """ ds = define_variable_if_absent( ds, variables.stat_identity_by_state, stat_identity_by_state, identity_by_state ) variables.validate( ds, {stat_identity_by_state: variables.stat_identity_by_state_spec} ) ibs = ds[stat_identity_by_state].data # average matching is the mean of non-diagonal elements num = da.nansum(da.tril(ibs, -1)) denom = da.nansum(da.tril(~da.isnan(ibs), -1)) avg = num / denom beta = (ibs - avg) / (1 - avg) new_ds = create_dataset( { variables.stat_Weir_Goudet_beta: ( ("samples_0", "samples_1"), beta, ) } ) return conditional_merge_datasets(ds, new_ds, merge)
def hardy_weinberg_test(ds: Dataset, *, genotype_counts: Optional[Hashable] = None, ploidy: Optional[int] = None, alleles: Optional[int] = None, merge: bool = True) -> Dataset: """Exact test for HWE as described in Wigginton et al. 2005 [1]. Parameters ---------- ds Dataset containing genotype calls or precomputed genotype counts. genotype_counts Name of variable containing precomputed genotype counts, by default None. If not provided, these counts will be computed automatically from genotype calls. If present, must correspond to an (`N`, 3) array where `N` is equal to the number of variants and the 3 columns contain heterozygous, homozygous reference, and homozygous alternate counts (in that order) across all samples for a variant. ploidy Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset. If the `ploidy` dimension is not present, then this value must be set explicitly. Currently HWE calculations are only supported for diploid datasets, i.e. ``ploidy`` must equal 2. alleles Genotype allele count, defaults to ``alleles`` dimension of provided dataset. If the `alleles` dimension is not present, then this value must be set explicitly. Currently HWE calculations are only supported for biallelic datasets, i.e. ``alleles`` must equal 2. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Warnings -------- This function is only applicable to diploid, biallelic datasets. Returns ------- Dataset containing (N = num variants): variant_hwe_p_value : [array-like, shape: (N, O)] P values from HWE test for each variant as float in [0, 1]. References ---------- - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005. “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of Human Genetics 76 (5): 887–93. Raises ------ NotImplementedError If ploidy of provided dataset != 2 NotImplementedError If maximum number of alleles in provided dataset != 2 """ ploidy = ploidy or ds.dims.get("ploidy") if not ploidy: raise ValueError( "`ploidy` parameter must be set when not present as dataset dimension." ) if ploidy != 2: raise NotImplementedError( "HWE test only implemented for diploid genotypes") alleles = alleles or ds.dims.get("alleles") if not alleles: raise ValueError( "`alleles` parameter must be set when not present as dataset dimension." ) if alleles != 2: raise NotImplementedError( "HWE test only implemented for biallelic genotypes") # Use precomputed genotype counts if provided if genotype_counts is not None: variables.validate(ds, {genotype_counts: variables.genotype_counts_spec}) obs = list(da.asarray(ds[genotype_counts]).T) # Otherwise compute genotype counts from calls else: ds = count_genotypes(ds, dim="samples") obs = [ da.asarray(ds[v]) for v in ["variant_n_het", "variant_n_hom_ref", "variant_n_hom_alt"] ] p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs) new_ds = create_dataset({variables.variant_hwe_p_value: ("variants", p)}) return conditional_merge_datasets(ds, new_ds, merge)
def count_call_alleles( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, merge: bool = True, ) -> Dataset: """Compute per sample allele counts from genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_allele_count_spec` of allele counts with shape (variants, samples, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE array([[[1, 1], [1, 1]], <BLANKLINE> [[1, 1], [0, 2]], <BLANKLINE> [[1, 1], [1, 1]], <BLANKLINE> [[2, 0], [2, 0]]], dtype=uint8) """ variables.validate(ds, {call_genotype: variables.call_genotype_spec}) n_alleles = ds.dims["alleles"] G = da.asarray(ds[call_genotype]) shape = (G.chunks[0], G.chunks[1], n_alleles) N = da.empty(n_alleles, dtype=np.uint8) new_ds = create_dataset( { variables.call_allele_count: ( ("variants", "samples", "alleles"), da.map_blocks( count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2 ), ) } ) return conditional_merge_datasets(ds, new_ds, merge)
def individual_heterozygosity( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, merge: bool = True, ) -> Dataset: """Compute per call individual heterozygosity. Individual heterozygosity is the probability that two alleles drawn at random without replacement, from an individual at a given site, are not identical in state. Therefore, individual heterozygosity is defined for diploid and polyploid calls but will return nan in the case of haploid calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_heterozygosity_spec` of per genotype observed heterozygosity with shape (variants, samples) containing values within the interval [0, 1] or nan if ploidy < 2. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.individual_heterozygosity(ds)["call_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[1., 1.], [1., 0.], [1., 1.], [0., 0.]]) """ ds = define_variable_if_absent( ds, variables.call_allele_count, call_allele_count, count_call_alleles ) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) AC = da.asarray(ds.call_allele_count) K = AC.sum(axis=-1) # use nan denominator to avoid divide by zero with K - 1 K2 = da.where(K > 1, K, np.nan) AF = AC / K2[..., None] HI = (1 - da.sum(AF ** 2, axis=-1)) * (K / (K2 - 1)) new_ds = create_dataset( {variables.call_heterozygosity: (("variants", "samples"), HI)} ) return conditional_merge_datasets(ds, new_ds, merge)
def call_allele_frequencies( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, merge: bool = True, ) -> Dataset: """Compute per sample allele frequencies from genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_allele_frequency_spec` of allele frequencies with shape (variants, samples, alleles) and values corresponding to the frequency of non-missing occurrences of each allele. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.call_allele_frequencies(ds)["call_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE array([[[0.5, 0.5], [0.5, 0.5]], <BLANKLINE> [[0.5, 0.5], [0. , 1. ]], <BLANKLINE> [[0.5, 0.5], [0.5, 0.5]], <BLANKLINE> [[1. , 0. ], [1. , 0. ]]]) """ ds = define_variable_if_absent( ds, variables.call_allele_count, call_allele_count, count_call_alleles ) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) AC = ds[call_allele_count] K = AC.sum(dim="alleles") # avoid divide by zero AF = AC / xr.where(K > 0, K, np.nan) # type: ignore[no-untyped-call] new_ds = create_dataset({variables.call_allele_frequency: AF}) return conditional_merge_datasets(ds, new_ds, merge)
def window( ds: Dataset, size: int, step: Optional[int] = None, merge: bool = True, ) -> Dataset: """Add fixed-size windowing information to a dataset. Windows are defined over the ``variants`` dimension, and are used by some downstream functions to calculate statistics for each window. Parameters ---------- ds Genotype call dataset. size The window size (number of variants). step The distance (number of variants) between start positions of windows. Defaults to ``size``. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the following variables: - :data:`sgkit.variables.window_start_spec` (windows): The index values of window start positions. - :data:`sgkit.variables.window_stop_spec` (windows): The index values of window stop positions. """ step = step or size n_variants = ds.dims["variants"] n_contigs = len(ds.attrs["contigs"]) contig_ids = np.arange(n_contigs) variant_contig = ds["variant_contig"] contig_starts = np.searchsorted(variant_contig.values, contig_ids) contig_bounds = np.append(contig_starts, [n_variants], axis=0) contig_window_contigs = [] contig_window_starts = [] contig_window_stops = [] for i in range(n_contigs): starts, stops = _get_windows(contig_bounds[i], contig_bounds[i + 1], size, step) contig_window_starts.append(starts) contig_window_stops.append(stops) contig_window_contigs.append(np.full_like(starts, i)) window_contigs = np.concatenate(contig_window_contigs) window_starts = np.concatenate(contig_window_starts) window_stops = np.concatenate(contig_window_stops) new_ds = create_dataset({ window_contig: ( "windows", window_contigs, ), window_start: ( "windows", window_starts, ), window_stop: ( "windows", window_stops, ), }) return conditional_merge_datasets(ds, new_ds, merge)
def pbs( ds: Dataset, *, stat_Fst: Hashable = variables.stat_Fst, cohorts: Optional[Sequence[Union[Tuple[int, int, int], Tuple[str, str, str]]]] = None, merge: bool = True, ) -> Dataset: """Compute the population branching statistic (PBS) between cohort triples. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. stat_Fst Fst variable to use or calculate. Defined by :data:`sgkit.variables.stat_Fst_spec`. If the variable is not present in ``ds``, it will be computed using :func:`Fst`. cohorts The cohort triples to compute statistics for, specified as a sequence of tuples of cohort indexes or IDs. None (the default) means compute statistics for all cohorts. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the PBS value between cohort triples, as defined by :data:`sgkit.variables.stat_pbs_spec`. Shape (variants, cohorts, cohorts, cohorts), or (windows, cohorts, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6) >>> # Divide samples into three named cohorts >>> n_cohorts = 3 >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)] >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names}) >>> # Divide into two windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE array([ 0. , -0.160898]) """ ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst) variables.validate(ds, {stat_Fst: variables.stat_Fst_spec}) fst = ds[variables.stat_Fst] fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg)) t = -np.log(1 - fst) n_cohorts = ds.dims["cohorts"] n_windows = ds.dims["windows"] assert_array_shape(t, n_windows, n_cohorts, n_cohorts) # calculate PBS triples t = da.asarray(t) shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts) cohorts = cohorts or list(itertools.combinations(range(n_cohorts), 3)) # type: ignore ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None)) p = da.map_blocks(lambda t: _pbs_cohorts(t, ct), t, chunks=shape, new_axis=3, dtype=np.float64) assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts) new_ds = create_dataset({ variables.stat_pbs: (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p) }) return conditional_merge_datasets(ds, new_ds, merge)
def Fst( ds: Dataset, *, estimator: Optional[str] = None, stat_divergence: Hashable = variables.stat_divergence, merge: bool = True, ) -> Dataset: """Compute Fst between pairs of cohorts. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. estimator Determines the formula to use for computing Fst. If None (the default), or ``Hudson``, Fst is calculated using the method of Hudson (1992) elaborated by Bhatia et al. (2013), (the same estimator as scikit-allel). Other supported estimators include ``Nei`` (1986), (the same estimator as tskit). stat_divergence Divergence variable to use or calculate. Defined by :data:`sgkit.variables.stat_divergence_spec`. If the variable is not present in ``ds``, it will be computed using :func:`divergence`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the Fst value between pairs of cohorts, as defined by :data:`sgkit.variables.stat_Fst_spec`. Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE array([[[ nan, -0.16666667], [-0.16666667, nan]], <BLANKLINE> [[ nan, -0.16666667], [-0.16666667, nan]], <BLANKLINE> [[ nan, -0.33333333], [-0.33333333, nan]], <BLANKLINE> [[ nan, -0.33333333], [-0.33333333, nan]], <BLANKLINE> [[ nan, 0.2 ], [ 0.2 , nan]]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE array([[[ nan, -0.22222222], [-0.22222222, nan]], <BLANKLINE> [[ nan, 0. ], [ 0. , nan]]]) """ known_estimators = {"Hudson": _Fst_Hudson, "Nei": _Fst_Nei} if estimator is not None and estimator not in known_estimators: raise ValueError( f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}" ) estimator = estimator or "Hudson" ds = define_variable_if_absent(ds, variables.stat_divergence, stat_divergence, divergence) variables.validate(ds, {stat_divergence: variables.stat_divergence_spec}) n_cohorts = ds.dims["cohorts"] gs = da.asarray(ds.stat_divergence) shape = (gs.chunks[0], n_cohorts, n_cohorts) fst = da.map_blocks(known_estimators[estimator], gs, chunks=shape, dtype=np.float64) # TODO: reinstate assert (first dim could be either variants or windows) # assert_array_shape(fst, n_windows, n_cohorts, n_cohorts) new_ds = create_dataset( {variables.stat_Fst: (("windows", "cohorts_0", "cohorts_1"), fst)}) return conditional_merge_datasets(ds, new_ds, merge)
def Tajimas_D( ds: Dataset, *, variant_allele_count: Hashable = variables.variant_allele_count, stat_diversity: Hashable = variables.stat_diversity, merge: bool = True, ) -> Dataset: """Compute Tajimas' D for a genotype call dataset. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling this function. Parameters ---------- ds Genotype call dataset. variant_allele_count Variant allele count variable to use or calculate. Defined by :data:`sgkit.variables.variant_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_variant_alleles`. stat_diversity Diversity variable to use or calculate. Defined by :data:`sgkit.variables.stat_diversity_spec`. If the variable is not present in ``ds``, it will be computed using :func:`diversity`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`. Shape (variants, cohorts), or (windows, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[0.88883234, 2.18459998], [2.18459998, 0.88883234], [2.18459998, 2.18459998], [0.88883234, 0.88883234], [0.88883234, 0.88883234]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window_by_variant(ds, size=3) >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[2.40517586, 2.40517586], [1.10393559, 1.10393559]]) """ ds = define_variable_if_absent(ds, variables.variant_allele_count, variant_allele_count, count_variant_alleles) ds = define_variable_if_absent(ds, variables.stat_diversity, stat_diversity, diversity) variables.validate( ds, { variant_allele_count: variables.variant_allele_count_spec, stat_diversity: variables.stat_diversity_spec, }, ) ac = ds[variant_allele_count] ac = da.asarray(ac) # count segregating. Note that this uses the definition in tskit, # which is the number of alleles - 1. In the biallelic case this # gives us the number of non-monomorphic sites. S = (ac > 0).sum(axis=1) - 1 if has_windows(ds): S = window_statistic( S, np.sum, ds.window_start.values, ds.window_stop.values, dtype=S.dtype, axis=0, ) # assume number of chromosomes sampled is constant for all variants # NOTE: even tho ac has dtype uint, we promote the sum to float # because the computation below requires floats n = ac.sum(axis=1, dtype="float").max() # (n-1)th harmonic number a1 = (1 / da.arange(1, n)).sum() # calculate Watterson's theta (absolute value) theta = S / a1 # get diversity div = ds[stat_diversity] # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = div - theta[:, np.newaxis] # calculate the denominator (standard deviation) a2 = (1 / (da.arange(1, n)**2)).sum() b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) d_stdev = da.sqrt((e1 * S) + (e2 * S * (S - 1))) # Let IEEE decide the semantics of division by zero here. The return value # will be -inf, nan or +inf, depending on the value of the numerator. # Currently this will raise a RuntimeWarning, if we divide by zero. D = d / d_stdev[:, np.newaxis] if has_windows(ds): new_ds = create_dataset( {variables.stat_Tajimas_D: (["windows", "cohorts"], D.data)}) else: new_ds = create_dataset( {variables.stat_Tajimas_D: (["variants", "cohorts"], D.data)}) return conditional_merge_datasets(ds, new_ds, merge)
def observed_heterozygosity( ds: Dataset, *, call_heterozygosity: Hashable = variables.call_heterozygosity, sample_cohort: Hashable = variables.sample_cohort, merge: bool = True, ) -> Dataset: """Compute per cohort observed heterozygosity. The observed heterozygosity of a cohort is the mean of individual heterozygosity values among all samples of that cohort as described in :func:`individual_heterozygosity`. Calls with a nan value for individual heterozygosity are ignored when calculating the cohort mean. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling this function. Parameters ---------- ds Dataset containing genotype calls. call_heterozygosity Input variable name holding call_heterozygosity as defined by :data:`sgkit.variables.call_heterozygosity_spec`. If the variable is not present in ``ds``, it will be computed using :func:`individual_heterozygosity`. sample_cohort Input variable name holding sample_cohort as defined by :data:`sgkit.variables.sample_cohort_spec`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.stat_observed_heterozygosity_spec` of per cohort observed heterozygosity with shape (variants, cohorts) containing values within the inteval [0, 1] or nan. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[0.5, 1. ], [1. , 0.5], [0. , 1. ], [0.5, 0.5], [0.5, 0.5]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window_by_variant(ds, size=3) >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[1.5, 2.5], [1. , 1. ]]) """ ds = define_variable_if_absent( ds, variables.call_heterozygosity, call_heterozygosity, individual_heterozygosity, ) variables.validate( ds, {call_heterozygosity: variables.call_heterozygosity_spec}) hi = da.asarray(ds[call_heterozygosity]) sc = da.asarray(ds[sample_cohort]) n_cohorts = sc.max().compute() + 1 shape = (hi.chunks[0], n_cohorts) n = da.zeros(n_cohorts, dtype=np.uint8) ho = da.map_blocks( _cohort_observed_heterozygosity, hi, sc, n, chunks=shape, drop_axis=1, new_axis=1, dtype=np.float64, ) if has_windows(ds): ho_sum = window_statistic( ho, np.sum, ds.window_start.values, ds.window_stop.values, dtype=ho.dtype, axis=0, ) new_ds = create_dataset({ variables.stat_observed_heterozygosity: ( ("windows", "cohorts"), ho_sum, ) }) else: new_ds = create_dataset({ variables.stat_observed_heterozygosity: ( ("variants", "cohorts"), ho, ) }) return conditional_merge_datasets(ds, new_ds, merge)
def pc_relate(ds: xr.Dataset, *, maf: float = 0.01, call_genotype: Hashable = variables.call_genotype, call_genotype_mask: Hashable = variables.call_genotype_mask, sample_pcs: Hashable = variables.sample_pcs, merge: bool = True) -> xr.Dataset: """Compute PC-Relate as described in Conomos, et al. 2016 [1]. This method computes the kinship coefficient matrix. The kinship coefficient for a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that a random allele selected from ``i`` and a random allele selected from ``j`` at a locus are IBD. Several of the most common family relationships and their corresponding kinship coefficient: +--------------------------------------------------+---------------------+ | Relationship | Kinship coefficient | +==================================================+=====================+ | Individual-self | 1/2 | +--------------------------------------------------+---------------------+ | full sister/full brother | 1/4 | +--------------------------------------------------+---------------------+ | mother/father/daughter/son | 1/4 | +--------------------------------------------------+---------------------+ | grandmother/grandfather/granddaughter/grandson | 1/8 | +--------------------------------------------------+---------------------+ | aunt/uncle/niece/nephew | 1/8 | +--------------------------------------------------+---------------------+ | first cousin | 1/16 | +--------------------------------------------------+---------------------+ | half-sister/half-brother | 1/8 | +--------------------------------------------------+---------------------+ Parameters ---------- ds Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC) - genotype calls: (SxVxD) - genotype calls mask: (SxVxD) - sample PCs: (PCxS) maf individual minor allele frequency filter. If an individual's estimated individual-specific minor allele frequency at a SNP is less than this value, that SNP will be excluded from the analysis for that individual. The default value is 0.01. Must be between (0.0, 0.1). call_genotype Input variable name holding call_genotype. Defined by :data:`sgkit.variables.call_genotype_spec`. call_genotype_mask Input variable name holding call_genotype_mask. Defined by :data:`sgkit.variables.call_genotype_mask_spec` sample_pcs Input variable name holding sample_pcs. Defined by :data:`sgkit.variables.sample_pcs_spec` merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Warnings -------- This function is only applicable to diploid, biallelic datasets. This version is compatible with the R implementation of PC Relate method from the GENESIS package version 2.18.0. Returns ------- Dataset containing (S = num samples): :data:`sgkit.variables.pc_relate_phi_spec`: (S,S) ArrayLike pairwise recent kinship coefficient matrix as float in [-0.5, 0.5]. References ---------- [1] - Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016. "Model-Free Estimation of Recent Genetic Relatedness." American Journal of Human Genetics 98 (1): 127–48. Raises ------ ValueError If ploidy of provided dataset != 2 ValueError If maximum number of alleles in provided dataset != 2 ValueError Input dataset is missing any of the required variables ValueError If maf is not in (0.0, 1.0) """ if maf <= 0.0 or maf >= 1.0: raise ValueError("MAF must be between (0.0, 1.0)") if "ploidy" in ds.dims and ds.dims["ploidy"] != 2: raise ValueError("PC Relate only works for diploid genotypes") if "alleles" in ds.dims and ds.dims["alleles"] != 2: raise ValueError("PC Relate only works for biallelic genotypes") variables.validate( ds, { call_genotype: variables.call_genotype_spec, call_genotype_mask: variables.call_genotype_mask_spec, sample_pcs: variables.sample_pcs_spec, }, ) call_g, call_g_mask = _collapse_ploidy(ds, call_genotype, call_genotype_mask) imputed_call_g = _impute_genotype_call_with_variant_mean( call_g, call_g_mask) # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T # is a length D vector of regression coefficients for each of the PCs pcs = ds[sample_pcs] pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0) # Note: dask qr decomp requires no chunking in one dimension, and because number of # components should be smaller than number of samples in most cases, we disable # chunking on components pcsi = pcsi.T.rechunk((None, -1)) q, r = da.linalg.qr(pcsi) # mu, eq: 3 half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T) mu = pcsi.dot(half_beta).T # phi, eq: 4 mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask mu_mask = da.ma.masked_array(mu, mask=mask) variance = mu_mask * (1.0 - mu_mask) variance = da.ma.filled(variance, fill_value=0.0) stddev = da.sqrt(variance) centered_af = call_g / 2 - mu_mask centered_af = da.ma.filled(centered_af, fill_value=0.0) # NOTE: gramian could be a performance bottleneck, and we could explore # performance improvements like (or maybe sth else): # * calculating only the pairs we are interested in # * using an optimized einsum. assert centered_af.shape == call_g.shape assert stddev.shape == call_g.shape phi = gramian(centered_af) / gramian(stddev) # NOTE: phi is of shape (S x S), S = num samples assert phi.shape == (call_g.shape[1], ) * 2 new_ds = create_dataset( {variables.pc_relate_phi: (("sample_x", "sample_y"), phi)}) return conditional_merge_datasets(ds, new_ds, merge)
def Tajimas_D( ds: Dataset, *, variant_allele_count: Hashable = variables.variant_allele_count, stat_diversity: Hashable = variables.stat_diversity, merge: bool = True, ) -> Dataset: """Compute Tajimas' D for a genotype call dataset. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. variant_allele_count Variant allele count variable to use or calculate. Defined by :data:`sgkit.variables.variant_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_variant_alleles`. stat_diversity Diversity variable to use or calculate. Defined by :data:`sgkit.variables.stat_diversity_spec`. If the variable is not present in ``ds``, it will be computed using :func:`diversity`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`. Shape (variants, cohorts), or (windows, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[-3.35891429, -2.96698697], [-2.96698697, -3.35891429], [-2.96698697, -2.96698697], [-3.35891429, -3.35891429], [-3.35891429, -3.35891429]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[-0.22349574, -0.22349574], [-2.18313233, -2.18313233]]) """ ds = define_variable_if_absent(ds, variables.variant_allele_count, variant_allele_count, count_variant_alleles) ds = define_variable_if_absent(ds, variables.stat_diversity, stat_diversity, diversity) variables.validate( ds, { variant_allele_count: variables.variant_allele_count_spec, stat_diversity: variables.stat_diversity_spec, }, ) ac = ds[variant_allele_count] # count segregating S = ((ac > 0).sum(axis=1) > 1).sum() # assume number of chromosomes sampled is constant for all variants # NOTE: even tho ac has dtype uint, we promote the sum to float # because the computation below requires floats n = ac.sum(axis=1, dtype="float").max() # (n-1)th harmonic number a1 = (1 / da.arange(1, n)).sum() # calculate Watterson's theta (absolute value) theta = S / a1 # get diversity div = ds[stat_diversity] # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = div - theta # calculate the denominator (standard deviation) a2 = (1 / (da.arange(1, n)**2)).sum() b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) if d_stdev == 0: D = np.nan else: # finally calculate Tajima's D D = d / d_stdev new_ds = create_dataset({variables.stat_Tajimas_D: D}) return conditional_merge_datasets(ds, new_ds, merge)
def count_cohort_alleles( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, sample_cohort: Hashable = variables.sample_cohort, merge: bool = True, ) -> Dataset: """Compute per cohort allele counts from per-sample allele counts, or genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. sample_cohort Input variable name holding sample_cohort as defined by :data:`sgkit.variables.sample_cohort_spec`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.cohort_allele_count_spec` of allele counts with shape (variants, cohorts, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples") >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 S2 S3 variants 0 0/0 1/0 1/0 0/1 1 1/0 0/1 0/0 1/0 2 1/1 0/0 1/0 0/1 3 1/0 1/1 1/1 1/0 4 1/0 0/0 1/0 1/1 >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE array([[[3, 1], [2, 2]], <BLANKLINE> [[2, 2], [3, 1]], <BLANKLINE> [[2, 2], [2, 2]], <BLANKLINE> [[1, 3], [1, 3]], <BLANKLINE> [[3, 1], [1, 3]]]) """ ds = define_variable_if_absent( ds, variables.call_allele_count, call_allele_count, count_call_alleles ) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) n_variants = ds.dims["variants"] n_alleles = ds.dims["alleles"] AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort]) n_cohorts = SC.max().compute() + 1 # 0-based indexing C = da.empty(n_cohorts, dtype=np.uint8) G = da.asarray(ds.call_genotype) shape = (G.chunks[0], n_cohorts, n_alleles) AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32) assert_array_shape( AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2] ) # Stack the blocks and sum across them # (which will only work because each chunk is guaranteed to have same size) AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0) assert_array_shape(AC, n_variants, n_cohorts, n_alleles) new_ds = create_dataset( {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)} ) return conditional_merge_datasets(ds, new_ds, merge)
def Garud_H( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, cohorts: Optional[Sequence[Union[int, str]]] = None, merge: bool = True, ) -> Dataset: """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures of soft sweeps, as defined in Garud et al. (2015). By default, values of this statistic are calculated across all variants. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. cohorts The cohorts to compute statistics for, specified as a sequence of cohort indexes or IDs. None (the default) means compute statistics for all cohorts. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the following variables: - `stat_Garud_h1` (windows, cohorts): Garud H1 statistic. Defined by :data:`sgkit.variables.stat_Garud_h1_spec`. - `stat_Garud_h12` (windows, cohorts): Garud H12 statistic. Defined by :data:`sgkit.variables.stat_Garud_h12_spec`. - `stat_Garud_h123` (windows, cohorts): Garud H123 statistic. Defined by :data:`sgkit.variables.stat_Garud_h123_spec`. - `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic. Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`. Raises ------ NotImplementedError If the dataset is not diploid. Warnings -------- This function is currently only implemented for diploid datasets. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3, step=3) >>> gh = sg.Garud_H(ds) >>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE array([[0.25 , 0.375], [0.375, 0.375]]) >>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE array([[0.375, 0.625], [0.625, 0.625]]) >>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE array([[0.625, 1. ], [1. , 1. ]]) >>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE array([[0.75 , 0.33333333], [0.33333333, 0.33333333]]) """ if ds.dims["ploidy"] != 2: raise NotImplementedError( "Garud H only implemented for diploid genotypes") if not has_windows(ds): raise ValueError("Dataset must be windowed for Garud_H") variables.validate(ds, {call_genotype: variables.call_genotype_spec}) gt = ds[call_genotype] # convert sample cohorts to haplotype layout sc = ds.sample_cohort.values hsc = np.stack((sc, sc), axis=1).ravel() # TODO: assumes diploid n_cohorts = sc.max() + 1 # 0-based indexing cohorts = cohorts or range(n_cohorts) ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts", None)) gh = window_statistic( gt, lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts, ct), ds.window_start.values, ds.window_stop.values, dtype=np.float64, # first chunks dimension is windows, computed in window_statistic chunks=(-1, n_cohorts, N_GARUD_H_STATS), ) n_windows = ds.window_start.shape[0] assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS) new_ds = create_dataset({ variables.stat_Garud_h1: ( ("windows", "cohorts"), gh[:, :, 0], ), variables.stat_Garud_h12: ( ("windows", "cohorts"), gh[:, :, 1], ), variables.stat_Garud_h123: ( ("windows", "cohorts"), gh[:, :, 2], ), variables.stat_Garud_h2_h1: ( ("windows", "cohorts"), gh[:, :, 3], ), }) return conditional_merge_datasets(ds, new_ds, merge)
def cohort_allele_frequencies( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute allele frequencies for each cohort. Parameters ---------- ds Dataset containing genotype calls. cohort_allele_count Input variable name holding cohort_allele_count as defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.cohort_allele_frequency_spec` of allele frequencies with shape (variants, cohorts, alleles) and values corresponding to the frequency of non-missing occurrences of each allele. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples") >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 S2 S3 variants 0 0/0 1/0 1/0 0/1 1 1/0 0/1 0/0 1/0 2 1/1 0/0 1/0 0/1 3 1/0 1/1 1/1 1/0 4 1/0 0/0 1/0 1/1 >>> sg.cohort_allele_frequencies(ds)["cohort_allele_frequency"].values # doctest: +NORMALIZE_WHITESPACE array([[[0.75, 0.25], [0.5 , 0.5 ]], <BLANKLINE> [[0.5 , 0.5 ], [0.75, 0.25]], <BLANKLINE> [[0.5 , 0.5 ], [0.5 , 0.5 ]], <BLANKLINE> [[0.25, 0.75], [0.25, 0.75]], <BLANKLINE> [[0.75, 0.25], [0.25, 0.75]]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) AC = ds[cohort_allele_count] AF = AC / AC.sum(dim="alleles") new_ds = create_dataset({variables.cohort_allele_frequency: AF}) return conditional_merge_datasets(ds, new_ds, merge)