def per_base(x, windows, is_accessible=None, fill=np.nan): """Calculate the per-base value of a windowed statistic. Parameters ---------- x : array_like, shape (n_windows,) The statistic to average per-base. windows : array_like, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions using 1-based coordinates. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional Use this value where there are no accessible bases in a window. Returns ------- y : ndarray, float, shape (n_windows,) The input array divided by the number of (accessible) bases in each window. n_bases : ndarray, int, shape (n_windows,) The number of (accessible) bases in each window """ # calculate window sizes if is_accessible is None: # N.B., window stops are included n_bases = np.diff(windows, axis=1).reshape(-1) + 1 else: n_bases = np.array( [np.count_nonzero(is_accessible[i - 1:j]) for i, j in windows]) # deal with multidimensional x if x.ndim == 1: pass elif x.ndim == 2: n_bases = n_bases[:, None] else: raise NotImplementedError('only arrays of 1 or 2 dimensions supported') # calculate density per-base with ignore_invalid(): y = np.where(n_bases > 0, x / n_bases, fill) # restore to 1-dimensional if n_bases.ndim > 1: n_bases = n_bases.reshape(-1) return y, n_bases
def per_base(x, windows, is_accessible=None, fill=np.nan): """Calculate the per-base value of a windowed statistic. Parameters ---------- x : array_like, shape (n_windows,) The statistic to average per-base. windows : array_like, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions using 1-based coordinates. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional Use this value where there are no accessible bases in a window. Returns ------- y : ndarray, float, shape (n_windows,) The input array divided by the number of (accessible) bases in each window. n_bases : ndarray, int, shape (n_windows,) The number of (accessible) bases in each window """ # calculate window sizes if is_accessible is None: # N.B., window stops are included n_bases = np.diff(windows, axis=1).reshape(-1) + 1 else: n_bases = np.array([np.count_nonzero(is_accessible[i-1:j]) for i, j in windows]) # deal with multidimensional x if x.ndim == 1: pass elif x.ndim == 2: n_bases = n_bases[:, None] else: raise NotImplementedError('only arrays of 1 or 2 dimensions supported') # calculate density per-base with ignore_invalid(): y = np.where(n_bases > 0, x / n_bases, fill) # restore to 1-dimensional if n_bases.ndim > 1: n_bases = n_bases.reshape(-1) return y, n_bases
def inbreeding_coefficient(g, fill=np.nan): """Calculate the inbreeding coefficient for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where the expected heterozygosity is zero. Returns ------- f : ndarray, float, shape (n_variants,) Inbreeding coefficient. Notes ----- The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is the observed heterozygosity and *He* is the expected heterozygosity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.inbreeding_coefficient(g) array([ nan, 0.33333333, 1. , -0.33333333]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # calculate observed and expected heterozygosity ho = heterozygosity_observed(g) af = g.count_alleles().to_frequencies() he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0) # calculate inbreeding coefficient, accounting for variants with no # expected heterozygosity with ignore_invalid(): f = np.where(he > 0, 1 - (ho / he), fill) return f
def refimpl(f, ploidy, fill=0): """Limited reference implementation for testing purposes.""" # check allele frequencies sum to 1 af_sum = np.sum(f, axis=1) # assume three alleles p = f[:, 0] q = f[:, 1] r = f[:, 2] out = 1 - p**ploidy - q**ploidy - r**ploidy with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def refimpl(af, ploidy, fill=0): """Limited reference implementation for testing purposes.""" # check allele frequencies sum to 1 af_sum = np.sum(af, axis=1) # assume three alleles p = af[:, 0] q = af[:, 1] r = af[:, 2] out = 1 - p**ploidy - q**ploidy - r**ploidy with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def heterozygosity_expected(af, ploidy, fill=np.nan): """Calculate the expected rate of heterozygosity for each variant under Hardy-Weinberg equilibrium. Parameters ---------- af : array_like, float, shape (n_variants, n_alleles) Allele frequencies array. ploidy : int Sample ploidy. fill : float, optional Use this value for variants where allele frequencies do not sum to 1. Returns ------- he : ndarray, float, shape (n_variants,) Expected heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> af = g.count_alleles().to_frequencies() >>> allel.stats.heterozygosity_expected(af, ploidy=2) array([ 0. , 0.5 , 0.66666667, 0.375 ]) """ # check inputs af = asarray_ndim(af, 2) # calculate expected heterozygosity out = 1 - np.sum(np.power(af, ploidy), axis=1) # fill values where allele frequencies could not be calculated af_sum = np.sum(af, axis=1) with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def heterozygosity_observed(g, fill=np.nan): """Calculate the rate of observed heterozygosity for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where all calls are missing. Returns ------- ho : ndarray, float, shape (n_variants,) Observed heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.heterozygosity_observed(g) array([ 0. , 0.33333333, 0. , 0.5 ]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # count hets n_het = np.asarray(g.count_het(axis=1)) n_called = np.asarray(g.count_called(axis=1)) # calculate rate of observed heterozygosity, accounting for variants # where all calls are missing with ignore_invalid(): ho = np.where(n_called > 0, n_het / n_called, fill) return ho
def heterozygosity_expected(af, ploidy, fill=np.nan): """Calculate the expected rate of heterozygosity for each variant under Hardy-Weinberg equilibrium. Parameters ---------- af : array_like, float, shape (n_variants, n_alleles) Allele frequencies array. fill : float, optional Use this value for variants where allele frequencies do not sum to 1. Returns ------- he : ndarray, float, shape (n_variants,) Expected heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> af = g.count_alleles().to_frequencies() >>> allel.stats.heterozygosity_expected(af, ploidy=2) array([ 0. , 0.5 , 0.66666667, 0.375 ]) """ # check inputs af = asarray_ndim(af, 2) # calculate expected heterozygosity out = 1 - np.sum(np.power(af, ploidy), axis=1) # fill values where allele frequencies could not be calculated af_sum = np.sum(af, axis=1) with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.stats.mean_pairwise_difference(ac) array([ 0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.stats.mean_pairwise_difference_between(ac1, ac2) array([ 0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.mean_pairwise_difference(ac) array([0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.mean_pairwise_difference_between(ac1, ac2) array([0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd