def locate_fixed_differences(ac1, ac2): """Locate variants with no shared alleles between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. Returns ------- loc : ndarray, bool, shape (n_variants,) See Also -------- allel.stats.diversity.windowed_df Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2, 3]) >>> loc_df = allel.locate_fixed_differences(ac1, ac2) >>> loc_df array([ True, False, False, True, True]) """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # stack allele counts for convenience pac = np.dstack([ac1, ac2]) # count numbers of alleles called in each population pan = np.sum(pac, axis=1) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate variants with allele calls in both populations non_missing = np.all(pan > 0, axis=1) # locate variants where all alleles are only found in a single population no_shared_alleles = np.all(npa <= 1, axis=1) return non_missing & no_shared_alleles
def locate_private_alleles(*acs): """Locate alleles that are found only in a single population. Parameters ---------- *acs : array_like, int, shape (n_variants, n_alleles) Allele counts arrays from each population. Returns ------- loc : ndarray, bool, shape (n_variants, n_alleles) Boolean array where elements are True if allele is private to a single population. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2]) >>> ac3 = g.count_alleles(subpop=[3]) >>> loc_private_alleles = allel.locate_private_alleles(ac1, ac2, ac3) >>> loc_private_alleles array([[ True, False, False], [False, False, False], [ True, False, False], [ True, True, True], [ True, True, False]]) >>> loc_private_variants = np.any(loc_private_alleles, axis=1) >>> loc_private_variants array([ True, False, True, True, True]) """ # check inputs acs = [asarray_ndim(ac, 2) for ac in acs] check_dim0_aligned(*acs) acs = ensure_dim1_aligned(*acs) # stack allele counts for convenience pac = np.dstack(acs) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate alleles found only in a single population loc_pa = npa == 1 return loc_pa
def hudson_fst(ac1, ac2, fill=np.nan): """Calculate the numerator and denominator for Fst estimation using the method of Hudson (1992) elaborated by Bhatia et al. (2013). Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- num : ndarray, float, shape (n_variants,) Divergence between the two populations minus average of diversity within each population. den : ndarray, float, shape (n_variants,) Divergence between the two populations. Examples -------- Calculate numerator and denominator for Fst estimation:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]]) >>> subpops = [[0, 1], [2, 3]] >>> ac1 = g.count_alleles(subpop=subpops[0]) >>> ac2 = g.count_alleles(subpop=subpops[1]) >>> num, den = allel.hudson_fst(ac1, ac2) >>> num array([ 1. , -0.16666667, 0. , -0.125 , -0.33333333]) >>> den array([1. , 0.5 , 0. , 0.625, 0.5 ]) Estimate Fst for each variant individually:: >>> fst = num / den >>> fst array([ 1. , -0.33333333, nan, -0.2 , -0.66666667]) Estimate Fst averaging over variants:: >>> fst = np.sum(num) / np.sum(den) >>> fst 0.1428571428571429 """ # flake8: noqa # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # calculate these once only an1 = np.sum(ac1, axis=1) an2 = np.sum(ac2, axis=1) # calculate average diversity (a.k.a. heterozygosity) within each # population within = (mean_pairwise_difference(ac1, an1, fill=fill) + mean_pairwise_difference(ac2, an2, fill=fill)) / 2 # calculate divergence (a.k.a. heterozygosity) between each population between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill) # define numerator and denominator for Fst calculations num = between - within den = between return num, den
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.stats.mean_pairwise_difference_between(ac1, ac2) array([ 0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def hudson_fst(ac1, ac2, fill=np.nan): """Calculate the numerator and denominator for Fst estimation using the method of Hudson (1992) elaborated by Bhatia et al. (2013). Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- num : ndarray, float, shape (n_variants,) Divergence between the two populations minus average of diversity within each population. den : ndarray, float, shape (n_variants,) Divergence between the two populations. Examples -------- Calculate numerator and denominator for Fst estimation:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]]) >>> subpops = [[0, 1], [2, 3]] >>> ac1 = g.count_alleles(subpop=subpops[0]) >>> ac2 = g.count_alleles(subpop=subpops[1]) >>> num, den = allel.stats.hudson_fst(ac1, ac2) >>> num array([ 1. , -0.16666667, 0. , -0.125 , -0.33333333]) >>> den array([ 1. , 0.5 , 0. , 0.625, 0.5 ]) Estimate Fst for each variant individually:: >>> fst = num / den >>> fst array([ 1. , -0.33333333, nan, -0.2 , -0.66666667]) Estimate Fst averaging over variants:: >>> fst = np.sum(num) / np.sum(den) >>> fst 0.1428571428571429 """ # flake8: noqa # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # calculate these once only an1 = np.sum(ac1, axis=1) an2 = np.sum(ac2, axis=1) # calculate average diversity (a.k.a. heterozygosity) within each # population within = (mean_pairwise_difference(ac1, an1, fill=fill) + mean_pairwise_difference(ac2, an2, fill=fill)) / 2 # calculate divergence (a.k.a. heterozygosity) between each population between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill) # define numerator and denominator for Fst calculations num = between - within den = between return num, den
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.mean_pairwise_difference_between(ac1, ac2) array([0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd