def test_sequence_divergence(self): from allel.stats import sequence_divergence pos = [2, 4, 8] ac1 = AlleleCountsArray([[2, 0], [2, 0], [2, 0]]) ac2 = AlleleCountsArray([[0, 2], [0, 2], [0, 2]]) # all variants e = 3 / 7 a = sequence_divergence(pos, ac1, ac2) eq(e, a) # start/stop e = 2 / 6 a = sequence_divergence(pos, ac1, ac2, start=0, stop=5) eq(e, a) # start/stop, an provided an1 = ac1.sum(axis=1) an2 = ac2.sum(axis=1) e = 2 / 6 a = sequence_divergence(pos, ac1, ac2, start=0, stop=5, an1=an1, an2=an2) eq(e, a)
def patterson_f2(aca, acb): """Unbiased estimator for F2(A, B), the branch length between populations A and B. Parameters ---------- aca : array_like, int, shape (n_variants, 2) Allele counts for population A. acb : array_like, int, shape (n_variants, 2) Allele counts for population B. Returns ------- f2 : ndarray, float, shape (n_variants,) Notes ----- See Patterson (2012), Appendix A. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb) # compute allele numbers sa = aca.sum(axis=1) sb = acb.sum(axis=1) # compute heterozygosities ha = h_hat(aca) hb = h_hat(acb) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] # compute estimator x = ((a - b)**2) - (ha / sa) - (hb / sb) return x
def patterson_f2(aca, acb): """Unbiased estimator for F2(A, B), the branch length between populations A and B. Parameters ---------- aca : array_like, int, shape (n_variants, 2) Allele counts for population A. acb : array_like, int, shape (n_variants, 2) Allele counts for population B. Returns ------- f2 : ndarray, float, shape (n_variants,) Notes ----- See Patterson (2012), Appendix A. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb) # compute allele numbers sa = aca.sum(axis=1) sb = acb.sum(axis=1) # compute heterozygosities ha = h_hat(aca) hb = h_hat(acb) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] # compute estimator x = ((a - b) ** 2) - (ha / sa) - (hb / sb) return x
def patterson_f3(acc, aca, acb): """Unbiased estimator for F3(C; A, B), the three-population test for admixture in population C. Parameters ---------- acc : array_like, int, shape (n_variants, 2) Allele counts for the test population (C). aca : array_like, int, shape (n_variants, 2) Allele counts for the first source population (A). acb : array_like, int, shape (n_variants, 2) Allele counts for the second source population (B). Returns ------- T : ndarray, float, shape (n_variants,) Un-normalized f3 estimates per variant. B : ndarray, float, shape (n_variants,) Estimates for heterozygosity in population C. Notes ----- See Patterson (2012), main text and Appendix A. For un-normalized f3 statistics, ignore the `B` return value. To compute the f3* statistic, which is normalized by heterozygosity in population C to remove numerical dependence on the allele frequency spectrum, compute ``np.sum(T) / np.sum(B)``. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' acc = AlleleCountsArray(acc, copy=False) assert acc.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb, acc) # compute allele number and heterozygosity in test population sc = acc.sum(axis=1) hc = h_hat(acc) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] c = acc.to_frequencies()[:, 1] # compute estimator T = ((c - a) * (c - b)) - (hc / sc) B = 2 * hc return T, B
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan): """Calculate the value of Tajima's D in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- D : ndarray, float, shape (n_windows,) Tajima's D. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> D, windows, counts = allel.stats.windowed_tajima_d( ... pos, ac, size=10, start=1, stop=31 ... ) >>> D array([ 0.59158014, 2.93397641, 6.12372436]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> counts array([3, 4, 2]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # calculate constants a1 = np.sum(1 / np.arange(1, n)) a2 = np.sum(1 / (np.arange(1, n) ** 2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) e1 = c1 / a1 e2 = c2 / (a1 ** 2 + a2) # locate segregating variants is_seg = ac.is_segregating() # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # define statistic to compute for each window # noinspection PyPep8Naming def statistic(w_is_seg, w_mpd): S = np.count_nonzero(w_is_seg) pi = np.sum(w_mpd) d = pi - (S / a1) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) wD = d / d_stdev return wD D, windows, counts = windowed_statistic( pos, values=(is_seg, mpd), statistic=statistic, size=size, start=start, stop=stop, step=step, windows=windows, fill=fill, ) return D, windows, counts
def tajima_d(ac, pos=None, start=None, stop=None): """Calculate the value of Tajima's D over a given region. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. pos : array_like, int, shape (n_items,), optional Variant positions, using 1-based coordinates, in ascending order. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). Returns ------- D : float Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> allel.stats.tajima_d(ac) 3.1445848780213814 >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> allel.stats.tajima_d(ac, pos=pos, start=7, stop=25) 3.8779735196179366 """ # check inputs if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if pos is not None and (start is not None or stop is not None): if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) loc = pos.locate_range(start, stop) ac = ac[loc] # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # count segregating variants S = ac.count_segregating() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate Watterson's theta (absolute value) theta_hat_w_abs = S / a1 # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # calculate theta_hat pi (sum differences over variants) theta_hat_pi_abs = np.sum(mpd) # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = theta_hat_pi_abs - theta_hat_w_abs # calculate the denominator (standard deviation) a2 = np.sum(1 / (np.arange(1, n) ** 2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) e1 = c1 / a1 e2 = c2 / (a1 ** 2 + a2) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) # finally calculate Tajima's D D = d / d_stdev return D
def windowed_watterson_theta( pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([ 0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic( pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w
def tajima_d(ac, pos=None, start=None, stop=None, min_sites=3): """Calculate the value of Tajima's D over a given region. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. pos : array_like, int, shape (n_items,), optional Variant positions, using 1-based coordinates, in ascending order. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. min_sites : int, optional Minimum number of segregating sites for which to calculate a value. If there are fewer, np.nan is returned. Defaults to 3. Returns ------- D : float Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> allel.tajima_d(ac) 3.1445848780213814 >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> allel.tajima_d(ac, pos=pos, start=7, stop=25) 3.8779735196179366 """ # check inputs if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if pos is not None and (start is not None or stop is not None): if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) loc = pos.locate_range(start, stop) ac = ac[loc] # count segregating variants S = ac.count_segregating() if S < min_sites: return np.nan # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate Watterson's theta (absolute value) theta_hat_w_abs = S / a1 # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # calculate theta_hat pi (sum differences over variants) theta_hat_pi_abs = np.sum(mpd) # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = theta_hat_pi_abs - theta_hat_w_abs # calculate the denominator (standard deviation) a2 = np.sum(1 / (np.arange(1, n)**2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) # finally calculate Tajima's D D = d / d_stdev return D
def windowed_watterson_theta(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic(pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, min_sites=3): """Calculate the value of Tajima's D in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. min_sites : int, optional Minimum number of segregating sites for which to calculate a value. If there are fewer, np.nan is returned. Defaults to 3. Returns ------- D : ndarray, float, shape (n_windows,) Tajima's D. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27] >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31) >>> D array([1.36521524, 4.22566622]) >>> windows array([[ 1, 20], [11, 31]]) >>> counts array([6, 6]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # calculate constants a1 = np.sum(1 / np.arange(1, n)) a2 = np.sum(1 / (np.arange(1, n)**2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) # locate segregating variants is_seg = ac.is_segregating() # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # define statistic to compute for each window # noinspection PyPep8Naming def statistic(w_is_seg, w_mpd): S = np.count_nonzero(w_is_seg) if S < min_sites: return np.nan pi = np.sum(w_mpd) d = pi - (S / a1) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) wD = d / d_stdev return wD D, windows, counts = windowed_statistic(pos, values=(is_seg, mpd), statistic=statistic, size=size, start=start, stop=stop, step=step, windows=windows, fill=np.nan) return D, windows, counts