Exemplo n.º 1
0
    def test_sequence_divergence(self):
        from allel.stats import sequence_divergence
        pos = [2, 4, 8]
        ac1 = AlleleCountsArray([[2, 0],
                                 [2, 0],
                                 [2, 0]])
        ac2 = AlleleCountsArray([[0, 2],
                                 [0, 2],
                                 [0, 2]])

        # all variants
        e = 3 / 7
        a = sequence_divergence(pos, ac1, ac2)
        eq(e, a)

        # start/stop
        e = 2 / 6
        a = sequence_divergence(pos, ac1, ac2, start=0, stop=5)
        eq(e, a)

        # start/stop, an provided
        an1 = ac1.sum(axis=1)
        an2 = ac2.sum(axis=1)
        e = 2 / 6
        a = sequence_divergence(pos, ac1, ac2, start=0, stop=5, an1=an1,
                                an2=an2)
        eq(e, a)
Exemplo n.º 2
0
def patterson_f3(acc, aca, acb):
    """Unbiased estimator for F3(C; A, B), the three-population test for
    admixture in population C.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).

    Returns
    -------
    T : ndarray, float, shape (n_variants,)
        Un-normalized f3 estimates per variant.
    B : ndarray, float, shape (n_variants,)
        Estimates for heterozygosity in population C.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f3 statistics, ignore the `B` return value.

    To compute the f3* statistic, which is normalized by heterozygosity in
    population C to remove numerical dependence on the allele frequency
    spectrum, compute ``np.sum(T) / np.sum(B)``.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc)

    # compute allele number and heterozygosity in test population
    sc = acc.sum(axis=1)
    hc = h_hat(acc)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]

    # compute estimator
    T = ((c - a) * (c - b)) - (hc / sc)
    B = 2 * hc

    return T, B
Exemplo n.º 3
0
    def test_slice_types(self):

        ac = AlleleCountsArray(allele_counts_data, dtype='u1')

        # row slice
        s = ac[1:]
        assert_is_instance(s, AlleleCountsArray)

        # col slice
        s = ac[:, 1:]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, AlleleCountsArray)

        # row index
        s = ac[0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, AlleleCountsArray)

        # col index
        s = ac[:, 0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, AlleleCountsArray)

        # item
        s = ac[0, 0]
        assert_is_instance(s, np.uint8)
        assert_not_is_instance(s, AlleleCountsArray)
Exemplo n.º 4
0
def patterson_d(aca, acb, acc, acd):
    """Unbiased estimator for D(A, B; C, D), the normalised four-population
    test for admixture between (A or B) and (C or D), also known as the
    "ABBA BABA" test.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Numerator (un-normalised f4 estimates).
    den : ndarray, float, shape (n_variants,)
        Denominator.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f4 statistics, ignore the `den` return value.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    acd = AlleleCountsArray(acd, copy=False)
    assert acd.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc, acd)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]
    d = acd.to_frequencies()[:, 1]

    # compute estimator
    num = (a - b) * (c - d)
    den = (a + b - (2 * a * b)) * (c + d - (2 * c * d))

    return num, den
Exemplo n.º 5
0
def patterson_f3(acc, aca, acb):
    """Unbiased estimator for F3(C; A, B), the three-population test for
    admixture in population C.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).

    Returns
    -------
    T : ndarray, float, shape (n_variants,)
        Un-normalized f3 estimates per variant.
    B : ndarray, float, shape (n_variants,)
        Estimates for heterozygosity in population C.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f3 statistics, ignore the `B` return value.

    To compute the f3* statistic, which is normalized by heterozygosity in
    population C to remove numerical dependence on the allele frequency
    spectrum, compute ``np.sum(T) / np.sum(B)``.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc)

    # compute allele number and heterozygosity in test population
    sc = acc.sum(axis=1)
    hc = h_hat(acc)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]

    # compute estimator
    T = ((c - a) * (c - b)) - (hc / sc)
    B = 2 * hc

    return T, B
Exemplo n.º 6
0
    def test_constructor(self):

        # missing data arg
        with self.assertRaises(TypeError):
            # noinspection PyArgumentList
            AlleleCountsArray()

        # data has wrong dtype
        data = 'foo bar'
        with self.assertRaises(TypeError):
            AlleleCountsArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with self.assertRaises(TypeError):
            AlleleCountsArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with self.assertRaises(TypeError):
            AlleleCountsArray(data)

        # data has wrong dimensions
        data = diploid_genotype_data
        with self.assertRaises(TypeError):
            AlleleCountsArray(data)

        # valid data (typed)
        ac = AlleleCountsArray(allele_counts_data, dtype='u1')
        aeq(allele_counts_data, ac)
        eq(np.uint8, ac.dtype)
Exemplo n.º 7
0
def patterson_f2(aca, acb):
    """Unbiased estimator for F2(A, B), the branch length between populations
    A and B.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.

    Returns
    -------
    f2 : ndarray, float, shape (n_variants,)

    Notes
    -----
    See Patterson (2012), Appendix A.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb)

    # compute allele numbers
    sa = aca.sum(axis=1)
    sb = acb.sum(axis=1)

    # compute heterozygosities
    ha = h_hat(aca)
    hb = h_hat(acb)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]

    # compute estimator
    x = ((a - b)**2) - (ha / sa) - (hb / sb)

    return x
Exemplo n.º 8
0
def patterson_f2(aca, acb):
    """Unbiased estimator for F2(A, B), the branch length between populations
    A and B.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.

    Returns
    -------
    f2 : ndarray, float, shape (n_variants,)

    Notes
    -----
    See Patterson (2012), Appendix A.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb)

    # compute allele numbers
    sa = aca.sum(axis=1)
    sb = acb.sum(axis=1)

    # compute heterozygosities
    ha = h_hat(aca)
    hb = h_hat(acb)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]

    # compute estimator
    x = ((a - b) ** 2) - (ha / sa) - (hb / sb)

    return x
Exemplo n.º 9
0
 def setup_instance(self, data):
     return AlleleCountsArray(data)
Exemplo n.º 10
0
def patterson_d(aca, acb, acc, acd):
    """Unbiased estimator for D(A, B; C, D), the normalised four-population
    test for admixture between (A or B) and (C or D), also known as the
    "ABBA BABA" test.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Numerator (un-normalised f4 estimates).
    den : ndarray, float, shape (n_variants,)
        Denominator.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f4 statistics, ignore the `den` return value.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    acd = AlleleCountsArray(acd, copy=False)
    assert acd.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc, acd)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]
    d = acd.to_frequencies()[:, 1]

    # compute estimator
    num = (a - b) * (c - d)
    den = (a + b - (2 * a * b)) * (c + d - (2 * c * d))

    return num, den
Exemplo n.º 11
0
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan):
    """Calculate the value of Tajima's D in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> D, windows, counts = allel.stats.windowed_tajima_d(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> D
    array([ 0.59158014,  2.93397641,  6.12372436])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # calculate constants
    a1 = np.sum(1 / np.arange(1, n))
    a2 = np.sum(1 / (np.arange(1, n) ** 2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
    e1 = c1 / a1
    e2 = c2 / (a1 ** 2 + a2)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # define statistic to compute for each window
    # noinspection PyPep8Naming
    def statistic(w_is_seg, w_mpd):
        S = np.count_nonzero(w_is_seg)
        pi = np.sum(w_mpd)
        d = pi - (S / a1)
        d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
        wD = d / d_stdev
        return wD

    D, windows, counts = windowed_statistic(
        pos,
        values=(is_seg, mpd),
        statistic=statistic,
        size=size,
        start=start,
        stop=stop,
        step=step,
        windows=windows,
        fill=fill,
    )

    return D, windows, counts
Exemplo n.º 12
0
def tajima_d(ac, pos=None, start=None, stop=None):
    """Calculate the value of Tajima's D over a given region.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    pos : array_like, int, shape (n_items,), optional
        Variant positions, using 1-based coordinates, in ascending order.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).

    Returns
    -------
    D : float

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                    [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> allel.stats.tajima_d(ac)
    3.1445848780213814
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> allel.stats.tajima_d(ac, pos=pos, start=7, stop=25)
    3.8779735196179366

    """

    # check inputs
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if pos is not None and (start is not None or stop is not None):
        if not isinstance(pos, SortedIndex):
            pos = SortedIndex(pos, copy=False)
        loc = pos.locate_range(start, stop)
        ac = ac[loc]

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # count segregating variants
    S = ac.count_segregating()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate Watterson's theta (absolute value)
    theta_hat_w_abs = S / a1

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # calculate theta_hat pi (sum differences over variants)
    theta_hat_pi_abs = np.sum(mpd)

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = theta_hat_pi_abs - theta_hat_w_abs

    # calculate the denominator (standard deviation)
    a2 = np.sum(1 / (np.arange(1, n) ** 2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
    e1 = c1 / a1
    e2 = c2 / (a1 ** 2 + a2)
    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # finally calculate Tajima's D
    D = d / d_stdev

    return D
Exemplo n.º 13
0
def windowed_watterson_theta(
    pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([ 0.10909091,  0.16363636,  0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(
        pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0
    )

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill)

    return theta_hat_w, windows, n_bases, counts
Exemplo n.º 14
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w
Exemplo n.º 15
0
def tajima_d(ac, pos=None, start=None, stop=None, min_sites=3):
    """Calculate the value of Tajima's D over a given region.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    pos : array_like, int, shape (n_items,), optional
        Variant positions, using 1-based coordinates, in ascending order.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    D : float

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                    [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> allel.tajima_d(ac)
    3.1445848780213814
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> allel.tajima_d(ac, pos=pos, start=7, stop=25)
    3.8779735196179366

    """

    # check inputs
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if pos is not None and (start is not None or stop is not None):
        if not isinstance(pos, SortedIndex):
            pos = SortedIndex(pos, copy=False)
        loc = pos.locate_range(start, stop)
        ac = ac[loc]

    # count segregating variants
    S = ac.count_segregating()
    if S < min_sites:
        return np.nan

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate Watterson's theta (absolute value)
    theta_hat_w_abs = S / a1

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # calculate theta_hat pi (sum differences over variants)
    theta_hat_pi_abs = np.sum(mpd)

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = theta_hat_pi_abs - theta_hat_w_abs

    # calculate the denominator (standard deviation)
    a2 = np.sum(1 / (np.arange(1, n)**2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # finally calculate Tajima's D
    D = d / d_stdev

    return D
Exemplo n.º 16
0
def windowed_watterson_theta(pos,
                             ac,
                             size=None,
                             start=None,
                             stop=None,
                             step=None,
                             windows=None,
                             is_accessible=None,
                             fill=np.nan):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([0.10909091, 0.16363636, 0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(pos,
                                            is_seg,
                                            statistic=np.count_nonzero,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=0)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs,
                                    windows=windows,
                                    is_accessible=is_accessible,
                                    fill=fill)

    return theta_hat_w, windows, n_bases, counts
Exemplo n.º 17
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w
Exemplo n.º 18
0
def windowed_tajima_d(pos,
                      ac,
                      size=None,
                      start=None,
                      stop=None,
                      step=None,
                      windows=None,
                      min_sites=3):
    """Calculate the value of Tajima's D in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27]
    >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31)
    >>> D
    array([1.36521524, 4.22566622])
    >>> windows
    array([[ 1, 20],
           [11, 31]])
    >>> counts
    array([6, 6])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # calculate constants
    a1 = np.sum(1 / np.arange(1, n))
    a2 = np.sum(1 / (np.arange(1, n)**2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # define statistic to compute for each window
    # noinspection PyPep8Naming
    def statistic(w_is_seg, w_mpd):
        S = np.count_nonzero(w_is_seg)
        if S < min_sites:
            return np.nan
        pi = np.sum(w_mpd)
        d = pi - (S / a1)
        d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
        wD = d / d_stdev
        return wD

    D, windows, counts = windowed_statistic(pos,
                                            values=(is_seg, mpd),
                                            statistic=statistic,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=np.nan)

    return D, windows, counts
Exemplo n.º 19
0
def pbs(ac1,
        ac2,
        ac3,
        window_size,
        window_start=0,
        window_stop=None,
        window_step=None,
        normed=True):
    """Compute the population branching statistic (PBS) which performs a comparison
    of allele frequencies between three populations to detect genome regions that are
    unusually differentiated in one population relative to the other two populations.

    Parameters
    ----------
    ac1 : array_like, int
        Allele counts from the first population.
    ac2 : array_like, int
        Allele counts from the second population.
    ac3 : array_like, int
        Allele counts from the third population.
    window_size : int
        The window size (number of variants) within which to compute PBS values.
    window_start : int, optional
        The variant index at which to start windowed calculations.
    window_stop : int, optional
        The variant index at which to stop windowed calculations.
    window_step : int, optional
        The number of variants between start positions of windows. If not given, defaults
        to the window size, i.e., non-overlapping windows.
    normed : bool, optional
        If True (default), use the normalised version of PBS, also known as PBSn1 [2]_.
        Otherwise, use the PBS statistic as originally defined in [1]_.

    Returns
    -------
    pbs : ndarray, float
        Windowed PBS values.

    Notes
    -----
    The F\ :sub:`ST` calculations use Hudson's estimator.

    References
    ----------
    .. [1] Yi et al., "Sequencing of Fifty Human Exomes Reveals Adaptation to High
       Altitude", Science, 329(5987): 75–78, 2 July 2010.
    .. [2] Malaspinas et al., "A genomic history of Aboriginal Australia", Nature. volume
       538, pages 207–214, 13 October 2016.

    """

    # normalise and check inputs
    ac1 = AlleleCountsArray(ac1)
    ac2 = AlleleCountsArray(ac2)
    ac3 = AlleleCountsArray(ac3)
    check_dim0_aligned(ac1, ac2, ac3)

    # compute fst
    fst12 = moving_hudson_fst(ac1,
                              ac2,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)
    fst13 = moving_hudson_fst(ac1,
                              ac3,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)
    fst23 = moving_hudson_fst(ac2,
                              ac3,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)

    # clip fst values to avoid infinite if fst is 1
    for x in fst12, fst13, fst23:
        np.clip(x, a_min=0, a_max=0.99999, out=x)

    # compute fst transform
    t12 = -np.log(1 - fst12)
    t13 = -np.log(1 - fst13)
    t23 = -np.log(1 - fst23)

    # compute pbs
    ret = (t12 + t13 - t23) / 2

    if normed:
        # compute pbs normalising constant
        norm = 1 + (t12 + t13 + t23) / 2
        ret = ret / norm

    return ret