예제 #1
0
    def calc(self):
        """
        Returns a summary of a set of sequences that can be partitioned into
        the list of lists of taxa given by ``taxon_groups``.
        """
        diffs_x, mean_diffs_x, sq_diff_x = _count_differences(
            self.pop1_seqs, self.state_alphabet, self.ignore_uncertain
        )
        diffs_y, mean_diffs_y, sq_diff_y = _count_differences(
            self.pop2_seqs, self.state_alphabet, self.ignore_uncertain
        )
        d_x = diffs_x / probability.binomial_coefficient(len(self.pop1_seqs), 2)
        d_y = diffs_y / probability.binomial_coefficient(len(self.pop2_seqs), 2)
        d_xy = self._average_number_of_pairwise_differences_between_populations()
        s2_x = (float(sq_diff_x) / probability.binomial_coefficient(len(self.pop1_seqs), 2)) - (d_x ** 2)
        s2_y = (float(sq_diff_y) / probability.binomial_coefficient(len(self.pop2_seqs), 2)) - (d_y ** 2)
        s2_xy = self._variance_of_pairwise_differences_between_populations(d_xy)
        n = len(self.combined_seqs)
        n_x = float(len(self.pop1_seqs))
        n_y = float(len(self.pop2_seqs))
        a = float(n * (n - 1))
        ax = float(n_x * (n_x - 1))
        ay = float(n_y * (n_y - 1))
        k = _average_number_of_pairwise_differences(self.combined_seqs, self.state_alphabet, self.ignore_uncertain)
        n = len(self.combined_seqs)

        # Hickerson 2006: pi #
        self.average_number_of_pairwise_differences = k

        # Hickerson 2006: pi_b #
        self.average_number_of_pairwise_differences_between = d_xy

        # Hickerson 2006: pi_w #
        self.average_number_of_pairwise_differences_within = d_x + d_y

        # Hickerson 2006: pi_net #
        self.average_number_of_pairwise_differences_net = d_xy - (d_x + d_y)

        # Hickerson 2006: S #
        self.num_segregating_sites = _num_segregating_sites(
            self.combined_seqs, self.state_alphabet, self.ignore_uncertain
        )

        # Hickerson 2006: theta #
        a1 = sum([1.0 / i for i in range(1, n)])
        self.wattersons_theta = float(self.num_segregating_sites) / a1

        # Wakeley 1996 #
        self.wakeleys_psi = (float(1) / (a)) * (
            ax * (math.sqrt(s2_x) / d_x) + ay * (math.sqrt(s2_y) / d_y) + (2 * n_x * n_y * math.sqrt(s2_xy) / k)
        )

        # Tajima's D #
        self.tajimas_d = _tajimas_d(n, self.average_number_of_pairwise_differences, self.num_segregating_sites)
예제 #2
0
def expected_tmrca(n_genes, pop_size=None, n_to_coalesce=2):
    """
    Expected (mean) value for the Time to the Most Recent Common Ancestor of
    ``n_to_coalesce`` genes in a sample of ``n_genes`` drawn from a population of
    ``pop_size`` genes.

    Parameters
    ----------
    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : `Random`
        The random number generator instance.

    Returns
    -------
    k : float
        The expected waiting time (in continuous time) for ``n_to_coalesce``
        genes to coalesce out of a sample of ``n_genes`` in a population of
        ``pop_size`` genes.

    """
    nc2 = probability.binomial_coefficient(n_genes, n_to_coalesce)
    tmrca = (float(1)/nc2)
    if pop_size is not None:
        return tmrca * pop_size
    else:
        return tmrca
예제 #3
0
def time_to_coalescence(n_genes,
        pop_size=None,
        n_to_coalesce=2,
        rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version): Time
    to go from ``n_genes`` genes to ``n_genes``-1 genes in a continuous-time
    Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until
    ``n-genes`` lineages coalesce in a population of ``pop_size`` genes.

    Given the number of gene lineages in a sample, ``n_genes``, and a
    population size, ``pop_size``, this function returns a random number from
    an exponential distribution with rate $\choose(``pop_size``, 2)$.
    ``pop_size`` is the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals. If ``pop_size`` is 1 or 0 or
    None, then time is in haploid population units; i.e. where 1 unit of time
    equals 2N generations for a diploid population of size N, or N generations
    for a haploid population of size N. Otherwise time is in generations.

    The coalescence time, or the waiting time for the coalescence, of two
    gene lineages evolving in a population with haploid size $N$ is an
    exponentially-distributed random variable with rate of $N$ an
    expectation of $\frac{1}{N}$).
    The waiting time for coalescence of *any* two gene lineages in a sample of
    $n$ gene lineages evolving in a population with haploid size $N$ is an
    exponentially-distributed random variable with rate of $\choose{N, 2}$ and
    an expectation of $\frac{1}{\choose{N, 2}}$.

    Parameters
    ----------
    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : `Random`
        The random number generator instance to use.

    Returns
    -------
    k : float
        A randomly-generated waiting time (in continuous time) for
        ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a
        population of ``pop_size`` genes.
    """
    if rng is None:
        rng = GLOBAL_RNG
    if not pop_size:
        time_units = 1.0
    else:
        time_units = pop_size
    rate = probability.binomial_coefficient(n_genes, n_to_coalesce)
    tmrca = rng.expovariate(rate)
    return tmrca * time_units
예제 #4
0
def _average_number_of_pairwise_differences(char_sequences, state_alphabet, ignore_uncertain=True):
    """
    Returns $k$ (Tajima 1983; Wakely 1996), calculated for a set of sequences:

    k = \frac{\right(\sum \sum \k_{ij}\left)}{n \choose 2}

    where $k_{ij}$ is the number of pairwise differences between the
    $i$th and $j$th sequence, and $n$ is the number of DNA sequences
    sampled.
    """
    sum_diff, mean_diff, sq_diff = _count_differences(char_sequences, state_alphabet, ignore_uncertain)
    return sum_diff / probability.binomial_coefficient(len(char_sequences), 2)
예제 #5
0
def discrete_time_to_coalescence(n_genes,
                                 pop_size=None,
                                 n_to_coalesce=2,
                                 rng=None):
    """
    A random draw from the "Kingman distribution" (discrete time version): Time
    to go from ``n_genes`` genes to ``n_genes``-1 genes in a discrete-time
    Wright-Fisher population of ``pop_size`` genes; i.e. waiting time until
    ``n-genes`` lineages coalesce in a population of ``pop_size`` genes.

    Parameters
    ----------

    n_genes : integer
        The number of genes in the sample.
    pop_size : integer
        The effective *haploid* population size; i.e., number of genes in the
        population: 2 * N in a diploid population of N individuals, or N in a
        haploid population of N individuals.
    n_to_coalesce : integer
        The waiting time that will be returned will be the waiting time for
        this number of genes in the sample to coalesce.
    rng : `Random`
        The random number generator instance.

    Returns
    -------
    k : integer
        A randomly-generated waiting time (in discrete generations) for
        ``n_to_coalesce`` genes to coalesce out of a sample of ``n_genes`` in a
        population of ``pop_size`` genes.

    """
    if not pop_size:
        time_units = 1.0
    else:
        time_units = pop_size
    if rng is None:
        rng = GLOBAL_RNG
    p = pop_size / probability.binomial_coefficient(n_genes, n_to_coalesce)
    tmrca = probability.geometric_rv(p)
    return tmrca * time_units