Пример #1
0
def test_distributions_poisson_random_sample():
	d = PoissonDistribution(1)

	x = numpy.array([0, 1, 2, 2, 0])

	assert_array_almost_equal(d.sample(5, random_state=5), x)
	assert_raises(AssertionError, assert_array_almost_equal, d.sample(5), x)
Пример #2
0
def test_distributions_poisson_random_sample():
	d = PoissonDistribution(1)

	x = numpy.array([0, 1, 2, 2, 0])

	assert_array_almost_equal(d.sample(5, random_state=5), x)
	assert_raises(AssertionError, assert_array_almost_equal, d.sample(5), x)
def test_poisson():
    d = PoissonDistribution(5)

    assert_almost_equal(d.log_probability(5), -1.7403021806115442)
    assert_almost_equal(d.log_probability(10), -4.0100334487345126)
    assert_almost_equal(d.log_probability(1), -3.3905620875658995)
    assert_equal(d.log_probability(-1), float("-inf"))

    d = PoissonDistribution(0)

    assert_equal(d.log_probability(1), float("-inf"))
    assert_equal(d.log_probability(7), float("-inf"))

    d.fit([1, 6, 4, 9, 1])
    assert_equal(d.parameters[0], 4.2)

    d.fit([1, 6, 4, 9, 1], weights=[0, 0, 0, 1, 0])
    assert_equal(d.parameters[0], 9)

    d.fit([1, 6, 4, 9, 1], weights=[1, 0, 0, 1, 0])
    assert_equal(d.parameters[0], 5)

    assert_almost_equal(d.log_probability(5), -1.7403021806115442)
    assert_almost_equal(d.log_probability(10), -4.0100334487345126)
    assert_almost_equal(d.log_probability(1), -3.3905620875658995)
    assert_equal(d.log_probability(-1), float("-inf"))

    e = pickle.loads(pickle.dumps(d))
    assert_equal(e.name, "PoissonDistribution")
    assert_equal(e.parameters[0], 5)
Пример #4
0
def roh_poissonhmm(gv,
                   pos,
                   phet_roh=0.001,
                   phet_nonroh=(0.0025, 0.01),
                   transition=1e-3,
                   window_size=1000,
                   min_roh=0,
                   is_accessible=None,
                   contig_size=None):
    """Call ROH (runs of homozygosity) in a single individual given a genotype vector.

    This function computes the likely ROH using a Poisson HMM model. The chromosome is divided into
    equally accessible windows of specified size, then the number of hets observed in each is used
    to fit a Poisson HMM. Note this is much faster than `roh_mhmm`, but at the cost of some
    resolution.

    The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one
    or more probabilities of observing a het in a non-ROH, as this probability may not be
    constant across the genome (`phet_nonroh`).

    Parameters
    ----------
    gv : array_like, int, shape (n_variants, ploidy)
        Genotype vector.
    pos: array_like, int, shape (n_variants,)
        Positions of variants, same 0th dimension as `gv`.
    phet_roh: float, optional
        Probability of observing a heterozygote in a ROH. Appropriate values
        will depend on de novo mutation rate and genotype error rate.
    phet_nonroh: tuple of floats, optional
        One or more probabilites of observing a heterozygote outside of ROH.
        Appropriate values will depend primarily on nucleotide diversity within
        the population, but also on mutation rate and genotype error rate.
    transition: float, optional
        Probability of moving between states. This is based on windows, so a larger window size may
        call for a larger transitional probability
    window_size: integer, optional
        Window size (equally accessible bases) to consider as a potential ROH. Setting this window
        too small may result in spurious ROH calls, while too large will result in a lack of
        resolution.
    min_roh: integer, optional
        Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate.
    is_accessible: array_like, bool, shape (`contig_size`,), optional
        Boolean array for each position in contig describing whether accessible
        or not. Although optional, highly recommended so invariant sites are distinguishable from
        sites where variation is inaccessible
    contig_size: integer, optional
        If is_accessible is not available, use this to specify the size of the contig, and assume
        all sites are accessible.


    Returns
    -------
    df_roh: DataFrame
        Data frame where each row describes a run of homozygosity. Columns are 'start',
        'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive.
    froh: float
        Proportion of genome in a ROH.

    Notes
    -----
    This function requires `pomegranate` (>= 0.9.0) to be installed.

    """

    from pomegranate import HiddenMarkovModel, PoissonDistribution

    # equally accessbile windows
    if is_accessible is None:
        if contig_size is None:
            raise ValueError(
                "If is_accessibile argument is not provided, you must provide contig_size"
            )
        is_accessible = np.ones((contig_size, ), dtype="bool")
    else:
        contig_size = is_accessible.size

    eqw = equally_accessible_windows(is_accessible, window_size)

    ishet = GenotypeVector(gv).is_het()
    counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw)

    # heterozygote probabilities
    het_px = np.concatenate([(phet_roh, ), phet_nonroh])

    # start probabilities (all equal)
    start_prob = np.repeat(1 / het_px.size, het_px.size)

    # transition between underlying states
    transition_mx = _hmm_derive_transition_matrix(transition, het_px.size)

    dists = [PoissonDistribution(x * window_size) for x in het_px]

    model = HiddenMarkovModel.from_matrix(
        transition_probabilities=transition_mx,
        distributions=dists,
        starts=start_prob)

    prediction = np.array(model.predict(counts[:, None]))

    df_blocks = tabulate_state_blocks(prediction,
                                      states=list(range(len(het_px))))
    df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)

    # adapt the dataframe for ROH
    df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0])
    df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1])
    df_roh["length"] = df_roh.stop - df_roh.start

    # filter by ROH size
    if min_roh > 0:
        df_roh = df_roh[df_roh.length >= min_roh]

    # compute FROH
    froh = df_roh.length.sum() / contig_size

    return df_roh[["start", "stop", "length", "is_marginal"]], froh
Пример #5
0
def test_poisson():
	d = PoissonDistribution(5)

	assert_almost_equal(d.log_probability(5), -1.7403021806115442)
	assert_almost_equal(d.log_probability(10), -4.0100334487345126)
	assert_almost_equal(d.log_probability(1), -3.3905620875658995)
	assert_equal(d.log_probability(-1), float("-inf"))

	d = PoissonDistribution(0)

	assert_equal(d.log_probability(1), float("-inf"))
	assert_equal(d.log_probability(7), float("-inf"))

	d.fit([1, 6, 4, 9, 1])
	assert_equal(d.parameters[0], 4.2)

	d.fit([1, 6, 4, 9, 1], weights=[0, 0, 0, 1, 0])
	assert_equal(d.parameters[0], 9)

	d.fit([1, 6, 4, 9, 1], weights=[1, 0, 0, 1, 0])
	assert_equal(d.parameters[0], 5)

	assert_almost_equal(d.log_probability(5), -1.7403021806115442)
	assert_almost_equal(d.log_probability(10), -4.0100334487345126)
	assert_almost_equal(d.log_probability(1), -3.3905620875658995)
	assert_equal(d.log_probability(-1), float("-inf"))

	e = pickle.loads(pickle.dumps(d))
	assert_equal(e.name, "PoissonDistribution")
	assert_equal(e.parameters[0], 5)