예제 #1
0
def multinomial_bootstrap_ci(count_vec, uncovered_estimator,
                             alpha=0.05, bootstraps=1000,
                             random_state=0):
    """
    Performs a bootstrapping procedure to calculate confidence intervals

    Parameters
    ----------
    count_vec : np.array
       Count vector
    uncovered_estimator : function
       Discovery estimator (aka unobserved probability estimator)
    alpha : float
       Significance value (aka quantiles)
    bootstraps : int
       Number of bootstraps to perform
    random_state : np.random.RandomState or int
       used to generate random numbers

    Returns
    -------
    LB_p : np.array
       Lower bounds in confidence intervals for composition
    UB_p : np.array
       Upper bounds in confidence intervals for composition
    LB_cover : float
       Lower bound for discovery probability
    UB_cover : float
       Upper bound for discovery probability
    """
    random_state = check_random_state(random_state)
    N = count_vec.sum()
    p = closure(count_vec)
    boots = random_state.multinomial(N, p, size=bootstraps)
    boot_rel_p = closure(boots)
    boot_p_unobs = np.apply_along_axis(uncovered_estimator,
                                       -1, boots)
    boot_p = boot_rel_p * np.atleast_2d(1 - boot_p_unobs).T
    LB_p = np.percentile(boot_p, alpha/2 * 100, axis=0)
    UB_p = np.percentile(boot_p, (1-alpha/2) * 100, axis=0)
    LB_cover = np.percentile(boot_p_unobs, alpha/2 * 100)
    UB_cover = np.percentile(boot_p_unobs, (1-alpha/2) * 100)
    return LB_p, UB_p, LB_cover, UB_cover
예제 #2
0
def variation_distance(p1, p2):
    """
    Calculates the total variation distance between any two compositions

    Parameters
    ----------
    p1 : np.array
       composition vector
    p2 : np.array
       composition vector

    Returns
    -------
    float :
       Total variation distance of probability

    References
    ----------
    .. http://en.wikipedia.org/wiki/
       Total_variation_distance_of_probability_measures
    """
    p1, p2 = closure(p1), closure(p2)
    return 0.5*abs((p1-p2)).sum()
예제 #3
0
def coverage_correction(count_vec, uncovered_estimator):
    """
    Corrects for coverage and estimates absolute proportions

    Parameters
    ----------
    count_vec : np.array
       Count vector
    uncovered_estimator : function
       Discovery estimator (aka unobserved probability estimator)

    Returns
    -------
    np.array
       Corrected proportions
    """
    rel_p = closure(count_vec)
    p_unobs = np.apply_along_axis(uncovered_estimator,
                                  -1, count_vec)
    p = rel_p * np.atleast_2d(1 - p_unobs).T
    return np.squeeze(p)
예제 #4
0
data_dir = "../data/tick/meshnick_tech_reps"
biom_file = "%s/373_otu_table.biom" % data_dir
meta_file = "%s/meta.txt" % data_dir

table = biom.load_table(biom_file)
mat = np.array(table._get_sparse_data().todense()).T

# Randomly sample simplex
num_dists = 10000
num_species = 1000
depths=[300, 3000, 30000]
relative_tvd = np.zeros((num_dists, len(depths)))
robbins_tvd = np.zeros((num_dists, len(depths)))
for u, depth in enumerate(depths):
    for i in range(num_dists):
        pvals = closure(-np.log(np.random.rand(num_species)))
        # pvals = closure(mat[i, :])

        samp_table = np.random.multinomial(n=depth, pvals=pvals)

        cx1 = coverage_replacement(np.atleast_2d(samp_table),
                                   uncovered_estimator=robbins)
        relative_tvd[i, u] = variation_distance(closure(samp_table),  pvals)
        robbins_tvd[i, u] = variation_distance(cx1, pvals)

fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
for u in range(len(depths)):
    axes[u].hist(relative_tvd[:, u], 20, label='Relative', alpha=0.5, color='b')
    axes[u].hist(robbins_tvd[:, u], 20, label='Robbins', alpha=0.5, color='r')
    axes[u].set_title('Depth=%d' % depths[u])
    if u == 0:
예제 #5
0
y = np.array([20] + [10]*(N-1))
ax3[0].bar(ind, x, width, color='r', label='Time point 1')
ax3[0].bar(ind+width, y, width, color='b', label='Time point 2')
ax3[0].set_xticks([])
ax3[0].set_title('Species 1 doubles')
ax3[0].set_ylabel('Abundances')
ax3[0].legend()
# x = np.array([15] + [10]*(N-1))
# y = np.array([15] + [5]*(N-1))
# ax3[1].bar(ind, x, width, color='r')
# ax3[1].bar(ind+width, y, width, color='b')
# ax3[1].set_xticks([])
# ax3[1].set_title('Every species halves, except species 1')
# ax3[1].set_ylabel('Abundances')

x = closure(np.array([10] + [10]*(N-1)))
y = closure(np.array([20] + [10]*(N-1)))
ax3[1].bar(ind, x, width, color='r')
ax3[1].bar(ind+width, y, width, color='b')
ax3[1].set_title('Proportions for both scenarios')
ax3[1].set_ylabel('Proportions')
ax3[1].set_xlabel('Species')
plt.xticks(ind+width, map(str, range(1, 11)))
fig3.savefig(fname)


# Logratio difference plot
fname = '%s/logratio_diff.png' % (res_dir)
N = 10
ind = np.arange(N)  # the x locations for the groups
width = 0.4         # the width of the bars
예제 #6
0
num_species = 10000
num_samps = 100
pdf_dict = {
    'Geometric': np.random.geometric(1 / num_species, size=num_species),
    'Uniform': np.random.uniform(5000, 15000, size=num_species)
}

depths = np.linspace(2000, 20000, 10)
disp_depths = np.linspace(2000, 20000, 4)
u, v = 0, 0
fig, axes = plt.subplots(2, 2, figsize=(8, 8), sharey=True)
for pdf, pval in pdf_dict.items():
    mult_mean = []
    robbins_mean = []
    pvals = closure(pval)

    for depth in depths:

        samp_table = np.array([
            np.random.multinomial(n=depth, pvals=pvals)
            for i in range(num_samps)
        ])

        mrsamp_table = multiplicative_replacement(samp_table, delta=10**-8)
        rrsamp_table = coverage_replacement(samp_table,
                                            uncovered_estimator=robbins)

        # Get both mean distortion and variance
        truth = np.tile(pvals, (num_samps, 1))
        mr_msd = mean_sq_distance(mrsamp_table, truth)
예제 #7
0
"""
# from stats import robbins_variance, mvn_ellipsoid
from __future__ import division
import numpy as np
from stats import multinomial_bootstrap_ci
from composition import closure
from skbio.diversity.alpha import robbins
from stats import coverage_correction
import matplotlib.pyplot as plt

np.random.seed(0)
num_samps = 100
num_rarefactions = 10
num_species = 100

pvals = closure(np.random.geometric(1/10000, size=num_species))
# depth = np.random.geometric(1/2000)+2000
depth = 1000
samp_table = np.random.multinomial(n=depth, pvals=pvals)
corrected_pvals = coverage_correction(samp_table, robbins)
LB_p, UB_p, LB_cover, UB_cover = multinomial_bootstrap_ci(samp_table,
                                                          robbins,
                                                          alpha=0.01,
                                                          bootstraps=10000,
                                                          random_state=0)
fig = plt.figure()
asymmetric_error = [LB_p, UB_p]

plt.errorbar(np.arange(num_species), corrected_pvals, yerr=asymmetric_error,
             fmt='o', label='Estimated Proportions')
plt.plot(np.arange(num_species), pvals, 'or', label='True Proportions')
예제 #8
0
y = np.array([20] + [10] * (N - 1))
ax3[0].bar(ind, x, width, color='r', label='Time point 1')
ax3[0].bar(ind + width, y, width, color='b', label='Time point 2')
ax3[0].set_xticks([])
ax3[0].set_title('Species 1 doubles')
ax3[0].set_ylabel('Abundances')
ax3[0].legend()
# x = np.array([15] + [10]*(N-1))
# y = np.array([15] + [5]*(N-1))
# ax3[1].bar(ind, x, width, color='r')
# ax3[1].bar(ind+width, y, width, color='b')
# ax3[1].set_xticks([])
# ax3[1].set_title('Every species halves, except species 1')
# ax3[1].set_ylabel('Abundances')

x = closure(np.array([10] + [10] * (N - 1)))
y = closure(np.array([20] + [10] * (N - 1)))
ax3[1].bar(ind, x, width, color='r')
ax3[1].bar(ind + width, y, width, color='b')
ax3[1].set_title('Proportions for both scenarios')
ax3[1].set_ylabel('Proportions')
ax3[1].set_xlabel('Species')
plt.xticks(ind + width, map(str, range(1, 11)))
fig3.savefig(fname)

# Logratio difference plot
fname = '%s/logratio_diff.png' % (res_dir)
N = 10
ind = np.arange(N)  # the x locations for the groups
width = 0.4  # the width of the bars
fig, ax = plt.subplots()