def multinomial_bootstrap_ci(count_vec, uncovered_estimator, alpha=0.05, bootstraps=1000, random_state=0): """ Performs a bootstrapping procedure to calculate confidence intervals Parameters ---------- count_vec : np.array Count vector uncovered_estimator : function Discovery estimator (aka unobserved probability estimator) alpha : float Significance value (aka quantiles) bootstraps : int Number of bootstraps to perform random_state : np.random.RandomState or int used to generate random numbers Returns ------- LB_p : np.array Lower bounds in confidence intervals for composition UB_p : np.array Upper bounds in confidence intervals for composition LB_cover : float Lower bound for discovery probability UB_cover : float Upper bound for discovery probability """ random_state = check_random_state(random_state) N = count_vec.sum() p = closure(count_vec) boots = random_state.multinomial(N, p, size=bootstraps) boot_rel_p = closure(boots) boot_p_unobs = np.apply_along_axis(uncovered_estimator, -1, boots) boot_p = boot_rel_p * np.atleast_2d(1 - boot_p_unobs).T LB_p = np.percentile(boot_p, alpha/2 * 100, axis=0) UB_p = np.percentile(boot_p, (1-alpha/2) * 100, axis=0) LB_cover = np.percentile(boot_p_unobs, alpha/2 * 100) UB_cover = np.percentile(boot_p_unobs, (1-alpha/2) * 100) return LB_p, UB_p, LB_cover, UB_cover
def variation_distance(p1, p2): """ Calculates the total variation distance between any two compositions Parameters ---------- p1 : np.array composition vector p2 : np.array composition vector Returns ------- float : Total variation distance of probability References ---------- .. http://en.wikipedia.org/wiki/ Total_variation_distance_of_probability_measures """ p1, p2 = closure(p1), closure(p2) return 0.5*abs((p1-p2)).sum()
def coverage_correction(count_vec, uncovered_estimator): """ Corrects for coverage and estimates absolute proportions Parameters ---------- count_vec : np.array Count vector uncovered_estimator : function Discovery estimator (aka unobserved probability estimator) Returns ------- np.array Corrected proportions """ rel_p = closure(count_vec) p_unobs = np.apply_along_axis(uncovered_estimator, -1, count_vec) p = rel_p * np.atleast_2d(1 - p_unobs).T return np.squeeze(p)
data_dir = "../data/tick/meshnick_tech_reps" biom_file = "%s/373_otu_table.biom" % data_dir meta_file = "%s/meta.txt" % data_dir table = biom.load_table(biom_file) mat = np.array(table._get_sparse_data().todense()).T # Randomly sample simplex num_dists = 10000 num_species = 1000 depths=[300, 3000, 30000] relative_tvd = np.zeros((num_dists, len(depths))) robbins_tvd = np.zeros((num_dists, len(depths))) for u, depth in enumerate(depths): for i in range(num_dists): pvals = closure(-np.log(np.random.rand(num_species))) # pvals = closure(mat[i, :]) samp_table = np.random.multinomial(n=depth, pvals=pvals) cx1 = coverage_replacement(np.atleast_2d(samp_table), uncovered_estimator=robbins) relative_tvd[i, u] = variation_distance(closure(samp_table), pvals) robbins_tvd[i, u] = variation_distance(cx1, pvals) fig, axes = plt.subplots(1, 3, figsize=(15, 4.5)) for u in range(len(depths)): axes[u].hist(relative_tvd[:, u], 20, label='Relative', alpha=0.5, color='b') axes[u].hist(robbins_tvd[:, u], 20, label='Robbins', alpha=0.5, color='r') axes[u].set_title('Depth=%d' % depths[u]) if u == 0:
y = np.array([20] + [10]*(N-1)) ax3[0].bar(ind, x, width, color='r', label='Time point 1') ax3[0].bar(ind+width, y, width, color='b', label='Time point 2') ax3[0].set_xticks([]) ax3[0].set_title('Species 1 doubles') ax3[0].set_ylabel('Abundances') ax3[0].legend() # x = np.array([15] + [10]*(N-1)) # y = np.array([15] + [5]*(N-1)) # ax3[1].bar(ind, x, width, color='r') # ax3[1].bar(ind+width, y, width, color='b') # ax3[1].set_xticks([]) # ax3[1].set_title('Every species halves, except species 1') # ax3[1].set_ylabel('Abundances') x = closure(np.array([10] + [10]*(N-1))) y = closure(np.array([20] + [10]*(N-1))) ax3[1].bar(ind, x, width, color='r') ax3[1].bar(ind+width, y, width, color='b') ax3[1].set_title('Proportions for both scenarios') ax3[1].set_ylabel('Proportions') ax3[1].set_xlabel('Species') plt.xticks(ind+width, map(str, range(1, 11))) fig3.savefig(fname) # Logratio difference plot fname = '%s/logratio_diff.png' % (res_dir) N = 10 ind = np.arange(N) # the x locations for the groups width = 0.4 # the width of the bars
num_species = 10000 num_samps = 100 pdf_dict = { 'Geometric': np.random.geometric(1 / num_species, size=num_species), 'Uniform': np.random.uniform(5000, 15000, size=num_species) } depths = np.linspace(2000, 20000, 10) disp_depths = np.linspace(2000, 20000, 4) u, v = 0, 0 fig, axes = plt.subplots(2, 2, figsize=(8, 8), sharey=True) for pdf, pval in pdf_dict.items(): mult_mean = [] robbins_mean = [] pvals = closure(pval) for depth in depths: samp_table = np.array([ np.random.multinomial(n=depth, pvals=pvals) for i in range(num_samps) ]) mrsamp_table = multiplicative_replacement(samp_table, delta=10**-8) rrsamp_table = coverage_replacement(samp_table, uncovered_estimator=robbins) # Get both mean distortion and variance truth = np.tile(pvals, (num_samps, 1)) mr_msd = mean_sq_distance(mrsamp_table, truth)
""" # from stats import robbins_variance, mvn_ellipsoid from __future__ import division import numpy as np from stats import multinomial_bootstrap_ci from composition import closure from skbio.diversity.alpha import robbins from stats import coverage_correction import matplotlib.pyplot as plt np.random.seed(0) num_samps = 100 num_rarefactions = 10 num_species = 100 pvals = closure(np.random.geometric(1/10000, size=num_species)) # depth = np.random.geometric(1/2000)+2000 depth = 1000 samp_table = np.random.multinomial(n=depth, pvals=pvals) corrected_pvals = coverage_correction(samp_table, robbins) LB_p, UB_p, LB_cover, UB_cover = multinomial_bootstrap_ci(samp_table, robbins, alpha=0.01, bootstraps=10000, random_state=0) fig = plt.figure() asymmetric_error = [LB_p, UB_p] plt.errorbar(np.arange(num_species), corrected_pvals, yerr=asymmetric_error, fmt='o', label='Estimated Proportions') plt.plot(np.arange(num_species), pvals, 'or', label='True Proportions')
y = np.array([20] + [10] * (N - 1)) ax3[0].bar(ind, x, width, color='r', label='Time point 1') ax3[0].bar(ind + width, y, width, color='b', label='Time point 2') ax3[0].set_xticks([]) ax3[0].set_title('Species 1 doubles') ax3[0].set_ylabel('Abundances') ax3[0].legend() # x = np.array([15] + [10]*(N-1)) # y = np.array([15] + [5]*(N-1)) # ax3[1].bar(ind, x, width, color='r') # ax3[1].bar(ind+width, y, width, color='b') # ax3[1].set_xticks([]) # ax3[1].set_title('Every species halves, except species 1') # ax3[1].set_ylabel('Abundances') x = closure(np.array([10] + [10] * (N - 1))) y = closure(np.array([20] + [10] * (N - 1))) ax3[1].bar(ind, x, width, color='r') ax3[1].bar(ind + width, y, width, color='b') ax3[1].set_title('Proportions for both scenarios') ax3[1].set_ylabel('Proportions') ax3[1].set_xlabel('Species') plt.xticks(ind + width, map(str, range(1, 11))) fig3.savefig(fname) # Logratio difference plot fname = '%s/logratio_diff.png' % (res_dir) N = 10 ind = np.arange(N) # the x locations for the groups width = 0.4 # the width of the bars fig, ax = plt.subplots()