def chao1(counts, bias_corrected=True): r"""Calculate chao1 richness estimator. Uses the bias-corrected version unless `bias_corrected` is ``False`` *and* there are both singletons and doubletons. Parameters ---------- counts : 1-D array_like, int Vector of counts. bias_corrected : bool, optional Indicates whether or not to use the bias-corrected version of the equation. If ``False`` *and* there are both singletons and doubletons, the uncorrected version will be used. The biased-corrected version will be used otherwise. Returns ------- double Computed chao1 richness estimator. See Also -------- chao1_ci Notes ----- The uncorrected version is based on Equation 6 in [1]_: .. math:: chao1=S_{obs}+\frac{F_1^2}{2F_2} where :math:`F_1` and :math:`F_2` are the count of singletons and doubletons, respectively. The bias-corrected version is defined as .. math:: chao1=S_{obs}+\frac{F_1(F_1-1)}{2(F_2+1)} References ---------- .. [1] Chao, A. 1984. Non-parametric estimation of the number of classes in a population. Scandinavian Journal of Statistics 11, 265-270. """ counts = _validate_counts_vector(counts) o, s, d = osd(counts) if not bias_corrected and s and d: return o + s ** 2 / (d * 2) else: return o + s * (s - 1) / (2 * (d + 1))
def chao1(counts, bias_corrected=True): r"""Calculate chao1 richness estimator. Uses the bias-corrected version unless `bias_corrected` is ``False`` *and* there are both singletons and doubletons. Parameters ---------- counts : 1-D array_like, int Vector of counts. bias_corrected : bool, optional Indicates whether or not to use the bias-corrected version of the equation. If ``False`` *and* there are both singletons and doubletons, the uncorrected version will be used. The biased-corrected version will be used otherwise. Returns ------- double Computed chao1 richness estimator. See Also -------- chao1_ci Notes ----- The uncorrected version is based on Equation 6 in [1]_: .. math:: chao1=S_{obs}+\frac{F_1^2}{2F_2} where :math:`F_1` and :math:`F_2` are the count of singletons and doubletons, respectively. The bias-corrected version is defined as .. math:: chao1=S_{obs}+\frac{F_1(F_1-1)}{2(F_2+1)} References ---------- .. [1] Chao, A. 1984. Non-parametric estimation of the number of classes in a population. Scandinavian Journal of Statistics 11, 265-270. """ counts = _validate_counts_vector(counts) o, s, d = osd(counts) if not bias_corrected and s and d: return o + s**2 / (d * 2) else: return o + s * (s - 1) / (2 * (d + 1))
def chao1_ci(counts, bias_corrected=True, zscore=1.96): """Calculate chao1 confidence interval. Parameters ---------- counts : 1-D array_like, int Vector of counts. bias_corrected : bool, optional Indicates whether or not to use the bias-corrected version of the equation. If ``False`` *and* there are both singletons and doubletons, the uncorrected version will be used. The biased-corrected version will be used otherwise. zscore : scalar, optional Score to use for confidence. Default of 1.96 is for a 95% confidence interval. Returns ------- tuple chao1 confidence interval as ``(lower_bound, upper_bound)``. See Also -------- chao1 Notes ----- The implementation here is based on the equations in the EstimateS manual [1]_. Different equations are employed to calculate the chao1 variance and confidence interval depending on `bias_corrected` and the presence/absence of singletons and/or doubletons. Specifically, the following EstimateS equations are used: 1. No singletons, Equation 14. 2. Singletons but no doubletons, Equations 7, 13. 3. Singletons and doubletons, ``bias_corrected=True``, Equations 6, 13. 4. Singletons and doubletons, ``bias_corrected=False``, Equations 5, 13. References ---------- .. [1] http://viceroy.eeb.uconn.edu/estimates/ """ counts = _validate_counts_vector(counts) o, s, d = osd(counts) if s: chao = chao1(counts, bias_corrected) chaovar = _chao1_var(counts, bias_corrected) return _chao_confidence_with_singletons(chao, o, chaovar, zscore) else: n = counts.sum() return _chao_confidence_no_singletons(n, o, zscore)
def lladser_ci(counts, r, alpha=0.95, f=10, ci_type='ULCL'): """Calculate single CI of the conditional uncovered probability. Parameters ---------- counts : 1-D array_like, int Vector of counts. r : int Number of new colors that are required for the next prediction. alpha : float, optional Desired confidence level. f : float, optional Ratio between upper and lower bound. ci_type : {'ULCL', 'ULCU', 'U', 'L'} Type of confidence interval. If ``'ULCL'``, upper and lower bounds with conservative lower bound. If ``'ULCU'``, upper and lower bounds with conservative upper bound. If ``'U'``, upper bound only, lower bound fixed to 0.0. If ``'L'``, lower bound only, upper bound fixed to 1.0. Returns ------- tuple Confidence interval as ``(lower_bound, upper_bound)``. See Also -------- lladser_pe Notes ----- This function is just a wrapper around the full CI estimator described in Theorem 2 (iii) in [1]_, intended to be called for a single best CI estimate on a complete sample. References ---------- .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via Poissonization: Accurate Measurements of the Microbial Unknown" PLoS 2011. """ counts = _validate_counts_vector(counts) sample = _expand_counts(counts) np.random.shuffle(sample) try: ci = list(_lladser_ci_series(sample, r, alpha, f, ci_type))[-1] except IndexError: ci = (np.nan, np.nan) return ci
def lladser_pe(counts, r=10): """Calculate single point estimate of conditional uncovered probability. Parameters ---------- counts : 1-D array_like, int Vector of counts. r : int, optional Number of new colors that are required for the next prediction. Returns ------- double Single point estimate of the conditional uncovered probability. May be ``np.nan`` if a point estimate could not be computed. See Also -------- lladser_ci Notes ----- This function is just a wrapper around the full point estimator described in Theorem 2 (i) in [1]_, intended to be called for a single best estimate on a complete sample. This function is not guaranteed to return estimated uncovered probabilities less than 1 if the coverage is too low. References ---------- .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via Poissonization: Accurate Measurements of the Microbial Unknown" PLoS 2011. """ counts = _validate_counts_vector(counts) sample = _expand_counts(counts) np.random.shuffle(sample) try: pe = list(_lladser_point_estimates(sample, r))[-1][0] except IndexError: pe = np.nan return pe
def gini_index(data, method='rectangles'): r"""Calculate the Gini index. The Gini index is defined as .. math:: G=\frac{A}{A+B} where :math:`A` is the area between :math:`y=x` and the Lorenz curve and :math:`B` is the area under the Lorenz curve. Simplifies to :math:`1-2B` since :math:`A+B=0.5`. Parameters ---------- data : 1-D array_like Vector of counts, abundances, proportions, etc. All entries must be non-negative. method : {'rectangles', 'trapezoids'} Method for calculating the area under the Lorenz curve. If ``'rectangles'``, connects the Lorenz curve points by lines parallel to the x axis. This is the correct method (in our opinion) though ``'trapezoids'`` might be desirable in some circumstances. If ``'trapezoids'``, connects the Lorenz curve points by linear segments between them. Basically assumes that the given sampling is accurate and that more features of given data would fall on linear gradients between the values of this data. Returns ------- double Gini index. Raises ------ ValueError If `method` isn't one of the supported methods for calculating the area under the curve. Notes ----- The Gini index was introduced in [1]_. The formula for ``method='rectangles'`` is .. math:: dx\sum_{i=1}^n h_i The formula for ``method='trapezoids'`` is .. math:: dx(\frac{h_0+h_n}{2}+\sum_{i=1}^{n-1} h_i) References ---------- .. [1] Gini, C. (1912). "Variability and Mutability", C. Cuppini, Bologna, 156 pages. Reprinted in Memorie di metodologica statistica (Ed. Pizetti E, Salvemini, T). Rome: Libreria Eredi Virgilio Veschi (1955). """ # Suppress cast to int because this method supports ints and floats. data = _validate_counts_vector(data, suppress_cast=True) lorenz_points = _lorenz_curve(data) B = _lorenz_curve_integrator(lorenz_points, method) return 1 - 2 * B
def ace(counts, rare_threshold=10): r"""Calculate the ACE metric (Abundance-based Coverage Estimator). The ACE metric is defined as: .. math:: S_{ace}=S_{abund}+\frac{S_{rare}}{C_{ace}}+ \frac{F_1}{C_{ace}}\gamma^2_{ace} where :math:`S_{abund}` is the number of abundant OTUs (with more than `rare_threshold` individuals) when all samples are pooled, :math:`S_{rare}` is the number of rare OTUs (with less than or equal to `rare_threshold` individuals) when all samples are pooled, :math:`C_{ace}` is the sample abundance coverage estimator, :math:`F_1` is the frequency of singletons, and :math:`\gamma^2_{ace}` is the estimated coefficient of variation for rare OTUs. The estimated coefficient of variation is defined as (assuming `rare_threshold` is 10, the default): .. math:: \gamma^2_{ace}=max\left[\frac{S_{rare}}{C_{ace}} \frac{\sum^{10}_{i=1}{{i\left(i-1\right)}}F_i} {\left(N_{rare}\right)\left(N_{rare}-1\right)} -1,0\right] Parameters ---------- counts : 1-D array_like, int Vector of counts. rare_threshold : int, optional Threshold at which an OTU containing as many or fewer individuals will be considered rare. Returns ------- double Computed ACE metric. Raises ------ ValueError If every rare OTU is a singleton. Notes ----- ACE was first introduced in [1]_ and [2]_. The implementation here is based on the description given in the EstimateS manual [3]_. If no rare OTUs exist, returns the number of abundant OTUs. The default value of 10 for `rare_threshold` is based on [4]_. If `counts` contains zeros, indicating OTUs which are known to exist in the environment but did not appear in the sample, they will be ignored for the purpose of calculating the number of rare OTUs. References ---------- .. [1] Chao, A. & S.-M Lee. 1992 Estimating the number of classes via sample coverage. Journal of the American Statistical Association 87, 210-217. .. [2] Chao, A., M.-C. Ma, & M. C. K. Yang. 1993. Stopping rules and estimation for recapture debugging with unequal failure rates. Biometrika 80, 193-201. .. [3] http://viceroy.eeb.uconn.edu/estimates/ .. [4] Chao, A., W.-H. Hwang, Y.-C. Chen, and C.-Y. Kuo. 2000. Estimating the number of shared species in two communities. Statistica Sinica 10:227-246. """ counts = _validate_counts_vector(counts) freq_counts = np.bincount(counts) s_rare = _otus_rare(freq_counts, rare_threshold) singles = freq_counts[1] if singles > 0 and singles == s_rare: raise ValueError("The only rare OTUs are singletons, so the ACE " "metric is undefined. EstimateS suggests using " "bias-corrected Chao1 instead.") s_abun = _otus_abundant(freq_counts, rare_threshold) if s_rare == 0: return s_abun n_rare = _number_rare(freq_counts, rare_threshold) c_ace = 1 - singles / n_rare top = s_rare * _number_rare(freq_counts, rare_threshold, gamma=True) bottom = c_ace * n_rare * (n_rare - 1) gamma_ace = (top / bottom) - 1 if gamma_ace < 0: gamma_ace = 0 return s_abun + (s_rare / c_ace) + ((singles / c_ace) * gamma_ace)