Exemplo n.º 1
0
def test_bootstrap_vectorized_3samp(method, axis):
    def statistic(*data, axis=0):
        # an arbitrary, vectorized statistic
        return sum((sample.mean(axis) for sample in data))

    def statistic_1d(*data):
        # the same statistic, not vectorized
        for sample in data:
            assert sample.ndim == 1
        return statistic(*data, axis=0)

    np.random.seed(0)
    x = np.random.rand(4, 5)
    y = np.random.rand(4, 5)
    z = np.random.rand(4, 5)
    res1 = bootstrap((x, y, z),
                     statistic,
                     vectorized=True,
                     axis=axis,
                     n_resamples=100,
                     method=method,
                     random_state=0)
    res2 = bootstrap((x, y, z),
                     statistic_1d,
                     vectorized=False,
                     axis=axis,
                     n_resamples=100,
                     method=method,
                     random_state=0)
    assert_allclose(res1.confidence_interval, res2.confidence_interval)
    assert_allclose(res1.standard_error, res2.standard_error)
Exemplo n.º 2
0
def test_bootstrap_vectorized_1samp(method, axis):
    def statistic(x, axis=0):
        # an arbitrary, vectorized statistic
        return x.mean(axis=axis)

    def statistic_1d(x):
        # the same statistic, not vectorized
        assert x.ndim == 1
        return statistic(x, axis=0)

    np.random.seed(0)
    x = np.random.rand(4, 5)
    res1 = bootstrap((x, ),
                     statistic,
                     vectorized=True,
                     axis=axis,
                     n_resamples=100,
                     batch=None,
                     method=method,
                     random_state=0)
    res2 = bootstrap((x, ),
                     statistic_1d,
                     vectorized=False,
                     axis=axis,
                     n_resamples=100,
                     batch=10,
                     method=method,
                     random_state=0)
    assert_allclose(res1.confidence_interval, res2.confidence_interval)
    assert_allclose(res1.standard_error, res2.standard_error)
Exemplo n.º 3
0
def test_bootstrap_batch(method, axis):
    # for one-sample statistics, batch size shouldn't affect the result
    np.random.seed(0)

    x = np.random.rand(10, 11, 12)
    res1 = bootstrap((x,), np.mean, batch=None, method=method,
                     random_state=0, axis=axis, n_resamples=100)
    res2 = bootstrap((x,), np.mean, batch=10, method=method,
                     random_state=0, axis=axis, n_resamples=100)

    assert_equal(res2.confidence_interval.low, res1.confidence_interval.low)
    assert_equal(res2.confidence_interval.high, res1.confidence_interval.high)
    assert_equal(res2.standard_error, res1.standard_error)
Exemplo n.º 4
0
def test_bootstrap_vectorized(method, axis, paired):
    # test that paired is vectorized as expected: when samples are tiled,
    # CI and standard_error of each axis-slice is the same as those of the
    # original 1d sample

    if not paired and method == 'BCa':
        # should re-assess when BCa is extended
        pytest.xfail(reason="BCa currently for 1-sample statistics only")
    np.random.seed(0)

    def my_statistic(x, y, z, axis=-1):
        return x.mean(axis=axis) + y.mean(axis=axis) + z.mean(axis=axis)

    shape = 10, 11, 12
    n_samples = shape[axis]

    x = np.random.rand(n_samples)
    y = np.random.rand(n_samples)
    z = np.random.rand(n_samples)
    res1 = bootstrap((x, y, z),
                     my_statistic,
                     paired=paired,
                     method=method,
                     random_state=0,
                     axis=0,
                     n_resamples=100)

    reshape = [1, 1, 1]
    reshape[axis] = n_samples
    x = np.broadcast_to(x.reshape(reshape), shape)
    y = np.broadcast_to(y.reshape(reshape), shape)
    z = np.broadcast_to(z.reshape(reshape), shape)
    res2 = bootstrap((x, y, z),
                     my_statistic,
                     paired=paired,
                     method=method,
                     random_state=0,
                     axis=axis,
                     n_resamples=100)

    assert_allclose(res2.confidence_interval.low, res1.confidence_interval.low)
    assert_allclose(res2.confidence_interval.high,
                    res1.confidence_interval.high)
    assert_allclose(res2.standard_error, res1.standard_error)

    result_shape = list(shape)
    result_shape.pop(axis)

    assert_equal(res2.confidence_interval.low.shape, result_shape)
    assert_equal(res2.confidence_interval.high.shape, result_shape)
    assert_equal(res2.standard_error.shape, result_shape)
Exemplo n.º 5
0
def test_bootstrap_degenerate(method):
    data = 35 * [10000.]
    if method == "BCa":
        with np.errstate(invalid='ignore'):
            with pytest.warns(BootstrapDegenerateDistributionWarning):
                res = bootstrap([
                    data,
                ], np.mean, method=method)
                assert_equal(res.confidence_interval, (np.nan, np.nan))
    else:
        res = bootstrap([
            data,
        ], np.mean, method=method)
        assert_equal(res.confidence_interval, (10000., 10000.))
    assert_equal(res.standard_error, 0)
Exemplo n.º 6
0
def test_bootstrap_against_itself_1samp(method, expected):
    # The expected values in this test were generated using bootstrap
    # to check for unintended changes in behavior. The test also makes sure
    # that bootstrap works with multi-sample statistics and that the
    # `axis` argument works as expected / function is vectorized.
    np.random.seed(0)

    n = 100  # size of sample
    n_resamples = 999  # number of bootstrap resamples used to form each CI
    confidence_level = 0.9

    # The true mean is 5
    dist = stats.norm(loc=5, scale=1)
    stat_true = dist.mean()

    # Do the same thing 2000 times. (The code is fully vectorized.)
    n_replications = 2000
    data = dist.rvs(size=(n_replications, n))
    res = bootstrap((data,),
                    statistic=np.mean,
                    confidence_level=confidence_level,
                    n_resamples=n_resamples,
                    batch=50,
                    method=method,
                    axis=-1)
    ci = res.confidence_interval

    # ci contains vectors of lower and upper confidence interval bounds
    ci_contains_true = np.sum((ci[0] < stat_true) & (stat_true < ci[1]))
    assert ci_contains_true == expected

    # ci_contains_true is not inconsistent with confidence_level
    pvalue = stats.binomtest(ci_contains_true, n_replications,
                             confidence_level).pvalue
    assert pvalue > 0.1
Exemplo n.º 7
0
def test_vector_valued_statistic(method):
    # Generate 95% confidence interval around MLE of normal distribution
    # parameters. Repeat 100 times, each time on sample of size 100.
    # Check that confidence interval contains true parameters ~95 times.
    # Confidence intervals are estimated and stochastic; a test failure
    # does not necessarily indicate that something is wrong. More important
    # than values of `counts` below is that the shapes of the outputs are
    # correct.

    rng = np.random.default_rng(2196847219)
    params = 1, 0.5
    sample = stats.norm.rvs(*params, size=(100, 100), random_state=rng)

    def statistic(data):
        return stats.norm.fit(data)

    res = bootstrap((sample, ),
                    statistic,
                    method=method,
                    axis=-1,
                    vectorized=False)

    counts = np.sum((res.confidence_interval.low.T < params)
                    & (res.confidence_interval.high.T > params),
                    axis=0)
    assert np.all(counts >= 90)
    assert np.all(counts <= 100)
    assert res.confidence_interval.low.shape == (2, 100)
    assert res.confidence_interval.high.shape == (2, 100)
    assert res.standard_error.shape == (2, 100)
Exemplo n.º 8
0
def test_bootstrap_against_R(method, expected):
    # Compare against R's "boot" library
    # library(boot)

    # stat <- function (x, a) {
    #     mean(x[a])
    # }

    # x <- c(10, 12, 12.5, 12.5, 13.9, 15, 21, 22,
    #        23, 34, 50, 81, 89, 121, 134, 213)

    # # Use a large value so we get a few significant digits for the CI.
    # n = 1000000
    # bootresult = boot(x, stat, n)
    # result <- boot.ci(bootresult)
    # print(result)
    x = np.array([
        10, 12, 12.5, 12.5, 13.9, 15, 21, 22, 23, 34, 50, 81, 89, 121, 134, 213
    ])
    res = bootstrap((x, ),
                    np.mean,
                    n_resamples=1000000,
                    method=method,
                    random_state=0)
    assert_allclose(res.confidence_interval, expected, rtol=0.005)
Exemplo n.º 9
0
def _grad_conf_int(forecasts, p_value) -> tuple[float, float]:
    forecasts = (forecasts, )
    interval = stats.bootstrap(
        forecasts,
        np.median,
        confidence_level=(1 - p_value),
        random_state=0,
    ).confidence_interval

    return interval.low, interval.high
Exemplo n.º 10
0
def species_accumulation(x, max_steps, n_iter=100):
    steps = np.arange(1, max_steps)
    interpolated = np.arange(1, max_steps) < x.sum()

    accumulation = stats.bootstrap(x,
                                   fn=partial(stats.rarefaction_extrapolation,
                                              max_steps=max_steps),
                                   n_iter=n_iter)
    accumulation['interpolated'] = interpolated
    accumulation['steps'] = steps
    return accumulation
Exemplo n.º 11
0
def check_sample_var(sample, popvar):
    # check that population mean lies within the CI bootstrapped from the
    # sample. This used to be a chi-squared test for variance, but there were
    # too many false positives
    res = stats.bootstrap(
        (sample, ),
        lambda x, axis: x.var(ddof=1, axis=axis),
        confidence_level=0.995,
    )
    conf = res.confidence_interval
    low, high = conf.low, conf.high
    assert low <= popvar <= high
Exemplo n.º 12
0
def table_bootpack(table, bin_size, n_bootstraps, seed=8472):
    new_table = data.Table()
    for head, subtable in misc.sorted_groupby(table, key=lambda r: r.corr.shape):
        subtable = data.Table(subtable)
        stacked_corrs = numpy.stack(subtable['corr'], axis=0)
        bootpacks = data.BootPack(
            stats.mean(stacked_corrs),
            stats.bootstrap(stats.bin_(stacked_corrs, bin_size), n_bootstraps, seed=seed))
        new_table.extend(
            data.Record(record, corr=bootpack, bin_size=bin_size, n_bootstraps=n_bootstraps)
            for record, bootpack in zip(subtable, bootpacks))
    return new_table
def lower_ci_bound_on_raw_rewards(
    actions_with_scores_list: List[ActionsWithScores],
    debug: bool = False
) -> Union[AggregatedScores, List[Tuple[PlayerAction, float, float, float]]]:
    """
  The aggregated score is the lower CI bound of the mean of all the individual
  rewards across all permutations (i.e., it doesn't compute averages for each
  permutation first). This requires MctsPlayerOptions.save_rewards to be True.
  If debug is True, the output contains the CI limits as well.
  WARNING: This is very slow.
  """
    # pylint: disable=too-many-branches
    is_fully_simulated = are_all_nodes_fully_simulated(
        actions_with_scores_list)
    if is_fully_simulated:
        return _average_ucb_for_fully_simulated_trees(actions_with_scores_list)

    stats = defaultdict(list)
    for actions_with_scores in actions_with_scores_list:
        for action, score in actions_with_scores.items():
            if score.fully_simulated:
                stats[action].extend([score.score for _ in range(score.n)])
            else:
                stats[action].extend(score.rewards)

    actions_and_scores = []
    for action, rewards in stats.items():
        if len(rewards) == 1:
            if debug:
                actions_and_scores.append(
                    (action, rewards[0], rewards[0], rewards[0]))
            else:
                actions_and_scores.append((action, rewards[0]))
        else:
            bootstrap_result = bootstrap((rewards, ),
                                         np.mean,
                                         method='percentile',
                                         n_resamples=1000)
            confidence_interval = bootstrap_result.confidence_interval
            if debug:
                actions_and_scores.append(
                    (action, confidence_interval.low, confidence_interval.low,
                     confidence_interval.high))
            else:
                actions_and_scores.append((action, confidence_interval.low))
    # noinspection PyUnreachableCode
    if __debug__:
        logging.debug(
            "MctsPlayer: Lower CI bounds on raw rewards:\n%s",
            pprint.pformat(
                sorted(actions_and_scores, key=lambda x: x[1], reverse=True)))
    return actions_and_scores
Exemplo n.º 14
0
def test_bootstrap_against_theory(method):
    # based on https://www.statology.org/confidence-intervals-python/
    data = stats.norm.rvs(loc=5, scale=2, size=5000, random_state=0)
    alpha = 0.95
    dist = stats.t(df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))
    expected_interval = dist.interval(alpha=alpha)
    expected_se = dist.std()

    res = bootstrap((data,), np.mean, n_resamples=5000,
                    confidence_level=alpha, method=method,
                    random_state=0)
    assert_allclose(res.confidence_interval, expected_interval, rtol=5e-4)
    assert_allclose(res.standard_error, expected_se, atol=3e-4)
Exemplo n.º 15
0
def test_bootstrap_gh15678(method):
    # Check that gh-15678 is fixed: when statistic function returned a Python
    # float, method="BCa" failed when trying to add a dimension to the float
    rng = np.random.default_rng(354645618886684)
    dist = stats.norm(loc=2, scale=4)
    data = dist.rvs(size=100, random_state=rng)
    data = (data, )
    res = bootstrap(data,
                    stats.skew,
                    method=method,
                    n_resamples=100,
                    random_state=np.random.default_rng(9563))
    # this always worked because np.apply_along_axis returns NumPy data type
    ref = bootstrap(data,
                    stats.skew,
                    method=method,
                    n_resamples=100,
                    random_state=np.random.default_rng(9563),
                    vectorized=False)
    assert_allclose(res.confidence_interval, ref.confidence_interval)
    assert_allclose(res.standard_error, ref.standard_error)
    assert isinstance(res.standard_error, np.float64)
Exemplo n.º 16
0
def test_bootstrap_paired(method):
    # test that `paired` works as expected
    np.random.seed(0)
    n = 100
    x = np.random.rand(n)
    y = np.random.rand(n)

    def my_statistic(x, y, axis=-1):
        return ((x-y)**2).mean(axis=axis)

    def my_paired_statistic(i, axis=-1):
        a = x[i]
        b = y[i]
        res = my_statistic(a, b)
        return res

    i = np.arange(len(x))

    res1 = bootstrap((i,), my_paired_statistic, random_state=0)
    res2 = bootstrap((x, y), my_statistic, paired=True, random_state=0)

    assert_allclose(res1.confidence_interval, res2.confidence_interval)
    assert_allclose(res1.standard_error, res2.standard_error)
Exemplo n.º 17
0
def compute_CI(data, metric=np.mean, confidence_level=0.95, axis=-1, n_resamples=999, eps=1e-8, **kwargs):
    """
    data: np.array of shape (timesteps, sample_size) (second dim. is the number of runs for ex.)
    
    Returns:
    --------
    ci_l : np.array of shape (timesteps,)
    ci_u: np.array of shape (timesteps,)
    """
    from scipy.stats import bootstrap
    data = data + eps
    res = bootstrap((data,), metric, confidence_level=confidence_level, axis=axis, n_resamples=n_resamples, **kwargs)
    ci_l, ci_u = res.confidence_interval
    return ci_l, ci_u
Exemplo n.º 18
0
def test_bootstrap_against_itself_2samp(method, expected):
    # The expected values in this test were generated using bootstrap
    # to check for unintended changes in behavior. The test also makes sure
    # that bootstrap works with multi-sample statistics and that the
    # `axis` argument works as expected / function is vectorized.
    np.random.seed(0)

    n1 = 100  # size of sample 1
    n2 = 120  # size of sample 2
    n_resamples = 999  # number of bootstrap resamples used to form each CI
    confidence_level = 0.9

    # The statistic we're interested in is the difference in means
    def my_stat(data1, data2, axis=-1):
        mean1 = np.mean(data1, axis=axis)
        mean2 = np.mean(data2, axis=axis)
        return mean1 - mean2

    # The true difference in the means is -0.1
    dist1 = stats.norm(loc=0, scale=1)
    dist2 = stats.norm(loc=0.1, scale=1)
    stat_true = dist1.mean() - dist2.mean()

    # Do the same thing 1000 times. (The code is fully vectorized.)
    n_replications = 1000
    data1 = dist1.rvs(size=(n_replications, n1))
    data2 = dist2.rvs(size=(n_replications, n2))
    res = bootstrap((data1, data2),
                    statistic=my_stat,
                    confidence_level=confidence_level,
                    n_resamples=n_resamples,
                    batch=50,
                    method=method,
                    axis=-1)
    ci = res.confidence_interval

    # ci contains vectors of lower and upper confidence interval bounds
    ci_contains_true = np.sum((ci[0] < stat_true) & (stat_true < ci[1]))
    assert ci_contains_true == expected

    # ci_contains_true is not inconsistent with confidence_level
    pvalue = stats.binomtest(ci_contains_true, n_replications,
                             confidence_level).pvalue
    assert pvalue > 0.1
Exemplo n.º 19
0
    def genStatsFunction(
        self,
        fcn: Callable,
        fcnkwargs: dict[str, Any] = None,
    ) -> None:
        """
        A wrapper function to generate statistics via a generic function.

        Parameters
        ----------
        fcn : Callable
            The function used to generate the desired statistics.
        fcnkwargs : dict[str, Any]
            The keyword arguments for the function.
        """
        self.fcn = fcn
        if fcnkwargs is None:
            fcnkwargs = dict()
        self.fcnkwargs = fcnkwargs
        if self.bootstrap:
            self.bootstrap_n = order_stat_TI_n(self.bootstrap_k,
                                               p=0.5,
                                               c=self.conf)

        # Scalar Variables
        if self.var.isscalar:
            # Calculate nums and confidence interval for each point in the sequence
            self.nums = self.statsFunctionWrapper(self.var.nums)
            if self.bootstrap:
                # Switch to method='Bca' once https://github.com/scipy/scipy/issues/15883 resolved
                res = bootstrap((np.array(self.var.nums), ),
                                self.statsFunctionWrapper,
                                confidence_level=self.conf,
                                n_resamples=self.bootstrap_n,
                                random_state=self.seed,
                                method='basic')
                self.confidence_interval_low_nums = res.confidence_interval.low
                self.confidence_interval_high_nums = res.confidence_interval.high

            # Calculate the corresponding vals based on the nummap
            self.vals = copy(self.nums)
            if self.bootstrap:
                self.confidence_interval_low_vals = copy(
                    self.confidence_interval_low_nums)
                self.confidence_interval_high_vals = copy(
                    self.confidence_interval_high_nums)
            if self.var.nummap is not None:
                self.vals = [self.var.nummap[num] for num in self.nums]
                if self.bootstrap:
                    self.confidence_interval_low_vals = \
                        [self.var.nummap[num] for num in self.confidence_interval_low_nums]
                    self.confidence_interval_high_vals = \
                        [self.var.nummap[num] for num in self.confidence_interval_high_nums]

        # 1-D Variables
        elif self.var.maxdim == 1:
            nums_list = get_list(self.var.nums)
            npoints = max(len(x) for x in nums_list)
            if self.bootstrap:
                confidence_interval_low_nums = []
                confidence_interval_high_nums = []

            # Calculate nums and confidence interval for each point in the sequence
            nums = []
            for i in range(npoints):
                numsatidx = np.array([x[i] for x in nums_list if len(x) > i])
                nums.append(self.statsFunctionWrapper(numsatidx))
                if self.bootstrap:
                    # Switch to Bca once https://github.com/scipy/scipy/issues/15883 resolved
                    res = bootstrap((numsatidx, ),
                                    self.statsFunctionWrapper,
                                    confidence_level=self.conf,
                                    n_resamples=self.bootstrap_n,
                                    random_state=self.seed,
                                    method='basic')
                    confidence_interval_low_nums.append(
                        res.confidence_interval.low)
                    confidence_interval_high_nums.append(
                        res.confidence_interval.high)
            self.nums = nums
            if self.bootstrap:
                self.confidence_interval_low_nums = confidence_interval_low_nums
                self.confidence_interval_high_nums = confidence_interval_high_nums

            # Calculate the corresponding vals based on the nummap
            self.vals = copy(self.nums)
            if self.bootstrap:
                self.confidence_interval_low_vals = copy(
                    self.confidence_interval_low_nums)
                self.confidence_interval_high_vals = copy(
                    self.confidence_interval_high_nums)
            if self.var.nummap is not None:
                self.vals = [[self.var.nummap[x] for x in y]
                             for y in self.nums]
                if self.bootstrap:
                    self.confidence_interval_low_vals \
                        = [[self.var.nummap[x] for x in y]
                           for y in self.confidence_interval_low_nums]
                    self.confidence_interval_low_vals \
                        = [[self.var.nummap[x] for x in y]
                           for y in self.confidence_interval_high_nums]

        else:
            # Suppress warning since this will become valid when Var is split
            # warn('VarStat only available for scalar or 1-D data')
            pass
Exemplo n.º 20
0
    sm_l.append(float(sm[key]))
    c_l.append(float(cython[key]))
    r_l.append(float(r[key]))
  
gl_l = np.asarray(gl_l)
sm_l = np.asarray(sm_l)
c_l = np.asarray(c_l)
r_l = np.asarray(r_l)

cython_errors = np.asarray(np.abs(c_l - gl_l), dtype=float)
sm_errors = np.asarray(np.abs(sm_l - gl_l), dtype=float)
r_errors = np.asarray(np.abs(r_l - gl_l), dtype=float)
#%%
import scipy.stats as stats
CI = .99
res_cython = stats.bootstrap((cython_errors,), np.mean, confidence_level=CI)
ci_cython = res_cython.confidence_interval
             
print(f"""             Mean Absolute Error CI\n
        Cython {CI*100}% Confidence Interval:
        ---------------------------------
             Lower     |     Upper
          -----------------------------
           {ci_cython.low:.3e}   |    {ci_cython.high:.3e}
      """)
      

res_sm = stats.bootstrap((sm_errors,), np.mean, confidence_level=CI)
ci_sm = res_sm.confidence_interval        
print(f"""\n\n
        Statsmodels {CI*100}% Confidence Interval:
Exemplo n.º 21
0
def test_bootstrap_iv():

    message = "`data` must be a sequence of samples."
    with pytest.raises(ValueError, match=message):
        bootstrap(1, np.mean)

    message = "`data` must contain at least one sample."
    with pytest.raises(ValueError, match=message):
        bootstrap(tuple(), np.mean)

    message = "each sample in `data` must contain two or more observations..."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3], [1]), np.mean)

    message = ("When `paired is True`, all samples must have the same length ")
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3], [1, 2, 3, 4]), np.mean, paired=True)

    message = "`vectorized` must be `True` or `False`."
    with pytest.raises(ValueError, match=message):
        bootstrap(1, np.mean, vectorized='ekki')

    message = "`axis` must be an integer."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, axis=1.5)

    message = "could not convert string to float"
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, confidence_level='ni')

    message = "`n_resamples` must be a positive integer."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, n_resamples=-1000)

    message = "`n_resamples` must be a positive integer."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, n_resamples=1000.5)

    message = "`batch` must be a positive integer or None."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, batch=-1000)

    message = "`batch` must be a positive integer or None."
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, batch=1000.5)

    message = "`method` must be in"
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, method='ekki')

    message = "`method = 'BCa' is only available for one-sample statistics"

    def statistic(x, y, axis):
        mean1 = np.mean(x, axis)
        mean2 = np.mean(y, axis)
        return mean1 - mean2

    with pytest.raises(ValueError, match=message):
        bootstrap(([.1, .2, .3], [.1, .2, .3]), statistic, method='BCa')

    message = "'herring' cannot be used to seed a"
    with pytest.raises(ValueError, match=message):
        bootstrap(([1, 2, 3],), np.mean, random_state='herring')
def _lower_ci_bound(ucbs: List[Tuple[float, int]]) -> float:
    scores = [q / n for q, n in ucbs]
    if len(scores) == 1:
        return scores[0]
    bootstrap_result = bootstrap((scores, ), np.mean, method='percentile')
    return bootstrap_result.confidence_interval.low
Exemplo n.º 23
0
def diversity(x,
              method=None,
              CI=False,
              conf=0.95,
              n_iter=1000,
              n_jobs=1,
              seed=None,
              disable_pb=False,
              **kwargs):
    r"""
    Wrapper for various bias-corrected richness functions

    Parameters
    ----------
    x : array-like, with shape (number of species)
        An array representing the abundances (observed
        counts) for each individual species.
    method : str (default = None)
        One estimator of:
            - 'chao1'
            - 'egghe_proot'
            - 'jackknife'
            - 'minsample'
            - 'empirical' (same as None)
    **kwargs : additional parameters passed to selected method

    Note
    ----
    If `CI` is True, a bootstrap procedure will be called on the
    specified method to compute the confidence intervals around
    the central estimate etc. For the Jackknife procedure, the
    CI is calculated analytically and no bootstrap values will
    be included in the returned dict.

    Returns
    -------
    Consult the documentation of selected method.
    """

    x = np.array(x, dtype=np.int64)

    if (x < 0).any():
        msg = "Elements of `x` should be strictly non-negative"
        raise ValueError(msg)

    if x.sum() <= 0:
        msg = "`x` appears to be empty"
        raise ValueError(msg)

    if method is not None and method.lower() not in ESTIMATORS:
        raise ValueError(f"Unknown estimation method `{method}`.")

    if method is None:
        method = "empirical"

    method = method.lower()

    if CI and method != 'jackknife':
        estimate = stats.bootstrap(x,
                                   fn=partial(ESTIMATORS[method], **kwargs),
                                   n_iter=n_iter,
                                   n_jobs=n_jobs,
                                   seed=seed,
                                   disable_pb=disable_pb)
    elif CI and method == 'jackknife':
        estimate = ESTIMATORS[method](x, CI=CI, conf=conf, **kwargs)
    else:
        estimate = ESTIMATORS[method](x, **kwargs)

    return estimate