Пример #1
0
def bootstrap_one_branch(sc,
                         data,
                         stat_fn=np.mean,
                         num_samples=10000,
                         seed_start=None,
                         threshold_quantile=None,
                         summary_quantiles=mabs.DEFAULT_QUANTILES):
    """Run a bootstrap for one branch on its own.

    Resamples the data ``num_samples`` times, computes ``stat_fn`` for
    each sample, then returns summary statistics for the distribution
    of the outputs of ``stat_fn``.

    Args:
        sc: The spark context
        data: The data as a list, 1D numpy array, or pandas Series
        stat_fn: Either a function that aggregates each resampled
            population to a scalar (e.g. the default value ``np.mean``
            lets you bootstrap means), or a function that aggregates
            each resampled population to a dict of scalars. In both
            cases, this function must accept a one-dimensional ndarray
            as its input.
        num_samples: The number of bootstrap iterations to perform
        seed_start: An int with which to seed numpy's RNG. It must
            be unique within this set of calculations.
        threshold_quantile (float, optional): An optional threshold
            quantile, above which to discard outliers. E.g. ``0.9999``.
        summary_quantiles (list, optional): Quantiles to determine the
            confidence bands on the branch statistics. Change these
            when making Bonferroni corrections.
    """
    samples = get_bootstrap_samples(sc, data, stat_fn, num_samples, seed_start,
                                    threshold_quantile)

    return mabs.summarize_one_branch_samples(samples, summary_quantiles)
Пример #2
0
def test_summarize_one_branch_samples():
    s = pd.Series(np.linspace(0, 1, 1001))

    res = mabs.summarize_one_branch_samples(s, [0.05, 0.31, 0.95])
    assert res.shape == (4, )
    assert res["0.05"] == pytest.approx(0.05)
    assert res["0.31"] == pytest.approx(0.31)
    assert res["0.95"] == pytest.approx(0.95)
    assert res["mean"] == pytest.approx(0.5)
Пример #3
0
def test_summarize_one_branch_vs_samples():
    s = pd.Series([30, 80], index=["num_conversions", "num_enrollments"])

    ppf = mabsb.summarize_one_branch_from_agg(s, quantiles=[0.5, 0.41])

    samples = mabsb.get_samples(s.to_frame().T, "num_enrollments",
                                "num_conversions", 100000)

    res = mabs.summarize_one_branch_samples(samples.iloc[:, 0],
                                            quantiles=[0.5, 0.41])

    assert ppf["0.5"] == pytest.approx(res["0.5"], abs=0.001)
    assert ppf["0.41"] == pytest.approx(res["0.41"], abs=0.001)
    assert ppf["mean"] == pytest.approx(res["mean"], abs=0.001)
Пример #4
0
def test_summarize_one_branch_samples_batch():
    s = pd.Series(np.linspace(0, 1, 1001))
    df = pd.DataFrame({"a": s, "b": s + 1})
    res = mabs.summarize_one_branch_samples(df, quantiles=[0.05, 0.31, 0.95])
    assert res.shape == (2, 4)

    assert res.loc["a", "0.05"] == pytest.approx(0.05)
    assert res.loc["a", "0.31"] == pytest.approx(0.31)
    assert res.loc["a", "0.95"] == pytest.approx(0.95)
    assert res.loc["a", "mean"] == pytest.approx(0.5)

    assert res.loc["b", "0.05"] == pytest.approx(1.05)
    assert res.loc["b", "0.31"] == pytest.approx(1.31)
    assert res.loc["b", "0.95"] == pytest.approx(1.95)
    assert res.loc["b", "mean"] == pytest.approx(1.5)
Пример #5
0
def test_summarize_one_branch_samples_batch():
    s = pd.Series(np.linspace(0, 1, 1001))
    df = pd.DataFrame({'a': s, 'b': s + 1})
    res = mabs.summarize_one_branch_samples(df, quantiles=[0.05, 0.31, 0.95])
    assert res.shape == (2, 4)

    assert res.loc['a', '0.05'] == pytest.approx(0.05)
    assert res.loc['a', '0.31'] == pytest.approx(0.31)
    assert res.loc['a', '0.95'] == pytest.approx(0.95)
    assert res.loc['a', 'mean'] == pytest.approx(0.5)

    assert res.loc['b', '0.05'] == pytest.approx(1.05)
    assert res.loc['b', '0.31'] == pytest.approx(1.31)
    assert res.loc['b', '0.95'] == pytest.approx(1.95)
    assert res.loc['b', 'mean'] == pytest.approx(1.5)
Пример #6
0
def bootstrap_one_branch(
    data,
    stat_fn=bb_mean,
    num_samples=10000,
    seed_start=None,
    threshold_quantile=None,
    summary_quantiles=mabs.DEFAULT_QUANTILES,
    sc=None,
):
    """Bootstrap ``stat_fn`` for one branch on its own.

    Computes ``stat_fn`` for ``num_samples`` resamples of ``data``,
    then returns summary statistics for the results.

    Args:
        data: The data as a list, 1D numpy array, or pandas Series
        stat_fn (callable, optional): A function that either:

            * Aggregates each resampled population to a scalar (e.g.
              the default, ``bb_mean``), or
            * Aggregates each resampled population to a dict of
              scalars (e.g. the func returned by
              ``make_bb_quantile_closure`` when given multiple
              quantiles.

            In both cases, this function must accept two parameters:

            * a one-dimensional ndarray or pandas Series of values,
            * an identically shaped object of weights for these values

        num_samples: The number of bootstrap iterations to perform
        seed_start: An int with which to seed numpy's RNG. It must
            be unique within this set of calculations.
        threshold_quantile (float, optional): An optional threshold
            quantile, above which to discard outliers. E.g. ``0.9999``.
        summary_quantiles (list, optional): Quantiles to determine the
            confidence bands on the branch statistics. Change these
            when making Bonferroni corrections.
        sc (optional): The Spark context, if available
    """
    samples = get_bootstrap_samples(data, stat_fn, num_samples, seed_start,
                                    threshold_quantile, sc)

    return mabs.summarize_one_branch_samples(samples, summary_quantiles)