def bootstrap_one_branch(sc, data, stat_fn=np.mean, num_samples=10000, seed_start=None, threshold_quantile=None, summary_quantiles=mabs.DEFAULT_QUANTILES): """Run a bootstrap for one branch on its own. Resamples the data ``num_samples`` times, computes ``stat_fn`` for each sample, then returns summary statistics for the distribution of the outputs of ``stat_fn``. Args: sc: The spark context data: The data as a list, 1D numpy array, or pandas Series stat_fn: Either a function that aggregates each resampled population to a scalar (e.g. the default value ``np.mean`` lets you bootstrap means), or a function that aggregates each resampled population to a dict of scalars. In both cases, this function must accept a one-dimensional ndarray as its input. num_samples: The number of bootstrap iterations to perform seed_start: An int with which to seed numpy's RNG. It must be unique within this set of calculations. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. summary_quantiles (list, optional): Quantiles to determine the confidence bands on the branch statistics. Change these when making Bonferroni corrections. """ samples = get_bootstrap_samples(sc, data, stat_fn, num_samples, seed_start, threshold_quantile) return mabs.summarize_one_branch_samples(samples, summary_quantiles)
def test_summarize_one_branch_samples(): s = pd.Series(np.linspace(0, 1, 1001)) res = mabs.summarize_one_branch_samples(s, [0.05, 0.31, 0.95]) assert res.shape == (4, ) assert res["0.05"] == pytest.approx(0.05) assert res["0.31"] == pytest.approx(0.31) assert res["0.95"] == pytest.approx(0.95) assert res["mean"] == pytest.approx(0.5)
def test_summarize_one_branch_vs_samples(): s = pd.Series([30, 80], index=["num_conversions", "num_enrollments"]) ppf = mabsb.summarize_one_branch_from_agg(s, quantiles=[0.5, 0.41]) samples = mabsb.get_samples(s.to_frame().T, "num_enrollments", "num_conversions", 100000) res = mabs.summarize_one_branch_samples(samples.iloc[:, 0], quantiles=[0.5, 0.41]) assert ppf["0.5"] == pytest.approx(res["0.5"], abs=0.001) assert ppf["0.41"] == pytest.approx(res["0.41"], abs=0.001) assert ppf["mean"] == pytest.approx(res["mean"], abs=0.001)
def test_summarize_one_branch_samples_batch(): s = pd.Series(np.linspace(0, 1, 1001)) df = pd.DataFrame({"a": s, "b": s + 1}) res = mabs.summarize_one_branch_samples(df, quantiles=[0.05, 0.31, 0.95]) assert res.shape == (2, 4) assert res.loc["a", "0.05"] == pytest.approx(0.05) assert res.loc["a", "0.31"] == pytest.approx(0.31) assert res.loc["a", "0.95"] == pytest.approx(0.95) assert res.loc["a", "mean"] == pytest.approx(0.5) assert res.loc["b", "0.05"] == pytest.approx(1.05) assert res.loc["b", "0.31"] == pytest.approx(1.31) assert res.loc["b", "0.95"] == pytest.approx(1.95) assert res.loc["b", "mean"] == pytest.approx(1.5)
def test_summarize_one_branch_samples_batch(): s = pd.Series(np.linspace(0, 1, 1001)) df = pd.DataFrame({'a': s, 'b': s + 1}) res = mabs.summarize_one_branch_samples(df, quantiles=[0.05, 0.31, 0.95]) assert res.shape == (2, 4) assert res.loc['a', '0.05'] == pytest.approx(0.05) assert res.loc['a', '0.31'] == pytest.approx(0.31) assert res.loc['a', '0.95'] == pytest.approx(0.95) assert res.loc['a', 'mean'] == pytest.approx(0.5) assert res.loc['b', '0.05'] == pytest.approx(1.05) assert res.loc['b', '0.31'] == pytest.approx(1.31) assert res.loc['b', '0.95'] == pytest.approx(1.95) assert res.loc['b', 'mean'] == pytest.approx(1.5)
def bootstrap_one_branch( data, stat_fn=bb_mean, num_samples=10000, seed_start=None, threshold_quantile=None, summary_quantiles=mabs.DEFAULT_QUANTILES, sc=None, ): """Bootstrap ``stat_fn`` for one branch on its own. Computes ``stat_fn`` for ``num_samples`` resamples of ``data``, then returns summary statistics for the results. Args: data: The data as a list, 1D numpy array, or pandas Series stat_fn (callable, optional): A function that either: * Aggregates each resampled population to a scalar (e.g. the default, ``bb_mean``), or * Aggregates each resampled population to a dict of scalars (e.g. the func returned by ``make_bb_quantile_closure`` when given multiple quantiles. In both cases, this function must accept two parameters: * a one-dimensional ndarray or pandas Series of values, * an identically shaped object of weights for these values num_samples: The number of bootstrap iterations to perform seed_start: An int with which to seed numpy's RNG. It must be unique within this set of calculations. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. summary_quantiles (list, optional): Quantiles to determine the confidence bands on the branch statistics. Change these when making Bonferroni corrections. sc (optional): The Spark context, if available """ samples = get_bootstrap_samples(data, stat_fn, num_samples, seed_start, threshold_quantile, sc) return mabs.summarize_one_branch_samples(samples, summary_quantiles)