Exemplo n.º 1
0
def compute_ci(data,
               outcome,
               estimates,
               ci_method="percentile",
               alpha=0.05,
               n_cores=1):
    """Compute confidence interval of bootstrap estimates. Parts of the code of the
    subfunctions of this function are taken from Daniel Saxton's resample library, as
    found on https://github.com/dsaxton/resample/ .


    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the data calculating statistic of interest.
            Needs to return a pandas Series.
        estimates (pandas.DataFrame): DataFrame of estimates in the bootstrap samples.
        ci_method (str): method of choice for confidence interval computation.
        alpha (float): significance level of choice.
        n_cores (int): number of jobs for parallelization.

    Returns:
        cis (pandas.DataFrame): DataFrame where k'th row contains CI for k'th parameter.

    """

    check_inputs(data=data, alpha=alpha, ci_method=ci_method)

    funcname = "_ci_" + ci_method

    cis = globals()[funcname](data, outcome, estimates, alpha, n_cores)

    return pd.DataFrame(cis,
                        index=estimates.columns.tolist(),
                        columns=["lower_ci", "upper_ci"])
def test_check_inputs_cluster_by(setup):
    cluster_by = "this is not a column name of df"
    expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."

    with pytest.raises(ValueError) as error:
        check_inputs(data=setup["df"], cluster_by=cluster_by)
    assert str(error.value) == expected_msg
def test_check_inputs_data():
    data = "this is not a data frame"
    expected_msg = "Data must be a pandas.DataFrame or pandas.Series."

    with pytest.raises(TypeError) as error:
        check_inputs(data=data)
    assert str(error.value) == expected_msg
def test_check_inputs_ci_level(setup):
    ci_level = 666
    expected_msg = "Input 'ci_level' must be in [0,1]."

    with pytest.raises(ValueError) as error:
        check_inputs(data=setup["df"], ci_level=ci_level)
    assert str(error.value) == expected_msg
Exemplo n.º 5
0
def test_check_inputs_ci_method(setup, expected):
    ci_method = 4
    with pytest.raises(ValueError) as excinfo:
        check_inputs(data=setup["df"], ci_method=ci_method)
    expected_msg = ("ci_method must be 'percentile', 'bc',"
                    f" 'bca', 't', 'basic' or 'normal', '{ci_method}'"
                    f" was supplied")
    assert str(excinfo.value) == expected_msg
def get_bootstrap_outcomes(
    data,
    outcome,
    outcome_kwargs=None,
    cluster_by=None,
    seed=None,
    n_draws=1000,
    n_cores=1,
    error_handling="continue",
    batch_evaluator=joblib_batch_evaluator,
):
    """Draw bootstrap samples and calculate outcomes.

    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the dataset calculating statistic of interest.
            Needs to return array-like object or pd.Series.
        cluster_by (str): column name of the variable to cluster by.
        seed (int): Random seed.
        n_draws (int): number of draws, only relevant if seeds is None.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        estimates (pandas.DataFrame): Outcomes for different bootstrap samples. The
            columns are the index of the result of ``outcome``.

    """

    check_inputs(data=data, cluster_by=cluster_by)

    if outcome_kwargs is not None:
        outcome = partial(outcome, *outcome_kwargs)

    indices = get_bootstrap_indices(
        data=data,
        cluster_by=cluster_by,
        seed=seed,
        n_draws=n_draws,
    )

    estimates = _get_bootstrap_outcomes_from_indices(
        indices=indices,
        data=data,
        outcome=outcome,
        n_cores=n_cores,
        error_handling=error_handling,
        batch_evaluator=batch_evaluator,
    )

    return estimates
def get_bootstrap_outcomes(
    data,
    outcome,
    cluster_by=None,
    rng=None,
    n_draws=1000,
    n_cores=1,
    error_handling="continue",
    batch_evaluator="joblib",
):
    """Draw bootstrap samples and calculate outcomes.

    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the dataset calculating statistic of interest.
            Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
        cluster_by (str): column name of the variable to cluster by.
        rng (numpy.random.Generator): A random number generator.
        n_draws (int): number of bootstrap draws.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        estimates (list):  List of pytrees of estimated bootstrap outcomes.
    """
    check_inputs(data=data, cluster_by=cluster_by)
    batch_evaluator = process_batch_evaluator(batch_evaluator)

    indices = get_bootstrap_indices(
        data=data,
        rng=rng,
        cluster_by=cluster_by,
        n_draws=n_draws,
    )

    estimates = _get_bootstrap_outcomes_from_indices(
        indices=indices,
        data=data,
        outcome=outcome,
        n_cores=n_cores,
        error_handling=error_handling,
        batch_evaluator=batch_evaluator,
    )

    return estimates
Exemplo n.º 8
0
def calculate_ci(
    base_outcome,
    estimates,
    ci_method="percentile",
    ci_level=0.95,
):
    """Compute confidence interval of bootstrap estimates.

    Parts of the code of the subfunctions of this function are taken from
    Daniel Saxton's resample library, as found on
    https://github.com/dsaxton/resample/


    Args:
        base_outcome (list): List of flat base outcomes, i.e. the outcome
            statistic(s) evaluated on the original data set.
        estimates (np.ndarray): Array of estimates computed on the bootstrapped
            samples.
        ci_method (str): Method of choice for computing confidence intervals.
            The default is "percentile".
        ci_level (float): Confidence level for the calculation of confidence
            intervals. The default is 0.95.

    Returns:
        np.ndarray: 1d array of the lower confidence interval, where the k'th entry
            contains the lower confidence interval for the k'th parameter.
        np.ndarray: 1d array of the upper confidence interval, where the k'th entry
            contains the upper confidence interval for the k'th parameter.
    """
    check_inputs(ci_method=ci_method, ci_level=ci_level, skipdata=True)

    alpha = 1 - ci_level

    if ci_method == "percentile":
        cis = _ci_percentile(estimates, alpha)
    elif ci_method == "bc":
        cis = _ci_bc(estimates, base_outcome, alpha)
    elif ci_method == "t":
        cis = _ci_t(estimates, base_outcome, alpha)
    elif ci_method == "basic":
        cis = _ci_basic(estimates, base_outcome, alpha)
    elif ci_method == "normal":
        cis = _ci_normal(estimates, base_outcome, alpha)

    return cis[:, 0], cis[:, 1]
Exemplo n.º 9
0
def bootstrap_from_outcomes(
    data, outcome, bootstrap_outcomes, ci_method="percentile", alpha=0.05, n_cores=1
):
    """Set up results table containing mean, standard deviation and confidence interval
    for each estimated parameter.

    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the data calculating statistic of interest.
            Needs to return a pandas Series.
        bootstrap_outcomes (pandas.DataFrame): DataFrame of bootstrap_outcomes in the
            bootstrap samples.
        ci_method (str): method of choice for confidence interval computation.
        n_cores (int): number of jobs for parallelization.
        alpha (float): significance level of choice.

    Returns:
        results (pandas.DataFrame): table of results.

    """

    check_inputs(data=data, ci_method=ci_method, alpha=alpha)

    summary = pd.DataFrame(bootstrap_outcomes.mean(axis=0), columns=["mean"])

    summary["std"] = bootstrap_outcomes.std(axis=0)

    cis = compute_ci(data, outcome, bootstrap_outcomes, ci_method, alpha, n_cores)
    summary["lower_ci"] = cis["lower_ci"]
    summary["upper_ci"] = cis["upper_ci"]

    cov = bootstrap_outcomes.cov()

    out = {"summary": summary, "cov": cov, "outcomes": bootstrap_outcomes}

    return out
Exemplo n.º 10
0
        seeds (numpy.array): array of seeds for bootstrap samples, default is none.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        results (pandas.DataFrame): DataFrame where k'th row contains mean estimate,
        standard error, and confidence interval of k'th parameter.

    """

    check_inputs(data, cluster_by, ci_method, alpha)

    estimates = get_bootstrap_outcomes(
        data=data,
        outcome=outcome,
        outcome_kwargs=outcome_kwargs,
        cluster_by=cluster_by,
        seed=seed,
        n_draws=n_draws,
        n_cores=n_cores,
        error_handling=error_handling,
        batch_evaluator=batch_evaluator,
    )

    out = bootstrap_from_outcomes(data, outcome, estimates, ci_method, alpha, n_cores)
Exemplo n.º 11
0
def test_check_inputs_alpha(setup, expected):
    alpha = 666
    with pytest.raises(ValueError) as excinfo:
        check_inputs(data=setup["df"], alpha=alpha)
    assert "Input 'alpha' must be in [0,1]." == str(excinfo.value)
Exemplo n.º 12
0
def test_check_inputs_cluster_by(setup, expected):
    cluster_by = "this is not a column name of df"
    with pytest.raises(ValueError) as excinfo:
        check_inputs(data=setup["df"], cluster_by=cluster_by)
    assert "Input 'cluster_by' must be None or a column name of DataFrame." == str(
        excinfo.value)
Exemplo n.º 13
0
def test_check_inputs_data(setup, expected):
    data = "this is not a data frame"
    with pytest.raises(ValueError) as excinfo:
        check_inputs(data=data)
    assert "Input 'data' must be DataFrame." == str(excinfo.value)
Exemplo n.º 14
0
            Generator instance then that instance is used.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        BootstrapResult: A BootstrapResult object storing information on summary
            statistics, the covariance matrix, and estimated boostrap outcomes.
    """
    if callable(outcome):

        check_inputs(data=data, cluster_by=cluster_by)

        if outcome_kwargs is not None:
            outcome = functools.partial(outcome, **outcome_kwargs)
    else:
        raise ValueError("outcome must be a callable.")

    if existing_result is None:
        base_outcome = outcome(data)
        existing_outcomes = []
    elif isinstance(existing_result, BootstrapResult):
        base_outcome = existing_result.base_outcome
        existing_outcomes = existing_result.outcomes
    else:
        raise ValueError("existing_result must be None or a BootstrapResult.")