예제 #1
0
def test_clustering_leaves_households_intact(data):
    np.random.seed(1234)
    indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1)[0]
    sampled = data.iloc[indices]
    sampled_households = sampled["hh"].unique()
    for household in sampled_households:
        expected_ids = set(data[data["hh"] == household]["id"].unique())
        actual_ids = set(sampled[sampled["hh"] == household]["id"].unique())
        assert expected_ids == actual_ids
def get_bootstrap_outcomes(
    data,
    outcome,
    outcome_kwargs=None,
    cluster_by=None,
    seed=None,
    n_draws=1000,
    n_cores=1,
    error_handling="continue",
    batch_evaluator=joblib_batch_evaluator,
):
    """Draw bootstrap samples and calculate outcomes.

    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the dataset calculating statistic of interest.
            Needs to return array-like object or pd.Series.
        cluster_by (str): column name of the variable to cluster by.
        seed (int): Random seed.
        n_draws (int): number of draws, only relevant if seeds is None.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        estimates (pandas.DataFrame): Outcomes for different bootstrap samples. The
            columns are the index of the result of ``outcome``.

    """

    check_inputs(data=data, cluster_by=cluster_by)

    if outcome_kwargs is not None:
        outcome = partial(outcome, *outcome_kwargs)

    indices = get_bootstrap_indices(
        data=data,
        cluster_by=cluster_by,
        seed=seed,
        n_draws=n_draws,
    )

    estimates = _get_bootstrap_outcomes_from_indices(
        indices=indices,
        data=data,
        outcome=outcome,
        n_cores=n_cores,
        error_handling=error_handling,
        batch_evaluator=batch_evaluator,
    )

    return estimates
def get_bootstrap_outcomes(
    data,
    outcome,
    cluster_by=None,
    rng=None,
    n_draws=1000,
    n_cores=1,
    error_handling="continue",
    batch_evaluator="joblib",
):
    """Draw bootstrap samples and calculate outcomes.

    Args:
        data (pandas.DataFrame): original dataset.
        outcome (callable): function of the dataset calculating statistic of interest.
            Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
        cluster_by (str): column name of the variable to cluster by.
        rng (numpy.random.Generator): A random number generator.
        n_draws (int): number of bootstrap draws.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        estimates (list):  List of pytrees of estimated bootstrap outcomes.
    """
    check_inputs(data=data, cluster_by=cluster_by)
    batch_evaluator = process_batch_evaluator(batch_evaluator)

    indices = get_bootstrap_indices(
        data=data,
        rng=rng,
        cluster_by=cluster_by,
        n_draws=n_draws,
    )

    estimates = _get_bootstrap_outcomes_from_indices(
        indices=indices,
        data=data,
        outcome=outcome,
        n_cores=n_cores,
        error_handling=error_handling,
        batch_evaluator=batch_evaluator,
    )

    return estimates
예제 #4
0
def test_get_bootstrap_indices_radomization_works_with_clustering(data):
    rng = get_rng(seed=12345)
    res = get_bootstrap_indices(data, cluster_by="hh", n_draws=2, rng=rng)
    assert set(res[0]) != set(res[1])
예제 #5
0
def test_get_bootstrap_indices_randomization_works_without_clustering(data):
    np.random.seed(1234)
    res = get_bootstrap_indices(data, n_draws=2)
    assert set(res[0]) != set(res[1])