def test_bootstrap_input_array_samples_several(self, sample_array_type):
        res = bootstrap(sample_array_type,
                        num_iter=100,
                        resample_size=100,
                        metrics={"mean": np.mean})

        # if all means are the same there could be two issues: not actually calculating the mean from the sample
        # OR not actually sampling with replacement
        counter = create_count_of_sampled_items(res)
        assert len(dict(counter)) > 1
    def test_bootstrap_input_list_has_at_least_one_duplicate(
            self, sample_list_type):
        res = bootstrap(sample_list_type,
                        num_iter=100,
                        resample_size=100,
                        metrics={"mean": np.mean})
        counter = create_count_of_sampled_items(res)
        assert any(
            [True if count > 1 else False for count in counter.values()])

        # checking if the mean lies around the true mean of the given 'sample' - this cannot be tested without fixing
        # the seed, which I do not want to in this implementation
        print(
            len(
                np.array(res)
                [[abs(metric['mean'] - 50) >= 5 for metric in res]]) < 15)
Exemplo n.º 3
0
import os
import sys
SRC_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(SRC_ROOT)

from src.bootstrap import bootstrap, get_application
from src.settings import settings

bootstrap()
application = get_application()

if __name__ == '__main__':
    application.run(debug=settings.DEBUG,
                    host=settings.HOST,
                    port=settings.PORT,
                    threaded=False)
Exemplo n.º 4
0
def main():
    '''
    Main function to be called
    '''
    return bootstrap()
Exemplo n.º 5
0
 def test_main(self):
     '''
     placeholder test
     '''
     assert bootstrap() is 'foo'
Exemplo n.º 6
0
def simulation(
    population_function,
    metric_functions: Dict[str, Any],
    num_sample_draws: int,
    conf_interval: Callable[[Iterable[Any], float],
                            ConfidenceInterval] = confidence_interval,
    pop_size: int = 100000,
    sample_size: int = 1000,
    resample_size: int = 1000,
    num_bootstraps: int = 1000,
    coverage: float = 0.95
) -> Tuple[List[Dict[str, MetricResult]], Dict[str, Any]]:
    """
    Simulates several sample draws from the actual population to see if about 90% of the time the measured metrics
    actually lie within the calculated confidence interval given by the bootstrapping procedure.

    Args:
        population_function: the function that returns a population of size pop_size
        metric_functions: a list of functions that take an iterable as input and return a single value as output
        num_sample_draws: How often do we want to simulate sampling from the distribution? (without replacement)
        conf_interval: the function which calculates the confidence interval from the bootstrapping procedure. There are
                       several ways of calculating a confidence interval.
        pop_size: size of the true population
        sample_size: size of the sample for which we simulate
        resample_size: size of the bootstrap resamples (it is recommended to use the sample size or at least 50% of it)
        num_bootstraps: how often the bootstrap resamples should be drawn
        coverage: the coverage of the confidence interval

    Returns:
        A Tuple sim_res, metrics:
            `sim_res` is a list of dictionaries of MetricResults, which tells us the confidence
            interval for this bootstrap and if the population metric is within the confidence interval.
            `metrics` is a dictionary of the population metrics.
    """
    sim_res = []
    # Could also expect this as input, such that several simulations can run on one population. Cannot set a seed
    # for distribution sampling
    pop, metrics = get_population_and_metrics(
        population_function,
        pop_size=pop_size,
        metric_functions=metric_functions)

    for _ in tqdm(range(num_sample_draws)):
        sample = get_sample(pop, sample_size=sample_size)
        res = bootstrap(sample,
                        num_iter=num_bootstraps,
                        resample_size=resample_size,
                        metrics=metric_functions)

        bs_res = _calculate_metric_for_bootstrap_sample(
            res,
            conf_interval=conf_interval,
            coverage=coverage,
            metrics=metrics)
        # for metric_name, _ in metric_functions.items():
        #     bs_metric_res = [single_res[metric_name] for single_res in res]
        #     conf = conf_interval(bs_metric_res, coverage)
        #     pop_metric_in_conf = True if conf.lower_bound <= metrics[metric_name] <= conf.upper_bound else False
        #
        #     bs_res[metric_name] = MetricResult(metric_confidence=conf, pop_metric_in_conf=pop_metric_in_conf)

        sim_res.append(bs_res)

    return sim_res, metrics