示例#1
0
def _determine_quantile_bias_corrected_point_estimate(
    bootstraps: np.ndarray,
    quantile_pct: float,
    quantile_statistic_interpolation_method: str,
    sample_quantile: np.ndarray,
) -> Number:
    bootstrap_quantiles: Union[np.ndarray, Number] = numpy_quantile(
        bootstraps,
        q=quantile_pct,
        axis=1,
        method=quantile_statistic_interpolation_method,
    )
    bootstrap_quantile_point_estimate: np.ndarray = np.mean(
        bootstrap_quantiles)
    bootstrap_quantile_standard_error: np.ndarray = np.std(bootstrap_quantiles)
    bootstrap_quantile_bias: float = bootstrap_quantile_point_estimate - sample_quantile

    # Bias / Standard Error > 0.25 is a rule of thumb for when to apply bias correction.
    # See:
    # Efron, B., & Tibshirani, R. J. (1993). Estimates of bias. An Introduction to the Bootstrap (pp. 128).
    #         Springer Science and Business Media Dordrecht. DOI 10.1007/978-1-4899-4541-9
    quantile_bias_corrected_point_estimate: Number

    if (bootstrap_quantile_standard_error > 0
            and bootstrap_quantile_bias / bootstrap_quantile_standard_error <=
            0.25):
        quantile_bias_corrected_point_estimate = bootstrap_quantile_point_estimate
    else:
        quantile_bias_corrected_point_estimate = (
            bootstrap_quantile_point_estimate - bootstrap_quantile_bias)
    return quantile_bias_corrected_point_estimate
示例#2
0
def compute_quantiles(
    metric_values: np.ndarray,
    false_positive_rate: np.float64,
    quantile_statistic_interpolation_method: str,
) -> NumericRangeEstimationResult:
    lower_quantile = numpy_quantile(
        a=metric_values,
        q=(false_positive_rate / 2),
        axis=0,
        method=quantile_statistic_interpolation_method,
    )
    upper_quantile = numpy_quantile(
        a=metric_values,
        q=1.0 - (false_positive_rate / 2),
        axis=0,
        method=quantile_statistic_interpolation_method,
    )
    return NumericRangeEstimationResult(
        estimation_histogram=np.histogram(a=metric_values,
                                          bins=NUM_HISTOGRAM_BINS)[0],
        value_range=np.asarray([lower_quantile, upper_quantile]),
    )
示例#3
0
def compute_bootstrap_quantiles_point_estimate(
    metric_values: np.ndarray,
    false_positive_rate: np.float64,
    quantile_statistic_interpolation_method: str,
    n_resamples: int,
    random_seed: Optional[int] = None,
) -> NumericRangeEstimationResult:
    """
    ML Flow Experiment: parameter_builders_bootstrap/bootstrap_quantiles
    ML Flow Experiment ID: 4129654509298109

    An internal implementation of the "bootstrap" estimator method, returning a point estimate for a population
    parameter of interest (lower and upper quantiles in this case). See
    https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for an introduction to "bootstrapping" in statistics.

    The methods implemented here can be found in:
    Efron, B., & Tibshirani, R. J. (1993). Estimates of bias. An Introduction to the Bootstrap (pp. 124-130).
        Springer Science and Business Media Dordrecht. DOI 10.1007/978-1-4899-4541-9

    This implementation is sub-par compared to the one available from the "SciPy" standard library
    ("https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html"), in that it does not handle
    multi-dimensional statistics. "scipy.stats.bootstrap" is vectorized, thus having the ability to accept a
    multi-dimensional statistic function and process all dimensions.

    Unfortunately, as of March 4th, 2022, the SciPy implementation has two issues: 1) it only returns a confidence
    interval and not a point estimate for the population parameter of interest, which is what we require for our use
    cases. 2) It can not handle multi-dimensional statistics and correct for bias simultaneously. You must either use
    one feature or the other.

    This implementation could only be replaced by "scipy.stats.bootstrap" if Great Expectations drops support for
    Python 3.6, thereby enabling us to use a more up-to-date version of the "scipy" Python package (the currently used
    version does not have "bootstrap"). Also, as discussed above, two contributions would need to be made to the SciPy
    package to enable 1) bias correction for multi-dimensional statistics and 2) a return value of a point estimate for
    the population parameter of interest (lower and upper quantiles in this case).

    Additional future direction could include developing enhancements to bootstrapped estimator based on theory
    presented in "http://dido.econ.yale.edu/~dwka/pub/p1001.pdf":
    @article{Andrews2000a,
        added-at = {2008-04-25T10:38:44.000+0200},
        author = {Andrews, Donald W. K. and Buchinsky, Moshe},
        biburl = {https://www.bibsonomy.org/bibtex/28e2f0a58cdb95e39659921f989a17bdd/smicha},
        day = 01,
        interhash = {778746398daa9ba63bdd95391f1efd37},
        intrahash = {8e2f0a58cdb95e39659921f989a17bdd},
        journal = {Econometrica},
        keywords = {imported},
        month = Jan,
        note = {doi: 10.1111/1468-0262.00092},
        number = 1,
        pages = {23--51},
        timestamp = {2008-04-25T10:38:52.000+0200},
        title = {A Three-step Method for Choosing the Number of Bootstrap Repetitions},
        url = {http://www.blackwell-synergy.com/doi/abs/10.1111/1468-0262.00092},
        volume = 68,
        year = 2000
    }
    The article outlines a three-step minimax procedure that relies on the Central Limit Theorem (C.L.T.) along with the
    bootstrap sampling technique (see https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for background) for
    computing the stopping criterion, expressed as the optimal number of bootstrap samples, needed to achieve a maximum
    probability that the value of the statistic of interest will be minimally deviating from its actual (ideal) value.
    """
    lower_quantile_pct: float = false_positive_rate / 2
    upper_quantile_pct: float = 1.0 - false_positive_rate / 2

    sample_lower_quantile: np.ndarray = numpy_quantile(
        a=metric_values,
        q=lower_quantile_pct,
        method=quantile_statistic_interpolation_method,
    )
    sample_upper_quantile: np.ndarray = numpy_quantile(
        a=metric_values,
        q=upper_quantile_pct,
        method=quantile_statistic_interpolation_method,
    )

    bootstraps: np.ndarray
    if random_seed:
        random_state: np.random.Generator = np.random.Generator(
            np.random.PCG64(random_seed))
        bootstraps = random_state.choice(metric_values,
                                         size=(n_resamples,
                                               metric_values.size))
    else:
        bootstraps = np.random.choice(metric_values,
                                      size=(n_resamples, metric_values.size))

    lower_quantile_bias_corrected_point_estimate: Number = _determine_quantile_bias_corrected_point_estimate(
        bootstraps=bootstraps,
        quantile_pct=lower_quantile_pct,
        quantile_statistic_interpolation_method=
        quantile_statistic_interpolation_method,
        sample_quantile=sample_lower_quantile,
    )

    upper_quantile_bias_corrected_point_estimate: Number = _determine_quantile_bias_corrected_point_estimate(
        bootstraps=bootstraps,
        quantile_pct=upper_quantile_pct,
        quantile_statistic_interpolation_method=
        quantile_statistic_interpolation_method,
        sample_quantile=sample_upper_quantile,
    )

    return NumericRangeEstimationResult(
        estimation_histogram=np.histogram(a=bootstraps.flatten(),
                                          bins=NUM_HISTOGRAM_BINS)[0],
        value_range=[
            lower_quantile_bias_corrected_point_estimate,
            upper_quantile_bias_corrected_point_estimate,
        ],
    )