예제 #1
0
 def full_conditioning_rule_stopped_obs(distribution: Distribution,
                                        data: Obs,
                                        threshold: Sequence = None):
     if threshold is None:
         raise ValueError("This metric requires a input threshold.")
     return (ConditioningMethod.no_conditioning(distribution, data) -
             distribution.logsf(threshold[-1]) +
             np.sum(distribution.logcdf(threshold[:-1])))
예제 #2
0
def qq_l2_distance(distribution: Distribution, data: Obs):
    """
    QQ-Plot-like metrics: mean L2 distance between the x=y line and the (theoretical quantiles, empirical quantiles) one.
    Introduced by Varty, Z., Tawn, J. A., Atkinson, P. M., & Bierman, S. (2021).
    Inference for extreme earthquake magnitudes accounting for a time-varying measurement process.
    arXiv preprint arXiv:2102.00884.
    """
    levels = np.linspace(1e-4, 1.0 - 1e-4, 100)
    empirical_quantile = np.quantile(data, levels)
    return np.mean((distribution.inverse_cdf(levels) - empirical_quantile)**2)
예제 #3
0
def pp_l2_distance(distribution: Distribution, data: Obs):
    """
    PP-Plot-like metrics: mean L2 distance between the x=y line and the (theoretical cdf, empirical cdf) one.
    Introduced by Varty, Z., Tawn, J. A., Atkinson, P. M., & Bierman, S. (2021).
    Inference for extreme earthquake magnitudes accounting for a time-varying measurement process.
    arXiv preprint arXiv:2102.00884.
    """
    levels = np.linspace(1e-4, 1.0 - 1e-4, 100)
    empirical_cdf = np.quantile(distribution.cdf(data), levels)
    mult = 1 / (np.sqrt(levels * (1 - levels) / np.sqrt(len(data))))
    return np.mean(mult * (levels - empirical_cdf)**2)
예제 #4
0
def Brier_score(distribution: Distribution,
                data: Obs,
                threshold: float = None):
    """
    Brier Score: mean squared error between binary forecast and its empirical value.
    :param threshold: the tail we are interested in predicting correctly.
    :return: mean of (P(Y>=u)-1_{Y>=u})^2
    """
    if threshold is None:
        raise ValueError("This metric requires a input threshold.")
    p_threshold = distribution.sf(threshold)
    return np.mean((p_threshold - (data >= threshold).astype(float))**2)
예제 #5
0
def crps(distribution: Distribution, data: Obs):
    """
    Continuous Rank Probability Score: evaluates the continuous proximity of the empirical cumulative distribution function and that of the
    forecast distribution F.
    :return: \int_{-\infty}^{\infty}(F(y)-H(t-y))^2dy with H the Heavyside function equal to 0. for t<y, 1/2 for t=y and 1 for t>y.
    """
    from scipy import integrate

    def heavyside(t, y):
        return np.where(t < y, 0.0, np.where(t == y, 0.5, 1.0))

    integral = integrate.quad(
        lambda t: (distribution.cdf(t) - np.mean(heavyside(t, data)))**2,
        a=-np.inf,
        b=np.inf,
    )
    return integral[0]
예제 #6
0
def quantile_score(distribution: Distribution,
                   data: Obs,
                   quantile: float = None):
    """
    Quantile score: probability weighted score evaluating the difference between the predicted quantile and the
    empirical one.
    :param quantile: quantile of interest.
    :return: q*(y-F^{-1}(q)) if y>=F^{-1}(q), (1-q)*(F^{-1}(q)-y) otherwise
    """
    if quantile is None:
        raise ValueError("This metric requires a input quantile.")
    elif (quantile < 0) or (quantile > 1):
        raise ValueError("The quantile should be between 0 and 1.")

    def rho(x):
        return np.where(x >= 0, x * quantile, x * (quantile - 1))

    return np.mean(rho(data - distribution.inverse_cdf(quantile)))
예제 #7
0
def get_quantiles_and_confidence_intervals(
    fit: Distribution,
    data: Union[pd.DataFrame, np.array, pd.Series],
    ci_confidence=0.99,
):
    ll = Profiler(fit, data, inference_confidence=ci_confidence)
    min_max = []
    levels = np.linspace(0.01, 0.99, 100)
    empirical = pd.Series(data).quantile(levels)
    theoretical = fit.inverse_cdf(levels)
    for level in levels:
        metric = lambda x: x.inverse_cdf(level)
        CI = ll.confidence_interval(metric)
        min_max.append(CI)
    min_max = pd.DataFrame(min_max,
                           columns=["lower_bound", "upper_bound"],
                           index=levels)
    return theoretical, empirical, min_max["lower_bound"], min_max[
        "upper_bound"]
예제 #8
0
def qq_plot_gpd(
    data: pd.DataFrame,
    gpd_fit: Distribution,
    path_to_figure: str,
    threshold: Union[List, float, int, str] = "",
    ci_confidence=0.99,
    figure_name="qq_plot_gpd",
):
    id_obs = True
    if len(gpd_fit.flattened_params) != len(gpd_fit.params):
        id_obs = False
    theoretical, empirical, lower_bound, upper_bound = (
        get_quantiles_and_confidence_intervals(gpd_fit, data, ci_confidence)
        if id_obs else get_quantiles_and_confidence_intervals_uniform_scale(
            gpd_fit, data, ci_confidence))
    n = len(data)
    text_title = ""
    if type(gpd_fit.loc()) is not Parameter:
        loc = {
            r"$\mu_{}$".format(a): round(gpd_fit.loc.param_dict[a], 2)
            for a in gpd_fit.loc.param_dict
        }
        for k in loc:
            text_title += f"{k} = {loc[k]}, "
    else:
        loc = round(gpd_fit.loc(), 2)
        text_title += r"$\mu$=" + str(loc) + ", "
    text_title += "\n"
    if type(gpd_fit.scale()) is not Parameter:
        scale = {
            r"$\sigma_{}$".format(a): round(gpd_fit.scale.param_dict[a], 2)
            for a in gpd_fit.scale.param_dict
        }
        for k in scale:
            text_title += f"{k} = {scale[k]}, "
    else:
        scale = round(gpd_fit.scale(), 2)
        text_title += r"$\sigma$=" + str(scale) + ", "
    text_title += "\n"
    if type(gpd_fit.shape()) is not Parameter:
        shape = {
            r"$\xi_{}$".format(a): round(gpd_fit.shape.param_dict[a], 2)
            for a in gpd_fit.shape.param_dict
        }
        for k in shape:
            text_title += f"{k} = {shape[k]}, "
    else:
        shape = round(gpd_fit.shape(), 2)
        text_title += r"$\xi$=" + str(shape)
    if text_title.endswith(", "):
        text_title = text_title[:-2]
    threshold_text = (str(tuple(threshold))
                      if hasattr(threshold, "__len__") else str(threshold))
    plt.scatter(theoretical, empirical, s=5, marker="x", color="navy")
    plt.plot(theoretical, theoretical, label=f"$x=y$", color="navy")
    if id_obs:
        plt.fill_betweenx(y=empirical,
                          x1=lower_bound,
                          x2=upper_bound,
                          alpha=0.2,
                          color="navy")
    else:
        plt.fill_between(theoretical,
                         lower_bound,
                         upper_bound,
                         alpha=0.2,
                         color="navy")
    plt.legend()
    plt.title("QQ Plot of Exceedances over threshold " + threshold_text +
              " vs GPD distribution with parameters:\n" + text_title)
    plt.xlabel(f"Theoretical quantiles ({n} observations)")
    plt.ylabel("Empirical quantiles")
    plt.tight_layout()
    plt.savefig(f"{path_to_figure}/{figure_name}.png")
    plt.clf()
    to_concat = pd.DataFrame(
        [theoretical, lower_bound, upper_bound],
        columns=empirical.index,
        index=["theoretical", "lower_bound", "upper_bound"],
    ).T
    return pd.concat([empirical.rename("realized"), to_concat], axis=1)
예제 #9
0
def log_likelihood(distribution: Distribution, data: Obs):
    return np.sum(distribution.logpdf(data))
예제 #10
0
def likelihood(distribution: Distribution, data: Obs):
    return np.prod(distribution.pdf(data))
예제 #11
0
 def excluding_last_obs_rule(distribution: Distribution, data: Obs):
     return ConditioningMethod.no_conditioning(
         distribution, data) - distribution.logpdf(data.iloc[-1])
예제 #12
0
 def full_conditioning_rule_stopped_obs(data: pd.Series,
                                        distribution: Distribution,
                                        threshold: Sequence = None):
     return distribution.logsf(threshold[-1]) + np.sum(
         distribution.logcdf(threshold[:-1]))
예제 #13
0
 def partial_conditioning_rule_stopped_obs(data: pd.Series,
                                           distribution: Distribution,
                                           threshold: Sequence = None):
     return distribution.logsf(threshold[-1])
예제 #14
0
 def excluding_last_obs_rule(data: pd.Series, distribution: Distribution):
     if hasattr(data, "rclass"):
         return FloatVector(data[-1])
     else:
         return distribution.logpdf(data.iloc[-1])