def full_conditioning_rule_stopped_obs(distribution: Distribution, data: Obs, threshold: Sequence = None): if threshold is None: raise ValueError("This metric requires a input threshold.") return (ConditioningMethod.no_conditioning(distribution, data) - distribution.logsf(threshold[-1]) + np.sum(distribution.logcdf(threshold[:-1])))
def qq_l2_distance(distribution: Distribution, data: Obs): """ QQ-Plot-like metrics: mean L2 distance between the x=y line and the (theoretical quantiles, empirical quantiles) one. Introduced by Varty, Z., Tawn, J. A., Atkinson, P. M., & Bierman, S. (2021). Inference for extreme earthquake magnitudes accounting for a time-varying measurement process. arXiv preprint arXiv:2102.00884. """ levels = np.linspace(1e-4, 1.0 - 1e-4, 100) empirical_quantile = np.quantile(data, levels) return np.mean((distribution.inverse_cdf(levels) - empirical_quantile)**2)
def pp_l2_distance(distribution: Distribution, data: Obs): """ PP-Plot-like metrics: mean L2 distance between the x=y line and the (theoretical cdf, empirical cdf) one. Introduced by Varty, Z., Tawn, J. A., Atkinson, P. M., & Bierman, S. (2021). Inference for extreme earthquake magnitudes accounting for a time-varying measurement process. arXiv preprint arXiv:2102.00884. """ levels = np.linspace(1e-4, 1.0 - 1e-4, 100) empirical_cdf = np.quantile(distribution.cdf(data), levels) mult = 1 / (np.sqrt(levels * (1 - levels) / np.sqrt(len(data)))) return np.mean(mult * (levels - empirical_cdf)**2)
def Brier_score(distribution: Distribution, data: Obs, threshold: float = None): """ Brier Score: mean squared error between binary forecast and its empirical value. :param threshold: the tail we are interested in predicting correctly. :return: mean of (P(Y>=u)-1_{Y>=u})^2 """ if threshold is None: raise ValueError("This metric requires a input threshold.") p_threshold = distribution.sf(threshold) return np.mean((p_threshold - (data >= threshold).astype(float))**2)
def crps(distribution: Distribution, data: Obs): """ Continuous Rank Probability Score: evaluates the continuous proximity of the empirical cumulative distribution function and that of the forecast distribution F. :return: \int_{-\infty}^{\infty}(F(y)-H(t-y))^2dy with H the Heavyside function equal to 0. for t<y, 1/2 for t=y and 1 for t>y. """ from scipy import integrate def heavyside(t, y): return np.where(t < y, 0.0, np.where(t == y, 0.5, 1.0)) integral = integrate.quad( lambda t: (distribution.cdf(t) - np.mean(heavyside(t, data)))**2, a=-np.inf, b=np.inf, ) return integral[0]
def quantile_score(distribution: Distribution, data: Obs, quantile: float = None): """ Quantile score: probability weighted score evaluating the difference between the predicted quantile and the empirical one. :param quantile: quantile of interest. :return: q*(y-F^{-1}(q)) if y>=F^{-1}(q), (1-q)*(F^{-1}(q)-y) otherwise """ if quantile is None: raise ValueError("This metric requires a input quantile.") elif (quantile < 0) or (quantile > 1): raise ValueError("The quantile should be between 0 and 1.") def rho(x): return np.where(x >= 0, x * quantile, x * (quantile - 1)) return np.mean(rho(data - distribution.inverse_cdf(quantile)))
def get_quantiles_and_confidence_intervals( fit: Distribution, data: Union[pd.DataFrame, np.array, pd.Series], ci_confidence=0.99, ): ll = Profiler(fit, data, inference_confidence=ci_confidence) min_max = [] levels = np.linspace(0.01, 0.99, 100) empirical = pd.Series(data).quantile(levels) theoretical = fit.inverse_cdf(levels) for level in levels: metric = lambda x: x.inverse_cdf(level) CI = ll.confidence_interval(metric) min_max.append(CI) min_max = pd.DataFrame(min_max, columns=["lower_bound", "upper_bound"], index=levels) return theoretical, empirical, min_max["lower_bound"], min_max[ "upper_bound"]
def qq_plot_gpd( data: pd.DataFrame, gpd_fit: Distribution, path_to_figure: str, threshold: Union[List, float, int, str] = "", ci_confidence=0.99, figure_name="qq_plot_gpd", ): id_obs = True if len(gpd_fit.flattened_params) != len(gpd_fit.params): id_obs = False theoretical, empirical, lower_bound, upper_bound = ( get_quantiles_and_confidence_intervals(gpd_fit, data, ci_confidence) if id_obs else get_quantiles_and_confidence_intervals_uniform_scale( gpd_fit, data, ci_confidence)) n = len(data) text_title = "" if type(gpd_fit.loc()) is not Parameter: loc = { r"$\mu_{}$".format(a): round(gpd_fit.loc.param_dict[a], 2) for a in gpd_fit.loc.param_dict } for k in loc: text_title += f"{k} = {loc[k]}, " else: loc = round(gpd_fit.loc(), 2) text_title += r"$\mu$=" + str(loc) + ", " text_title += "\n" if type(gpd_fit.scale()) is not Parameter: scale = { r"$\sigma_{}$".format(a): round(gpd_fit.scale.param_dict[a], 2) for a in gpd_fit.scale.param_dict } for k in scale: text_title += f"{k} = {scale[k]}, " else: scale = round(gpd_fit.scale(), 2) text_title += r"$\sigma$=" + str(scale) + ", " text_title += "\n" if type(gpd_fit.shape()) is not Parameter: shape = { r"$\xi_{}$".format(a): round(gpd_fit.shape.param_dict[a], 2) for a in gpd_fit.shape.param_dict } for k in shape: text_title += f"{k} = {shape[k]}, " else: shape = round(gpd_fit.shape(), 2) text_title += r"$\xi$=" + str(shape) if text_title.endswith(", "): text_title = text_title[:-2] threshold_text = (str(tuple(threshold)) if hasattr(threshold, "__len__") else str(threshold)) plt.scatter(theoretical, empirical, s=5, marker="x", color="navy") plt.plot(theoretical, theoretical, label=f"$x=y$", color="navy") if id_obs: plt.fill_betweenx(y=empirical, x1=lower_bound, x2=upper_bound, alpha=0.2, color="navy") else: plt.fill_between(theoretical, lower_bound, upper_bound, alpha=0.2, color="navy") plt.legend() plt.title("QQ Plot of Exceedances over threshold " + threshold_text + " vs GPD distribution with parameters:\n" + text_title) plt.xlabel(f"Theoretical quantiles ({n} observations)") plt.ylabel("Empirical quantiles") plt.tight_layout() plt.savefig(f"{path_to_figure}/{figure_name}.png") plt.clf() to_concat = pd.DataFrame( [theoretical, lower_bound, upper_bound], columns=empirical.index, index=["theoretical", "lower_bound", "upper_bound"], ).T return pd.concat([empirical.rename("realized"), to_concat], axis=1)
def log_likelihood(distribution: Distribution, data: Obs): return np.sum(distribution.logpdf(data))
def likelihood(distribution: Distribution, data: Obs): return np.prod(distribution.pdf(data))
def excluding_last_obs_rule(distribution: Distribution, data: Obs): return ConditioningMethod.no_conditioning( distribution, data) - distribution.logpdf(data.iloc[-1])
def full_conditioning_rule_stopped_obs(data: pd.Series, distribution: Distribution, threshold: Sequence = None): return distribution.logsf(threshold[-1]) + np.sum( distribution.logcdf(threshold[:-1]))
def partial_conditioning_rule_stopped_obs(data: pd.Series, distribution: Distribution, threshold: Sequence = None): return distribution.logsf(threshold[-1])
def excluding_last_obs_rule(data: pd.Series, distribution: Distribution): if hasattr(data, "rclass"): return FloatVector(data[-1]) else: return distribution.logpdf(data.iloc[-1])