def eva_model_pot(battery_wl_preprocessed) -> EVA: eva_model = EVA(data=battery_wl_preprocessed) eva_model.get_extremes( method="POT", extremes_type="high", threshold=1.35, r="24H", ) return eva_model
def eva_model_bm(battery_wl_preprocessed) -> EVA: eva_model = EVA(data=battery_wl_preprocessed) eva_model.get_extremes( method="BM", extremes_type="high", block_size="365.2425D", errors="raise", ) return eva_model
def _calculate_return_value( args: typing.Tuple[pd.Series, # ts (time series) float, # return_period typing.Union[str, pd.Timedelta], # return_period_size float, # threshold typing.Union[str, pd.Timedelta], # r str, # extremes_type typing.Union[str, scipy.stats.rv_continuous], # distribution str, # distribution_name typing.Optional[float], # alpha int, # n_samples ], ) -> typing.Dict[str, typing.Union[str, typing.Optional[float]]]: ( ts, return_period, return_period_size, threshold, r, extremes_type, distribution, distribution_name, alpha, n_samples, ) = args model = EVA(data=ts) model.get_extremes( method="POT", extremes_type=extremes_type, threshold=threshold, r=r, ) model.fit_model( model="MLE", distribution=distribution, ) # TODO - this is a hack to avoid spawning nested subprocesses _n_samples = n_samples % 10 while _n_samples < n_samples: _n_samples += 10 model.get_return_value( return_period=return_period, return_period_size=return_period_size, alpha=alpha, n_samples=_n_samples, ) rv, cil, ciu = model.get_return_value( return_period=return_period, return_period_size=return_period_size, alpha=alpha, n_samples=n_samples, ) return { "distribution_name": distribution_name, "threshold": threshold, "rv": rv, "cil": cil, "ciu": ciu, }
def test_from_extremes(self): index = pd.date_range(start="2000", end="2050", periods=100) eva_model = EVA.from_extremes( extremes=pd.Series( data=np.arange(100), index=index, name="water level [m]", ), method="BM", extremes_type="high", ) assert eva_model.extremes_method == "BM" assert eva_model.extremes_type == "high" assert eva_model.extremes_kwargs["errors"] == "ignore" assert eva_model.extremes_kwargs["min_last_block"] is None expected_block_size = (((index.max() - index.min()) / (len(index) - 1)).to_numpy().astype(float)) actual_block_size = ( eva_model.extremes_kwargs["block_size"].to_numpy().astype(float)) assert np.isclose(expected_block_size, actual_block_size, rtol=0, atol=1e-6) # Test default POT arguments eva_model = EVA.from_extremes( extremes=pd.Series( data=np.arange(100), index=pd.date_range(start="2000", end="2050", periods=100), name="water level [m]", ), method="POT", extremes_type="high", ) assert np.isclose(eva_model.extremes_kwargs["threshold"], 0, rtol=0, atol=1e-6) assert eva_model.extremes_kwargs["r"] == pd.to_timedelta("24H") eva_model = EVA.from_extremes( extremes=pd.Series( data=np.arange(100), index=pd.date_range(start="2000", end="2050", periods=100), name="water level [m]", ), method="POT", extremes_type="low", ) assert np.isclose(eva_model.extremes_kwargs["threshold"], 99, rtol=0, atol=1e-6) assert eva_model.extremes_kwargs["r"] == pd.to_timedelta("24H")
def eva_model_bm_emcee(battery_wl_preprocessed) -> EVA: eva_model = EVA(data=battery_wl_preprocessed) eva_model.get_extremes( method="BM", extremes_type="high", block_size="365.2425D", errors="raise", ) eva_model.fit_model("Emcee", n_walkers=10, n_samples=100) return eva_model
def eva_model_bm_mle(battery_wl_preprocessed) -> EVA: eva_model = EVA(data=battery_wl_preprocessed) eva_model.get_extremes( method="BM", extremes_type="high", block_size="1Y", errors="raise", ) eva_model.fit_model("MLE") return eva_model
def test_init_errors(self): with pytest.raises( TypeError, match=r"invalid type.*`data` argument.*pandas.Series"): EVA(data=1) with pytest.warns(RuntimeWarning, match=r"`data`.*not numeric.*converting"): eva_model = EVA(data=pd.Series( data=["1", "2", "3"], index=pd.DatetimeIndex(["2020", "2021", "2022"]), )) assert np.allclose(eva_model.data.values, [1, 2, 3]) with pytest.warns(RuntimeWarning, match=r"`data`.*not numeric.*converting"): with pytest.raises( TypeError, match=r"invalid dtype.*`data` argument.*numeric"): EVA(data=pd.Series( data=["a", "b", "c"], index=pd.DatetimeIndex(["2020", "2021", "2022"]), )) with pytest.raises(TypeError, match=r"index of `data`.*date-time.*not"): EVA(data=pd.Series(data=[1, 2, 3], index=["2020", "2021", "2022"])) with pytest.warns(RuntimeWarning, match=r"index is not sorted.*sorting"): eva_model = EVA(data=pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2022", "2021", "2020"]), )) assert np.allclose(eva_model.data.index.year.values, [2020, 2021, 2022]) with pytest.warns(RuntimeWarning, match=r"Null values found.*removing invalid"): eva_model = EVA(data=pd.Series( data=[1, 2, np.nan, 3], index=pd.DatetimeIndex(["2020", "2021", "2022", "2023"]), )) assert np.allclose(eva_model.data.values, [1, 2, 3]) assert np.allclose(eva_model.data.index.year.values, [2020, 2021, 2023])
def eva_model(battery_wl_preprocessed) -> EVA: return EVA(data=battery_wl_preprocessed)
def test_set_extremes_errors(self): eva_model = EVA(data=pd.Series( data=np.arange(100), index=pd.date_range(start="2000", end="2050", periods=100), name="water level [m]", )) # Test invalid `extremes` with pytest.raises(TypeError, match=r"invalid type.*must be pandas.Series"): eva_model.set_extremes([1, 2, 3]) with pytest.raises(TypeError, match=r"invalid index.*must be date-time"): eva_model.set_extremes(pd.Series( data=[1, 2, 3], index=[1, 2, 3], )) with pytest.raises(TypeError, match=r"`extremes` must have numeric values"): eva_model.set_extremes( pd.Series( data=["a", "b", "c"], index=pd.DatetimeIndex(["2020", "2021", "2022"]), )) with pytest.raises(ValueError, match="name doesn't match"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name="different name", )) with pytest.raises(ValueError, match=".+time range must fit within.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["1990", "2021", "2022"]), )) # Test invalid general kwargs with pytest.raises(ValueError, match=r"`method` must be either.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="wrong method", ) with pytest.raises(ValueError, match=r"`extremes_type` must be either.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="BM", extremes_type="wrong type", ) # Test invalid BM kwargs with pytest.raises(ValueError, match=r"`block_size` must be a positive.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="BM", extremes_type="high", block_size="-1D", ) with pytest.raises(ValueError, match=r"invalid value.+`errors` argument"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="BM", extremes_type="high", errors="wrong errors", ) with pytest.raises(ValueError, match=r"`min_last_block` must be a number.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="BM", extremes_type="high", min_last_block=2.0, ) # Test invalid POT kwargs with pytest.raises(ValueError, match=r"invalid `threshold` value"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="POT", extremes_type="high", threshold=2, ) with pytest.raises(ValueError, match=r"`r` must be a positive.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="POT", extremes_type="high", r="-1D", ) # Test unrecognized arguments with pytest.raises(TypeError, match=r"unrecognized arguments.+"): eva_model.set_extremes( pd.Series( data=[1, 2, 3], index=pd.DatetimeIndex(["2020", "2021", "2022"]), name=eva_model.data.name, ), method="BM", extremes_type="high", unrecognized_argument=1, )
def plot_aic_scores( ts: pd.Series, thresholds=None, r: typing.Union[str, pd.Timedelta] = "24H", extremes_type: str = "high", distributions: typing.Optional[typing.List[typing.Union[ str, scipy.stats.rv_continuous]]] = None, ax: typing.Optional[plt.Axes] = None, figsize: tuple = (8, 5), ) -> plt.Axes: """ Plot AIC scores for each distribution and threshold. Used to investigate which distribution better explains data variance for each threshold value. Does NOT indicate which threshold value is better because it will always have the same shape - logarithmic curve. Parameters ---------- ts : pandas.Series Time series of the signal. thresholds : array-like, optional An array of thresholds for which the AIC plot is plotted. If None (default), plots AIC for 100 equally-spaced thresholds between 90th (10th if extremes_type='high') percentile and 10th largest (smallest if extremes_type='low') value in the series. r : pandas.Timedelta or value convertible to timedelta, optional Duration of window used to decluster the exceedances. By default r='24H' (24 hours). See pandas.to_timedelta for more information. extremes_type : str, optional high (default) - extreme high values low - extreme low values distributions : list, optional List of distributions for which the AIC curves are plotted. By default these are "genpareto" and "expon". A distribution must be either a name of distribution from scipy.stats or a subclass of scipy.stats.rv_continuous. See https://docs.scipy.org/doc/scipy/reference/stats.html ax : matplotlib.axes._axes.Axes, optional If provided, then the plot is drawn on this axes. If None (default), new figure and axes are created figsize : tuple, optional Figure size in inches in format (width, height). By default it is (8, 5). Returns ------- plt.Axes Axes object. """ # Get default `thresholds` if thresholds is None: thresholds = get_default_thresholds( ts=ts, extremes_type=extremes_type, num=100, ) # Get default `distributions` if distributions is None: distributions = [ "genpareto", "expon", ] distribution_names: typing.List[str] = [] for distribution in distributions: if isinstance(distribution, str): distribution_names.append(distribution) else: distribution_names.append(distribution.name) # Calculate AIC values model = EVA(data=ts) results = [] for distribution, distribution_name in zip(distributions, distribution_names): for threshold in thresholds: model.get_extremes( method="POT", extremes_type=extremes_type, threshold=threshold, r=r, ) model.fit_model(model="MLE", distribution=distribution) results.append({ "distribution_name": distribution_name, "threshold": threshold, "aic": model.AIC, }) results = pd.DataFrame(data=results).sort_values("threshold", ascending=True) with plt.rc_context(rc=pyextremes_rc): if ax is None: _, ax = plt.subplots(figsize=figsize, dpi=96) ax.grid(False) for i, (distribution_name, df) in enumerate(results.groupby("distribution_name")): ax.plot( df.loc[:, "threshold"], df.loc[:, "aic"], color=pyextremes_rc["axes.prop_cycle"].by_key()["color"][i], lw=2, ls="-", label=distribution_name, zorder=(i + 3) * 5, ) # Plot legend ax.legend(frameon=True, framealpha=0.9) # Label axes ax.set_xlabel("Threshold") ax.set_ylabel("AIC Score") return ax
def plot_return_value_stability( ts: pd.Series, return_period, return_period_size: typing.Union[str, pd.Timedelta] = "365.2425D", thresholds=None, r: typing.Union[str, pd.Timedelta] = "24H", extremes_type: str = "high", distributions: typing.Optional[typing.List[typing.Union[ str, scipy.stats.rv_continuous]]] = None, alpha: typing.Optional[float] = None, n_samples: int = 100, figsize: tuple = (8, 5), ) -> tuple: # pragma: no cover """ Plot return value stability plot for given threshold values. The return value stability plot shows return values for given return period for given thresholds. The purpose of this plot is to investigate statibility and sensitivity of the Generalized Pareto Distribution model to threshold value. Threshold value selection should still be guided by the mean residual life plot and the parameter stability plot. This plot should be used as additional check. Parameters ---------- ts : pandas.Series Time series of the signal. return_period : number Return period. Given as a multiple of `return_period_size`. return_period_size : str or pandas.Timedelta, optional Size of return period (default='365.2425D'). If set to '30D', then a return period of 12 would be roughly equivalent to a 1 year return period (360 days). thresholds : array-like, optional An array of thresholds for which the mean residual life plot is plotted. If None (default), plots mean residual life for 100 equally-spaced thresholds between 90th (10th if extremes_type='low') percentile and 10th largest (smallest if extremes_type='low') value in the series. r : str or pandas.Timedelta, optional Duration of window used to decluster the exceedances. By default r='24H' (24 hours). extremes_type : str, optional high (default) - extreme high values low - extreme low values distributions : list, optional List of distributions for which the return value curves are plotted. By default these are "genpareto" and "expon". A distribution must be either a name of distribution from with scipy.stats or a subclass of scipy.stats.rv_continuous. See https://docs.scipy.org/doc/scipy/reference/stats.html alpha : float, optional Confidence interval width in the range (0, 1). If None (default), then confidence interval is not shown. n_samples : int, optional Number of bootstrap samples used to estimate confidence interval bounds (default=100). Ignored if `alpha` is None. figsize : tuple, optional Figure size in inches in format (width, height). By default it is (8, 5). Returns ------- figure : matplotlib.figure.Figure Figure object. axes : matplotlib.axes._axes.Axes Axes object. """ # Get default `thresholds` if thresholds is None: thresholds = get_default_thresholds( ts=ts, extremes_type=extremes_type, num=100, ) # Get default `distributions` if distributions is None: distributions = [ "genpareto", "expon", ] # Instantiate model model = EVA(data=ts) # Calculate return values for each threshold and distribution return_values: typing.Dict[str, typing.List[float]] = {} ci_lower: typing.Dict[str, typing.List[float]] = {} ci_upper: typing.Dict[str, typing.List[float]] = {} for distribution in distributions: for threshold in thresholds: model.get_extremes( method="POT", extremes_type=extremes_type, threshold=threshold, r=r, ) model.fit_model( model="MLE", distribution=distribution, ) rv, cil, ciu = model.get_return_value( return_period=return_period, return_period_size=return_period_size, alpha=alpha, n_samples=n_samples, ) try: return_values[distribution].append(rv) ci_lower[distribution].append(cil) ci_upper[distribution].append(ciu) except KeyError: return_values[distribution] = [rv] ci_lower[distribution] = [cil] ci_upper[distribution] = [ciu] with plt.rc_context(rc=pyextremes_rc): # Create figure and axes fig, ax = plt.subplots(figsize=figsize, dpi=96) ax.grid(False) # Plot central estimate of return values for i, distribution in enumerate(distributions): color = pyextremes_rc["axes.prop_cycle"].by_key()["color"][i] ax.plot( thresholds, return_values[distribution], color=color, lw=2, ls="-", label=distribution, zorder=(i + 3) * 5, ) # Plot confidence bounds if alpha is not None: for ci in [ci_lower[distribution], ci_upper[distribution]]: ax.plot( thresholds, ci, color=color, lw=1, ls="--", zorder=(i + 2) * 5, ) ax.fill_between( thresholds, ci_lower[distribution], ci_upper[distribution], facecolor=color, edgecolor="None", alpha=0.25, zorder=(i + 1) * 5, ) # Plot legend ax.legend( frameon=True, framealpha=0.9, ) # Label axes ax.set_xlabel("Threshold") ax.set_ylabel("Return value") return fig, ax