def _calculate_return_value(
    args: typing.Tuple[pd.Series,  # ts (time series)
                       float,  # return_period
                       typing.Union[str, pd.Timedelta],  # return_period_size
                       float,  # threshold
                       typing.Union[str, pd.Timedelta],  # r
                       str,  # extremes_type
                       typing.Union[str,
                                    scipy.stats.rv_continuous],  # distribution
                       str,  # distribution_name
                       typing.Optional[float],  # alpha
                       int,  # n_samples
                       ],
) -> typing.Dict[str, typing.Union[str, typing.Optional[float]]]:
    (
        ts,
        return_period,
        return_period_size,
        threshold,
        r,
        extremes_type,
        distribution,
        distribution_name,
        alpha,
        n_samples,
    ) = args
    model = EVA(data=ts)
    model.get_extremes(
        method="POT",
        extremes_type=extremes_type,
        threshold=threshold,
        r=r,
    )
    model.fit_model(
        model="MLE",
        distribution=distribution,
    )
    # TODO - this is a hack to avoid spawning nested subprocesses
    _n_samples = n_samples % 10
    while _n_samples < n_samples:
        _n_samples += 10
        model.get_return_value(
            return_period=return_period,
            return_period_size=return_period_size,
            alpha=alpha,
            n_samples=_n_samples,
        )
    rv, cil, ciu = model.get_return_value(
        return_period=return_period,
        return_period_size=return_period_size,
        alpha=alpha,
        n_samples=n_samples,
    )
    return {
        "distribution_name": distribution_name,
        "threshold": threshold,
        "rv": rv,
        "cil": cil,
        "ciu": ciu,
    }
示例#2
0
def eva_model_pot_mle(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="POT",
        extremes_type="high",
        threshold=1.35,
        r="24H",
    )
    eva_model.fit_model("MLE")
    return eva_model
示例#3
0
def eva_model_bm_emcee(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="BM",
        extremes_type="high",
        block_size="365.2425D",
        errors="raise",
    )
    eva_model.fit_model("Emcee", n_walkers=10, n_samples=100)
    return eva_model
示例#4
0
def eva_model_bm_mle(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="BM",
        extremes_type="high",
        block_size="365.2425D",
        errors="raise",
    )
    eva_model.fit_model("MLE")
    return eva_model
def plot_aic_scores(
        ts: pd.Series,
        thresholds=None,
        r: typing.Union[str, pd.Timedelta] = "24H",
        extremes_type: str = "high",
        distributions: typing.Optional[typing.List[typing.Union[
            str, scipy.stats.rv_continuous]]] = None,
        ax: typing.Optional[plt.Axes] = None,
        figsize: tuple = (8, 5),
) -> plt.Axes:
    """
    Plot AIC scores for each distribution and threshold.

    Used to investigate which distribution better explains data variance for each
    threshold value. Does NOT indicate which threshold value is better because
    it will always have the same shape - logarithmic curve.

    Parameters
    ----------
    ts : pandas.Series
        Time series of the signal.
    thresholds : array-like, optional
        An array of thresholds for which the AIC plot is plotted.
        If None (default), plots AIC for 100 equally-spaced thresholds
        between 90th (10th if extremes_type='high') percentile
        and 10th largest (smallest if extremes_type='low') value in the series.
    r : pandas.Timedelta or value convertible to timedelta, optional
        Duration of window used to decluster the exceedances.
        By default r='24H' (24 hours).
        See pandas.to_timedelta for more information.
    extremes_type : str, optional
        high (default) - extreme high values
        low - extreme low values
    distributions : list, optional
        List of distributions for which the AIC curves are plotted.
        By default these are "genpareto" and "expon".
        A distribution must be either a name of distribution from scipy.stats
        or a subclass of scipy.stats.rv_continuous.
        See https://docs.scipy.org/doc/scipy/reference/stats.html
    ax : matplotlib.axes._axes.Axes, optional
        If provided, then the plot is drawn on this axes.
        If None (default), new figure and axes are created
    figsize : tuple, optional
        Figure size in inches in format (width, height).
        By default it is (8, 5).

    Returns
    -------
    plt.Axes
        Axes object.

    """
    # Get default `thresholds`
    if thresholds is None:
        thresholds = get_default_thresholds(
            ts=ts,
            extremes_type=extremes_type,
            num=100,
        )

    # Get default `distributions`
    if distributions is None:
        distributions = [
            "genpareto",
            "expon",
        ]
    distribution_names: typing.List[str] = []
    for distribution in distributions:
        if isinstance(distribution, str):
            distribution_names.append(distribution)
        else:
            distribution_names.append(distribution.name)

    # Calculate AIC values
    model = EVA(data=ts)
    results = []
    for distribution, distribution_name in zip(distributions,
                                               distribution_names):
        for threshold in thresholds:
            model.get_extremes(
                method="POT",
                extremes_type=extremes_type,
                threshold=threshold,
                r=r,
            )
            model.fit_model(model="MLE", distribution=distribution)
            results.append({
                "distribution_name": distribution_name,
                "threshold": threshold,
                "aic": model.AIC,
            })
    results = pd.DataFrame(data=results).sort_values("threshold",
                                                     ascending=True)

    with plt.rc_context(rc=pyextremes_rc):
        if ax is None:
            _, ax = plt.subplots(figsize=figsize, dpi=96)
            ax.grid(False)

        for i, (distribution_name,
                df) in enumerate(results.groupby("distribution_name")):
            ax.plot(
                df.loc[:, "threshold"],
                df.loc[:, "aic"],
                color=pyextremes_rc["axes.prop_cycle"].by_key()["color"][i],
                lw=2,
                ls="-",
                label=distribution_name,
                zorder=(i + 3) * 5,
            )

        # Plot legend
        ax.legend(frameon=True, framealpha=0.9)

        # Label axes
        ax.set_xlabel("Threshold")
        ax.set_ylabel("AIC Score")

        return ax
示例#6
0
def plot_return_value_stability(
        ts: pd.Series,
        return_period,
        return_period_size: typing.Union[str, pd.Timedelta] = "365.2425D",
        thresholds=None,
        r: typing.Union[str, pd.Timedelta] = "24H",
        extremes_type: str = "high",
        distributions: typing.Optional[typing.List[typing.Union[
            str, scipy.stats.rv_continuous]]] = None,
        alpha: typing.Optional[float] = None,
        n_samples: int = 100,
        figsize: tuple = (8, 5),
) -> tuple:  # pragma: no cover
    """
    Plot return value stability plot for given threshold values.

    The return value stability plot shows return values for given return period
    for given thresholds.
    The purpose of this plot is to investigate statibility and sensitivity of the
    Generalized Pareto Distribution model to threshold value.
    Threshold value selection should still be guided by the mean residual life plot
    and the parameter stability plot. This plot should be used as additional check.

    Parameters
    ----------
    ts : pandas.Series
        Time series of the signal.
    return_period : number
        Return period.
        Given as a multiple of `return_period_size`.
    return_period_size : str or pandas.Timedelta, optional
        Size of return period (default='365.2425D').
        If set to '30D', then a return period of 12
        would be roughly equivalent to a 1 year return period (360 days).
    thresholds : array-like, optional
        An array of thresholds for which the mean residual life plot is plotted.
        If None (default), plots mean residual life for 100 equally-spaced thresholds
        between 90th (10th if extremes_type='low') percentile
        and 10th largest (smallest if extremes_type='low') value in the series.
    r : str or pandas.Timedelta, optional
        Duration of window used to decluster the exceedances.
        By default r='24H' (24 hours).
    extremes_type : str, optional
        high (default) - extreme high values
        low - extreme low values
    distributions : list, optional
        List of distributions for which the return value curves are plotted.
        By default these are "genpareto" and "expon".
        A distribution must be either a name of distribution from with scipy.stats
        or a subclass of scipy.stats.rv_continuous.
        See https://docs.scipy.org/doc/scipy/reference/stats.html
    alpha : float, optional
        Confidence interval width in the range (0, 1).
        If None (default), then confidence interval is not shown.
    n_samples : int, optional
        Number of bootstrap samples used to estimate
        confidence interval bounds (default=100).
        Ignored if `alpha` is None.
    figsize : tuple, optional
        Figure size in inches in format (width, height).
        By default it is (8, 5).

    Returns
    -------
    figure : matplotlib.figure.Figure
        Figure object.
    axes : matplotlib.axes._axes.Axes
        Axes object.

    """
    # Get default `thresholds`
    if thresholds is None:
        thresholds = get_default_thresholds(
            ts=ts,
            extremes_type=extremes_type,
            num=100,
        )

    # Get default `distributions`
    if distributions is None:
        distributions = [
            "genpareto",
            "expon",
        ]

    # Instantiate model
    model = EVA(data=ts)

    # Calculate return values for each threshold and distribution
    return_values: typing.Dict[str, typing.List[float]] = {}
    ci_lower: typing.Dict[str, typing.List[float]] = {}
    ci_upper: typing.Dict[str, typing.List[float]] = {}
    for distribution in distributions:
        for threshold in thresholds:
            model.get_extremes(
                method="POT",
                extremes_type=extremes_type,
                threshold=threshold,
                r=r,
            )
            model.fit_model(
                model="MLE",
                distribution=distribution,
            )
            rv, cil, ciu = model.get_return_value(
                return_period=return_period,
                return_period_size=return_period_size,
                alpha=alpha,
                n_samples=n_samples,
            )
            try:
                return_values[distribution].append(rv)
                ci_lower[distribution].append(cil)
                ci_upper[distribution].append(ciu)
            except KeyError:
                return_values[distribution] = [rv]
                ci_lower[distribution] = [cil]
                ci_upper[distribution] = [ciu]

    with plt.rc_context(rc=pyextremes_rc):
        # Create figure and axes
        fig, ax = plt.subplots(figsize=figsize, dpi=96)
        ax.grid(False)

        # Plot central estimate of return values
        for i, distribution in enumerate(distributions):
            color = pyextremes_rc["axes.prop_cycle"].by_key()["color"][i]
            ax.plot(
                thresholds,
                return_values[distribution],
                color=color,
                lw=2,
                ls="-",
                label=distribution,
                zorder=(i + 3) * 5,
            )

            # Plot confidence bounds
            if alpha is not None:
                for ci in [ci_lower[distribution], ci_upper[distribution]]:
                    ax.plot(
                        thresholds,
                        ci,
                        color=color,
                        lw=1,
                        ls="--",
                        zorder=(i + 2) * 5,
                    )
                ax.fill_between(
                    thresholds,
                    ci_lower[distribution],
                    ci_upper[distribution],
                    facecolor=color,
                    edgecolor="None",
                    alpha=0.25,
                    zorder=(i + 1) * 5,
                )

        # Plot legend
        ax.legend(
            frameon=True,
            framealpha=0.9,
        )

        # Label axes
        ax.set_xlabel("Threshold")
        ax.set_ylabel("Return value")

        return fig, ax