示例#1
0
def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median):
    # c \approx .6745
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately .6745.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = array_like(a, 'a', ndim=None)
    c = float_like(c, 'c')
    if not a.size:
        center = 0.0
    elif callable(center):
        center = np.apply_over_axes(center, a, axis)
    else:
        center = float_like(center, "center")

    return np.median((np.abs(a - center)) / c, axis=axis)
示例#2
0
def test_float_like(floating):
    assert isinstance(float_like(floating, "floating"), float)
    assert isinstance(float_like(floating, "floating", optional=True), float)
    assert float_like(None, "floating", optional=True) is None
    if isinstance(floating, (int, np.integer, float, np.inexact)):
        assert isinstance(float_like(floating, "floating", strict=True), float)
        assert float_like(None, "floating", optional=True, strict=True) is None
示例#3
0
    def __init__(
        self,
        index: Union[Sequence[Hashable], pd.Index],
        *,
        period: Optional[Union[float, int]] = None,
        constant: bool = False,
        order: int = 0,
        seasonal: bool = False,
        fourier: int = 0,
        additional_terms: Sequence[DeterministicTerm] = (),
        drop: bool = False,
    ):
        if not isinstance(index, pd.Index):
            index = pd.Index(index)
        self._index = index
        self._deterministic_terms: List[DeterministicTerm] = []
        self._extendable = False
        self._index_freq = None
        self._validate_index()
        period = float_like(period, "period", optional=True)
        self._constant = constant = bool_like(constant, "constant")
        self._order = required_int_like(order, "order")
        self._seasonal = seasonal = bool_like(seasonal, "seasonal")
        self._fourier = required_int_like(fourier, "fourier")
        additional_terms = tuple(additional_terms)
        self._cached_in_sample = None
        self._drop = bool_like(drop, "drop")
        self._additional_terms = additional_terms
        if constant or order:
            self._deterministic_terms.append(TimeTrend(constant, order))
        if seasonal and fourier:
            raise ValueError(
                """seasonal and fourier can be initialized through the constructor since\
these will be necessarily perfectly collinear. Instead, you can pass \
additional components using the additional_terms input.""")
        if (seasonal or fourier) and period is None:
            if period is None:
                self._period = period = freq_to_period(self._index_freq)
        if seasonal:
            period = required_int_like(period, "period")
            self._deterministic_terms.append(Seasonality(period))
        elif fourier:
            period = float_like(period, "period")
            assert period is not None
            self._deterministic_terms.append(Fourier(period, order=fourier))
        for term in additional_terms:
            if not isinstance(term, DeterministicTerm):
                raise TypeError(
                    "All additional terms must be instances of subsclasses "
                    "of DeterministicTerm")
            if term not in self._deterministic_terms:
                self._deterministic_terms.append(term)
            else:
                raise ValueError(
                    "One or more terms in additional_terms has been added "
                    "through the parameters of the constructor. Terms must "
                    "be unique.")
        self._period = period
        self._retain_cols: Optional[List[Hashable]] = None
示例#4
0
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0):
    """
    The normalized interquartile range along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant, used to get consistent estimates of the
        standard deviation at the normal distribution.  Defined as
        scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is
        approximately 1.349.
    axis : int, optional
        The default is 0. Can also be None.

    Returns
    -------
    The normalized interquartile range
    """
    a = array_like(a, "a", ndim=None)
    c = float_like(c, "c")

    if a.ndim == 0:
        raise ValueError("a should have at least one dimension")
    elif a.size == 0:
        return np.nan
    else:
        quantiles = np.quantile(a, [0.25, 0.75], axis=axis)
        return np.squeeze(np.diff(quantiles, axis=0) / c)
示例#5
0
def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median):
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately 0.6745.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = array_like(a, "a", ndim=None)
    c = float_like(c, "c")
    if not a.size:
        center_val = 0.0
    elif callable(center):
        if axis is not None:
            center_val = np.apply_over_axes(center, a, axis)
        else:
            center_val = center(a.ravel())
    else:
        center_val = float_like(center, "center")
    err = (np.abs(a - center_val)) / c
    if not err.size:
        if axis is None or err.ndim == 1:
            return np.nan
        else:
            shape = list(err.shape)
            shape.pop(axis)
            return np.empty(shape)
    return np.median(err, axis=axis)
示例#6
0
def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
    """
    Computes the Qn robust estimator of scale

    The Qn scale estimator is a more efficient alternative to the MAD.
    The Qn scale estimator of an array a of length n is defined as
    c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
    the Qn estimator is the k-th order statistic of the absolute differences
    of the array. The optional constant is used to normalize the estimate
    as explained below. The implementation follows the algorithm described
    in Croux and Rousseeuw (1992).

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant. The default value is used to get consistent
        estimates of the standard deviation at the normal distribution.
    axis : int, optional
        The default is 0.

    Returns
    -------
    {float, ndarray}
        The Qn robust estimator of scale
    """
    a = array_like(a,
                   "a",
                   ndim=None,
                   dtype=np.float64,
                   contiguous=True,
                   order="C")
    c = float_like(c, "c")
    if a.ndim == 0:
        raise ValueError("a should have at least one dimension")
    elif a.size == 0:
        return np.nan
    else:
        out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
        if out.ndim == 0:
            return float(out)
        return out
示例#7
0
def iqr(a,
        c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4),
        axis=0,
        center=np.median):
    """
    The normalized interquartile range along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant, used to get consistent estimates of the
        standard deviation at the normal distribution.  Defined as
        scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is
        approximately 1.349.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    The normalized interquartile range
    """
    a = array_like(a, 'a', ndim=None)
    c = float_like(c, 'c')

    if a.size == 0:
        return np.nan
    else:
        if callable(center) and a.size:
            center = np.apply_over_axes(center, a, axis)
        else:
            center = 0.0
        quantiles = np.quantile(a - center, [0.25, 0.75], axis=axis)
        return np.squeeze(np.diff(quantiles, axis=0) / c)
    def __init__(
        self,
        data: Union[np.ndarray, pd.Series, pd.DataFrame],
        stats: Sequence[str] = None,
        *,
        numeric: bool = True,
        categorical: bool = True,
        alpha: float = 0.05,
        use_t: bool = False,
        percentiles: Sequence[Union[int, float]] = PERCENTILES,
        ntop: bool = 5,
    ):
        data_arr = data
        if not isinstance(data, (pd.Series, pd.DataFrame)):
            data_arr = array_like(data, "data", maxdim=2)
        if data_arr.ndim == 1:
            data = pd.Series(data)
        numeric = bool_like(numeric, "numeric")
        categorical = bool_like(categorical, "categorical")
        include = []
        col_types = ""
        if numeric:
            include.append(np.number)
            col_types = "numeric"
        if categorical:
            include.append("category")
            col_types += "and " if col_types != "" else ""
            col_types += "categorical"
        if not numeric and not categorical:
            raise ValueError(
                "At least one of numeric and categorical must be True"
            )
        self._data = pd.DataFrame(data).select_dtypes(include)
        if self._data.shape[1] == 0:

            raise ValueError(
                "Selecting {col_types} results in an empty DataFrame"
            )
        self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
        self._is_cat_like = [
            is_categorical_dtype(dt) for dt in self._data.dtypes
        ]

        if stats is not None:
            undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
            if undef:
                raise ValueError(
                    f"{', '.join(undef)} are not known statistics"
                )
        self._stats = (
            list(DEFAULT_STATISTICS) if stats is None else list(stats)
        )
        self._ntop = int_like(ntop, "ntop")
        self._compute_top = "top" in self._stats
        self._compute_freq = "freq" in self._stats
        if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
            raise ValueError("top must be a non-negative integer")

        self._compute_perc = "percentiles" in self._stats
        self._percentiles = array_like(
            percentiles, "percentiles", maxdim=1, dtype="d"
        )
        self._percentiles = np.sort(self._percentiles)
        if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
            raise ValueError("percentiles must be distinct")
        if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
            raise ValueError("percentiles must be strictly between 0 and 100")

        # Expand special stats
        replacements = {
            "mode": ["mode", "mode_freq"],
            "ci": ["upper_ci", "lower_ci"],
            "jarque_bera": ["jarque_bera", "jarque_bera_pval"],
            "top": [f"top_{i}" for i in range(1, self._ntop + 1)],
            "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
            "percentiles": [f"{i}%" for i in percentiles],
        }

        for key in replacements:
            if key in self._stats:
                idx = self._stats.index(key)
                self._stats = (
                    self._stats[:idx]
                    + replacements[key]
                    + self._stats[idx + 1 :]
                )

        self._alpha = float_like(alpha, "alpha")
        if not 0 < alpha < 1:
            raise ValueError("alpha must be strictly between 0 and 1")
        self._use_t = bool_like(use_t, "use_t")
示例#9
0
def test_not_float_like(not_floating):
    with pytest.raises(TypeError):
        float_like(not_floating, "floating")
示例#10
0
    def fit(self,
            smoothing_level=None,
            smoothing_slope=None,
            smoothing_seasonal=None,
            damping_slope=None,
            optimized=True,
            use_boxcox=False,
            remove_bias=False,
            use_basinhopping=False,
            start_params=None,
            initial_level=None,
            initial_slope=None,
            use_brute=True):
        """
        Fit the model

        Parameters
        ----------
        smoothing_level : float, optional
            The alpha value of the simple exponential smoothing, if the value
            is set then this value will be used as the value.
        smoothing_slope :  float, optional
            The beta value of the Holt's trend method, if the value is
            set then this value will be used as the value.
        smoothing_seasonal : float, optional
            The gamma value of the holt winters seasonal method, if the value
            is set then this value will be used as the value.
        damping_slope : float, optional
            The phi value of the damped method, if the value is
            set then this value will be used as the value.
        optimized : bool, optional
            Estimate model parameters by maximizing the log-likelihood
        use_boxcox : {True, False, 'log', float}, optional
            Should the Box-Cox transform be applied to the data first? If 'log'
            then apply the log. If float then use lambda equal to float.
        remove_bias : bool, optional
            Remove bias from forecast values and fitted values by enforcing
            that the average residual is equal to zero.
        use_basinhopping : bool, optional
            Using Basin Hopping optimizer to find optimal values
        start_params : ndarray, optional
            Starting values to used when optimizing the fit.  If not provided,
            starting values are determined using a combination of grid search
            and reasonable values based on the initial values of the data
        initial_level : float, optional
            Value to use when initializing the fitted level.
        initial_slope : float, optional
            Value to use when initializing the fitted slope.
        use_brute : bool, optional
            Search for good starting values using a brute force (grid)
            optimizer. If False, a naive set of starting values is used.

        Returns
        -------
        results : HoltWintersResults class
            See statsmodels.tsa.holtwinters.HoltWintersResults

        Notes
        -----
        This is a full implementation of the holt winters exponential smoothing
        as per [1]. This includes all the unstable methods as well as the
        stable methods. The implementation of the library covers the
        functionality of the R library as much as possible whilst still
        being Pythonic.

        References
        ----------
        [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles
            and practice. OTexts, 2014.
        """
        # Variable renames to alpha,beta, etc as this helps with following the
        # mathematical notation in general
        alpha = float_like(smoothing_level, 'smoothing_level', True)
        beta = float_like(smoothing_slope, 'smoothing_slope', True)
        gamma = float_like(smoothing_seasonal, 'smoothing_seasonal', True)
        phi = float_like(damping_slope, 'damping_slope', True)
        l0 = self._l0 = float_like(initial_level, 'initial_level', True)
        b0 = self._b0 = float_like(initial_slope, 'initial_slope', True)
        if start_params is not None:
            start_params = array_like(start_params,
                                      'start_params',
                                      contiguous=True)
        data = self._data
        damped = self.damped
        seasoning = self.seasoning
        trending = self.trending
        trend = self.trend
        seasonal = self.seasonal
        m = self.seasonal_periods
        opt = None
        phi = phi if damped else 1.0
        if use_boxcox == 'log':
            lamda = 0.0
            y = boxcox(data, lamda)
        elif isinstance(use_boxcox, float):
            lamda = use_boxcox
            y = boxcox(data, lamda)
        elif use_boxcox:
            y, lamda = boxcox(data)
        else:
            lamda = None
            y = data.squeeze()
        self._y = y
        lvls = np.zeros(self.nobs)
        b = np.zeros(self.nobs)
        s = np.zeros(self.nobs + m - 1)
        p = np.zeros(6 + m)
        max_seen = np.finfo(np.double).max
        l0, b0, s0 = self.initial_values()

        xi = np.zeros_like(p, dtype=np.bool)
        if optimized:
            init_alpha = alpha if alpha is not None else 0.5 / max(m, 1)
            init_beta = beta if beta is not None else 0.1 * init_alpha if trending else beta
            init_gamma = None
            init_phi = phi if phi is not None else 0.99
            # Selection of functions to optimize for appropriate parameters
            if seasoning:
                init_gamma = gamma if gamma is not None else 0.05 * \
                                                             (1 - init_alpha)
                xi = np.array([
                    alpha is None, trending and beta is None, gamma is None,
                    initial_level is None, trending and initial_slope is None,
                    phi is None and damped
                ] + [True] * m)
                func = SMOOTHERS[(seasonal, trend)]
            elif trending:
                xi = np.array([
                    alpha is None, beta is None, False, initial_level is None,
                    initial_slope is None, phi is None and damped
                ] + [False] * m)
                func = SMOOTHERS[(None, trend)]
            else:
                xi = np.array([
                    alpha is None, False, False, initial_level is None, False,
                    False
                ] + [False] * m)
                func = SMOOTHERS[(None, None)]
            p[:] = [init_alpha, init_beta, init_gamma, l0, b0, init_phi] + s0
            if np.any(xi):
                # txi [alpha, beta, gamma, l0, b0, phi, s0,..,s_(m-1)]
                # Have a quick look in the region for a good starting place for alpha etc.
                # using guesstimates for the levels
                txi = xi & np.array([True, True, True, False, False, True] +
                                    [False] * m)
                txi = txi.astype(np.bool)
                bounds = ([(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, None),
                           (0.0, None), (0.0, 1.0)] + [
                               (None, None),
                           ] * m)
                args = (txi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs,
                        max_seen)
                if start_params is None and np.any(txi) and use_brute:
                    _bounds = [bnd for bnd, flag in zip(bounds, txi) if flag]
                    res = brute(func,
                                _bounds,
                                args,
                                Ns=20,
                                full_output=True,
                                finish=None)
                    p[txi], max_seen, _, _ = res
                else:
                    if start_params is not None:
                        if len(start_params) != xi.sum():
                            msg = 'start_params must have {0} values but ' \
                                  'has {1} instead'
                            nxi, nsp = len(xi), len(start_params)
                            raise ValueError(msg.format(nxi, nsp))
                        p[xi] = start_params
                    args = (xi.astype(np.uint8), p, y, lvls, b, s, m,
                            self.nobs, max_seen)
                    max_seen = func(np.ascontiguousarray(p[xi]), *args)
                # alpha, beta, gamma, l0, b0, phi = p[:6]
                # s0 = p[6:]
                # bounds = np.array([(0.0,1.0),(0.0,1.0),(0.0,1.0),(0.0,None),
                # (0.0,None),(0.8,1.0)] + [(None,None),]*m)
                args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs,
                        max_seen)
                if use_basinhopping:
                    # Take a deeper look in the local minimum we are in to find the best
                    # solution to parameters, maybe hop around to try escape the local
                    # minimum we may be in.
                    _bounds = [bnd for bnd, flag in zip(bounds, xi) if flag]
                    res = basinhopping(func,
                                       p[xi],
                                       minimizer_kwargs={
                                           'args': args,
                                           'bounds': _bounds
                                       },
                                       stepsize=0.01)
                    success = res.lowest_optimization_result.success
                else:
                    # Take a deeper look in the local minimum we are in to find the best
                    # solution to parameters
                    _bounds = [bnd for bnd, flag in zip(bounds, xi) if flag]
                    lb, ub = np.asarray(_bounds).T.astype(np.float)
                    initial_p = p[xi]

                    # Ensure strictly inbounds
                    loc = initial_p <= lb
                    upper = ub[loc].copy()
                    upper[~np.isfinite(upper)] = 100.0
                    eps = 1e-4
                    initial_p[loc] = lb[loc] + eps * (upper - lb[loc])

                    loc = initial_p >= ub
                    lower = lb[loc].copy()
                    lower[~np.isfinite(lower)] = -100.0
                    eps = 1e-4
                    initial_p[loc] = ub[loc] - eps * (ub[loc] - lower)

                    res = minimize(func, initial_p, args=args, bounds=_bounds)
                    success = res.success

                if not success:
                    from warnings import warn
                    from statsmodels.tools.sm_exceptions import ConvergenceWarning
                    warn("Optimization failed to converge. Check mle_retvals.",
                         ConvergenceWarning)
                p[xi] = res.x
                opt = res
            else:
                from warnings import warn
                from statsmodels.tools.sm_exceptions import EstimationWarning
                message = "Model has no free parameters to estimate. Set " \
                          "optimized=False to suppress this warning"
                warn(message, EstimationWarning)

            [alpha, beta, gamma, l0, b0, phi] = p[:6]
            s0 = p[6:]

        hwfit = self._predict(h=0,
                              smoothing_level=alpha,
                              smoothing_slope=beta,
                              smoothing_seasonal=gamma,
                              damping_slope=phi,
                              initial_level=l0,
                              initial_slope=b0,
                              initial_seasons=s0,
                              use_boxcox=use_boxcox,
                              remove_bias=remove_bias,
                              is_optimized=xi)
        hwfit._results.mle_retvals = opt
        return hwfit
示例#11
0
    def fit(
            self,
            kernel="gau",
            bw="normal_reference",
            fft=True,
            weights=None,
            gridsize=None,
            adjust=1,
            cut=3,
            clip=(-np.inf, np.inf),
    ):
        """
        Attach the density estimate to the KDEUnivariate class.

        Parameters
        ----------
        kernel : str
            The Kernel to be used. Choices are:

            - "biw" for biweight
            - "cos" for cosine
            - "epa" for Epanechnikov
            - "gau" for Gaussian.
            - "tri" for triangular
            - "triw" for triweight
            - "uni" for uniform

        bw : str, float, callable
            The bandwidth to use. Choices are:

            - "scott" - 1.059 * A * nobs ** (-1/5.), where A is
              `min(std(x),IQR/1.34)`
            - "silverman" - .9 * A * nobs ** (-1/5.), where A is
              `min(std(x),IQR/1.34)`
            - "normal_reference" - C * A * nobs ** (-1/5.), where C is
              calculated from the kernel. Equivalent (up to 2 dp) to the
              "scott" bandwidth for gaussian kernels. See bandwidths.py
            - If a float is given, its value is used as the bandwidth.
            - If a callable is given, it's return value is used.
              The callable should take exactly two parameters, i.e.,
              fn(x, kern), and return a float, where:

              * x - the clipped input data
              * kern - the kernel instance used

        fft : bool
            Whether or not to use FFT. FFT implementation is more
            computationally efficient. However, only the Gaussian kernel
            is implemented. If FFT is False, then a 'nobs' x 'gridsize'
            intermediate array is created.
        gridsize : int
            If gridsize is None, max(len(x), 50) is used.
        cut : float
            Defines the length of the grid past the lowest and highest values
            of x so that the kernel goes to zero. The end points are
            -/+ cut*bw*{min(x) or max(x)}
        adjust : float
            An adjustment factor for the bw. Bandwidth becomes bw * adjust.

        Returns
        -------
        KDEUnivariate
            The instance fit,
        """
        if isinstance(bw, str):
            self.bw_method = bw
        else:
            self.bw_method = "user-given"
            if not callable(bw):
                bw = float_like(bw, "bw")

        endog = self.endog

        if fft:
            if kernel != "gau":
                msg = "Only gaussian kernel is available for fft"
                raise NotImplementedError(msg)
            if weights is not None:
                msg = "Weights are not implemented for fft"
                raise NotImplementedError(msg)
            density, grid, bw = kdensityfft(
                endog,
                kernel=kernel,
                bw=bw,
                adjust=adjust,
                weights=weights,
                gridsize=gridsize,
                clip=clip,
                cut=cut,
            )
        else:
            density, grid, bw = kdensity(
                endog,
                kernel=kernel,
                bw=bw,
                adjust=adjust,
                weights=weights,
                gridsize=gridsize,
                clip=clip,
                cut=cut,
            )
        self.density = density
        self.support = grid
        self.bw = bw
        self.kernel = kernel_switch[kernel](h=bw)  # we instantiate twice,
        # should this passed to funcs?
        # put here to ensure empty cache after re-fit with new options
        self.kernel.weights = weights
        if weights is not None:
            self.kernel.weights /= weights.sum()
        self._cache = {}
        return self
示例#12
0
def kdensityfft(
        x,
        kernel="gau",
        bw="normal_reference",
        weights=None,
        gridsize=None,
        adjust=1,
        clip=(-np.inf, np.inf),
        cut=3,
        retgrid=True,
):
    """
    Rosenblatt-Parzen univariate kernel density estimator

    Parameters
    ----------
    x : array_like
        The variable for which the density estimate is desired.
    kernel : str
        ONLY GAUSSIAN IS CURRENTLY IMPLEMENTED.
        "bi" for biweight
        "cos" for cosine
        "epa" for Epanechnikov, default
        "epa2" for alternative Epanechnikov
        "gau" for Gaussian.
        "par" for Parzen
        "rect" for rectangular
        "tri" for triangular
    bw : str, float, callable
        The bandwidth to use. Choices are:

        - "scott" - 1.059 * A * nobs ** (-1/5.), where A is
          `min(std(x),IQR/1.34)`
        - "silverman" - .9 * A * nobs ** (-1/5.), where A is
          `min(std(x),IQR/1.34)`
        - "normal_reference" - C * A * nobs ** (-1/5.), where C is
          calculated from the kernel. Equivalent (up to 2 dp) to the
          "scott" bandwidth for gaussian kernels. See bandwidths.py
        - If a float is given, its value is used as the bandwidth.
        - If a callable is given, it's return value is used.
          The callable should take exactly two parameters, i.e.,
          fn(x, kern), and return a float, where:

          * x - the clipped input data
          * kern - the kernel instance used

    weights : array or None
        WEIGHTS ARE NOT CURRENTLY IMPLEMENTED.
        Optional  weights. If the x value is clipped, then this weight is
        also dropped.
    gridsize : int
        If gridsize is None, min(len(x), 512) is used. Note that the provided
        number is rounded up to the next highest power of 2.
    adjust : float
        An adjustment factor for the bw. Bandwidth becomes bw * adjust.
        clip : tuple
        Observations in x that are outside of the range given by clip are
        dropped. The number of observations in x is then shortened.
    cut : float
        Defines the length of the grid past the lowest and highest values of x
        so that the kernel goes to zero. The end points are
        -/+ cut*bw*{x.min() or x.max()}
    retgrid : bool
        Whether or not to return the grid over which the density is estimated.

    Returns
    -------
    density : ndarray
        The densities estimated at the grid points.
    grid : ndarray, optional
        The grid points at which the density is estimated.

    Notes
    -----
    Only the default kernel is implemented. Weights are not implemented yet.
    This follows Silverman (1982) with changes suggested by Jones and Lotwick
    (1984). However, the discretization step is replaced by linear binning
    of Fan and Marron (1994). This should be extended to accept the parts
    that are dependent only on the data to speed things up for
    cross-validation.

    References
    ----------
    Fan, J. and J.S. Marron. (1994) `Fast implementations of nonparametric
        curve estimators`. Journal of Computational and Graphical Statistics.
        3.1, 35-56.
    Jones, M.C. and H.W. Lotwick. (1984) `Remark AS R50: A Remark on Algorithm
        AS 176. Kernal Density Estimation Using the Fast Fourier Transform`.
        Journal of the Royal Statistical Society. Series C. 33.1, 120-2.
    Silverman, B.W. (1982) `Algorithm AS 176. Kernel density estimation using
        the Fast Fourier Transform. Journal of the Royal Statistical Society.
        Series C. 31.2, 93-9.
    """
    x = np.asarray(x)
    # will not work for two columns.
    x = x[np.logical_and(x > clip[0], x < clip[1])]

    # Get kernel object corresponding to selection
    kern = kernel_switch[kernel]()

    if callable(bw):
        bw = float(bw(x, kern))
        # user passed a callable custom bandwidth function
    elif isinstance(bw, str):
        # if bw is None, select optimal bandwidth for kernel
        bw = bandwidths.select_bandwidth(x, bw, kern)
        # will cross-val fit this pattern?
    else:
        bw = float_like(bw, "bw")

    bw *= adjust

    nobs = len(x)  # after trim

    # 1 Make grid and discretize the data
    if gridsize is None:
        gridsize = np.max((nobs, 512.0))
    gridsize = 2**np.ceil(np.log2(gridsize))  # round to next power of 2

    a = np.min(x) - cut * bw
    b = np.max(x) + cut * bw
    grid, delta = np.linspace(a, b, int(gridsize), retstep=True)
    RANGE = b - a

    # TODO: Fix this?
    # This is the Silverman binning function, but I believe it's buggy (SS)
    # weighting according to Silverman
    #    count = counts(x,grid)
    #    binned = np.zeros_like(grid)    #xi_{k} in Silverman
    #    j = 0
    #    for k in range(int(gridsize-1)):
    #        if count[k]>0: # there are points of x in the grid here
    #            Xingrid = x[j:j+count[k]] # get all these points
    #            # get weights at grid[k],grid[k+1]
    #            binned[k] += np.sum(grid[k+1]-Xingrid)
    #            binned[k+1] += np.sum(Xingrid-grid[k])
    #            j += count[k]
    #    binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta

    # NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING
    binned = fast_linbin(x, a, b, gridsize) / (delta * nobs)

    # step 2 compute FFT of the weights, using Munro (1976) FFT convention
    y = forrt(binned)

    # step 3 and 4 for optimal bw compute zstar and the density estimate f
    # do not have to redo the above if just changing bw, ie., for cross val

    # NOTE: silverman_transform is the closed form solution of the FFT of the
    # gaussian kernel. Not yet sure how to generalize it.
    zstar = silverman_transform(bw, gridsize, RANGE) * y
    # 3.49 in Silverman
    # 3.50 w Gaussian kernel
    f = revrt(zstar)
    if retgrid:
        return f, grid, bw
    else:
        return f, bw
示例#13
0
    def __init__(self, data, ncomp=None, standardize=True, demean=True,
                 normalize=True, gls=False, weights=None, method='svd',
                 missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
                 max_em_iter=100):
        self._index = None
        self._columns = []
        if isinstance(data, pd.DataFrame):
            self._index = data.index
            self._columns = data.columns

        self.data = array_like(data, "data", ndim=2)
        # Store inputs
        self._gls = bool_like(gls, "gls")
        self._normalize = bool_like(normalize, "normalize")
        self._tol = float_like(tol, "tol")
        if not 0 < self._tol < 1:
            raise ValueError('tol must be strictly between 0 and 1')
        self._max_iter = int_like(max_iter, "int_like")
        self._max_em_iter = int_like(max_em_iter, "max_em_iter")
        self._tol_em = float_like(tol_em, "tol_em")

        # Prepare data
        self._standardize = bool_like(standardize, "standardize")
        self._demean = bool_like(demean, "demean")

        self._nobs, self._nvar = self.data.shape
        weights = array_like(weights, "weights", maxdim=1, optional=True)
        if weights is None:
            weights = np.ones(self._nvar)
        else:
            weights = np.array(weights).flatten()
            if weights.shape[0] != self._nvar:
                raise ValueError('weights should have nvar elements')
            weights = weights / np.sqrt((weights ** 2.0).mean())
        self.weights = weights

        # Check ncomp against maximum
        min_dim = min(self._nobs, self._nvar)
        self._ncomp = min_dim if ncomp is None else ncomp
        if self._ncomp > min_dim:
            import warnings

            warn = 'The requested number of components is more than can be ' \
                   'computed from data. The maximum number of components is ' \
                   'the minimum of the number of observations or variables'
            warnings.warn(warn, ValueWarning)
            self._ncomp = min_dim

        self._method = method
        # Workaround to avoid instance methods in __dict__
        if self._method not in ('eig', 'svd', 'nipals'):
            raise ValueError('method {0} is not known.'.format(method))

        self.rows = np.arange(self._nobs)
        self.cols = np.arange(self._nvar)
        # Handle missing
        self._missing = string_like(missing, "missing", optional=True)
        self._adjusted_data = self.data
        self._adjust_missing()

        # Update size
        self._nobs, self._nvar = self._adjusted_data.shape
        if self._ncomp == np.min(self.data.shape):
            self._ncomp = np.min(self._adjusted_data.shape)
        elif self._ncomp > np.min(self._adjusted_data.shape):
            raise ValueError('When adjusting for missing values, user '
                             'provided ncomp must be no larger than the '
                             'smallest dimension of the '
                             'missing-value-adjusted data size.')

        # Attributes and internal values
        self._tss = 0.0
        self._ess = None
        self.transformed_data = None
        self._mu = None
        self._sigma = None
        self._ess_indiv = None
        self._tss_indiv = None
        self.scores = self.factors = None
        self.loadings = None
        self.coeff = None
        self.eigenvals = None
        self.eigenvecs = None
        self.projection = None
        self.rsquare = None
        self.ic = None

        # Prepare data
        self.transformed_data = self._prepare_data()
        # Perform the PCA
        self._pca()
        if gls:
            self._compute_gls_weights()
            self.transformed_data = self._prepare_data()
            self._pca()

        # Final calculations
        self._compute_rsquare_and_ic()
        if self._index is not None:
            self._to_pandas()
示例#14
0
    def forecast(self, steps: int = 1, theta: float = 2) -> pd.Series:
        r"""
        Forecast the model for a given theta

        Parameters
        ----------
        steps : int
            The number of steps ahead to compute the forecast components.
        theta : float
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.

        Returns
        -------
        Series
            A Series containing the forecasts

        Notes
        -----
        The forecast is computed as

        .. math::

           \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0
                             \left[h - 1 + \frac{1}{\alpha}
                             - \frac{(1-\alpha)^T}{\alpha} \right]
                             + \tilde{X}_{T+h|T}

        where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous
        variable using the parameter :math:`\alpha`. :math:`b_0` is the
        slope of a time trend line fitted to X using the terms 0, 1, ..., T-1.

        This expression follows from [1]_ and [2]_ when the combination
        weights are restricted to be (theta-1)/theta and 1/theta. This nests
        the original implementation when theta=2 and the two weights are both
        1/2.

        References
        ----------
        .. [1] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method.
           International Journal of Forecasting, 19(2), 287-290.
        .. [2] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos,
           F. (2015). The optimized theta method. arXiv preprint
           arXiv:1503.03529.
        """

        steps = int_like(steps, "steps")
        if steps < 1:
            raise ValueError("steps must be a positive integer")
        theta = float_like(theta, "theta")
        if theta < 1:
            raise ValueError("theta must be a float >= 1")
        thresh = 4.0 / np.finfo(np.double).eps
        trend_weight = (theta - 1) / theta if theta < thresh else 1.0
        comp = self.forecast_components(steps=steps)
        fcast = trend_weight * comp.trend + np.asarray(comp.ses)
        # Re-seasonalize if needed
        if self.model.deseasonalize:
            seasonal = np.asarray(comp.seasonal)
            if self.model.method.startswith("mul"):
                fcast *= seasonal
            else:
                fcast += seasonal
        fcast.name = "forecast"

        return fcast
    def __init__(self, endog, trend=False, damped_trend=False, seasonal=None,
                 initialization_method='estimated', initial_level=None,
                 initial_trend=None, initial_seasonal=None, bounds=None,
                 concentrate_scale=True, dates=None, freq=None,
                 missing='none'):
        # Model definition
        self.trend = bool_like(trend, 'trend')
        self.damped_trend = bool_like(damped_trend, 'damped_trend')
        self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True)
        self.seasonal = self.seasonal_periods is not None
        self.initialization_method = string_like(
            initialization_method, 'initialization_method').lower()
        self.concentrate_scale = bool_like(concentrate_scale,
                                           'concentrate_scale')

        # TODO: add validation for bounds (e.g. have all bounds, upper > lower)
        # TODO: add `bounds_method` argument to choose between "usual" and
        # "admissible" as in Hyndman et al. (2008)
        self.bounds = bounds
        if self.bounds is None:
            self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)]

        # Validation
        if self.seasonal_periods == 1:
            raise ValueError('Cannot have a seasonal period of 1.')

        if self.seasonal and self.seasonal_periods is None:
            raise NotImplementedError('Unable to detect season automatically;'
                                      ' please specify `seasonal_periods`.')

        if self.initialization_method not in ['concentrated', 'estimated',
                                              'simple', 'heuristic', 'known']:
            raise ValueError('Invalid initialization method "%s".'
                             % initialization_method)

        if self.initialization_method == 'known':
            if initial_level is None:
                raise ValueError('`initial_level` argument must be provided'
                                 ' when initialization method is set to'
                                 ' "known".')
            if initial_trend is None and self.trend:
                raise ValueError('`initial_trend` argument must be provided'
                                 ' for models with a trend component when'
                                 ' initialization method is set to "known".')
            if initial_seasonal is None and self.seasonal:
                raise ValueError('`initial_seasonal` argument must be provided'
                                 ' for models with a seasonal component when'
                                 ' initialization method is set to "known".')

        # Initialize the state space model
        if not self.seasonal or self.seasonal_periods is None:
            self._seasonal_periods = 0
        else:
            self._seasonal_periods = self.seasonal_periods

        k_states = 2 + int(self.trend) + self._seasonal_periods
        k_posdef = 1

        init = ss_init.Initialization(k_states, 'known',
                                      constant=[0] * k_states)
        super(ExponentialSmoothing, self).__init__(
            endog, k_states=k_states, k_posdef=k_posdef,
            initialization=init, dates=dates, freq=freq, missing=missing)

        # Concentrate the scale out of the likelihood function
        if self.concentrate_scale:
            self.ssm.filter_concentrated = True

        # Setup fixed elements of the system matrices
        # Observation error
        self.ssm['design', 0, 0] = 1.
        self.ssm['selection', 0, 0] = 1.
        self.ssm['state_cov', 0, 0] = 1.

        # Level
        self.ssm['design', 0, 1] = 1.
        self.ssm['transition', 1, 1] = 1.

        # Trend
        if self.trend:
            self.ssm['transition', 1:3, 2] = 1.

        # Seasonal
        if self.seasonal:
            k = 2 + int(self.trend)
            self.ssm['design', 0, k] = 1.
            self.ssm['transition', k, -1] = 1.
            self.ssm['transition', k + 1:k_states, k:k_states - 1] = (
                np.eye(self.seasonal_periods - 1))

        # Initialization of the states
        if self.initialization_method != 'known':
            msg = ('Cannot give `%%s` argument when initialization is "%s"'
                   % initialization_method)
            if initial_level is not None:
                raise ValueError(msg % 'initial_level')
            if initial_trend is not None:
                raise ValueError(msg % 'initial_trend')
            if initial_seasonal is not None:
                raise ValueError(msg % 'initial_seasonal')

        if self.initialization_method == 'simple':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_simple(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'heuristic':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_heuristic(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'known':
            initial_level = float_like(initial_level, 'initial_level')
            if self.trend:
                initial_trend = float_like(initial_trend, 'initial_trend')
            if self.seasonal:
                initial_seasonal = array_like(initial_seasonal,
                                              'initial_seasonal')

                if len(initial_seasonal) == self.seasonal_periods - 1:
                    initial_seasonal = np.r_[initial_seasonal,
                                             0 - np.sum(initial_seasonal)]

                if len(initial_seasonal) != self.seasonal_periods:
                    raise ValueError(
                        'Invalid length of initial seasonal values. Must be'
                        ' one of s or s-1, where s is the number of seasonal'
                        ' periods.')

        # Note that the simple and heuristic methods of computing initial
        # seasonal factors return estimated seasonal factors associated with
        # the first t = 1, 2, ..., `n_seasons` observations. To use these as
        # the initial state, we lag them by `n_seasons`. This yields, for
        # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0.
        # As described above, the state vector in this model should have
        # seasonal factors ordered L0, L1, L2, L3, and as a result we need to
        # reverse the order of the computed initial seasonal factors from
        # these methods.
        methods = ['simple', 'heuristic']
        if (self.initialization_method in methods
                and initial_seasonal is not None):
            initial_seasonal = initial_seasonal[::-1]

        self._initial_level = initial_level
        self._initial_trend = initial_trend
        self._initial_seasonal = initial_seasonal
        self._initial_state = None

        # Initialize now if possible (if we have a damped trend, then
        # initialization will depend on the phi parameter, and so has to be
        # done at each `update`)
        methods = ['simple', 'heuristic', 'known']
        if not self.damped_trend and self.initialization_method in methods:
            self._initialize_constant_statespace(initial_level, initial_trend,
                                                 initial_seasonal)

        # Save keys for kwarg initialization
        self._init_keys += ['trend', 'damped_trend', 'seasonal',
                            'initialization_method', 'initial_level',
                            'initial_trend', 'initial_seasonal', 'bounds',
                            'concentrate_scale', 'dates', 'freq', 'missing']
示例#16
0
 def __init__(self, period: float, order: int):
     super().__init__(order)
     self._period = float_like(period, "period")
     if 2 * self._order > self._period:
         raise ValueError("2 * order must be <= period")
示例#17
0
def kdensity(
        x,
        kernel="gau",
        bw="normal_reference",
        weights=None,
        gridsize=None,
        adjust=1,
        clip=(-np.inf, np.inf),
        cut=3,
        retgrid=True,
):
    """
    Rosenblatt-Parzen univariate kernel density estimator.

    Parameters
    ----------
    x : array_like
        The variable for which the density estimate is desired.
    kernel : str
        The Kernel to be used. Choices are
        - "biw" for biweight
        - "cos" for cosine
        - "epa" for Epanechnikov
        - "gau" for Gaussian.
        - "tri" for triangular
        - "triw" for triweight
        - "uni" for uniform
    bw : str, float, callable
        The bandwidth to use. Choices are:

        - "scott" - 1.059 * A * nobs ** (-1/5.), where A is
          `min(std(x),IQR/1.34)`
        - "silverman" - .9 * A * nobs ** (-1/5.), where A is
          `min(std(x),IQR/1.34)`
        - "normal_reference" - C * A * nobs ** (-1/5.), where C is
          calculated from the kernel. Equivalent (up to 2 dp) to the
          "scott" bandwidth for gaussian kernels. See bandwidths.py
        - If a float is given, its value is used as the bandwidth.
        - If a callable is given, it's return value is used.
          The callable should take exactly two parameters, i.e.,
          fn(x, kern), and return a float, where:

          * x - the clipped input data
          * kern - the kernel instance used

    weights : array or None
        Optional  weights. If the x value is clipped, then this weight is
        also dropped.
    gridsize : int
        If gridsize is None, max(len(x), 50) is used.
    adjust : float
        An adjustment factor for the bw. Bandwidth becomes bw * adjust.
    clip : tuple
        Observations in x that are outside of the range given by clip are
        dropped. The number of observations in x is then shortened.
    cut : float
        Defines the length of the grid past the lowest and highest values of x
        so that the kernel goes to zero. The end points are
        -/+ cut*bw*{min(x) or max(x)}
    retgrid : bool
        Whether or not to return the grid over which the density is estimated.

    Returns
    -------
    density : ndarray
        The densities estimated at the grid points.
    grid : ndarray, optional
        The grid points at which the density is estimated.

    Notes
    -----
    Creates an intermediate (`gridsize` x `nobs`) array. Use FFT for a more
    computationally efficient version.
    """
    x = np.asarray(x)
    if x.ndim == 1:
        x = x[:, None]
    clip_x = np.logical_and(x > clip[0], x < clip[1])
    x = x[clip_x]

    nobs = len(x)  # after trim

    if gridsize is None:
        gridsize = max(nobs, 50)  # do not need to resize if no FFT

        # handle weights
    if weights is None:
        weights = np.ones(nobs)
        q = nobs
    else:
        # ensure weights is a numpy array
        weights = np.asarray(weights)

        if len(weights) != len(clip_x):
            msg = "The length of the weights must be the same as the given x."
            raise ValueError(msg)
        weights = weights[clip_x.squeeze()]
        q = weights.sum()

    # Get kernel object corresponding to selection
    kern = kernel_switch[kernel]()

    if callable(bw):
        bw = float(bw(x, kern))
        # user passed a callable custom bandwidth function
    elif isinstance(bw, str):
        bw = bandwidths.select_bandwidth(x, bw, kern)
        # will cross-val fit this pattern?
    else:
        bw = float_like(bw, "bw")

    bw *= adjust

    a = np.min(x, axis=0) - cut * bw
    b = np.max(x, axis=0) + cut * bw
    grid = np.linspace(a, b, gridsize)

    k = (x.T -
         grid[:, None]) / bw  # uses broadcasting to make a gridsize x nobs

    # set kernel bandwidth
    kern.seth(bw)

    # truncate to domain
    if (kern.domain
            is not None):  # will not work for piecewise kernels like parzen
        z_lo, z_high = kern.domain
        domain_mask = (k < z_lo) | (k > z_high)
        k = kern(k)  # estimate density
        k[domain_mask] = 0
    else:
        k = kern(k)  # estimate density

    k[k < 0] = 0  # get rid of any negative values, do we need this?

    dens = np.dot(k, weights) / (q * bw)

    if retgrid:
        return dens, grid, bw
    else:
        return dens, bw