Exemplo n.º 1
0
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0):
    """
    The normalized interquartile range along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant, used to get consistent estimates of the
        standard deviation at the normal distribution.  Defined as
        scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is
        approximately 1.349.
    axis : int, optional
        The default is 0. Can also be None.

    Returns
    -------
    The normalized interquartile range
    """
    a = array_like(a, "a", ndim=None)
    c = float_like(c, "c")

    if a.ndim == 0:
        raise ValueError("a should have at least one dimension")
    elif a.size == 0:
        return np.nan
    else:
        quantiles = np.quantile(a, [0.25, 0.75], axis=axis)
        return np.squeeze(np.diff(quantiles, axis=0) / c)
def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0):
    """
    Calculate the Anderson-Darling a2 statistic.

    Parameters
    ----------
    x : array_like
        The data to test.
    dist : {'norm', callable}
        The assumed distribution under the null of test statistic.
    fit : bool
        If True, then the distribution parameters are estimated.
        Currently only for 1d data x, except in case dist='norm'.
    params : tuple
        The optional distribution parameters if fit is False.
    axis : int
        If dist is 'norm' or fit is False, then data can be an n-dimensional
        and axis specifies the axis of a variable.

    Returns
    -------
    {float, ndarray}
        The Anderson-Darling statistic.
    """
    x = array_like(x, 'x', ndim=None)
    fit = bool_like(fit, 'fit')
    axis = int_like(axis, 'axis')
    y = np.sort(x, axis=axis)
    nobs = y.shape[axis]
    if fit:
        if dist == 'norm':
            xbar = np.expand_dims(np.mean(x, axis=axis), axis)
            s = np.expand_dims(np.std(x, ddof=1, axis=axis), axis)
            w = (y - xbar) / s
            z = stats.norm.cdf(w)
            # print z
        elif callable(dist):
            params = dist.fit(x)
            # print params
            z = dist.cdf(y, *params)
            print(z)
        else:
            raise ValueError("dist must be 'norm' or a Callable")
    else:
        if callable(dist):
            z = dist.cdf(y, *params)
        else:
            raise ValueError('if fit is false, then dist must be callable')

    i = np.arange(1, nobs + 1)
    sl1 = [None] * x.ndim
    sl1[axis] = slice(None)
    sl1 = tuple(sl1)
    sl2 = [slice(None)] * x.ndim
    sl2[axis] = slice(None, None, -1)
    sl2 = tuple(sl2)
    s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])),
               axis=axis)
    a2 = -nobs - s
    return a2
Exemplo n.º 3
0
def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median):
    # c \approx .6745
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately .6745.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = array_like(a, 'a', ndim=None)
    c = float_like(c, 'c')
    if callable(center) and a.size:
        center = np.apply_over_axes(center, a, axis)
    else:
        center = 0.0

    return np.median((np.abs(a - center)) / c, axis=axis)
Exemplo n.º 4
0
def runstest_1samp(x, cutoff='mean', correction=True):
    '''use runs test on binary discretized data above/below cutoff

    Parameters
    ----------
    x : array_like
        data, numeric
    cutoff : {'mean', 'median'} or number
        This specifies the cutoff to split the data into large and small
        values.
    correction : bool
        Following the SAS manual, for samplesize below 50, the test
        statistic is corrected by 0.5. This can be turned off with
        correction=False, and was included to match R, tseries, which
        does not use any correction.

    Returns
    -------
    z_stat : float
        test statistic, asymptotically normally distributed
    p-value : float
        p-value, reject the null hypothesis if it is below an type 1 error
        level, alpha .

    '''

    x = array_like(x, "x")
    if cutoff == 'mean':
        cutoff = np.mean(x)
    elif cutoff == 'median':
        cutoff = np.median(x)
    else:
        cutoff = float(cutoff)
    xindicator = (x >= cutoff).astype(int)
    return Runs(xindicator).runs_test(correction=correction)
Exemplo n.º 5
0
    def __init__(self,
                 endog,
                 exog,
                 window=None,
                 *,
                 weights=None,
                 min_nobs=None,
                 missing="drop",
                 expanding=False):
        # Call Model.__init__ twice to use const detection in first pass
        # But to not drop in the second pass
        missing = string_like(missing,
                              "missing",
                              options=("drop", "raise", "skip"))
        temp_msng = "drop" if missing != "raise" else "raise"
        Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
        k_const = self.k_constant
        const_idx = self.data.const_idx
        Model.__init__(self, endog, exog, missing="none", hasconst=False)
        self.k_constant = k_const
        self.data.const_idx = const_idx
        self._y = array_like(endog, "endog")
        nobs = self._y.shape[0]
        self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None))
        window = int_like(window, "window", optional=True)
        weights = array_like(weights, "weights", optional=True, shape=(nobs, ))
        self._window = window if window is not None else self._y.shape[0]
        self._weighted = weights is not None
        self._weights = np.ones(nobs) if weights is None else weights
        w12 = np.sqrt(self._weights)
        self._wy = w12 * self._y
        self._wx = w12[:, None] * self._x

        min_nobs = int_like(min_nobs, "min_nobs", optional=True)
        self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
        if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
            raise ValueError("min_nobs must be larger than the number of "
                             "regressors in the model and less than window")

        self._expanding = expanding

        self._is_nan = np.zeros_like(self._y, dtype=bool)
        self._has_nan = self._find_nans()
        self.const_idx = self.data.const_idx
        self._skip_missing = missing == "skip"
Exemplo n.º 6
0
def test_mvmean_2indep(data1, data2):
    """Hotellings test for multivariate mean in two independent samples

    The null hypothesis is that both samples have the same mean.
    The alternative hypothesis is that means differ.

    Parameters
    ----------
    data1 : array_like
        first sample data with observations in rows and variables in columns
    data2 : array_like
        second sample data with observations in rows and variables in columns

    Returns
    -------
    results : instance of a results class with attributes
        statistic, pvalue, t2 and df
    """
    x1 = array_like(data1, "x1", ndim=2)
    x2 = array_like(data2, "x2", ndim=2)
    nobs1, k_vars = x1.shape
    nobs2, k_vars2 = x2.shape
    if k_vars2 != k_vars:
        msg = "both samples need to have the same number of columns"
        raise ValueError(msg)
    mean1 = x1.mean(0)
    mean2 = x2.mean(0)
    cov1 = np.cov(x1, rowvar=False, ddof=1)
    cov2 = np.cov(x2, rowvar=False, ddof=1)
    nobs_t = nobs1 + nobs2
    combined_cov = ((nobs1 - 1) * cov1 + (nobs2 - 1) * cov2) / (nobs_t - 2)
    diff = mean1 - mean2
    t2 = (nobs1 * nobs2) / nobs_t * diff @ np.linalg.solve(combined_cov, diff)
    factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1)
    statistic = t2 / factor
    df = (k_vars, nobs_t - 1 - k_vars)
    pvalue = stats.f.sf(statistic, df[0], df[1])
    return HolderTuple(statistic=statistic,
                       pvalue=pvalue,
                       df=df,
                       t2=t2,
                       distr="F")
Exemplo n.º 7
0
def isestimable(c, d):
    """
    True if (Q, P) contrast `c` is estimable for (N, P) design `d`.

    From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
    the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
    and verifying it is the same as the rank of `D`.

    Parameters
    ----------
    c : array_like
        A contrast matrix with shape (Q, P). If 1 dimensional assume shape is
        (1, P).
    d : array_like
        The design matrix, (N, P).

    Returns
    -------
    bool
        True if the contrast `c` is estimable on design `d`.

    Examples
    --------
    >>> d = np.array([[1, 1, 1, 0, 0, 0],
    ...               [0, 0, 0, 1, 1, 1],
    ...               [1, 1, 1, 1, 1, 1]]).T
    >>> isestimable([1, 0, 0], d)
    False
    >>> isestimable([1, -1, 0], d)
    True
    """
    c = array_like(c, 'c', maxdim=2)
    d = array_like(d, 'd', ndim=2)
    c = c[None, :] if c.ndim == 1 else c
    if c.shape[1] != d.shape[1]:
        raise ValueError('Contrast should have %d columns' % d.shape[1])
    new = np.vstack([c, d])
    if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d):
        return False
    return True
Exemplo n.º 8
0
 def __init__(self,
              endog,
              exog,
              window=None,
              weights=None,
              min_nobs=None,
              missing='drop'):
     # Call Model.__init__ twice to use const detection in first pass
     # But to not drop in the second pass
     missing = string_like(missing,
                           'missing',
                           options=('drop', 'raise', 'skip'))
     temp_msng = 'drop' if missing != 'raise' else 'raise'
     Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
     k_const = self.k_constant
     const_idx = self.data.const_idx
     Model.__init__(self, endog, exog, missing='none', hasconst=False)
     self.k_constant = k_const
     self.data.const_idx = const_idx
     self._y = array_like(endog, 'endog')
     nobs = self._y.shape[0]
     self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None))
     window = int_like(window, 'window', optional=True)
     weights = array_like(weights, 'weights', optional=True, shape=(nobs, ))
     self._window = window if window is not None else self._y.shape[0]
     self._weighted = weights is not None
     self._weights = np.ones(nobs) if weights is None else weights
     w12 = np.sqrt(self._weights)
     self._wy = w12 * self._y
     self._wx = w12[:, None] * self._x
     self._is_nan = np.zeros_like(self._y, dtype=np.bool)
     self._has_nan = self._find_nans()
     self.const_idx = self.data.const_idx
     self._skip_missing = missing == 'skip'
     min_nobs = int_like(min_nobs, 'min_nobs', optional=True)
     self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
     if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
         raise ValueError('min_nobs must be larger than the number of '
                          'regressors in the model and less than window')
Exemplo n.º 9
0
 def __init__(self,
              endog,
              trend=None,
              damped=False,
              seasonal=None,
              seasonal_periods=None,
              dates=None,
              freq=None,
              missing='none'):
     super(ExponentialSmoothing, self).__init__(endog,
                                                None,
                                                dates,
                                                freq,
                                                missing=missing)
     self.endog = self.endog
     self._y = self._data = array_like(endog,
                                       'endog',
                                       contiguous=True,
                                       order='C')
     options = ("add", "mul", "additive", "multiplicative")
     trend = string_like(trend, 'trend', options=options, optional=True)
     if trend in ['additive', 'multiplicative']:
         trend = {'additive': 'add', 'multiplicative': 'mul'}[trend]
     self.trend = trend
     self.damped = bool_like(damped, 'damped')
     seasonal = string_like(seasonal,
                            'seasonal',
                            options=options,
                            optional=True)
     if seasonal in ['additive', 'multiplicative']:
         seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal]
     self.seasonal = seasonal
     self.trending = trend in ['mul', 'add']
     self.seasoning = seasonal in ['mul', 'add']
     if (self.trend == 'mul' or self.seasonal == 'mul') and \
             not np.all(self._data > 0.0):
         raise ValueError('endog must be strictly positive when using'
                          'multiplicative trend or seasonal components.')
     if self.damped and not self.trending:
         raise ValueError('Can only dampen the trend component')
     if self.seasoning:
         self.seasonal_periods = int_like(seasonal_periods,
                                          'seasonal_periods',
                                          optional=True)
         if seasonal_periods is None:
             self.seasonal_periods = freq_to_period(self._index_freq)
         if self.seasonal_periods <= 1:
             raise ValueError('seasonal_periods must be larger than 1.')
     else:
         self.seasonal_periods = 0
     self.nobs = len(self.endog)
Exemplo n.º 10
0
def iqr(x1, x2, axis=0):
    """
    Interquartile range of error

    Parameters
    ----------
    x1 : array_like
       One of the inputs into the IQR calculation.
    x2 : array_like
       The other input into the IQR calculation.
    axis : {None, int}
       axis along which the summary statistic is calculated

    Returns
    -------
    irq : {float, ndarray}
       Interquartile range along given axis.

    Notes
    -----
    If ``x1`` and ``x2`` have different shapes, then they must broadcast.
    """
    x1 = array_like(x1, 'x1', dtype=None, ndim=None)
    x2 = array_like(x2, 'x1', dtype=None, ndim=None)
    if axis is None:
        x1 = x1.ravel()
        x2 = x2.ravel()
        axis = 0
    xdiff = np.sort(x1 - x2, axis=axis)
    nobs = x1.shape[axis]
    idx = np.round((nobs - 1) * np.array([0.25, 0.75])).astype(int)
    sl = [slice(None)] * xdiff.ndim
    sl[axis] = idx
    iqr = np.diff(xdiff[tuple(sl)], axis=axis)
    iqr = np.squeeze(iqr)  # drop reduced dimension
    return iqr
Exemplo n.º 11
0
def test_mvmean_2indep(data1, data2):
    """Hotellings test for multivariate mean in two samples

    Parameters
    ----------
    data1 : array_like
        first sample data with observations in rows and variables in columns
    data2 : array_like
        second sample data with observations in rows and variables in columns

    Returns
    -------
    results : instance of a results class with attributes
        statistic, pvalue, t2 and df
    """
    x1 = array_like(data1, "x1", ndim=2)
    x2 = array_like(data2, "x2", ndim=2)
    nobs_x, k_vars = x1.shape
    nobs_y, k_vars = x2.shape
    mean_x = x1.mean(0)
    mean_y = x2.mean(0)
    cov_x = np.cov(x1, rowvar=False, ddof=1)
    cov_y = np.cov(x2, rowvar=False, ddof=1)
    nobs_t = nobs_x + nobs_y
    combined_cov = ((nobs_x - 1) * cov_x + (nobs_y - 1) * cov_y) / (nobs_t - 2)
    diff = mean_x - mean_y
    t2 = (nobs_x * nobs_y) / nobs_t * diff @ (np.linalg.solve(combined_cov, diff))
    factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1)
    statistic = t2 / factor
    df = (k_vars, nobs_t - 1 - k_vars)
    pvalue = stats.f.sf(statistic, df[0], df[1])
    return HolderTuple(statistic=statistic,
                      pvalue=pvalue,
                      df=df,
                      t2=t2,
                      distr="F")
Exemplo n.º 12
0
    def __init__(self, x, kernel=None):
        x = array_like(x, "x", maxdim=2, contiguous=True)
        if x.ndim == 1:
            x = x[:, None]

        nobs, n_series = x.shape

        if kernel is None:
            kernel = kernels.Gaussian()  # no meaningful bandwidth yet

        if n_series > 1:
            if isinstance(kernel, kernels.CustomKernel):
                kernel = kernels.NdKernel(n_series, kernels=kernel)

        self.kernel = kernel
        self.n = n_series  #TODO change attribute
        self.x = x
Exemplo n.º 13
0
    def __init__(
        self,
        endog,
        *,
        period: Optional[int] = None,
        deseasonalize: bool = True,
        use_test: bool = True,
        method: str = "auto",
        difference: bool = False
    ) -> None:
        self._y = array_like(endog, "endog", ndim=1)
        if isinstance(endog, pd.DataFrame):
            self.endog_orig = endog.iloc[:, 0]
        else:
            self.endog_orig = endog
        self._period = int_like(period, "period", optional=True)
        self._deseasonalize = bool_like(deseasonalize, "deseasonalize")
        self._use_test = (
            bool_like(use_test, "use_test") and self._deseasonalize
        )
        self._diff = bool_like(difference, "difference")
        self._method = string_like(
            method,
            "model",
            options=("auto", "additive", "multiplicative", "mul", "add"),
        )
        if self._period is None and self._deseasonalize:
            idx = getattr(endog, "index", None)
            pfreq = None
            if idx is not None:
                pfreq = getattr(idx, "freq", None)
                if pfreq is None:
                    pfreq = getattr(idx, "inferred_freq", None)
            if pfreq is not None:
                self._period = freq_to_period(pfreq)
            else:
                raise ValueError(
                    "You must specify a period or endog must be a "
                    "pandas object with a DatetimeIndex with "
                    "a freq not set to None"
                )

        self._has_seasonality = self._deseasonalize
Exemplo n.º 14
0
def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
    """
    Computes the Qn robust estimator of scale

    The Qn scale estimator is a more efficient alternative to the MAD.
    The Qn scale estimator of an array a of length n is defined as
    c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
    the Qn estimator is the k-th order statistic of the absolute differences
    of the array. The optional constant is used to normalize the estimate
    as explained below. The implementation follows the algorithm described
    in Croux and Rousseeuw (1992).

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant. The default value is used to get consistent
        estimates of the standard deviation at the normal distribution.
    axis : int, optional
        The default is 0.

    Returns
    -------
    {float, ndarray}
        The Qn robust estimator of scale
    """
    a = array_like(a,
                   "a",
                   ndim=None,
                   dtype=np.float64,
                   contiguous=True,
                   order="C")
    c = float_like(c, "c")
    if a.ndim == 0:
        raise ValueError("a should have at least one dimension")
    elif a.size == 0:
        return np.nan
    else:
        out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
        if out.ndim == 0:
            return float(out)
        return out
Exemplo n.º 15
0
def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median):
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately 0.6745.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = array_like(a, "a", ndim=None)
    c = float_like(c, "c")
    if not a.size:
        center_val = 0.0
    elif callable(center):
        if axis is not None:
            center_val = np.apply_over_axes(center, a, axis)
        else:
            center_val = center(a.ravel())
    else:
        center_val = float_like(center, "center")
    err = (np.abs(a - center_val)) / c
    if not err.size:
        if axis is None or err.ndim == 1:
            return np.nan
        else:
            shape = list(err.shape)
            shape.pop(axis)
            return np.empty(shape)
    return np.median(err, axis=axis)
Exemplo n.º 16
0
def matrix_rank(m, tol=None, method="qr"):
    """
    Matrix rank calculation using QR or SVD

    Parameters
    ----------
    m : array_like
        A 2-d array-like object to test
    tol : float, optional
        The tolerance to use when testing the matrix rank. If not provided
        an appropriate value is selected.
    method : {"ip", "qr", "svd"}
        The method used. "ip" uses the inner-product of a normalized version
        of m and then computes the rank using NumPy's matrix_rank.
        "qr" uses a QR decomposition and is the default. "svd" defers to
        NumPy's matrix_rank.

    Returns
    -------
    int
        The rank of m.

    Notes
    -----
    When using a QR factorization, the rank is determined by the number of
    elements on the leading diagonal of the R matrix that are above tol
    in absolute value.
    """
    m = array_like(m, "m", ndim=2)
    if method == "ip":
        m = m[:, np.any(m != 0, axis=0)]
        m = m / np.sqrt((m**2).sum(0))
        m = m.T @ m
        return np.linalg.matrix_rank(m, tol=tol, hermitian=True)
    elif method == "qr":
        r, = scipy.linalg.qr(m, mode="r")
        abs_diag = np.abs(np.diag(r))
        if tol is None:
            tol = abs_diag[0] * m.shape[1] * np.finfo(float).eps
        return int((abs_diag > tol).sum())
    else:
        return np.linalg.matrix_rank(m, tol=tol)
Exemplo n.º 17
0
def lpol2index(ar):
    """
    Remove zeros from lag polynomial

    Parameters
    ----------
    ar : array_like
        coefficients of lag polynomial

    Returns
    -------
    coeffs : array
        non-zero coefficients of lag polynomial
    index : array
        index (lags) of lag polynomial with non-zero elements
    """
    ar = array_like(ar, 'ar')
    index = np.nonzero(ar)[0]
    coeffs = ar[index]
    return coeffs, index
Exemplo n.º 18
0
def distance_indicators(x, epsilon=None, distance=1.5):
    """
    Calculate all pairwise threshold distance indicators for a time series

    Parameters
    ----------
    x : 1d array
        observations of time series for which heaviside distance indicators
        are calculated
    epsilon : scalar, optional
        the threshold distance to use in calculating the heaviside indicators
    distance : scalar, optional
        if epsilon is omitted, specifies the distance multiplier to use when
        computing it

    Returns
    -------
    indicators : 2d array
        matrix of distance threshold indicators

    Notes
    -----
    Since this can be a very large matrix, use np.int8 to save some space.

    """
    x = array_like(x, 'x')

    if epsilon is not None and epsilon <= 0:
        raise ValueError("Threshold distance must be positive if specified."
                         " Got epsilon of %f" % epsilon)
    if distance <= 0:
        raise ValueError("Threshold distance must be positive."
                         " Got distance multiplier %f" % distance)

    # TODO: add functionality to select epsilon optimally
    # TODO: and/or compute for a range of epsilons in [0.5*s, 2.0*s]?
    #      or [1.5*s, 2.0*s]?
    if epsilon is None:
        epsilon = distance * x.std(ddof=1)

    return np.abs(x[:, None] - x) < epsilon
Exemplo n.º 19
0
    def test_1d(self, use_pandas):
        data = gen_data(1, use_pandas)
        a = array_like(data, "a")
        assert a.ndim == 1
        assert a.shape == (10, )
        assert type(a) is np.ndarray

        a = array_like(data, "a", ndim=1)
        assert a.ndim == 1
        a = array_like(data, "a", shape=(10, ))
        assert a.shape == (10, )
        a = array_like(data, "a", ndim=1, shape=(None, ))
        assert a.ndim == 1
        a = array_like(data, "a", ndim=2, shape=(10, 1))
        assert a.ndim == 2
        assert a.shape == (10, 1)

        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", shape=(5, ))
Exemplo n.º 20
0
def lpol2index(ar):
    """
    Remove zeros from lag polynomial

    Parameters
    ----------
    ar : array_like
        coefficients of lag polynomial

    Returns
    -------
    coeffs : ndarray
        non-zero coefficients of lag polynomial
    index : ndarray
        index (lags) of lag polynomial with non-zero elements
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", np.ComplexWarning)
        ar = array_like(ar, "ar")
    index = np.nonzero(ar)[0]
    coeffs = ar[index]
    return coeffs, index
Exemplo n.º 21
0
def iqr(a,
        c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4),
        axis=0,
        center=np.median):
    """
    The normalized interquartile range along given axis of an array

    Parameters
    ----------
    a : array_like
        Input array.
    c : float, optional
        The normalization constant, used to get consistent estimates of the
        standard deviation at the normal distribution.  Defined as
        scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is
        approximately 1.349.
    axis : int, optional
        The default is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    The normalized interquartile range
    """
    a = array_like(a, 'a', ndim=None)
    c = float_like(c, 'c')

    if a.size == 0:
        return np.nan
    else:
        if callable(center) and a.size:
            center = np.apply_over_axes(center, a, axis)
        else:
            center = 0.0
        quantiles = np.quantile(a - center, [0.25, 0.75], axis=axis)
        return np.squeeze(np.diff(quantiles, axis=0) / c)
Exemplo n.º 22
0
    def test_right_squeeze_and_pad(self):
        data = np.empty((2, 1, 2))
        a = array_like(data, "a", ndim=3)
        assert a.shape == (2, 1, 2)
        data = np.empty((2))
        a = array_like(data, "a", ndim=3)
        assert a.shape == (2, 1, 1)
        data = np.empty((2, 1))
        a = array_like(data, "a", ndim=3)
        assert a.shape == (2, 1, 1)

        data = np.empty((2, 1, 1, 1))
        a = array_like(data, "a", ndim=3)
        assert a.shape == (2, 1, 1)

        data = np.empty((2, 1, 1, 2, 1, 1))
        with pytest.raises(ValueError):
            array_like(data, "a", ndim=3)
Exemplo n.º 23
0
 def test_dot(self, use_pandas):
     data = gen_data(2, use_pandas)
     a = array_like(data, "a")
     assert not isinstance(a.T.dot(data), array_like)
     assert not isinstance(a.T.dot(a), array_like)
Exemplo n.º 24
0
def seasonal_decompose(x, model="additive", filt=None, period=None,
                       two_sided=True, extrapolate_trend=0):
    """
    Seasonal decomposition using moving averages.

    Parameters
    ----------
    x : array_like
        Time series. If 2d, individual series are in columns. x must contain 2
        complete cycles.
    model : {"additive", "multiplicative"}, optional
        Type of seasonal component. Abbreviations are accepted.
    filt : array_like, optional
        The filter coefficients for filtering out the seasonal component.
        The concrete moving average method used in filtering is determined by
        two_sided.
    period : int, optional
        Period of the series. Must be used if x is not a pandas object or if
        the index of x does not have  a frequency. Overrides default
        periodicity of x if x is a pandas object with a timeseries index.
    two_sided : bool, optional
        The moving average method used in filtering.
        If True (default), a centered moving average is computed using the
        filt. If False, the filter coefficients are for past values only.
    extrapolate_trend : int or 'freq', optional
        If set to > 0, the trend resulting from the convolution is
        linear least-squares extrapolated on both ends (or the single one
        if two_sided is False) considering this many (+1) closest points.
        If set to 'freq', use `freq` closest points. Setting this parameter
        results in no NaN values in trend or resid components.

    Returns
    -------
    DecomposeResult
        A object with seasonal, trend, and resid attributes.

    See Also
    --------
    statsmodels.tsa.filters.bk_filter.bkfilter
    statsmodels.tsa.filters.cf_filter.xffilter
    statsmodels.tsa.filters.hp_filter.hpfilter
    statsmodels.tsa.filters.convolution_filter
    statsmodels.tsa.seasonal.STL

    Notes
    -----
    This is a naive decomposition. More sophisticated methods should
    be preferred.

    The additive model is Y[t] = T[t] + S[t] + e[t]

    The multiplicative model is Y[t] = T[t] * S[t] * e[t]

    The seasonal component is first removed by applying a convolution
    filter to the data. The average of this smoothed series for each
    period is the returned seasonal component.
    """
    pfreq = period
    pw = PandasWrapper(x)
    if period is None:
        pfreq = getattr(getattr(x, 'index', None), 'inferred_freq', None)

    x = array_like(x, 'x', maxdim=2)
    nobs = len(x)

    if not np.all(np.isfinite(x)):
        raise ValueError("This function does not handle missing values")
    if model.startswith('m'):
        if np.any(x <= 0):
            raise ValueError("Multiplicative seasonality is not appropriate "
                             "for zero and negative values")

    if period is None:
        if pfreq is not None:
            pfreq = freq_to_period(pfreq)
            period = pfreq
        else:
            raise ValueError("You must specify a period or x must be a "
                             "pandas object with a DatetimeIndex with "
                             "a freq not set to None")
    if x.shape[0] < 2 * pfreq:
        raise ValueError('x must have 2 complete cycles requires {0} '
                         'observations. x only has {1} '
                         'observation(s)'.format(2 * pfreq, x.shape[0]))

    if filt is None:
        if period % 2 == 0:  # split weights at ends
            filt = np.array([.5] + [1] * (period - 1) + [.5]) / period
        else:
            filt = np.repeat(1. / period, period)

    nsides = int(two_sided) + 1
    trend = convolution_filter(x, filt, nsides)

    if extrapolate_trend == 'freq':
        extrapolate_trend = period - 1

    if extrapolate_trend > 0:
        trend = _extrapolate_trend(trend, extrapolate_trend + 1)

    if model.startswith('m'):
        detrended = x / trend
    else:
        detrended = x - trend

    period_averages = seasonal_mean(detrended, period)

    if model.startswith('m'):
        period_averages /= np.mean(period_averages, axis=0)
    else:
        period_averages -= np.mean(period_averages, axis=0)

    seasonal = np.tile(period_averages.T, nobs // period + 1).T[:nobs]

    if model.startswith('m'):
        resid = x / seasonal / trend
    else:
        resid = detrended - seasonal

    results = []
    for s, name in zip((seasonal, trend, resid, x),
                       ('seasonal', 'trend', 'resid', None)):
        results.append(pw.wrap(s.squeeze(), columns=name))
    return DecomposeResult(seasonal=results[0], trend=results[1],
                           resid=results[2], observed=results[3])
Exemplo n.º 25
0
    def __init__(
        self,
        data: Union[np.ndarray, pd.Series, pd.DataFrame],
        stats: Sequence[str] = None,
        *,
        numeric: bool = True,
        categorical: bool = True,
        alpha: float = 0.05,
        use_t: bool = False,
        percentiles: Sequence[Union[int, float]] = PERCENTILES,
        ntop: bool = 5,
    ):
        data_arr = data
        if not isinstance(data, (pd.Series, pd.DataFrame)):
            data_arr = array_like(data, "data", maxdim=2)
        if data_arr.ndim == 1:
            data = pd.Series(data)
        numeric = bool_like(numeric, "numeric")
        categorical = bool_like(categorical, "categorical")
        include = []
        col_types = ""
        if numeric:
            include.append(np.number)
            col_types = "numeric"
        if categorical:
            include.append("category")
            col_types += "and " if col_types != "" else ""
            col_types += "categorical"
        if not numeric and not categorical:
            raise ValueError(
                "At least one of numeric and categorical must be True"
            )
        self._data = pd.DataFrame(data).select_dtypes(include)
        if self._data.shape[1] == 0:

            raise ValueError(
                "Selecting {col_types} results in an empty DataFrame"
            )
        self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
        self._is_cat_like = [
            is_categorical_dtype(dt) for dt in self._data.dtypes
        ]

        if stats is not None:
            undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
            if undef:
                raise ValueError(
                    f"{', '.join(undef)} are not known statistics"
                )
        self._stats = (
            list(DEFAULT_STATISTICS) if stats is None else list(stats)
        )
        self._ntop = int_like(ntop, "ntop")
        self._compute_top = "top" in self._stats
        self._compute_freq = "freq" in self._stats
        if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
            raise ValueError("top must be a non-negative integer")

        self._compute_perc = "percentiles" in self._stats
        self._percentiles = array_like(
            percentiles, "percentiles", maxdim=1, dtype="d"
        )
        self._percentiles = np.sort(self._percentiles)
        if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
            raise ValueError("percentiles must be distinct")
        if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
            raise ValueError("percentiles must be strictly between 0 and 100")

        # Expand special stats
        replacements = {
            "mode": ["mode", "mode_freq"],
            "ci": ["upper_ci", "lower_ci"],
            "jarque_bera": ["jarque_bera", "jarque_bera_pval"],
            "top": [f"top_{i}" for i in range(1, self._ntop + 1)],
            "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
            "percentiles": [f"{i}%" for i in percentiles],
        }

        for key in replacements:
            if key in self._stats:
                idx = self._stats.index(key)
                self._stats = (
                    self._stats[:idx]
                    + replacements[key]
                    + self._stats[idx + 1 :]
                )

        self._alpha = float_like(alpha, "alpha")
        if not 0 < alpha < 1:
            raise ValueError("alpha must be strictly between 0 and 1")
        self._use_t = bool_like(use_t, "use_t")
Exemplo n.º 26
0
def lagmat(
    x,
    maxlag: int,
    trim: Literal["forward", "backward", "both", "none"] = 'forward',
    original: Literal["ex", "sep", "in"] = "ex",
    use_pandas: bool = False
) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame,
                                                           DataFrame]:
    """
    Create 2d array of lags.

    Parameters
    ----------
    x : array_like
        Data; if 2d, observation in rows and variables in columns.
    maxlag : int
        All lags from zero to maxlag are included.
    trim : {'forward', 'backward', 'both', 'none', None}
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none', None : no trimming of observations.
    original : {'ex','sep','in'}
        How the original is treated.

        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : ndarray
        The array with lagged observations.
    y : ndarray, optional
        Only returned if original == 'sep'.

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])
    """
    maxlag = int_like(maxlag, "maxlag")
    use_pandas = bool_like(use_pandas, "use_pandas")
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    original = string_like(original, "original", options=("ex", "sep", "in"))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, "x", ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = "none" if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ("none", "backward"):
        raise ValueError("trim cannot be 'none' or 'backward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ["ex", "sep"]:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x

    if trim in ("none", "forward"):
        startobs = 0
    elif trim in ("backward", "both"):
        startobs = maxlag
    else:
        raise ValueError("trim option not valid")

    if trim in ("none", "backward"):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + ".L." + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ("sep", "ex"):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, axis=1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == "sep":
            leads = lm[startobs:stopobs, :dropidx]

    if original == "sep":
        return lags, leads
    else:
        return lags
Exemplo n.º 27
0
    def test_3d(self):
        data = gen_data(3, False)
        a = array_like(data, "a", ndim=3)
        assert a.shape == (5, 6, 7)
        assert a.ndim == 3
        assert type(a) is np.ndarray

        a = array_like(data, "a", ndim=3, shape=(5, None, 7))
        assert a.shape == (5, 6, 7)
        a = array_like(data, "a", ndim=3, shape=(None, None, 7))
        assert a.shape == (5, 6, 7)
        a = array_like(data, "a", ndim=5)
        assert a.shape == (5, 6, 7, 1, 1)
        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", ndim=3, shape=(10, ))
        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", ndim=3, shape=(None, None, 5))
        match = "a is required to have ndim 2 but has ndim 3"
        with pytest.raises(ValueError, match=match):
            array_like(data, "a", ndim=2)
        match = "a must have ndim <= 1"
        with pytest.raises(ValueError, match=match):
            array_like(data, "a", maxdim=1)
        match = "a must have ndim <= 2"
        with pytest.raises(ValueError, match=match):
            array_like(data, "a", maxdim=2)
Exemplo n.º 28
0
    def test_2d(self, use_pandas):
        data = gen_data(2, use_pandas)
        a = array_like(data, "a", ndim=2)
        assert a.ndim == 2
        assert a.shape == (20, 10)
        assert type(a) is np.ndarray

        a = array_like(data, "a", ndim=2)
        assert a.ndim == 2
        a = array_like(data, "a", ndim=2, shape=(20, None))
        assert a.shape == (20, 10)
        a = array_like(data, "a", ndim=2, shape=(20, ))
        assert a.shape == (20, 10)
        a = array_like(data, "a", ndim=2, shape=(None, 10))
        assert a.shape == (20, 10)

        a = array_like(data, "a", ndim=2, shape=(None, None))
        assert a.ndim == 2
        a = array_like(data, "a", ndim=3)
        assert a.ndim == 3
        assert a.shape == (20, 10, 1)

        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", ndim=2, shape=(10, ))
        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", ndim=2, shape=(20, 20))
        with pytest.raises(ValueError, match="a is required to have shape"):
            array_like(data, "a", ndim=2, shape=(None, 20))
        match = "a is required to have ndim 1 but has ndim 2"
        with pytest.raises(ValueError, match=match):
            array_like(data, "a", ndim=1)
        match = "a must have ndim <= 1"
        with pytest.raises(ValueError, match=match):
            array_like(data, "a", maxdim=1)
Exemplo n.º 29
0
 def test_contiguous(self):
     x = np.arange(10)
     y = x[::2]
     a = array_like(y, "a", contiguous=True)
     assert not y.flags["C_CONTIGUOUS"]
     assert a.flags["C_CONTIGUOUS"]
Exemplo n.º 30
0
 def test_slice(self, use_pandas):
     data = gen_data(2, use_pandas)
     a = array_like(data, "a", ndim=2)
     assert type(a[1:]) is np.ndarray