def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0): """ The normalized interquartile range along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant, used to get consistent estimates of the standard deviation at the normal distribution. Defined as scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is approximately 1.349. axis : int, optional The default is 0. Can also be None. Returns ------- The normalized interquartile range """ a = array_like(a, "a", ndim=None) c = float_like(c, "c") if a.ndim == 0: raise ValueError("a should have at least one dimension") elif a.size == 0: return np.nan else: quantiles = np.quantile(a, [0.25, 0.75], axis=axis) return np.squeeze(np.diff(quantiles, axis=0) / c)
def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0): """ Calculate the Anderson-Darling a2 statistic. Parameters ---------- x : array_like The data to test. dist : {'norm', callable} The assumed distribution under the null of test statistic. fit : bool If True, then the distribution parameters are estimated. Currently only for 1d data x, except in case dist='norm'. params : tuple The optional distribution parameters if fit is False. axis : int If dist is 'norm' or fit is False, then data can be an n-dimensional and axis specifies the axis of a variable. Returns ------- {float, ndarray} The Anderson-Darling statistic. """ x = array_like(x, 'x', ndim=None) fit = bool_like(fit, 'fit') axis = int_like(axis, 'axis') y = np.sort(x, axis=axis) nobs = y.shape[axis] if fit: if dist == 'norm': xbar = np.expand_dims(np.mean(x, axis=axis), axis) s = np.expand_dims(np.std(x, ddof=1, axis=axis), axis) w = (y - xbar) / s z = stats.norm.cdf(w) # print z elif callable(dist): params = dist.fit(x) # print params z = dist.cdf(y, *params) print(z) else: raise ValueError("dist must be 'norm' or a Callable") else: if callable(dist): z = dist.cdf(y, *params) else: raise ValueError('if fit is false, then dist must be callable') i = np.arange(1, nobs + 1) sl1 = [None] * x.ndim sl1[axis] = slice(None) sl1 = tuple(sl1) sl2 = [slice(None)] * x.ndim sl2[axis] = slice(None, None, -1) sl2 = tuple(sl2) s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])), axis=axis) a2 = -nobs - s return a2
def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median): # c \approx .6745 """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately .6745. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = array_like(a, 'a', ndim=None) c = float_like(c, 'c') if callable(center) and a.size: center = np.apply_over_axes(center, a, axis) else: center = 0.0 return np.median((np.abs(a - center)) / c, axis=axis)
def runstest_1samp(x, cutoff='mean', correction=True): '''use runs test on binary discretized data above/below cutoff Parameters ---------- x : array_like data, numeric cutoff : {'mean', 'median'} or number This specifies the cutoff to split the data into large and small values. correction : bool Following the SAS manual, for samplesize below 50, the test statistic is corrected by 0.5. This can be turned off with correction=False, and was included to match R, tseries, which does not use any correction. Returns ------- z_stat : float test statistic, asymptotically normally distributed p-value : float p-value, reject the null hypothesis if it is below an type 1 error level, alpha . ''' x = array_like(x, "x") if cutoff == 'mean': cutoff = np.mean(x) elif cutoff == 'median': cutoff = np.median(x) else: cutoff = float(cutoff) xindicator = (x >= cutoff).astype(int) return Runs(xindicator).runs_test(correction=correction)
def __init__(self, endog, exog, window=None, *, weights=None, min_nobs=None, missing="drop", expanding=False): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like(missing, "missing", options=("drop", "raise", "skip")) temp_msng = "drop" if missing != "raise" else "raise" Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing="none", hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, "endog") nobs = self._y.shape[0] self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None)) window = int_like(window, "window", optional=True) weights = array_like(weights, "weights", optional=True, shape=(nobs, )) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x min_nobs = int_like(min_nobs, "min_nobs", optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError("min_nobs must be larger than the number of " "regressors in the model and less than window") self._expanding = expanding self._is_nan = np.zeros_like(self._y, dtype=bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == "skip"
def test_mvmean_2indep(data1, data2): """Hotellings test for multivariate mean in two independent samples The null hypothesis is that both samples have the same mean. The alternative hypothesis is that means differ. Parameters ---------- data1 : array_like first sample data with observations in rows and variables in columns data2 : array_like second sample data with observations in rows and variables in columns Returns ------- results : instance of a results class with attributes statistic, pvalue, t2 and df """ x1 = array_like(data1, "x1", ndim=2) x2 = array_like(data2, "x2", ndim=2) nobs1, k_vars = x1.shape nobs2, k_vars2 = x2.shape if k_vars2 != k_vars: msg = "both samples need to have the same number of columns" raise ValueError(msg) mean1 = x1.mean(0) mean2 = x2.mean(0) cov1 = np.cov(x1, rowvar=False, ddof=1) cov2 = np.cov(x2, rowvar=False, ddof=1) nobs_t = nobs1 + nobs2 combined_cov = ((nobs1 - 1) * cov1 + (nobs2 - 1) * cov2) / (nobs_t - 2) diff = mean1 - mean2 t2 = (nobs1 * nobs2) / nobs_t * diff @ np.linalg.solve(combined_cov, diff) factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1) statistic = t2 / factor df = (k_vars, nobs_t - 1 - k_vars) pvalue = stats.f.sf(statistic, df[0], df[1]) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, t2=t2, distr="F")
def isestimable(c, d): """ True if (Q, P) contrast `c` is estimable for (N, P) design `d`. From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if the contrast `C` is estimable by looking at the rank of ``vstack([C,D])`` and verifying it is the same as the rank of `D`. Parameters ---------- c : array_like A contrast matrix with shape (Q, P). If 1 dimensional assume shape is (1, P). d : array_like The design matrix, (N, P). Returns ------- bool True if the contrast `c` is estimable on design `d`. Examples -------- >>> d = np.array([[1, 1, 1, 0, 0, 0], ... [0, 0, 0, 1, 1, 1], ... [1, 1, 1, 1, 1, 1]]).T >>> isestimable([1, 0, 0], d) False >>> isestimable([1, -1, 0], d) True """ c = array_like(c, 'c', maxdim=2) d = array_like(d, 'd', ndim=2) c = c[None, :] if c.ndim == 1 else c if c.shape[1] != d.shape[1]: raise ValueError('Contrast should have %d columns' % d.shape[1]) new = np.vstack([c, d]) if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d): return False return True
def __init__(self, endog, exog, window=None, weights=None, min_nobs=None, missing='drop'): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like(missing, 'missing', options=('drop', 'raise', 'skip')) temp_msng = 'drop' if missing != 'raise' else 'raise' Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing='none', hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, 'endog') nobs = self._y.shape[0] self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None)) window = int_like(window, 'window', optional=True) weights = array_like(weights, 'weights', optional=True, shape=(nobs, )) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x self._is_nan = np.zeros_like(self._y, dtype=np.bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == 'skip' min_nobs = int_like(min_nobs, 'min_nobs', optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError('min_nobs must be larger than the number of ' 'regressors in the model and less than window')
def __init__(self, endog, trend=None, damped=False, seasonal=None, seasonal_periods=None, dates=None, freq=None, missing='none'): super(ExponentialSmoothing, self).__init__(endog, None, dates, freq, missing=missing) self.endog = self.endog self._y = self._data = array_like(endog, 'endog', contiguous=True, order='C') options = ("add", "mul", "additive", "multiplicative") trend = string_like(trend, 'trend', options=options, optional=True) if trend in ['additive', 'multiplicative']: trend = {'additive': 'add', 'multiplicative': 'mul'}[trend] self.trend = trend self.damped = bool_like(damped, 'damped') seasonal = string_like(seasonal, 'seasonal', options=options, optional=True) if seasonal in ['additive', 'multiplicative']: seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal] self.seasonal = seasonal self.trending = trend in ['mul', 'add'] self.seasoning = seasonal in ['mul', 'add'] if (self.trend == 'mul' or self.seasonal == 'mul') and \ not np.all(self._data > 0.0): raise ValueError('endog must be strictly positive when using' 'multiplicative trend or seasonal components.') if self.damped and not self.trending: raise ValueError('Can only dampen the trend component') if self.seasoning: self.seasonal_periods = int_like(seasonal_periods, 'seasonal_periods', optional=True) if seasonal_periods is None: self.seasonal_periods = freq_to_period(self._index_freq) if self.seasonal_periods <= 1: raise ValueError('seasonal_periods must be larger than 1.') else: self.seasonal_periods = 0 self.nobs = len(self.endog)
def iqr(x1, x2, axis=0): """ Interquartile range of error Parameters ---------- x1 : array_like One of the inputs into the IQR calculation. x2 : array_like The other input into the IQR calculation. axis : {None, int} axis along which the summary statistic is calculated Returns ------- irq : {float, ndarray} Interquartile range along given axis. Notes ----- If ``x1`` and ``x2`` have different shapes, then they must broadcast. """ x1 = array_like(x1, 'x1', dtype=None, ndim=None) x2 = array_like(x2, 'x1', dtype=None, ndim=None) if axis is None: x1 = x1.ravel() x2 = x2.ravel() axis = 0 xdiff = np.sort(x1 - x2, axis=axis) nobs = x1.shape[axis] idx = np.round((nobs - 1) * np.array([0.25, 0.75])).astype(int) sl = [slice(None)] * xdiff.ndim sl[axis] = idx iqr = np.diff(xdiff[tuple(sl)], axis=axis) iqr = np.squeeze(iqr) # drop reduced dimension return iqr
def test_mvmean_2indep(data1, data2): """Hotellings test for multivariate mean in two samples Parameters ---------- data1 : array_like first sample data with observations in rows and variables in columns data2 : array_like second sample data with observations in rows and variables in columns Returns ------- results : instance of a results class with attributes statistic, pvalue, t2 and df """ x1 = array_like(data1, "x1", ndim=2) x2 = array_like(data2, "x2", ndim=2) nobs_x, k_vars = x1.shape nobs_y, k_vars = x2.shape mean_x = x1.mean(0) mean_y = x2.mean(0) cov_x = np.cov(x1, rowvar=False, ddof=1) cov_y = np.cov(x2, rowvar=False, ddof=1) nobs_t = nobs_x + nobs_y combined_cov = ((nobs_x - 1) * cov_x + (nobs_y - 1) * cov_y) / (nobs_t - 2) diff = mean_x - mean_y t2 = (nobs_x * nobs_y) / nobs_t * diff @ (np.linalg.solve(combined_cov, diff)) factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1) statistic = t2 / factor df = (k_vars, nobs_t - 1 - k_vars) pvalue = stats.f.sf(statistic, df[0], df[1]) return HolderTuple(statistic=statistic, pvalue=pvalue, df=df, t2=t2, distr="F")
def __init__(self, x, kernel=None): x = array_like(x, "x", maxdim=2, contiguous=True) if x.ndim == 1: x = x[:, None] nobs, n_series = x.shape if kernel is None: kernel = kernels.Gaussian() # no meaningful bandwidth yet if n_series > 1: if isinstance(kernel, kernels.CustomKernel): kernel = kernels.NdKernel(n_series, kernels=kernel) self.kernel = kernel self.n = n_series #TODO change attribute self.x = x
def __init__( self, endog, *, period: Optional[int] = None, deseasonalize: bool = True, use_test: bool = True, method: str = "auto", difference: bool = False ) -> None: self._y = array_like(endog, "endog", ndim=1) if isinstance(endog, pd.DataFrame): self.endog_orig = endog.iloc[:, 0] else: self.endog_orig = endog self._period = int_like(period, "period", optional=True) self._deseasonalize = bool_like(deseasonalize, "deseasonalize") self._use_test = ( bool_like(use_test, "use_test") and self._deseasonalize ) self._diff = bool_like(difference, "difference") self._method = string_like( method, "model", options=("auto", "additive", "multiplicative", "mul", "add"), ) if self._period is None and self._deseasonalize: idx = getattr(endog, "index", None) pfreq = None if idx is not None: pfreq = getattr(idx, "freq", None) if pfreq is None: pfreq = getattr(idx, "inferred_freq", None) if pfreq is not None: self._period = freq_to_period(pfreq) else: raise ValueError( "You must specify a period or endog must be a " "pandas object with a DatetimeIndex with " "a freq not set to None" ) self._has_seasonality = self._deseasonalize
def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0): """ Computes the Qn robust estimator of scale The Qn scale estimator is a more efficient alternative to the MAD. The Qn scale estimator of an array a of length n is defined as c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus, the Qn estimator is the k-th order statistic of the absolute differences of the array. The optional constant is used to normalize the estimate as explained below. The implementation follows the algorithm described in Croux and Rousseeuw (1992). Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. The default value is used to get consistent estimates of the standard deviation at the normal distribution. axis : int, optional The default is 0. Returns ------- {float, ndarray} The Qn robust estimator of scale """ a = array_like(a, "a", ndim=None, dtype=np.float64, contiguous=True, order="C") c = float_like(c, "c") if a.ndim == 0: raise ValueError("a should have at least one dimension") elif a.size == 0: return np.nan else: out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c) if out.ndim == 0: return float(out) return out
def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median): """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately 0.6745. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = array_like(a, "a", ndim=None) c = float_like(c, "c") if not a.size: center_val = 0.0 elif callable(center): if axis is not None: center_val = np.apply_over_axes(center, a, axis) else: center_val = center(a.ravel()) else: center_val = float_like(center, "center") err = (np.abs(a - center_val)) / c if not err.size: if axis is None or err.ndim == 1: return np.nan else: shape = list(err.shape) shape.pop(axis) return np.empty(shape) return np.median(err, axis=axis)
def matrix_rank(m, tol=None, method="qr"): """ Matrix rank calculation using QR or SVD Parameters ---------- m : array_like A 2-d array-like object to test tol : float, optional The tolerance to use when testing the matrix rank. If not provided an appropriate value is selected. method : {"ip", "qr", "svd"} The method used. "ip" uses the inner-product of a normalized version of m and then computes the rank using NumPy's matrix_rank. "qr" uses a QR decomposition and is the default. "svd" defers to NumPy's matrix_rank. Returns ------- int The rank of m. Notes ----- When using a QR factorization, the rank is determined by the number of elements on the leading diagonal of the R matrix that are above tol in absolute value. """ m = array_like(m, "m", ndim=2) if method == "ip": m = m[:, np.any(m != 0, axis=0)] m = m / np.sqrt((m**2).sum(0)) m = m.T @ m return np.linalg.matrix_rank(m, tol=tol, hermitian=True) elif method == "qr": r, = scipy.linalg.qr(m, mode="r") abs_diag = np.abs(np.diag(r)) if tol is None: tol = abs_diag[0] * m.shape[1] * np.finfo(float).eps return int((abs_diag > tol).sum()) else: return np.linalg.matrix_rank(m, tol=tol)
def lpol2index(ar): """ Remove zeros from lag polynomial Parameters ---------- ar : array_like coefficients of lag polynomial Returns ------- coeffs : array non-zero coefficients of lag polynomial index : array index (lags) of lag polynomial with non-zero elements """ ar = array_like(ar, 'ar') index = np.nonzero(ar)[0] coeffs = ar[index] return coeffs, index
def distance_indicators(x, epsilon=None, distance=1.5): """ Calculate all pairwise threshold distance indicators for a time series Parameters ---------- x : 1d array observations of time series for which heaviside distance indicators are calculated epsilon : scalar, optional the threshold distance to use in calculating the heaviside indicators distance : scalar, optional if epsilon is omitted, specifies the distance multiplier to use when computing it Returns ------- indicators : 2d array matrix of distance threshold indicators Notes ----- Since this can be a very large matrix, use np.int8 to save some space. """ x = array_like(x, 'x') if epsilon is not None and epsilon <= 0: raise ValueError("Threshold distance must be positive if specified." " Got epsilon of %f" % epsilon) if distance <= 0: raise ValueError("Threshold distance must be positive." " Got distance multiplier %f" % distance) # TODO: add functionality to select epsilon optimally # TODO: and/or compute for a range of epsilons in [0.5*s, 2.0*s]? # or [1.5*s, 2.0*s]? if epsilon is None: epsilon = distance * x.std(ddof=1) return np.abs(x[:, None] - x) < epsilon
def test_1d(self, use_pandas): data = gen_data(1, use_pandas) a = array_like(data, "a") assert a.ndim == 1 assert a.shape == (10, ) assert type(a) is np.ndarray a = array_like(data, "a", ndim=1) assert a.ndim == 1 a = array_like(data, "a", shape=(10, )) assert a.shape == (10, ) a = array_like(data, "a", ndim=1, shape=(None, )) assert a.ndim == 1 a = array_like(data, "a", ndim=2, shape=(10, 1)) assert a.ndim == 2 assert a.shape == (10, 1) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", shape=(5, ))
def lpol2index(ar): """ Remove zeros from lag polynomial Parameters ---------- ar : array_like coefficients of lag polynomial Returns ------- coeffs : ndarray non-zero coefficients of lag polynomial index : ndarray index (lags) of lag polynomial with non-zero elements """ with warnings.catch_warnings(): warnings.simplefilter("ignore", np.ComplexWarning) ar = array_like(ar, "ar") index = np.nonzero(ar)[0] coeffs = ar[index] return coeffs, index
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0, center=np.median): """ The normalized interquartile range along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant, used to get consistent estimates of the standard deviation at the normal distribution. Defined as scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is approximately 1.349. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- The normalized interquartile range """ a = array_like(a, 'a', ndim=None) c = float_like(c, 'c') if a.size == 0: return np.nan else: if callable(center) and a.size: center = np.apply_over_axes(center, a, axis) else: center = 0.0 quantiles = np.quantile(a - center, [0.25, 0.75], axis=axis) return np.squeeze(np.diff(quantiles, axis=0) / c)
def test_right_squeeze_and_pad(self): data = np.empty((2, 1, 2)) a = array_like(data, "a", ndim=3) assert a.shape == (2, 1, 2) data = np.empty((2)) a = array_like(data, "a", ndim=3) assert a.shape == (2, 1, 1) data = np.empty((2, 1)) a = array_like(data, "a", ndim=3) assert a.shape == (2, 1, 1) data = np.empty((2, 1, 1, 1)) a = array_like(data, "a", ndim=3) assert a.shape == (2, 1, 1) data = np.empty((2, 1, 1, 2, 1, 1)) with pytest.raises(ValueError): array_like(data, "a", ndim=3)
def test_dot(self, use_pandas): data = gen_data(2, use_pandas) a = array_like(data, "a") assert not isinstance(a.T.dot(data), array_like) assert not isinstance(a.T.dot(a), array_like)
def seasonal_decompose(x, model="additive", filt=None, period=None, two_sided=True, extrapolate_trend=0): """ Seasonal decomposition using moving averages. Parameters ---------- x : array_like Time series. If 2d, individual series are in columns. x must contain 2 complete cycles. model : {"additive", "multiplicative"}, optional Type of seasonal component. Abbreviations are accepted. filt : array_like, optional The filter coefficients for filtering out the seasonal component. The concrete moving average method used in filtering is determined by two_sided. period : int, optional Period of the series. Must be used if x is not a pandas object or if the index of x does not have a frequency. Overrides default periodicity of x if x is a pandas object with a timeseries index. two_sided : bool, optional The moving average method used in filtering. If True (default), a centered moving average is computed using the filt. If False, the filter coefficients are for past values only. extrapolate_trend : int or 'freq', optional If set to > 0, the trend resulting from the convolution is linear least-squares extrapolated on both ends (or the single one if two_sided is False) considering this many (+1) closest points. If set to 'freq', use `freq` closest points. Setting this parameter results in no NaN values in trend or resid components. Returns ------- DecomposeResult A object with seasonal, trend, and resid attributes. See Also -------- statsmodels.tsa.filters.bk_filter.bkfilter statsmodels.tsa.filters.cf_filter.xffilter statsmodels.tsa.filters.hp_filter.hpfilter statsmodels.tsa.filters.convolution_filter statsmodels.tsa.seasonal.STL Notes ----- This is a naive decomposition. More sophisticated methods should be preferred. The additive model is Y[t] = T[t] + S[t] + e[t] The multiplicative model is Y[t] = T[t] * S[t] * e[t] The seasonal component is first removed by applying a convolution filter to the data. The average of this smoothed series for each period is the returned seasonal component. """ pfreq = period pw = PandasWrapper(x) if period is None: pfreq = getattr(getattr(x, 'index', None), 'inferred_freq', None) x = array_like(x, 'x', maxdim=2) nobs = len(x) if not np.all(np.isfinite(x)): raise ValueError("This function does not handle missing values") if model.startswith('m'): if np.any(x <= 0): raise ValueError("Multiplicative seasonality is not appropriate " "for zero and negative values") if period is None: if pfreq is not None: pfreq = freq_to_period(pfreq) period = pfreq else: raise ValueError("You must specify a period or x must be a " "pandas object with a DatetimeIndex with " "a freq not set to None") if x.shape[0] < 2 * pfreq: raise ValueError('x must have 2 complete cycles requires {0} ' 'observations. x only has {1} ' 'observation(s)'.format(2 * pfreq, x.shape[0])) if filt is None: if period % 2 == 0: # split weights at ends filt = np.array([.5] + [1] * (period - 1) + [.5]) / period else: filt = np.repeat(1. / period, period) nsides = int(two_sided) + 1 trend = convolution_filter(x, filt, nsides) if extrapolate_trend == 'freq': extrapolate_trend = period - 1 if extrapolate_trend > 0: trend = _extrapolate_trend(trend, extrapolate_trend + 1) if model.startswith('m'): detrended = x / trend else: detrended = x - trend period_averages = seasonal_mean(detrended, period) if model.startswith('m'): period_averages /= np.mean(period_averages, axis=0) else: period_averages -= np.mean(period_averages, axis=0) seasonal = np.tile(period_averages.T, nobs // period + 1).T[:nobs] if model.startswith('m'): resid = x / seasonal / trend else: resid = detrended - seasonal results = [] for s, name in zip((seasonal, trend, resid, x), ('seasonal', 'trend', 'resid', None)): results.append(pw.wrap(s.squeeze(), columns=name)) return DecomposeResult(seasonal=results[0], trend=results[1], resid=results[2], observed=results[3])
def __init__( self, data: Union[np.ndarray, pd.Series, pd.DataFrame], stats: Sequence[str] = None, *, numeric: bool = True, categorical: bool = True, alpha: float = 0.05, use_t: bool = False, percentiles: Sequence[Union[int, float]] = PERCENTILES, ntop: bool = 5, ): data_arr = data if not isinstance(data, (pd.Series, pd.DataFrame)): data_arr = array_like(data, "data", maxdim=2) if data_arr.ndim == 1: data = pd.Series(data) numeric = bool_like(numeric, "numeric") categorical = bool_like(categorical, "categorical") include = [] col_types = "" if numeric: include.append(np.number) col_types = "numeric" if categorical: include.append("category") col_types += "and " if col_types != "" else "" col_types += "categorical" if not numeric and not categorical: raise ValueError( "At least one of numeric and categorical must be True" ) self._data = pd.DataFrame(data).select_dtypes(include) if self._data.shape[1] == 0: raise ValueError( "Selecting {col_types} results in an empty DataFrame" ) self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes] self._is_cat_like = [ is_categorical_dtype(dt) for dt in self._data.dtypes ] if stats is not None: undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS] if undef: raise ValueError( f"{', '.join(undef)} are not known statistics" ) self._stats = ( list(DEFAULT_STATISTICS) if stats is None else list(stats) ) self._ntop = int_like(ntop, "ntop") self._compute_top = "top" in self._stats self._compute_freq = "freq" in self._stats if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like): raise ValueError("top must be a non-negative integer") self._compute_perc = "percentiles" in self._stats self._percentiles = array_like( percentiles, "percentiles", maxdim=1, dtype="d" ) self._percentiles = np.sort(self._percentiles) if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]: raise ValueError("percentiles must be distinct") if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0): raise ValueError("percentiles must be strictly between 0 and 100") # Expand special stats replacements = { "mode": ["mode", "mode_freq"], "ci": ["upper_ci", "lower_ci"], "jarque_bera": ["jarque_bera", "jarque_bera_pval"], "top": [f"top_{i}" for i in range(1, self._ntop + 1)], "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)], "percentiles": [f"{i}%" for i in percentiles], } for key in replacements: if key in self._stats: idx = self._stats.index(key) self._stats = ( self._stats[:idx] + replacements[key] + self._stats[idx + 1 :] ) self._alpha = float_like(alpha, "alpha") if not 0 < alpha < 1: raise ValueError("alpha must be strictly between 0 and 1") self._use_t = bool_like(use_t, "use_t")
def lagmat( x, maxlag: int, trim: Literal["forward", "backward", "both", "none"] = 'forward', original: Literal["ex", "sep", "in"] = "ex", use_pandas: bool = False ) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame, DataFrame]: """ Create 2d array of lags. Parameters ---------- x : array_like Data; if 2d, observation in rows and variables in columns. maxlag : int All lags from zero to maxlag are included. trim : {'forward', 'backward', 'both', 'none', None} The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none', None : no trimming of observations. original : {'ex','sep','in'} How the original is treated. * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : ndarray The array with lagged observations. y : ndarray, optional Only returned if original == 'sep'. Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) """ maxlag = int_like(maxlag, "maxlag") use_pandas = bool_like(use_pandas, "use_pandas") trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) original = string_like(original, "original", options=("ex", "sep", "in")) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, "x", ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = "none" if trim is None else trim trim = trim.lower() if is_pandas and trim in ("none", "backward"): raise ValueError("trim cannot be 'none' or 'backward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ["ex", "sep"]: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x if trim in ("none", "forward"): startobs = 0 elif trim in ("backward", "both"): startobs = maxlag else: raise ValueError("trim option not valid") if trim in ("none", "backward"): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + ".L." + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ("sep", "ex"): leads = lags[x_columns] lags = lags.drop(x_columns, axis=1) else: lags = lm[startobs:stopobs, dropidx:] if original == "sep": leads = lm[startobs:stopobs, :dropidx] if original == "sep": return lags, leads else: return lags
def test_3d(self): data = gen_data(3, False) a = array_like(data, "a", ndim=3) assert a.shape == (5, 6, 7) assert a.ndim == 3 assert type(a) is np.ndarray a = array_like(data, "a", ndim=3, shape=(5, None, 7)) assert a.shape == (5, 6, 7) a = array_like(data, "a", ndim=3, shape=(None, None, 7)) assert a.shape == (5, 6, 7) a = array_like(data, "a", ndim=5) assert a.shape == (5, 6, 7, 1, 1) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", ndim=3, shape=(10, )) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", ndim=3, shape=(None, None, 5)) match = "a is required to have ndim 2 but has ndim 3" with pytest.raises(ValueError, match=match): array_like(data, "a", ndim=2) match = "a must have ndim <= 1" with pytest.raises(ValueError, match=match): array_like(data, "a", maxdim=1) match = "a must have ndim <= 2" with pytest.raises(ValueError, match=match): array_like(data, "a", maxdim=2)
def test_2d(self, use_pandas): data = gen_data(2, use_pandas) a = array_like(data, "a", ndim=2) assert a.ndim == 2 assert a.shape == (20, 10) assert type(a) is np.ndarray a = array_like(data, "a", ndim=2) assert a.ndim == 2 a = array_like(data, "a", ndim=2, shape=(20, None)) assert a.shape == (20, 10) a = array_like(data, "a", ndim=2, shape=(20, )) assert a.shape == (20, 10) a = array_like(data, "a", ndim=2, shape=(None, 10)) assert a.shape == (20, 10) a = array_like(data, "a", ndim=2, shape=(None, None)) assert a.ndim == 2 a = array_like(data, "a", ndim=3) assert a.ndim == 3 assert a.shape == (20, 10, 1) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", ndim=2, shape=(10, )) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", ndim=2, shape=(20, 20)) with pytest.raises(ValueError, match="a is required to have shape"): array_like(data, "a", ndim=2, shape=(None, 20)) match = "a is required to have ndim 1 but has ndim 2" with pytest.raises(ValueError, match=match): array_like(data, "a", ndim=1) match = "a must have ndim <= 1" with pytest.raises(ValueError, match=match): array_like(data, "a", maxdim=1)
def test_contiguous(self): x = np.arange(10) y = x[::2] a = array_like(y, "a", contiguous=True) assert not y.flags["C_CONTIGUOUS"] assert a.flags["C_CONTIGUOUS"]
def test_slice(self, use_pandas): data = gen_data(2, use_pandas) a = array_like(data, "a", ndim=2) assert type(a[1:]) is np.ndarray