class VARMAX(MLEModel): r""" Vector Autoregressive Moving Average with eXogenous regressors model Parameters ---------- endog : array_like The observed time-series process :math:`y`, , shaped nobs x k_endog. exog : array_like, optional Array of exogenous regressors, shaped nobs x k. order : iterable The (p,q) order of the model for the number of AR and MA parameters to use. trend : {'nc', 'c'}, optional Parameter controlling the deterministic trend polynomial. Can be specified as a string where 'c' indicates a constant intercept and 'nc' indicates no intercept term. error_cov_type : {'diagonal', 'unstructured'}, optional The structure of the covariance matrix of the error term, where "unstructured" puts no restrictions on the matrix and "diagonal" requires it to be a diagonal matrix (uncorrelated errors). Default is "unstructured". measurement_error : boolean, optional Whether or not to assume the endogenous observations `endog` were measured with error. Default is False. enforce_stationarity : boolean, optional Whether or not to transform the AR parameters to enforce stationarity in the autoregressive component of the model. Default is True. enforce_invertibility : boolean, optional Whether or not to transform the MA parameters to enforce invertibility in the moving average component of the model. Default is True. **kwargs Keyword arguments may be used to provide default values for state space matrices or for Kalman filtering options. See `Representation`, and `KalmanFilter` for more details. Attributes ---------- order : iterable The (p,q) order of the model for the number of AR and MA parameters to use. trend : {'nc', 'c'}, optional Parameter controlling the deterministic trend polynomial. Can be specified as a string where 'c' indicates a constant intercept and 'nc' indicates no intercept term. error_cov_type : {'diagonal', 'unstructured'}, optional The structure of the covariance matrix of the error term, where "unstructured" puts no restrictions on the matrix and "diagonal" requires it to be a diagonal matrix (uncorrelated errors). Default is "unstructured". measurement_error : boolean, optional Whether or not to assume the endogenous observations `endog` were measured with error. Default is False. enforce_stationarity : boolean, optional Whether or not to transform the AR parameters to enforce stationarity in the autoregressive component of the model. Default is True. enforce_invertibility : boolean, optional Whether or not to transform the MA parameters to enforce invertibility in the moving average component of the model. Default is True. Notes ----- Generically, the VARMAX model is specified (see for example chapter 18 of [1]_): .. math:: y_t = \nu + A_1 y_{t-1} + \dots + A_p y_{t-p} + B x_t + \epsilon_t + M_1 \epsilon_{t-1} + \dots M_q \epsilon_{t-q} where :math:`\epsilon_t \sim N(0, \Omega)`, and where :math:`y_t` is a `k_endog x 1` vector. Additionally, this model allows considering the case where the variables are measured with error. Note that in the full VARMA(p,q) case there is a fundamental identification problem in that the coefficient matrices :math:`\{A_i, M_j\}` are not generally unique, meaning that for a given time series process there may be multiple sets of matrices that equivalently represent it. See Chapter 12 of [1]_ for more informationl. Although this class can be used to estimate VARMA(p,q) models, a warning is issued to remind users that no steps have been taken to ensure identification in this case. References ---------- .. [1] Lutkepohl, Helmut. 2007. New Introduction to Multiple Time Series Analysis. Berlin: Springer. """ def __init__(self, endog, exog=None, order=(1, 0), trend='c', error_cov_type='unstructured', measurement_error=False, enforce_stationarity=True, enforce_invertibility=True, **kwargs): # Model parameters self.error_cov_type = error_cov_type self.measurement_error = measurement_error self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility # Save the given orders self.order = order self.trend = trend # Model orders self.k_ar = int(order[0]) self.k_ma = int(order[1]) self.k_trend = int(self.trend == 'c') # Check for valid model if trend not in ['c', 'nc']: raise ValueError('Invalid trend specification.') if error_cov_type not in ['diagonal', 'unstructured']: raise ValueError('Invalid error covariance matrix type' ' specification.') if self.k_ar == 0 and self.k_ma == 0: raise ValueError('Invalid VARMAX(p,q) specification; at least one' ' p,q must be greater than zero.') # Warn for VARMA model if self.k_ar > 0 and self.k_ma > 0: warn('Estimation of VARMA(p,q) models is not generically robust,' ' due especially to identification issues.') # Exogenous data self.k_exog = 0 if exog is not None: exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Note: at some point in the future might add state regression, as in # SARIMAX. self.mle_regression = self.k_exog > 0 # We need to have an array or pandas at this point if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) # Model order # Used internally in various places _min_k_ar = max(self.k_ar, 1) self._k_order = _min_k_ar + self.k_ma # Number of states k_endog = endog.shape[1] k_posdef = k_endog k_states = k_endog * self._k_order # By default, initialize as stationary kwargs.setdefault('initialization', 'stationary') # By default, use LU decomposition kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU) # Initialize the state space model super(VARMAX, self).__init__(endog, exog=exog, k_states=k_states, k_posdef=k_posdef, **kwargs) # Initialize the parameters self.parameters = OrderedDict() self.parameters['trend'] = self.k_endog * self.k_trend self.parameters['ar'] = self.k_endog**2 * self.k_ar self.parameters['ma'] = self.k_endog**2 * self.k_ma self.parameters['regression'] = self.k_endog * self.k_exog if self.error_cov_type == 'diagonal': self.parameters['state_cov'] = self.k_endog # These parameters fill in a lower-triangular matrix which is then # dotted with itself to get a positive definite matrix. elif self.error_cov_type == 'unstructured': self.parameters['state_cov'] = (int(self.k_endog * (self.k_endog + 1) / 2)) self.parameters['obs_cov'] = self.k_endog * self.measurement_error self.k_params = sum(self.parameters.values()) # Initialize known elements of the state space matrices # If we have exog effects, then the state intercept needs to be # time-varying if self.k_exog > 0: self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs)) # The design matrix is just an identity for the first k_endog states idx = np.diag_indices(self.k_endog) self.ssm[('design', ) + idx] = 1 # The transition matrix is described in four blocks, where the upper # left block is in companion form with the autoregressive coefficient # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ... if self.k_ar > 0: idx = np.diag_indices((self.k_ar - 1) * self.k_endog) idx = idx[0] + self.k_endog, idx[1] self.ssm[('transition', ) + idx] = 1 # ... and the lower right block is in companion form with zeros as the # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma). idx = np.diag_indices((self.k_ma - 1) * self.k_endog) idx = (idx[0] + (_min_k_ar + 1) * self.k_endog, idx[1] + _min_k_ar * self.k_endog) self.ssm[('transition', ) + idx] = 1 # The selection matrix is described in two blocks, where the upper # block selects the all k_posdef errors in the first k_endog rows # (the upper block is shaped k_endog * k_ar x k) and the lower block # also selects all k_posdef errors in the first k_endog rows (the lower # block is shaped k_endog * k_ma x k). idx = np.diag_indices(self.k_endog) self.ssm[('selection', ) + idx] = 1 idx = idx[0] + _min_k_ar * self.k_endog, idx[1] if self.k_ma > 0: self.ssm[('selection', ) + idx] = 1 # Cache some indices if self.trend == 'c' and self.k_exog == 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog] elif self.k_exog > 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog, :] if self.k_ar > 0: self._idx_transition = np.s_['transition', :k_endog, :] else: self._idx_transition = np.s_['transition', :k_endog, k_endog:] if self.error_cov_type == 'diagonal': self._idx_state_cov = (('state_cov', ) + np.diag_indices(self.k_endog)) elif self.error_cov_type == 'unstructured': self._idx_lower_state_cov = np.tril_indices(self.k_endog) if self.measurement_error: self._idx_obs_cov = ('obs_cov', ) + np.diag_indices(self.k_endog) # Cache some slices def _slice(key, offset): length = self.parameters[key] param_slice = np.s_[offset:offset + length] offset += length return param_slice, offset offset = 0 self._params_trend, offset = _slice('trend', offset) self._params_ar, offset = _slice('ar', offset) self._params_ma, offset = _slice('ma', offset) self._params_regression, offset = _slice('regression', offset) self._params_state_cov, offset = _slice('state_cov', offset) self._params_obs_cov, offset = _slice('obs_cov', offset) def filter(self, params, transformed=True, cov_type=None, return_ssm=False, **kwargs): params = np.array(params, ndmin=1) # Transform parameters if necessary if not transformed: params = self.transform_params(params) transformed = True # Get the state space output result = super(VARMAX, self).filter(params, transformed, cov_type, return_ssm=True, **kwargs) # Wrap in a results object if not return_ssm: result_kwargs = {} if cov_type is not None: result_kwargs['cov_type'] = cov_type result = VARMAXResultsWrapper( VARMAXResults(self, params, result, **result_kwargs)) return result filter.__doc__ = MLEModel.filter.__doc__ @property def start_params(self): params = np.zeros(self.k_params, dtype=np.float64) # A. Run a multivariate regression to get beta estimates endog = self.endog.copy() exog = self.exog.copy() if self.k_exog > 0 else None # Although the Kalman filter can deal with missing values in endog, # conditional sum of squares cannot if np.any(np.isnan(endog)): endog = endog[~np.isnan(endog)] if exog is not None: exog = exog[~np.isnan(endog)] # Regression effects via OLS exog_params = np.zeros(0) if self.k_exog > 0: exog_params = np.linalg.pinv(exog).dot(endog).T endog -= np.dot(exog, exog_params.T) # B. Run a VAR model on endog to get trend, AR parameters ar_params = [] k_ar = self.k_ar if self.k_ar > 0 else 1 mod_ar = var_model.VAR(endog) res_ar = mod_ar.fit(maxlags=k_ar, ic=None, trend=self.trend) ar_params = np.array(res_ar.params.T) if self.trend == 'c': trend_params = ar_params[:, 0] if self.k_ar > 0: ar_params = ar_params[:, 1:].ravel() else: ar_params = [] elif self.k_ar > 0: ar_params = ar_params.ravel() else: ar_params = [] endog = res_ar.resid # Test for stationarity if self.k_ar > 0 and self.enforce_stationarity: coefficient_matrices = (ar_params.reshape(self.k_endog * self.k_ar, self.k_endog).T).reshape( self.k_endog, self.k_endog, self.k_ar).T stationary = is_invertible([1] + list(-coefficient_matrices)) if not stationary: raise ValueError( 'Non-stationary starting autoregressive' ' parameters found with `enforce_stationarity`' ' set to True.') # C. Run a VAR model on the residuals to get MA parameters ma_params = [] if self.k_ma > 0: mod_ma = var_model.VAR(endog) res_ma = mod_ma.fit(maxlags=self.k_ma, ic=None, trend='nc') ma_params = np.array(res_ma.params.T).ravel() # Test for invertibility if self.enforce_invertibility: coefficient_matrices = (ma_params.reshape( self.k_endog * self.k_ma, self.k_endog).T).reshape(self.k_endog, self.k_endog, self.k_ma).T invertible = is_invertible([1] + list(-coefficient_matrices)) if not invertible: raise ValueError( 'Non-invertible starting moving-average' ' parameters found with `enforce_stationarity`' ' set to True.') # 1. Intercept terms if self.trend == 'c': params[self._params_trend] = trend_params # 2. AR terms params[self._params_ar] = ar_params # 3. MA terms params[self._params_ma] = ma_params # 4. Regression terms if self.mle_regression: params[self._params_regression] = exog_params.ravel() # 5. State covariance terms if self.error_cov_type == 'diagonal': params[self._params_state_cov] = res_ar.sigma_u.diagonal() elif self.error_cov_type == 'unstructured': cov_factor = np.linalg.cholesky(res_ar.sigma_u) params[self._params_state_cov] = ( cov_factor[self._idx_lower_state_cov].ravel()) # 5. Measurement error variance terms if self.measurement_error: if self.k_ma > 0: params[self._params_obs_cov] = res_ma.sigma_u.diagonal() else: params[self._params_obs_cov] = res_ar.sigma_u.diagonal() return params @property def param_names(self): param_names = [] # 1. Intercept terms if self.trend == 'c': param_names += [ 'const.%s' % self.endog_names[i] for i in range(self.k_endog) ] # 2. AR terms param_names += [ 'L%d.%s.%s' % (i + 1, self.endog_names[k], self.endog_names[j]) for j in range(self.k_endog) for i in range(self.k_ar) for k in range(self.k_endog) ] # 3. MA terms param_names += [ 'L%d.e(%s).%s' % (i + 1, self.endog_names[k], self.endog_names[j]) for j in range(self.k_endog) for i in range(self.k_ma) for k in range(self.k_endog) ] # 4. Regression terms param_names += [ 'beta.%s.%s' % (self.exog_names[j], self.endog_names[i]) for i in range(self.k_endog) for j in range(self.k_exog) ] # 5. State covariance terms if self.error_cov_type == 'diagonal': param_names += [ 'sigma2.%s' % self.endog_names[i] for i in range(self.k_endog) ] elif self.error_cov_type == 'unstructured': param_names += [ ('sqrt.var.%s' % self.endog_names[i] if i == j else 'sqrt.cov.%s.%s' % (self.endog_names[j], self.endog_names[i])) for i in range(self.k_endog) for j in range(i + 1) ] # 5. Measurement error variance terms if self.measurement_error: param_names += [ 'measurement_variance.%s' % self.endog_names[i] for i in range(self.k_endog) ] return param_names def transform_params(self, unconstrained): """ Transform unconstrained parameters used by the optimizer to constrained parameters used in likelihood evaluation Parameters ---------- unconstrained : array_like Array of unconstrained parameters used by the optimizer, to be transformed. Returns ------- constrained : array_like Array of constrained parameters which may be used in likelihood evalation. Notes ----- Constrains the factor transition to be stationary and variances to be positive. """ unconstrained = np.array(unconstrained, ndmin=1) constrained = np.zeros(unconstrained.shape, dtype=unconstrained.dtype) # 1. Intercept terms: nothing to do constrained[self._params_trend] = unconstrained[self._params_trend] # 2. AR terms: optionally force to be stationary if self.k_ar > 0 and self.enforce_stationarity: # Create the state covariance matrix if self.error_cov_type == 'diagonal': state_cov = np.diag(unconstrained[self._params_state_cov]**2) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=unconstrained.dtype) state_cov_lower[self._idx_lower_state_cov] = ( unconstrained[self._params_state_cov]) state_cov = np.dot(state_cov_lower, state_cov_lower.T) # Transform the parameters coefficients = unconstrained[self._params_ar].reshape( self.k_endog, self.k_endog * self.k_ar) coefficient_matrices, variance = ( constrain_stationary_multivariate(coefficients, state_cov)) constrained[self._params_ar] = coefficient_matrices.ravel() else: constrained[self._params_ar] = unconstrained[self._params_ar] # 3. MA terms: optionally force to be invertible if self.k_ma > 0 and self.enforce_invertibility: # Transform the parameters, using an identity variance matrix state_cov = np.eye(self.k_endog, dtype=unconstrained.dtype) coefficients = unconstrained[self._params_ma].reshape( self.k_endog, self.k_endog * self.k_ma) coefficient_matrices, variance = ( constrain_stationary_multivariate(coefficients, state_cov)) constrained[self._params_ma] = coefficient_matrices.ravel() else: constrained[self._params_ma] = unconstrained[self._params_ma] # 4. Regression terms: nothing to do constrained[self._params_regression] = ( unconstrained[self._params_regression]) # 5. State covariance terms # If we have variances, force them to be positive if self.error_cov_type == 'diagonal': constrained[self._params_state_cov] = ( unconstrained[self._params_state_cov]**2) # Otherwise, nothing needs to be done elif self.error_cov_type == 'unstructured': constrained[self._params_state_cov] = ( unconstrained[self._params_state_cov]) # 5. Measurement error variance terms if self.measurement_error: # Force these to be positive constrained[self._params_obs_cov] = ( unconstrained[self._params_obs_cov]**2) return constrained def untransform_params(self, constrained): """ Transform constrained parameters used in likelihood evaluation to unconstrained parameters used by the optimizer. Parameters ---------- constrained : array_like Array of constrained parameters used in likelihood evalution, to be transformed. Returns ------- unconstrained : array_like Array of unconstrained parameters used by the optimizer. """ constrained = np.array(constrained, ndmin=1) unconstrained = np.zeros(constrained.shape, dtype=constrained.dtype) # 1. Intercept terms: nothing to do unconstrained[self._params_trend] = constrained[self._params_trend] # 2. AR terms: optionally were forced to be stationary if self.k_ar > 0 and self.enforce_stationarity: # Create the state covariance matrix if self.error_cov_type == 'diagonal': state_cov = np.diag(constrained[self._params_state_cov]) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=constrained.dtype) state_cov_lower[self._idx_lower_state_cov] = ( constrained[self._params_state_cov]) state_cov = np.dot(state_cov_lower, state_cov_lower.T) # Transform the parameters coefficients = constrained[self._params_ar].reshape( self.k_endog, self.k_endog * self.k_ar) unconstrained_matrices, variance = ( unconstrain_stationary_multivariate(coefficients, state_cov)) unconstrained[self._params_ar] = unconstrained_matrices.ravel() else: unconstrained[self._params_ar] = constrained[self._params_ar] # 3. MA terms: optionally were forced to be invertible if self.k_ma > 0 and self.enforce_invertibility: # Transform the parameters, using an identity variance matrix state_cov = np.eye(self.k_endog, dtype=constrained.dtype) coefficients = constrained[self._params_ma].reshape( self.k_endog, self.k_endog * self.k_ma) unconstrained_matrices, variance = ( unconstrain_stationary_multivariate(coefficients, state_cov)) unconstrained[self._params_ma] = unconstrained_matrices.ravel() else: unconstrained[self._params_ma] = constrained[self._params_ma] # 4. Regression terms: nothing to do unconstrained[self._params_regression] = ( constrained[self._params_regression]) # 5. State covariance terms # If we have variances, then these were forced to be positive if self.error_cov_type == 'diagonal': unconstrained[self._params_state_cov] = ( constrained[self._params_state_cov]**0.5) # Otherwise, nothing needs to be done elif self.error_cov_type == 'unstructured': unconstrained[self._params_state_cov] = ( constrained[self._params_state_cov]) # 5. Measurement error variance terms if self.measurement_error: # These were forced to be positive unconstrained[self._params_obs_cov] = ( constrained[self._params_obs_cov]**0.5) return unconstrained def update(self, params, *args, **kwargs): params = super(VARMAX, self).update(params, *args, **kwargs) # 1. State intercept if self.mle_regression: exog_params = params[self._params_regression].reshape( self.k_endog, self.k_exog).T intercept = np.dot(self.exog, exog_params) if self.trend == 'c': intercept += params[self._params_trend] self.ssm[self._idx_state_intercept] = intercept.T elif self.trend == 'c': self.ssm[self._idx_state_intercept] = params[self._params_trend] # 2. Transition ar = params[self._params_ar].reshape(self.k_endog, self.k_endog * self.k_ar) ma = params[self._params_ma].reshape(self.k_endog, self.k_endog * self.k_ma) self.ssm[self._idx_transition] = np.c_[ar, ma] # 3. State covariance if self.error_cov_type == 'diagonal': self.ssm[self._idx_state_cov] = (params[self._params_state_cov]) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=params.dtype) state_cov_lower[self._idx_lower_state_cov] = ( params[self._params_state_cov]) self.ssm['state_cov'] = np.dot(state_cov_lower, state_cov_lower.T) # 4. Observation covariance if self.measurement_error: self.ssm[self._idx_obs_cov] = params[self._params_obs_cov]
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load() Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [ np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles) ] # Find mean, outliers curves if have_de_optim: median = differential_evolution(lambda x: -ks_gaussian.pdf(x), bounds=bounds, maxiter=5).x else: median = brute(lambda x: -ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band): """Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip(range(dim), itertools.repeat((band, pca, bounds, ks_gaussian))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [ i for i in alpha if 0.5 != i and 0.9 != i and threshold != i ] if extra_alpha != []: extra_quantiles = [ y for x in extra_alpha for y in _band_quantiles([x]) ] else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5]) hdr_50 = _band_quantiles([0.5]) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append( ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append( ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): label = str(labels_outlier[ii] ) if labels_outlier is not None else 'Outliers' ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res
class VARMAX(MLEModel): r""" Vector Autoregressive Moving Average with eXogenous regressors model Parameters ---------- endog : array_like The observed time-series process :math:`y`, , shaped nobs x k_endog. exog : array_like, optional Array of exogenous regressors, shaped nobs x k. order : iterable The (p,q) order of the model for the number of AR and MA parameters to use. trend : {'nc', 'c'}, optional Parameter controlling the deterministic trend polynomial. Can be specified as a string where 'c' indicates a constant intercept and 'nc' indicates no intercept term. error_cov_type : {'diagonal', 'unstructured'}, optional The structure of the covariance matrix of the error term, where "unstructured" puts no restrictions on the matrix and "diagonal" requires it to be a diagonal matrix (uncorrelated errors). Default is "unstructured". measurement_error : boolean, optional Whether or not to assume the endogenous observations `endog` were measured with error. Default is False. enforce_stationarity : boolean, optional Whether or not to transform the AR parameters to enforce stationarity in the autoregressive component of the model. Default is True. enforce_invertibility : boolean, optional Whether or not to transform the MA parameters to enforce invertibility in the moving average component of the model. Default is True. **kwargs Keyword arguments may be used to provide default values for state space matrices or for Kalman filtering options. See `Representation`, and `KalmanFilter` for more details. Attributes ---------- order : iterable The (p,q) order of the model for the number of AR and MA parameters to use. trend : {'nc', 'c'}, optional Parameter controlling the deterministic trend polynomial. Can be specified as a string where 'c' indicates a constant intercept and 'nc' indicates no intercept term. error_cov_type : {'diagonal', 'unstructured'}, optional The structure of the covariance matrix of the error term, where "unstructured" puts no restrictions on the matrix and "diagonal" requires it to be a diagonal matrix (uncorrelated errors). Default is "unstructured". measurement_error : boolean, optional Whether or not to assume the endogenous observations `endog` were measured with error. Default is False. enforce_stationarity : boolean, optional Whether or not to transform the AR parameters to enforce stationarity in the autoregressive component of the model. Default is True. enforce_invertibility : boolean, optional Whether or not to transform the MA parameters to enforce invertibility in the moving average component of the model. Default is True. Notes ----- Generically, the VARMAX model is specified (see for example chapter 18 of [1]_): .. math:: y_t = \nu + A_1 y_{t-1} + \dots + A_p y_{t-p} + B x_t + \epsilon_t + M_1 \epsilon_{t-1} + \dots M_q \epsilon_{t-q} where :math:`\epsilon_t \sim N(0, \Omega)`, and where :math:`y_t` is a `k_endog x 1` vector. Additionally, this model allows considering the case where the variables are measured with error. Note that in the full VARMA(p,q) case there is a fundamental identification problem in that the coefficient matrices :math:`\{A_i, M_j\}` are not generally unique, meaning that for a given time series process there may be multiple sets of matrices that equivalently represent it. See Chapter 12 of [1]_ for more informationl. Although this class can be used to estimate VARMA(p,q) models, a warning is issued to remind users that no steps have been taken to ensure identification in this case. References ---------- .. [1] Lutkepohl, Helmut. 2007. New Introduction to Multiple Time Series Analysis. Berlin: Springer. """ def __init__(self, endog, exog=None, order=(1, 0), trend='c', error_cov_type='unstructured', measurement_error=False, enforce_stationarity=True, enforce_invertibility=True, **kwargs): # Model parameters self.error_cov_type = error_cov_type self.measurement_error = measurement_error self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility # Save the given orders self.order = order self.trend = trend # Model orders self.k_ar = int(order[0]) self.k_ma = int(order[1]) self.k_trend = int(self.trend == 'c') # Check for valid model if trend not in ['c', 'nc']: raise ValueError('Invalid trend specification.') if error_cov_type not in ['diagonal', 'unstructured']: raise ValueError('Invalid error covariance matrix type' ' specification.') if self.k_ar == 0 and self.k_ma == 0: raise ValueError('Invalid VARMAX(p,q) specification; at least one' ' p,q must be greater than zero.') # Warn for VARMA model if self.k_ar > 0 and self.k_ma > 0: warn('Estimation of VARMA(p,q) models is not generically robust,' ' due especially to identification issues.') # Exogenous data self.k_exog = 0 if exog is not None: exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Note: at some point in the future might add state regression, as in # SARIMAX. self.mle_regression = self.k_exog > 0 # We need to have an array or pandas at this point if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) # Model order # Used internally in various places _min_k_ar = max(self.k_ar, 1) self._k_order = _min_k_ar + self.k_ma # Number of states k_endog = endog.shape[1] k_posdef = k_endog k_states = k_endog * self._k_order # By default, initialize as stationary kwargs.setdefault('initialization', 'stationary') # By default, use LU decomposition kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU) # Initialize the state space model super(VARMAX, self).__init__( endog, exog=exog, k_states=k_states, k_posdef=k_posdef, **kwargs ) # Initialize the parameters self.parameters = OrderedDict() self.parameters['trend'] = self.k_endog * self.k_trend self.parameters['ar'] = self.k_endog**2 * self.k_ar self.parameters['ma'] = self.k_endog**2 * self.k_ma self.parameters['regression'] = self.k_endog * self.k_exog if self.error_cov_type == 'diagonal': self.parameters['state_cov'] = self.k_endog # These parameters fill in a lower-triangular matrix which is then # dotted with itself to get a positive definite matrix. elif self.error_cov_type == 'unstructured': self.parameters['state_cov'] = ( int(self.k_endog * (self.k_endog + 1) / 2) ) self.parameters['obs_cov'] = self.k_endog * self.measurement_error self.k_params = sum(self.parameters.values()) # Initialize known elements of the state space matrices # If we have exog effects, then the state intercept needs to be # time-varying if self.k_exog > 0: self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs)) # The design matrix is just an identity for the first k_endog states idx = np.diag_indices(self.k_endog) self.ssm[('design',) + idx] = 1 # The transition matrix is described in four blocks, where the upper # left block is in companion form with the autoregressive coefficient # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ... if self.k_ar > 0: idx = np.diag_indices((self.k_ar - 1) * self.k_endog) idx = idx[0] + self.k_endog, idx[1] self.ssm[('transition',) + idx] = 1 # ... and the lower right block is in companion form with zeros as the # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma). idx = np.diag_indices((self.k_ma - 1) * self.k_endog) idx = (idx[0] + (_min_k_ar + 1) * self.k_endog, idx[1] + _min_k_ar * self.k_endog) self.ssm[('transition',) + idx] = 1 # The selection matrix is described in two blocks, where the upper # block selects the all k_posdef errors in the first k_endog rows # (the upper block is shaped k_endog * k_ar x k) and the lower block # also selects all k_posdef errors in the first k_endog rows (the lower # block is shaped k_endog * k_ma x k). idx = np.diag_indices(self.k_endog) self.ssm[('selection',) + idx] = 1 idx = idx[0] + _min_k_ar * self.k_endog, idx[1] if self.k_ma > 0: self.ssm[('selection',) + idx] = 1 # Cache some indices if self.trend == 'c' and self.k_exog == 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog] elif self.k_exog > 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog, :] if self.k_ar > 0: self._idx_transition = np.s_['transition', :k_endog, :] else: self._idx_transition = np.s_['transition', :k_endog, k_endog:] if self.error_cov_type == 'diagonal': self._idx_state_cov = ( ('state_cov',) + np.diag_indices(self.k_endog)) elif self.error_cov_type == 'unstructured': self._idx_lower_state_cov = np.tril_indices(self.k_endog) if self.measurement_error: self._idx_obs_cov = ('obs_cov',) + np.diag_indices(self.k_endog) # Cache some slices def _slice(key, offset): length = self.parameters[key] param_slice = np.s_[offset:offset + length] offset += length return param_slice, offset offset = 0 self._params_trend, offset = _slice('trend', offset) self._params_ar, offset = _slice('ar', offset) self._params_ma, offset = _slice('ma', offset) self._params_regression, offset = _slice('regression', offset) self._params_state_cov, offset = _slice('state_cov', offset) self._params_obs_cov, offset = _slice('obs_cov', offset) def filter(self, params, transformed=True, cov_type=None, return_ssm=False, **kwargs): params = np.array(params, ndmin=1) # Transform parameters if necessary if not transformed: params = self.transform_params(params) transformed = True # Get the state space output result = super(VARMAX, self).filter(params, transformed, cov_type, return_ssm=True, **kwargs) # Wrap in a results object if not return_ssm: result_kwargs = {} if cov_type is not None: result_kwargs['cov_type'] = cov_type result = VARMAXResultsWrapper( VARMAXResults(self, params, result, **result_kwargs) ) return result filter.__doc__ = MLEModel.filter.__doc__ @property def start_params(self): params = np.zeros(self.k_params, dtype=np.float64) # A. Run a multivariate regression to get beta estimates endog = self.endog.copy() exog = self.exog.copy() if self.k_exog > 0 else None # Although the Kalman filter can deal with missing values in endog, # conditional sum of squares cannot if np.any(np.isnan(endog)): endog = endog[~np.isnan(endog)] if exog is not None: exog = exog[~np.isnan(endog)] # Regression effects via OLS exog_params = np.zeros(0) if self.k_exog > 0: exog_params = np.linalg.pinv(exog).dot(endog).T endog -= np.dot(exog, exog_params.T) # B. Run a VAR model on endog to get trend, AR parameters ar_params = [] k_ar = self.k_ar if self.k_ar > 0 else 1 mod_ar = var_model.VAR(endog) res_ar = mod_ar.fit(maxlags=k_ar, ic=None, trend=self.trend) ar_params = np.array(res_ar.params.T) if self.trend == 'c': trend_params = ar_params[:, 0] if self.k_ar > 0: ar_params = ar_params[:, 1:].ravel() else: ar_params = [] elif self.k_ar > 0: ar_params = ar_params.ravel() else: ar_params = [] endog = res_ar.resid # Test for stationarity if self.k_ar > 0 and self.enforce_stationarity: coefficient_matrices = ( ar_params.reshape( self.k_endog * self.k_ar, self.k_endog ).T ).reshape(self.k_endog, self.k_endog, self.k_ar).T stationary = is_invertible([1] + list(-coefficient_matrices)) if not stationary: raise ValueError('Non-stationary starting autoregressive' ' parameters found with `enforce_stationarity`' ' set to True.') # C. Run a VAR model on the residuals to get MA parameters ma_params = [] if self.k_ma > 0: mod_ma = var_model.VAR(endog) res_ma = mod_ma.fit(maxlags=self.k_ma, ic=None, trend='nc') ma_params = np.array(res_ma.params.T).ravel() # Test for invertibility if self.enforce_invertibility: coefficient_matrices = ( ma_params.reshape( self.k_endog * self.k_ma, self.k_endog ).T ).reshape(self.k_endog, self.k_endog, self.k_ma).T invertible = is_invertible([1] + list(-coefficient_matrices)) if not invertible: raise ValueError('Non-invertible starting moving-average' ' parameters found with `enforce_stationarity`' ' set to True.') # 1. Intercept terms if self.trend == 'c': params[self._params_trend] = trend_params # 2. AR terms params[self._params_ar] = ar_params # 3. MA terms params[self._params_ma] = ma_params # 4. Regression terms if self.mle_regression: params[self._params_regression] = exog_params.ravel() # 5. State covariance terms if self.error_cov_type == 'diagonal': params[self._params_state_cov] = res_ar.sigma_u.diagonal() elif self.error_cov_type == 'unstructured': cov_factor = np.linalg.cholesky(res_ar.sigma_u) params[self._params_state_cov] = ( cov_factor[self._idx_lower_state_cov].ravel()) # 5. Measurement error variance terms if self.measurement_error: if self.k_ma > 0: params[self._params_obs_cov] = res_ma.sigma_u.diagonal() else: params[self._params_obs_cov] = res_ar.sigma_u.diagonal() return params @property def param_names(self): param_names = [] # 1. Intercept terms if self.trend == 'c': param_names += [ 'const.%s' % self.endog_names[i] for i in range(self.k_endog) ] # 2. AR terms param_names += [ 'L%d.%s.%s' % (i+1, self.endog_names[k], self.endog_names[j]) for j in range(self.k_endog) for i in range(self.k_ar) for k in range(self.k_endog) ] # 3. MA terms param_names += [ 'L%d.e(%s).%s' % (i+1, self.endog_names[k], self.endog_names[j]) for j in range(self.k_endog) for i in range(self.k_ma) for k in range(self.k_endog) ] # 4. Regression terms param_names += [ 'beta.%s.%s' % (self.exog_names[j], self.endog_names[i]) for i in range(self.k_endog) for j in range(self.k_exog) ] # 5. State covariance terms if self.error_cov_type == 'diagonal': param_names += [ 'sigma2.%s' % self.endog_names[i] for i in range(self.k_endog) ] elif self.error_cov_type == 'unstructured': param_names += [ ('sqrt.var.%s' % self.endog_names[i] if i == j else 'sqrt.cov.%s.%s' % (self.endog_names[j], self.endog_names[i])) for i in range(self.k_endog) for j in range(i+1) ] # 5. Measurement error variance terms if self.measurement_error: param_names += [ 'measurement_variance.%s' % self.endog_names[i] for i in range(self.k_endog) ] return param_names def transform_params(self, unconstrained): """ Transform unconstrained parameters used by the optimizer to constrained parameters used in likelihood evaluation Parameters ---------- unconstrained : array_like Array of unconstrained parameters used by the optimizer, to be transformed. Returns ------- constrained : array_like Array of constrained parameters which may be used in likelihood evalation. Notes ----- Constrains the factor transition to be stationary and variances to be positive. """ unconstrained = np.array(unconstrained, ndmin=1) constrained = np.zeros(unconstrained.shape, dtype=unconstrained.dtype) # 1. Intercept terms: nothing to do constrained[self._params_trend] = unconstrained[self._params_trend] # 2. AR terms: optionally force to be stationary if self.k_ar > 0 and self.enforce_stationarity: # Create the state covariance matrix if self.error_cov_type == 'diagonal': state_cov = np.diag(unconstrained[self._params_state_cov]**2) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=unconstrained.dtype) state_cov_lower[self._idx_lower_state_cov] = ( unconstrained[self._params_state_cov]) state_cov = np.dot(state_cov_lower, state_cov_lower.T) # Transform the parameters coefficients = unconstrained[self._params_ar].reshape( self.k_endog, self.k_endog * self.k_ar) coefficient_matrices, variance = ( constrain_stationary_multivariate(coefficients, state_cov)) constrained[self._params_ar] = coefficient_matrices.ravel() else: constrained[self._params_ar] = unconstrained[self._params_ar] # 3. MA terms: optionally force to be invertible if self.k_ma > 0 and self.enforce_invertibility: # Transform the parameters, using an identity variance matrix state_cov = np.eye(self.k_endog, dtype=unconstrained.dtype) coefficients = unconstrained[self._params_ma].reshape( self.k_endog, self.k_endog * self.k_ma) coefficient_matrices, variance = ( constrain_stationary_multivariate(coefficients, state_cov)) constrained[self._params_ma] = coefficient_matrices.ravel() else: constrained[self._params_ma] = unconstrained[self._params_ma] # 4. Regression terms: nothing to do constrained[self._params_regression] = ( unconstrained[self._params_regression]) # 5. State covariance terms # If we have variances, force them to be positive if self.error_cov_type == 'diagonal': constrained[self._params_state_cov] = ( unconstrained[self._params_state_cov]**2) # Otherwise, nothing needs to be done elif self.error_cov_type == 'unstructured': constrained[self._params_state_cov] = ( unconstrained[self._params_state_cov]) # 5. Measurement error variance terms if self.measurement_error: # Force these to be positive constrained[self._params_obs_cov] = ( unconstrained[self._params_obs_cov]**2) return constrained def untransform_params(self, constrained): """ Transform constrained parameters used in likelihood evaluation to unconstrained parameters used by the optimizer. Parameters ---------- constrained : array_like Array of constrained parameters used in likelihood evalution, to be transformed. Returns ------- unconstrained : array_like Array of unconstrained parameters used by the optimizer. """ constrained = np.array(constrained, ndmin=1) unconstrained = np.zeros(constrained.shape, dtype=constrained.dtype) # 1. Intercept terms: nothing to do unconstrained[self._params_trend] = constrained[self._params_trend] # 2. AR terms: optionally were forced to be stationary if self.k_ar > 0 and self.enforce_stationarity: # Create the state covariance matrix if self.error_cov_type == 'diagonal': state_cov = np.diag(constrained[self._params_state_cov]) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=constrained.dtype) state_cov_lower[self._idx_lower_state_cov] = ( constrained[self._params_state_cov]) state_cov = np.dot(state_cov_lower, state_cov_lower.T) # Transform the parameters coefficients = constrained[self._params_ar].reshape( self.k_endog, self.k_endog * self.k_ar) unconstrained_matrices, variance = ( unconstrain_stationary_multivariate(coefficients, state_cov)) unconstrained[self._params_ar] = unconstrained_matrices.ravel() else: unconstrained[self._params_ar] = constrained[self._params_ar] # 3. MA terms: optionally were forced to be invertible if self.k_ma > 0 and self.enforce_invertibility: # Transform the parameters, using an identity variance matrix state_cov = np.eye(self.k_endog, dtype=constrained.dtype) coefficients = constrained[self._params_ma].reshape( self.k_endog, self.k_endog * self.k_ma) unconstrained_matrices, variance = ( unconstrain_stationary_multivariate(coefficients, state_cov)) unconstrained[self._params_ma] = unconstrained_matrices.ravel() else: unconstrained[self._params_ma] = constrained[self._params_ma] # 4. Regression terms: nothing to do unconstrained[self._params_regression] = ( constrained[self._params_regression]) # 5. State covariance terms # If we have variances, then these were forced to be positive if self.error_cov_type == 'diagonal': unconstrained[self._params_state_cov] = ( constrained[self._params_state_cov]**0.5) # Otherwise, nothing needs to be done elif self.error_cov_type == 'unstructured': unconstrained[self._params_state_cov] = ( constrained[self._params_state_cov]) # 5. Measurement error variance terms if self.measurement_error: # These were forced to be positive unconstrained[self._params_obs_cov] = ( constrained[self._params_obs_cov]**0.5) return unconstrained def update(self, params, *args, **kwargs): params = super(VARMAX, self).update(params, *args, **kwargs) # 1. State intercept if self.mle_regression: exog_params = params[self._params_regression].reshape( self.k_endog, self.k_exog).T intercept = np.dot(self.exog, exog_params) if self.trend == 'c': intercept += params[self._params_trend] self.ssm[self._idx_state_intercept] = intercept.T elif self.trend == 'c': self.ssm[self._idx_state_intercept] = params[self._params_trend] # 2. Transition ar = params[self._params_ar].reshape( self.k_endog, self.k_endog * self.k_ar) ma = params[self._params_ma].reshape( self.k_endog, self.k_endog * self.k_ma) self.ssm[self._idx_transition] = np.c_[ar, ma] # 3. State covariance if self.error_cov_type == 'diagonal': self.ssm[self._idx_state_cov] = ( params[self._params_state_cov] ) elif self.error_cov_type == 'unstructured': state_cov_lower = np.zeros(self.ssm['state_cov'].shape, dtype=params.dtype) state_cov_lower[self._idx_lower_state_cov] = ( params[self._params_state_cov]) self.ssm['state_cov'] = np.dot(state_cov_lower, state_cov_lower.T) # 4. Observation covariance if self.measurement_error: self.ssm[self._idx_obs_cov] = params[self._params_obs_cov]
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load() Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles)] # Find mean, outliers curves if have_de_optim: median = differential_evolution(lambda x: - ks_gaussian.pdf(x), bounds=bounds, maxiter=5).x else: median = brute(lambda x: - ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band): """Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip(range(dim), itertools.repeat((band, pca, bounds, ks_gaussian))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [i for i in alpha if 0.5 != i and 0.9 != i and threshold != i] if extra_alpha != []: extra_quantiles = [y for x in extra_alpha for y in _band_quantiles([x])] else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5]) hdr_50 = _band_quantiles([0.5]) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers' ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res