def _handle_constant(self, hasconst): if hasconst is False or self.exog is None: self.k_constant = 0 self.const_idx = None else: # detect where the constant is check_implicit = False exog_max = np.max(self.exog, axis=0) if not np.isfinite(exog_max).all(): raise MissingDataError('exog contains inf or nans') exog_min = np.min(self.exog, axis=0) const_idx = np.where(exog_max == exog_min)[0].squeeze() self.k_constant = const_idx.size if self.k_constant == 1: if self.exog[:, const_idx].mean() != 0: self.const_idx = int(const_idx) else: # we only have a zero column and no other constant check_implicit = True elif self.k_constant > 1: # we have more than one constant column # look for ones values = [] # keep values if we need != 0 for idx in const_idx: value = self.exog[:, idx].mean() if value == 1: self.k_constant = 1 self.const_idx = int(idx) break values.append(value) else: # we did not break, no column of ones pos = (np.array(values) != 0) if pos.any(): # take the first nonzero column self.k_constant = 1 self.const_idx = int(const_idx[pos.argmax()]) else: # only zero columns check_implicit = True elif self.k_constant == 0: check_implicit = True else: # should not be here pass if check_implicit and not hasconst: # look for implicit constant # Compute rank of augmented matrix augmented_exog = np.column_stack( (np.ones(self.exog.shape[0]), self.exog)) rank_augm = np.linalg.matrix_rank(augmented_exog) rank_orig = np.linalg.matrix_rank(self.exog) self.k_constant = int(rank_orig == rank_augm) self.const_idx = None elif hasconst: # Ensure k_constant is 1 any time hasconst is True # even if one is not found self.k_constant = 1
def acovf(x, unbiased=False, demean=True, fft=False, missing='none'): """ Autocovariance for 1D Parameters ---------- x : array Time series data. Must be 1d. unbiased : bool If True, then denominators is n-k, otherwise n demean : bool If True, then subtract the mean x from each element of x fft : bool If True, use FFT convolution. This method should be preferred for long time series. missing : str A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. Returns ------- acovf : array autocovariance function References ----------- .. [*] Parzen, E., 1963. On spectral analysis with missing observations and amplitude modulation. Sankhya: The Indian Journal of Statistics, Series A, pp.383-392. """ x = np.squeeze(np.asarray(x)) if x.ndim > 1: raise ValueError("x must be 1d. Got %d dims." % x.ndim) missing = missing.lower() if missing not in ['none', 'raise', 'conservative', 'drop']: raise ValueError("missing option %s not understood" % missing) if missing == 'none': deal_with_masked = False else: deal_with_masked = has_missing(x) if deal_with_masked: if missing == 'raise': raise MissingDataError("NaNs were encountered in the data") notmask_bool = ~np.isnan(x) #bool if missing == 'conservative': x[~notmask_bool] = 0 else: #'drop' x = x[notmask_bool] #copies non-missing notmask_int = notmask_bool.astype(int) #int if demean and deal_with_masked: # whether 'drop' or 'conservative': xo = x - x.sum()/notmask_int.sum() if missing=='conservative': xo[~notmask_bool] = 0 elif demean: xo = x - x.mean() else: xo = x n = len(x) if unbiased and deal_with_masked and missing=='conservative': d = np.correlate(notmask_int, notmask_int, 'full') elif unbiased: xi = np.arange(1, n + 1) d = np.hstack((xi, xi[:-1][::-1])) elif deal_with_masked: #biased and NaNs given and ('drop' or 'conservative') d = notmask_int.sum() * np.ones(2*n-1) else: #biased and no NaNs or missing=='none' d = n * np.ones(2 * n - 1) if fft: nobs = len(xo) n = _next_regular(2 * nobs + 1) Frf = np.fft.fft(xo, n=n) acov = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d[nobs - 1:] acov = acov.real else: acov = (np.correlate(xo, xo, 'full') / d)[n - 1:] if deal_with_masked and missing=='conservative': # restore data for the user x[~notmask_bool] = np.nan return acov
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog, ) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array), ) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update( dict( zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def acovf(x, unbiased=False, demean=True, fft=None, missing='none', nlag=None): """ Autocovariance for 1D Parameters ---------- x : array Time series data. Must be 1d. unbiased : bool If True, then denominators is n-k, otherwise n demean : bool If True, then subtract the mean x from each element of x fft : bool If True, use FFT convolution. This method should be preferred for long time series. missing : str A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. nlag : {int, None} Limit the number of autocovariances returned. Size of returned array is nlag + 1. Setting nlag when fft is False uses a simple, direct estimator of the autocovariances that only computes the first nlag + 1 values. This can be much faster when the time series is long and only a small number of autocovariances are needed. Returns ------- acovf : array autocovariance function References ----------- .. [*] Parzen, E., 1963. On spectral analysis with missing observations and amplitude modulation. Sankhya: The Indian Journal of Statistics, Series A, pp.383-392. """ if fft is None: import warnings msg = 'fft=True will become the default in a future version of ' \ 'statsmodels. To suppress this warning, explicitly set ' \ 'fft=False.' warnings.warn(msg, FutureWarning) fft = False x = np.squeeze(np.asarray(x)) if x.ndim > 1: raise ValueError("x must be 1d. Got %d dims." % x.ndim) missing = missing.lower() if missing not in ['none', 'raise', 'conservative', 'drop']: raise ValueError("missing option %s not understood" % missing) if missing == 'none': deal_with_masked = False else: deal_with_masked = has_missing(x) if deal_with_masked: if missing == 'raise': raise MissingDataError("NaNs were encountered in the data") notmask_bool = ~np.isnan(x) # bool if missing == 'conservative': # Must copy for thread safety x = x.copy() x[~notmask_bool] = 0 else: # 'drop' x = x[notmask_bool] # copies non-missing notmask_int = notmask_bool.astype(int) # int if demean and deal_with_masked: # whether 'drop' or 'conservative': xo = x - x.sum() / notmask_int.sum() if missing == 'conservative': xo[~notmask_bool] = 0 elif demean: xo = x - x.mean() else: xo = x n = len(x) lag_len = nlag if nlag is None: lag_len = n - 1 elif nlag > n - 1: raise ValueError('nlag must be smaller than nobs - 1') if not fft and nlag is not None: acov = np.empty(lag_len + 1) acov[0] = xo.dot(xo) for i in range(lag_len): acov[i + 1] = xo[i + 1:].dot(xo[:-(i + 1)]) if not deal_with_masked or missing == 'drop': if unbiased: acov /= (n - np.arange(lag_len + 1)) else: acov /= n else: if unbiased: divisor = np.empty(lag_len + 1, dtype=np.int64) divisor[0] = notmask_int.sum() for i in range(lag_len): divisor[i + 1] = notmask_int[i + 1:].dot( notmask_int[:-(i + 1)]) divisor[divisor == 0] = 1 acov /= divisor else: # biased, missing data but npt 'drop' acov /= notmask_int.sum() return acov if unbiased and deal_with_masked and missing == 'conservative': d = np.correlate(notmask_int, notmask_int, 'full') d[d == 0] = 1 elif unbiased: xi = np.arange(1, n + 1) d = np.hstack((xi, xi[:-1][::-1])) elif deal_with_masked: # biased and NaNs given and ('drop' or 'conservative') d = notmask_int.sum() * np.ones(2 * n - 1) else: # biased and no NaNs or missing=='none' d = n * np.ones(2 * n - 1) if fft: nobs = len(xo) n = _next_regular(2 * nobs + 1) Frf = np.fft.fft(xo, n=n) acov = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d[nobs - 1:] acov = acov.real else: acov = np.correlate(xo, xo, 'full')[n - 1:] / d[n - 1:] if nlag is not None: # Copy to allow gc of full array rather than view return acov[:lag_len + 1].copy() return acov
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] if exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog, ) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (value_array, ) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (value_array, ) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (value_array, ) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if combined_2d: combined.update( dict( zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)