예제 #1
0
    def _init_dates(self, dates, freq):
        if dates is None:
            dates = self.data.row_labels

        if dates is not None:
            if (not datetools._is_datetime_index(dates) and
                    isinstance(self.data, data.PandasData)):
                try:
                    if is_numeric_dtype(dates):
                        raise ValueError
                    dates = to_datetime(dates)
                except ValueError:
                    raise ValueError("Given a pandas object and the index does "
                                     "not contain dates")
            if not freq:
                try:
                    freq = datetools._infer_freq(dates)
                except:
                    raise ValueError("Frequency inference failed. Use `freq` "
                                     "keyword.")

            if isinstance(dates[0], datetime.datetime):
                dates = DatetimeIndex(dates)
            else: # preserve PeriodIndex
                dates = PeriodIndex(dates)
        self.data.dates = dates
        self.data.freq = freq

        # Test for nanoseconds in early pandas versions
        if freq is not None and _freq_to_pandas[freq].freqstr == 'N':
            from distutils.version import LooseVersion
            from pandas import __version__ as pd_version
            if LooseVersion(pd_version) < '0.14':
                raise NotImplementedError('Nanosecond index not available in'
                                          ' Pandas < 0.14')
예제 #2
0
    def _init_dates(self, dates, freq):
        if dates is None:
            dates = self.data.row_labels

        if dates is not None:
            if (not datetools._is_datetime_index(dates) and
                    isinstance(self.data, data.PandasData)):
                try:
                    if is_numeric_dtype(dates):
                        raise ValueError
                    dates = to_datetime(dates)
                except ValueError:
                    raise ValueError("Given a pandas object and the index does "
                                     "not contain dates")
            if not freq:
                try:
                    freq = datetools._infer_freq(dates)
                except:
                    raise ValueError("Frequency inference failed. Use `freq` "
                                     "keyword.")

            if isinstance(dates[0], datetime.datetime):
                dates = DatetimeIndex(dates)
            else: # preserve PeriodIndex
                dates = PeriodIndex(dates)
        self.data.dates = dates
        self.data.freq = freq

        # Test for nanoseconds in early pandas versions
        if freq is not None and _freq_to_pandas[freq].freqstr == 'N':
            from distutils.version import LooseVersion
            from pandas import __version__ as pd_version
            if LooseVersion(pd_version) < '0.14':
                raise NotImplementedError('Nanosecond index not available in'
                                          ' Pandas < 0.14')
예제 #3
0
    def _init_dates(self, dates, freq):
        if dates is None:
            dates = self.data.row_labels

        if dates is not None:
            if (not datetools._is_datetime_index(dates) and
                    isinstance(self.data, data.PandasData)):
                try:
                    if is_numeric_dtype(dates):
                        raise ValueError
                    dates = to_datetime(dates)
                except ValueError:
                    raise ValueError("Given a pandas object and the index does "
                                     "not contain dates")
            if not freq:
                try:
                    freq = datetools._infer_freq(dates)
                except:
                    raise ValueError("Frequency inference failed. Use `freq` "
                                     "keyword.")

            if isinstance(dates[0], datetime.datetime):
                dates = DatetimeIndex(dates)
            else: # preserve PeriodIndex
                dates = PeriodIndex(dates)
        self.data.dates = dates
        self.data.freq = freq
예제 #4
0
    def _init_dates(self, dates, freq):
        if dates is None:
            dates = self.data.row_labels

        if dates is not None:
            if (not datetools._is_datetime_index(dates)
                    and isinstance(self.data, data.PandasData)):
                try:
                    if is_numeric_dtype(dates):
                        raise ValueError
                    dates = to_datetime(dates)
                except ValueError:
                    raise ValueError(
                        "Given a pandas object and the index does "
                        "not contain dates")
            if not freq:
                try:
                    freq = datetools._infer_freq(dates)
                except:
                    raise ValueError("Frequency inference failed. Use `freq` "
                                     "keyword.")

            if isinstance(dates[0], datetime.datetime):
                dates = DatetimeIndex(dates)
            else:  # preserve PeriodIndex
                dates = PeriodIndex(dates)
        self.data.dates = dates
        self.data.freq = freq
예제 #5
0
    def _init_dates(self, dates=None, freq=None):
        """
        Initialize dates

        Parameters
        ----------
        dates : array_like, optional
            An array like object containing dates.
        freq : str, tuple, datetime.timedelta, DateOffset or None, optional
            A frequency specification for either `dates` or the row labels from
            the endog / exog data.

        Notes
        -----
        Creates `self._index` and related attributes. `self._index` is always
        a Pandas index, and it is always Int64Index, DatetimeIndex, or
        PeriodIndex.

        If Pandas objects, endog / exog may have any type of index. If it is
        an Int64Index with values 0, 1, ..., nobs-1 or if it is (coerceable to)
        a DatetimeIndex or PeriodIndex *with an associated frequency*, then it
        is called a "supported" index. Otherwise it is called an "unsupported"
        index.

        Supported indexes are standardized (i.e. a list of date strings is
        converted to a DatetimeIndex) and the result is put in `self._index`.

        Unsupported indexes are ignored, and a supported Int64Index is
        generated and put in `self._index`. Warnings are issued in this case
        to alert the user if the returned index from some operation (e.g.
        forecasting) is different from the original data's index. However,
        whenever possible (e.g. purely in-sample prediction), the original
        index is returned.

        The benefit of supported indexes is that they allow *forecasting*, i.e.
        it is possible to extend them in a reasonable way. Thus every model
        must have an underlying supported index, even if it is just a generated
        Int64Index.

        """

        # Get our index from `dates` if available, otherwise from whatever
        # Pandas index we might have retrieved from endog, exog
        if dates is not None:
            index = dates
        else:
            index = self.data.row_labels

        # Sanity check that we do not have a `freq` without an index
        if index is None and freq is not None:
            raise ValueError('Frequency provided without associated index.')

        # If an index is available, see if it is a date-based index or if it
        # can be coerced to one. (If it cannot we'll fall back, below, to an
        # internal, 0, 1, ... nobs-1 integer index for modeling purposes)
        inferred_freq = False
        if index is not None:
            # Try to coerce to date-based index
            if not isinstance(index, (DatetimeIndex, PeriodIndex)):
                try:
                    # Only try to coerce non-numeric index types (string,
                    # list of date-times, etc.)
                    # Note that np.asarray(Float64Index([...])) yields an
                    # object dtype array in earlier versions of Pandas (and so
                    # will not have is_numeric_dtype == True), so explicitly
                    # check for it here. But note also that in very early
                    # Pandas (~0.12), Float64Index does not exist (and so the
                    # statsmodels compat makes it an empty tuple, so in that
                    # case also check if the first element is a float.
                    _index = np.asarray(index)
                    if (is_numeric_dtype(_index)
                            or isinstance(index, Float64Index)
                            or (Float64Index == tuple()
                                and isinstance(_index[0], float))):
                        raise ValueError('Numeric index given')
                    # If a non-index Pandas series was given, only keep its
                    # values (because we must have a pd.Index type, below, and
                    # pd.to_datetime will return a Series when passed
                    # non-list-like objects)
                    if isinstance(index, Series):
                        index = index.values
                    # All coercion is done via pd.to_datetime
                    # Note: date coercion via pd.to_datetime does not handle
                    # string versions of PeriodIndex objects most of the time.
                    _index = to_datetime(index)
                    # Older versions of Pandas can sometimes fail here and
                    # return a numpy array - check to make sure it's an index
                    if not isinstance(_index, Index):
                        raise ValueError('Could not coerce to date index')
                    index = _index
                except:
                    # Only want to actually raise an exception if `dates` was
                    # provided but cannot be coerced. If we got the index from
                    # the row_labels, we'll just ignore it and use the integer
                    # index below
                    if dates is not None:
                        raise ValueError('Non-date index index provided to'
                                         ' `dates` argument.')
            # Now, if we were given, or coerced, a date-based index, make sure
            # it has an associated frequency
            if isinstance(index, (DatetimeIndex, PeriodIndex)):
                # If no frequency, try to get an inferred frequency
                if freq is None and index.freq is None:
                    freq = index.inferred_freq
                    # If we got an inferred frequncy, alert the user
                    if freq is not None:
                        inferred_freq = True
                        if freq is not None:
                            warnings.warn(
                                'No frequency information was'
                                ' provided, so inferred frequency %s'
                                ' will be used.' % freq, ValueWarning)

                # Convert the passed freq to a pandas offset object
                if freq is not None:
                    freq = to_offset(freq)

                # Now, if no frequency information is available from the index
                # itself or from the `freq` argument, raise an exception
                if freq is None and index.freq is None:
                    # But again, only want to raise the exception if `dates`
                    # was provided.
                    if dates is not None:
                        raise ValueError('No frequency information was'
                                         ' provided with date index and no'
                                         ' frequency could be inferred.')
                # However, if the index itself has no frequency information but
                # the `freq` argument is available (or was inferred), construct
                # a new index with an associated frequency
                elif freq is not None and index.freq is None:
                    resampled_index = date_range(start=index[0],
                                                 end=index[-1],
                                                 freq=freq)
                    if not inferred_freq and not resampled_index.equals(index):
                        raise ValueError('The given frequency argument could'
                                         ' not be matched to the given index.')
                    index = resampled_index
                # Finally, if the index itself has a frequency and there was
                # also a given frequency, raise an exception if they are not
                # equal
                elif (freq is not None and not inferred_freq
                      and not (index.freq == freq)):
                    raise ValueError('The given frequency argument is'
                                     ' incompatible with the given index.')
            # Finally, raise an exception if we could not coerce to date-based
            # but we were given a frequency argument
            elif freq is not None:
                raise ValueError('Given index could not be coerced to dates'
                                 ' but `freq` argument was provided.')

        # Get attributes of the index
        has_index = index is not None
        date_index = isinstance(index, (DatetimeIndex, PeriodIndex))
        period_index = isinstance(index, PeriodIndex)
        int_index = isinstance(index, Int64Index)
        range_index = isinstance(index, RangeIndex)
        has_freq = index.freq is not None if date_index else None
        increment = Index(range(self.endog.shape[0]))
        is_increment = index.equals(increment) if int_index else None
        is_monotonic = index.is_monotonic if date_index else None

        # Issue warnings for unsupported indexes
        if has_index and not (date_index or range_index or is_increment):
            warnings.warn(
                'An unsupported index was provided and will be'
                ' ignored when e.g. forecasting.', ValueWarning)
        if date_index and not has_freq:
            warnings.warn(
                'A date index has been provided, but it has no'
                ' associated frequency information and so will be'
                ' ignored when e.g. forecasting.', ValueWarning)
        if date_index and not is_monotonic:
            warnings.warn(
                'A date index has been provided, but it is not'
                ' monotonic and so will be ignored when e.g.'
                ' forecasting.', ValueWarning)

        # Construct the internal index
        index_generated = False
        valid_index = ((date_index and has_freq and is_monotonic)
                       or (int_index and is_increment) or range_index)

        if valid_index:
            _index = index
        else:
            _index = increment
            index_generated = True
        self._index = _index
        self._index_generated = index_generated
        self._index_none = index is None
        self._index_int64 = int_index and not range_index and not date_index
        self._index_dates = date_index and not index_generated
        self._index_freq = self._index.freq if self._index_dates else None
        self._index_inferred_freq = inferred_freq

        # For backwards compatibility, set data.dates, data.freq
        self.data.dates = self._index if self._index_dates else None
        self.data.freq = self._index.freqstr if self._index_dates else None
예제 #6
0
    def __init__(
        self,
        data: Union[np.ndarray, pd.Series, pd.DataFrame],
        stats: Sequence[str] = None,
        *,
        numeric: bool = True,
        categorical: bool = True,
        alpha: float = 0.05,
        use_t: bool = False,
        percentiles: Sequence[Union[int, float]] = PERCENTILES,
        ntop: bool = 5,
    ):
        data_arr = data
        if not isinstance(data, (pd.Series, pd.DataFrame)):
            data_arr = array_like(data, "data", maxdim=2)
        if data_arr.ndim == 1:
            data = pd.Series(data)
        numeric = bool_like(numeric, "numeric")
        categorical = bool_like(categorical, "categorical")
        include = []
        col_types = ""
        if numeric:
            include.append(np.number)
            col_types = "numeric"
        if categorical:
            include.append("category")
            col_types += "and " if col_types != "" else ""
            col_types += "categorical"
        if not numeric and not categorical:
            raise ValueError(
                "At least one of numeric and categorical must be True"
            )
        self._data = pd.DataFrame(data).select_dtypes(include)
        if self._data.shape[1] == 0:

            raise ValueError(
                "Selecting {col_types} results in an empty DataFrame"
            )
        self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
        self._is_cat_like = [
            is_categorical_dtype(dt) for dt in self._data.dtypes
        ]

        if stats is not None:
            undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
            if undef:
                raise ValueError(
                    f"{', '.join(undef)} are not known statistics"
                )
        self._stats = (
            list(DEFAULT_STATISTICS) if stats is None else list(stats)
        )
        self._ntop = int_like(ntop, "ntop")
        self._compute_top = "top" in self._stats
        self._compute_freq = "freq" in self._stats
        if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
            raise ValueError("top must be a non-negative integer")

        self._compute_perc = "percentiles" in self._stats
        self._percentiles = array_like(
            percentiles, "percentiles", maxdim=1, dtype="d"
        )
        self._percentiles = np.sort(self._percentiles)
        if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
            raise ValueError("percentiles must be distinct")
        if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
            raise ValueError("percentiles must be strictly between 0 and 100")

        # Expand special stats
        replacements = {
            "mode": ["mode", "mode_freq"],
            "ci": ["upper_ci", "lower_ci"],
            "jarque_bera": ["jarque_bera", "jarque_bera_pval"],
            "top": [f"top_{i}" for i in range(1, self._ntop + 1)],
            "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
            "percentiles": [f"{i}%" for i in percentiles],
        }

        for key in replacements:
            if key in self._stats:
                idx = self._stats.index(key)
                self._stats = (
                    self._stats[:idx]
                    + replacements[key]
                    + self._stats[idx + 1 :]
                )

        self._alpha = float_like(alpha, "alpha")
        if not 0 < alpha < 1:
            raise ValueError("alpha must be strictly between 0 and 1")
        self._use_t = bool_like(use_t, "use_t")
예제 #7
0
    def _init_dates(self, dates=None, freq=None):
        """
        Initialize dates

        Parameters
        ----------
        dates : array_like, optional
            An array like object containing dates.
        freq : str, tuple, datetime.timedelta, DateOffset or None, optional
            A frequency specification for either `dates` or the row labels from
            the endog / exog data.

        Notes
        -----
        Creates `self._index` and related attributes. `self._index` is always
        a Pandas index, and it is always Int64Index, DatetimeIndex, or
        PeriodIndex.

        If Pandas objects, endog / exog may have any type of index. If it is
        an Int64Index with values 0, 1, ..., nobs-1 or if it is (coerceable to)
        a DatetimeIndex or PeriodIndex *with an associated frequency*, then it
        is called a "supported" index. Otherwise it is called an "unsupported"
        index.

        Supported indexes are standardized (i.e. a list of date strings is
        converted to a DatetimeIndex) and the result is put in `self._index`.

        Unsupported indexes are ignored, and a supported Int64Index is
        generated and put in `self._index`. Warnings are issued in this case
        to alert the user if the returned index from some operation (e.g.
        forecasting) is different from the original data's index. However,
        whenever possible (e.g. purely in-sample prediction), the original
        index is returned.

        The benefit of supported indexes is that they allow *forecasting*, i.e.
        it is possible to extend them in a reasonable way. Thus every model
        must have an underlying supported index, even if it is just a generated
        Int64Index.

        """

        # Get our index from `dates` if available, otherwise from whatever
        # Pandas index we might have retrieved from endog, exog
        if dates is not None:
            index = dates
        else:
            index = self.data.row_labels

        # Sanity check that we don't have a `freq` without an index
        if index is None and freq is not None:
            raise ValueError('Frequency provided without associated index.')

        # If an index is available, see if it is a date-based index or if it
        # can be coerced to one. (If it can't we'll fall back, below, to an
        # internal, 0, 1, ... nobs-1 integer index for modeling purposes)
        inferred_freq = False
        if index is not None:
            # Try to coerce to date-based index
            if not isinstance(index, (DatetimeIndex, PeriodIndex)):
                try:
                    # Only try to coerce non-numeric index types (string,
                    # list of date-times, etc.)
                    # Note that np.asarray(Float64Index([...])) yields an
                    # object dtype array in earlier versions of Pandas (and so
                    # will not have is_numeric_dtype == True), so explicitly
                    # check for it here. But note also that in very early
                    # Pandas (~0.12), Float64Index doesn't exist (and so the
                    # Statsmodels compat makes it an empty tuple, so in that
                    # case also check if the first element is a float.
                    _index = np.asarray(index)
                    if (is_numeric_dtype(_index) or
                            isinstance(index, Float64Index) or
                            (Float64Index == tuple() and
                             isinstance(_index[0], float))):
                        raise ValueError('Numeric index given')
                    # If a non-index Pandas series was given, only keep its
                    # values (because we must have a pd.Index type, below, and
                    # pd.to_datetime will return a Series when passed
                    # non-list-like objects)
                    if isinstance(index, Series):
                        index = index.values
                    # All coercion is done via pd.to_datetime
                    # Note: date coercion via pd.to_datetime does not handle
                    # string versions of PeriodIndex objects most of the time.
                    _index = to_datetime(index)
                    # Older versions of Pandas can sometimes fail here and
                    # return a numpy array - check to make sure it's an index
                    if not isinstance(_index, Index):
                        raise ValueError('Could not coerce to date index')
                    index = _index
                except:
                    # Only want to actually raise an exception if `dates` was
                    # provided but can't be coerced. If we got the index from
                    # the row_labels, we'll just ignore it and use the integer
                    # index below
                    if dates is not None:
                        raise ValueError('Non-date index index provided to'
                                         ' `dates` argument.')
            # Now, if we were given, or coerced, a date-based index, make sure
            # it has an associated frequency
            if isinstance(index, (DatetimeIndex, PeriodIndex)):
                # If no frequency, try to get an inferred frequency
                if freq is None and index.freq is None:
                    freq = index.inferred_freq
                    # If we got an inferred frequncy, alert the user
                    if freq is not None:
                        inferred_freq = True
                        if freq is not None:
                            warnings.warn('No frequency information was'
                                          ' provided, so inferred frequency %s'
                                          ' will be used.'
                                          % freq, ValueWarning)

                # Convert the passed freq to a pandas offset object
                if freq is not None:
                    freq = to_offset(freq)

                # Now, if no frequency information is available from the index
                # itself or from the `freq` argument, raise an exception
                if freq is None and index.freq is None:
                    # But again, only want to raise the exception if `dates`
                    # was provided.
                    if dates is not None:
                        raise ValueError('No frequency information was'
                                         ' provided with date index and no'
                                         ' frequency could be inferred.')
                # However, if the index itself has no frequency information but
                # the `freq` argument is available (or was inferred), construct
                # a new index with an associated frequency
                elif freq is not None and index.freq is None:
                    resampled_index = date_range(
                        start=index[0], end=index[-1], freq=freq)
                    if not inferred_freq and not resampled_index.equals(index):
                        raise ValueError('The given frequency argument could'
                                         ' not be matched to the given index.')
                    index = resampled_index
                # Finally, if the index itself has a frequency and there was
                # also a given frequency, raise an exception if they are not
                # equal
                elif (freq is not None and not inferred_freq and
                        not (index.freq == freq)):
                    raise ValueError('The given frequency argument is'
                                     ' incompatible with the given index.')
            # Finally, raise an exception if we could not coerce to date-based
            # but we were given a frequency argument
            elif freq is not None:
                raise ValueError('Given index could not be coerced to dates'
                                 ' but `freq` argument was provided.')

        # Get attributes of the index
        has_index = index is not None
        date_index = isinstance(index, (DatetimeIndex, PeriodIndex))
        int_index = isinstance(index, Int64Index)
        range_index = isinstance(index, RangeIndex)
        has_freq = index.freq is not None if date_index else None
        increment = Index(range(self.endog.shape[0]))
        is_increment = index.equals(increment) if int_index else None

        # Issue warnings for unsupported indexes
        if has_index and not (date_index or range_index or is_increment):
            warnings.warn('An unsupported index was provided and will be'
                          ' ignored when e.g. forecasting.', ValueWarning)
        if date_index and not has_freq:
            warnings.warn('A date index has been provided, but it has no'
                          ' associated frequency information and so will be'
                          ' ignored when e.g. forecasting.', ValueWarning)

        # Construct the internal index
        index_generated = False

        if ((date_index and has_freq) or (int_index and is_increment) or
                range_index):
            _index = index
        else:
            _index = increment
            index_generated = True
        self._index = _index
        self._index_generated = index_generated
        self._index_none = index is None
        self._index_dates = date_index and not index_generated
        self._index_freq = self._index.freq if self._index_dates else None
        self._index_inferred_freq = inferred_freq

        # For backwards compatibility, set data.dates, data.freq
        self.data.dates = self._index if self._index_dates else None
        self.data.freq = self._index.freqstr if self._index_dates else None