def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that does not raise an exception because it # can be coerced into a nanosecond DatetimeIndex data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') with pytest.warns(ValueWarning, match="No frequency information"): mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex # (This test doesn't make sense for Numpy < 1.7 since they don't have # nanosecond support) # (This test also doesn't make sense for Pandas < 0.14 since we don't # support nanosecond index in Pandas < 0.14) try: # Check for Numpy < 1.7 pd.to_offset('N') except: pass else: data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_period_index(): # test 1285 dates = pd.PeriodIndex(start="1/1/1990", periods=20, freq="M") x = np.arange(1, 21.) model = TimeSeriesModel(pd.Series(x, index=dates)) assert_equal(model._index.freqstr, "M") model = TimeSeriesModel(pd.Series(x, index=dates)) npt.assert_(model.data.freq == "M")
def test_period_index(): # test 1285 from pandas import PeriodIndex, TimeSeries dates = PeriodIndex(start="1/1/1990", periods=20, freq="M") x = np.arange(1, 21.) model = TimeSeriesModel(Series(x, index=dates)) npt.assert_(model.data.freq == "M") model = TimeSeriesModel(TimeSeries(x, index=dates)) npt.assert_(model.data.freq == "M")
def test_get_predict_start_end(): index = pd.date_range(start='1970-01-01', end='1990-01-01', freq='AS') endog = pd.Series(np.zeros(10), index[:10]) model = TimeSeriesModel(endog) predict_starts = [1, '1971-01-01', datetime(1971, 1, 1), index[1]] predict_ends = [20, '1990-01-01', datetime(1990, 1, 1), index[-1]] desired = (1, 9, 11) for start in predict_starts: for end in predict_ends: assert_equal(model._get_prediction_index(start, end)[:3], desired)
def test_get_predict_start_end(): index = pd.DatetimeIndex(start='1970-01-01', end='1990-01-01', freq='AS') endog = pd.Series(np.zeros(10), index[:10]) model = TimeSeriesModel(endog) predict_starts = [1, '1971-01-01', datetime(1971, 1, 1), index[1]] predict_ends = [20, '1990-01-01', datetime(1990, 1, 1), index[-1]] desired = (1, 9, 11) for start in predict_starts: for end in predict_ends: assert_equal(model._get_prediction_index(start, end)[:3], desired)
def test_keyerror_start_date(): x = np.arange(1, 36.) # dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) npt.assert_raises(KeyError, model._get_prediction_index, "1970-4-30", None)
def test_keyerror_start_date(): x = np.arange(1, 36.) if _pandas_08x: from pandas import date_range # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) else: from pandas import DateRange, datetools dates = DateRange("1972-1-1", "2007-1-1", offset=datetools.yearEnd) series = Series(x, index=dates) model = TimeSeriesModel(series) npt.assert_raises(ValueError, model._get_predict_start, "1970-4-30")
def test_keyerror_start_date(): x = np.arange(1, 36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) npt.assert_raises(ValueError, model._get_predict_start, "1970-4-30")
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1,36.) if _pandas_08x: from pandas import date_range # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") npt.assert_equal(predict_dates, expected_dates) #ptesting.assert_series_equal(predict_dates, expected_dates) else: from pandas import DateRange, datetools dates = DateRange("1972-1-1", "2007-1-1", offset=datetools.yearEnd) series = Series(x, index=dates) model = TimeSeriesModel(series) npt.assert_(model.data.freq == "A") start = model._get_predict_start("2006-12-31") end = model._get_predict_end("2016-12-31") model._make_predict_dates() predict_dates = model.data.predict_dates expected_dates = DateRange("2006-12-31", "2016-12-31", offset=datetools.yearEnd) npt.assert_array_equal(predict_dates, expected_dates)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1,36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") assert_equal(model._index.freqstr, "A-APR") start, end, out_of_sample, _ = ( model._get_prediction_index("2006-4-30", "2016-4-30")) predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1, 36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") assert_equal(model._index.freqstr, "A-APR") start, end, out_of_sample, _ = (model._get_prediction_index( "2006-4-30", "2016-4-30")) predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1,36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex # (This test doesn't make sense for Numpy < 1.7 since they don't have # nanosecond support) # (This test also doesn't make sense for Pandas < 0.14 since we don't # support nanosecond index in Pandas < 0.14) try: # Check for Numpy < 1.7 _freq_to_pandas['N'] except: pass else: data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) # Alternate test for Pandas < 0.14 from distutils.version import LooseVersion from pandas import __version__ as pd_version if LooseVersion(pd_version) < '0.14': assert_raises(NotImplementedError, TimeSeriesModel, s) else: actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start = mod._get_predict_start(0) end, out_of_sample = mod._get_predict_end(4) mod._make_predict_dates() assert_equal(len(mod.data.predict_dates), 5)
def test_pandas_dates(): data = [988, 819, 964] dates = ['2016-01-01 12:00:00', '2016-02-01 12:00:00', '2016-03-01 12:00:00'] datetime_dates = pd.to_datetime(dates) result = pd.Series(data=data, index=datetime_dates, name='price') df = pd.DataFrame(data={'price': data}, index=pd.DatetimeIndex(dates, freq='MS')) model = TimeSeriesModel(df['price']) assert_equal(model.data.dates, result.index)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1, 36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") npt.assert_equal(predict_dates, expected_dates)
def __init__(self, endog=None, exog=None, order=None, seasonal_order=None, ar_order=None, diff=None, ma_order=None, seasonal_ar_order=None, seasonal_diff=None, seasonal_ma_order=None, seasonal_periods=None, trend=None, enforce_stationarity=None, enforce_invertibility=None, concentrate_scale=None, trend_offset=1, dates=None, freq=None, missing='none', validate_specification=True): # Basic parameters self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility self.concentrate_scale = concentrate_scale self.trend_offset = trend_offset # Validate that we were not given conflicting specifications has_order = order is not None has_specific_order = (ar_order is not None or diff is not None or ma_order is not None) has_seasonal_order = seasonal_order is not None has_specific_seasonal_order = (seasonal_ar_order is not None or seasonal_diff is not None or seasonal_ma_order is not None or seasonal_periods is not None) if has_order and has_specific_order: raise ValueError('Cannot specify both `order` and either of' ' `ar_order` or `ma_order`.') if has_seasonal_order and has_specific_seasonal_order: raise ValueError('Cannot specify both `seasonal_order` and any of' ' `seasonal_ar_order`, `seasonal_ma_order`,' ' or `seasonal_periods`.') # Compute `order` if has_specific_order: ar_order = 0 if ar_order is None else ar_order diff = 0 if diff is None else diff ma_order = 0 if ma_order is None else ma_order order = (ar_order, diff, ma_order) elif not has_order: order = (0, 0, 0) # Compute `seasonal_order` if has_specific_seasonal_order: seasonal_ar_order = (0 if seasonal_ar_order is None else seasonal_ar_order) seasonal_diff = 0 if seasonal_diff is None else seasonal_diff seasonal_ma_order = (0 if seasonal_ma_order is None else seasonal_ma_order) seasonal_periods = (0 if seasonal_periods is None else seasonal_periods) seasonal_order = (seasonal_ar_order, seasonal_diff, seasonal_ma_order, seasonal_periods) elif not has_seasonal_order: seasonal_order = (0, 0, 0, 0) # Validate shapes of `order`, `seasonal_order` if len(order) != 3: raise ValueError('`order` argument must be an iterable with three' ' elements.') if len(seasonal_order) != 4: raise ValueError('`seasonal_order` argument must be an iterable' ' with four elements.') # Validate differencing parameters if validate_specification: if order[1] < 0: raise ValueError('Cannot specify negative differencing.') if order[1] != int(order[1]): raise ValueError('Cannot specify fractional differencing.') if seasonal_order[1] < 0: raise ValueError('Cannot specify negative seasonal' ' differencing.') if seasonal_order[1] != int(seasonal_order[1]): raise ValueError('Cannot specify fractional seasonal' ' differencing.') if seasonal_order[3] < 0: raise ValueError('Cannot specify negative seasonal' ' periodicity.') # Standardize to integers or lists of integers order = (standardize_lag_order(order[0], 'AR'), int(order[1]), standardize_lag_order(order[2], 'MA')) seasonal_order = (standardize_lag_order(seasonal_order[0], 'seasonal AR'), int(seasonal_order[1]), standardize_lag_order(seasonal_order[2], 'seasonal MA'), int(seasonal_order[3])) # Validate seasonals if validate_specification: if seasonal_order[3] == 1: raise ValueError('Seasonal periodicity must be greater' ' than 1.') if ((seasonal_order[0] != 0 or seasonal_order[1] != 0 or seasonal_order[2] != 0) and seasonal_order[3] == 0): raise ValueError('Must include nonzero seasonal periodicity if' ' including seasonal AR, MA, or' ' differencing.') # Basic order self.order = order self.ar_order, self.diff, self.ma_order = order self.seasonal_order = seasonal_order (self.seasonal_ar_order, self.seasonal_diff, self.seasonal_ma_order, self.seasonal_periods) = seasonal_order # Lists of included lags if isinstance(self.ar_order, list): self.ar_lags = self.ar_order else: self.ar_lags = np.arange(1, self.ar_order + 1).tolist() if isinstance(self.ma_order, list): self.ma_lags = self.ma_order else: self.ma_lags = np.arange(1, self.ma_order + 1).tolist() if isinstance(self.seasonal_ar_order, list): self.seasonal_ar_lags = self.seasonal_ar_order else: self.seasonal_ar_lags = (np.arange(1, self.seasonal_ar_order + 1).tolist()) if isinstance(self.seasonal_ma_order, list): self.seasonal_ma_lags = self.seasonal_ma_order else: self.seasonal_ma_lags = (np.arange(1, self.seasonal_ma_order + 1).tolist()) # Maximum lag orders self.max_ar_order = self.ar_lags[-1] if self.ar_lags else 0 self.max_ma_order = self.ma_lags[-1] if self.ma_lags else 0 self.max_seasonal_ar_order = (self.seasonal_ar_lags[-1] if self.seasonal_ar_lags else 0) self.max_seasonal_ma_order = (self.seasonal_ma_lags[-1] if self.seasonal_ma_lags else 0) self.max_reduced_ar_order = ( self.max_ar_order + self.max_seasonal_ar_order * self.seasonal_periods) self.max_reduced_ma_order = ( self.max_ma_order + self.max_seasonal_ma_order * self.seasonal_periods) # Check that we don't have duplicate AR or MA lags from the seasonal # component ar_lags = set(self.ar_lags) seasonal_ar_lags = set( np.array(self.seasonal_ar_lags) * self.seasonal_periods) duplicate_ar_lags = ar_lags.intersection(seasonal_ar_lags) if validate_specification and len(duplicate_ar_lags) > 0: raise ValueError('Invalid model: autoregressive lag(s) %s are' ' in both the seasonal and non-seasonal' ' autoregressive components.' % duplicate_ar_lags) ma_lags = set(self.ma_lags) seasonal_ma_lags = set( np.array(self.seasonal_ma_lags) * self.seasonal_periods) duplicate_ma_lags = ma_lags.intersection(seasonal_ma_lags) if validate_specification and len(duplicate_ma_lags) > 0: raise ValueError('Invalid model: moving average lag(s) %s are' ' in both the seasonal and non-seasonal' ' moving average components.' % duplicate_ma_lags) # Handle trend self.trend = trend self.trend_poly, _ = prepare_trend_spec(trend) # Check for a constant column in the provided exog exog_is_pandas = _is_using_pandas(exog, None) if (validate_specification and exog is not None and len(self.trend_poly) > 0 and self.trend_poly[0] == 1): # Figure out if we have any constant columns x = np.asanyarray(exog) ptp0 = np.ptp(x, axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const # If we already have a constant column, raise an error if np.any(col_const): raise ValueError('A constant trend was included in the model' ' specification, but the `exog` data already' ' contains a column of constants.') # This contains the included exponents of the trend polynomial, # where e.g. the constant term has exponent 0, a linear trend has # exponent 1, etc. self.trend_terms = np.where(self.trend_poly == 1)[0] # Trend order is either the degree of the trend polynomial, if all # exponents are included, or a list of included exponents. Here we need # to make a distinction between a degree zero polynomial (i.e. a # constant) and the zero polynomial (i.e. not even a constant). The # former has `trend_order = 0`, while the latter has # `trend_order = None`. self.k_trend = len(self.trend_terms) if len(self.trend_terms) == 0: self.trend_order = None self.trend_degree = None elif np.all(self.trend_terms == np.arange(len(self.trend_terms))): self.trend_order = self.trend_terms[-1] self.trend_degree = self.trend_terms[-1] else: self.trend_order = self.trend_terms self.trend_degree = self.trend_terms[-1] # Handle endog / exog # Standardize exog self.k_exog, exog = prepare_exog(exog) # Standardize endog (including creating a faux endog if necessary) faux_endog = endog is None if endog is None: endog = [] if exog is None else np.zeros(len(exog)) * np.nan # Add trend data into exog nobs = len(endog) if exog is None else len(exog) if self.trend_order is not None: # Add in the data trend_data = self.construct_trend_data(nobs, trend_offset) if exog is None: exog = trend_data elif exog_is_pandas: trend_data = pd.DataFrame(trend_data, index=exog.index, columns=self.construct_trend_names()) exog = pd.concat([trend_data, exog], axis=1) else: exog = np.c_[trend_data, exog] # Create an underlying time series model, to handle endog / exog, # especially validating shapes, retrieving names, and potentially # providing us with a time series index self._model = TimeSeriesModel(endog, exog=exog, dates=dates, freq=freq, missing=missing) self.endog = None if faux_endog else self._model.endog self.exog = self._model.exog # Validate endog shape if (validate_specification and not faux_endog and self.endog.ndim > 1 and self.endog.shape[1] > 1): raise ValueError('SARIMAX models require univariate `endog`. Got' ' shape %s.' % str(self.endog.shape)) self._has_missing = (None if faux_endog else np.any( np.isnan(self.endog)))
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1, 36.) if _pandas_08x: from pandas import date_range # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") npt.assert_equal(predict_dates, expected_dates) #ptesting.assert_series_equal(predict_dates, expected_dates) else: from pandas import DateRange, datetools dates = DateRange("1972-1-1", "2007-1-1", offset=datetools.yearEnd) series = Series(x, index=dates) model = TimeSeriesModel(series) npt.assert_(model.data.freq == "A") start = model._get_predict_start("2006-12-31") end = model._get_predict_end("2016-12-31") model._make_predict_dates() predict_dates = model.data.predict_dates expected_dates = DateRange("2006-12-31", "2016-12-31", offset=datetools.yearEnd) npt.assert_array_equal(predict_dates, expected_dates)