def get_resid(df): stl = STL(df['avg_time'], period=8, robust=True) res_avgt = stl.fit() stl = STL(df['num'], period=8, robust=True) res_num = stl.fit() return res_avgt.resid, res_num.resid
def test_period_detection(default_kwargs): class_kwargs, _, _ = _to_class_kwargs(default_kwargs) mod = STL(**class_kwargs) res = mod.fit() del class_kwargs['period'] endog = class_kwargs['endog'] index = pd.date_range('1-1-1959', periods=348, freq='M') class_kwargs['endog'] = pd.Series(endog, index=index) mod = STL(**class_kwargs) res_implicit_period = mod.fit() assert_allclose(res.seasonal, res_implicit_period.seasonal)
def fit(self, *, inner_iter=None, outer_iter=None, fit_kwargs=None): """ Estimate STL and forecasting model parameters. Parameters ----------\n%(fit_params)s fit_kwargs : Dict[str, Any] Any additional keyword arguments to pass to ``model``'s ``fit`` method when estimating the model on the decomposed residuals. Returns ------- STLForecastResults Results with forecasting methods. """ fit_kwargs = {} if fit_kwargs is None else fit_kwargs stl = STL(self._endog, **self._stl_kwargs) stl_fit: DecomposeResult = stl.fit(inner_iter=inner_iter, outer_iter=outer_iter) model_endog = stl_fit.trend + stl_fit.resid mod = self._model(model_endog, **self._model_kwargs) res = mod.fit(**fit_kwargs) if not hasattr(res, "forecast"): raise AttributeError( "The model's result must expose a ``forecast`` method.") return STLForecastResults(stl, stl_fit, mod, res, self._endog)
def predict_past(self, df, freq_period, steps): scalerfile = self.directory + '/scaler_pred.sav' if not os.path.isfile(scalerfile) or os.path.isfile(scalerfile): if (df["y"].max() - df["y"].min()) > 100: if self.verbose == 1: print("PowerTransformation scaler used") scaler = PowerTransformer() else: if self.verbose == 1: print("Identity scaler used") scaler = IdentityTransformer() self.scaler2 = scaler.fit(np.reshape(np.array(df["y"]), (-1, 1))) Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1))) pickle.dump(self.scaler2, open(scalerfile, 'wb')) elif os.path.isfile(scalerfile): self.scaler2 = pickle.load(open(scalerfile, "rb")) Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1))) if freq_period % 2 == 0: freq_period = freq_period + 1 decomposition = STL(Y, period=freq_period + 1) decomposition = decomposition.fit() decomposition.plot() plt.show() df.loc[:, 'trend'] = decomposition.trend df.loc[:, 'seasonal'] = decomposition.seasonal df.loc[:, 'residual'] = decomposition.resid df= df.fillna(method="bfill") self.trend = np.asarray(df.loc[:, 'trend']) self.seasonal = np.asarray(df.loc[:, 'seasonal']) self.residual = np.asarray(df.loc[:, 'residual']) prediction, _, _ = self.make_prediction(steps) return prediction[0]
def smad(ts, m=3.0, period=None, stl_seasonal=25, only_low_values=False, score=False): ''' Seasonal-MAD Input: ts: pd.Series with DateTimeIndex m: stardard deviation period: time series seasonal periodo stl_seasonal: STL Seasonal parameter only_low_values: return anomalies only for low values score: if True returns the decision function Output: ''' # Seasonal component according to the Papper if period is not None: stl = STL(ts, period=period, seasonal=stl_seasonal) else: stl = STL(ts, seasonal=stl_seasonal) res = stl.fit() # fit # calculamos o residuo residuo = ts - np.nanmedian(ts) - res.seasonal # Search outlier with mad mad = MAD(only_low_values=only_low_values) mad.fit(residuo) # return if score: return mad.decision_function(residuo) else: index = mad.predict(residuo, m=m).index return ts.loc[index]
def extract_climate_trend(self, df, trend='STL'): ''' input_params: df: input the dataframe of which the trends are to be extracted from requirements for the dataframe: - dataframe index need to be datetime, - datetime index should be sorted - should be a monthly resampling ''' climate_trend_df = pd.DataFrame() if trend == 'STL': yr_list = df.index.year #print(yr_list[-1]) #print(yr_list[0]) seasons = yr_list[-1] - yr_list[0] if seasons % 2 == 0: seasons += 1 for col in df: stl = STL(df[col], period=12, seasonal=seasons, robust=True) res = stl.fit() #print(res.trend) climate_trend_df[col] = res.trend return climate_trend_df
def decompostion_STL(series, period=None, title=''): from statsmodels.tsa.seasonal import STL stl = STL(series, period=period, robust=True) res_robust = stl.fit() fig = res_robust.plot() fig.text(0.1, 0.95, title, size=15, color='purple') plt.show()
def test_short_class(default_kwargs_short): class_kwargs, outer, inner = _to_class_kwargs(default_kwargs_short) mod = STL(**class_kwargs) res = mod.fit(outer_iter=outer, inner_iter=inner) expected = results.loc['short'].sort_index() assert_allclose(res.seasonal, expected.season) assert_allclose(res.trend, expected.trend) assert_allclose(res.weights, expected.rw)
def _decompose(self, ts): if self.dku_config.model == "multiplicative": self.parameters["endog"] = np.log(ts) stl = STL(**self.parameters) statsmodel_results = stl.fit() trend = np.exp(statsmodel_results.trend.values) seasonal = np.exp(statsmodel_results.seasonal.values) residuals = np.exp(statsmodel_results.resid.values) decomposition = _DecompositionResults(trend=trend, seasonal=seasonal, residuals=residuals) elif self.dku_config.model == "additive": self.parameters["endog"] = ts stl = STL(**self.parameters) statsmodel_results = stl.fit() decomposition = _DecompositionResults() decomposition.load(statsmodel_results) return decomposition
def test_pickle(default_kwargs): class_kwargs, outer, inner = _to_class_kwargs(default_kwargs) mod = STL(**class_kwargs) res = mod.fit() pkl = pickle.dumps(mod) reloaded = pickle.loads(pkl) res2 = reloaded.fit() assert_allclose(res.trend, res2.trend) assert_allclose(res.seasonal, res2.seasonal) assert mod.config == reloaded.config
def test_ntjump_1_class(default_kwargs): default_kwargs['ntjump'] = 1 class_kwargs, outer, inner = _to_class_kwargs(default_kwargs) mod = STL(**class_kwargs) res = mod.fit(outer_iter=outer, inner_iter=inner) expected = results.loc['ntjump-1'].sort_index() assert_allclose(res.seasonal, expected.season) assert_allclose(res.trend, expected.trend) assert_allclose(res.weights, expected.rw)
def test_pandas(default_kwargs, robust): class_kwargs, _, _ = _to_class_kwargs(default_kwargs, robust) endog = pd.Series(class_kwargs['endog'], name='y') period = class_kwargs['period'] mod = STL(endog=endog, period=period) res = mod.fit() assert isinstance(res.trend, pd.Series) assert isinstance(res.seasonal, pd.Series) assert isinstance(res.resid, pd.Series) assert isinstance(res.weights, pd.Series)
def plot_time_trend(df, name): if name == "VN-INDEX": marker_color = HOSE_COLOR else: marker_color = HNX_COLOR stl = STL(df[df.index.year >= 2006]["Close"], period=250, seasonal=21, robust=True) res = stl.fit() fig = make_subplots(shared_xaxes=True, rows=4, cols=1) fig.add_trace(go.Scatter( y=res.observed, x=res.observed.index, name="Orignal Index", showlegend=False, marker_color=marker_color ), row=1, col=1) fig.add_trace(go.Scatter(y=res.trend, x=res.trend.index, name="Trend", showlegend=False, marker_color=marker_color, ), row=2, col=1) fig.add_trace(go.Scatter( y=res.seasonal, x=res.seasonal.index, name="Season", showlegend=False, marker_color=marker_color ), row=3, col=1) fig.add_trace(go.Scatter( y=res.resid, x=res.resid.index, showlegend=False, marker_color=marker_color, name="Resid", ), row=4, col=1) # Update xaxis properties fig.update_yaxes(title_text="Orginal", row=1, col=1) fig.update_yaxes(title_text="Trend", row=2, col=1) fig.update_yaxes(title_text="Seasonal", row=3, col=1) fig.update_yaxes(title_text="Residuals", row=4, col=1) fig.update_layout(title=f"Seasonal-Trend Decomposition of {name}", height=500 ) return fig
def test_baseline_class(default_kwargs): class_kwargs, outer, inner = _to_class_kwargs(default_kwargs) mod = STL(**class_kwargs) res = mod.fit(outer_iter=outer, inner_iter=inner) expected = results.loc['baseline'].sort_index() assert_allclose(res.trend, expected.trend) assert_allclose(res.seasonal, expected.season) assert_allclose(res.weights, expected.rw) resid = class_kwargs['endog'] - expected.trend - expected.season assert_allclose(res.resid, resid)
def testStationarity(df, keywords): for keyword in keywords: product = readData(df, keyword) stl = STL(product, seasonal=13) res = stl.fit() season = res.seasonal result = adfuller(season) if result[1] > 0.05: print(keyword, result[1])
def STL_decomposition(df, column, year): df = df[(df.date_c.dt.year == year)] df = df.sort_values(by="date_c") df = df[["date_c", column]] df = df.resample("1D", on="date_c").mean()[[column]] df = df.interpolate(method="time") series = df[column] stl = STL(series, period=29, robust=True) res = stl.fit() print("Trend mean = {}".format(res.trend.mean()), flush=True) return res
def predict(data, hyperparams): if hyperparams['seasonality']: stl_data = pd.Series(data=list(data.iloc[:, 1]), index=list(data.iloc[:, 0])) stl = STL(stl_data, period=hyperparams['period']) resids = stl.fit().resid.values residual_df = pd.DataFrame(data={'residuals': resids}) anomalies = anom_detect().evaluate(residual_df, col_name='residuals') anomalies_indices = list(anomalies.index) else: db = DBSCAN(eps=hyperparams['eps'], min_samples=hyperparams['min_pts']).fit(data) anomalies_indices = np.argwhere(db.labels_ == -1).flatten().tolist() return anomalies_indices
def stl_decomposition(series, period=12): """ Run STL decomposition on a pandas Series object. Parameters ---------- series : Series object The observations to be deseasonalised. period : int (optional) Length of the seasonal period in observations. """ stl = STL(series, period, robust=True) res = stl.fit() return res
def fit_predict_ES(self, df, freq_period, steps=1): from statsmodels.tsa.holtwinters import ExponentialSmoothing self.freq_period = freq_period decomposition = STL(df["y"], period=freq_period + 1) decomposition = decomposition.fit() df.loc[:, 'trend'] = decomposition.trend df.loc[:, 'seasonal'] = decomposition.seasonal df.loc[:, 'residual'] = decomposition.resid fit_tres = ExponentialSmoothing(df["trend"], seasonal_periods=freq_period, seasonal='add').fit() prediction_trend = fit_tres.forecast(steps) fit_res = ExponentialSmoothing(df["residual"], seasonal_periods=freq_period, seasonal='add').fit() prediction_res = fit_res.forecast(steps) fit_sea = ExponentialSmoothing(df["seasonal"], seasonal_periods=freq_period, seasonal='add').fit() prediction_sea = fit_sea.forecast(steps) prediction = prediction_trend + prediction_res + prediction_sea return prediction
def twitter_score(x, period=None, seasonal=45): ''' Retorna os index dos valores que são anomalias input precisa ser um Serie com index temporal''' # filtrando o componente seasonal if period is not None: stl = STL(x, period=period, seasonal=seasonal) else: stl = STL(x, seasonal=seasonal) res = stl.fit() # calculamos o residuo residuo = x - np.nanmedian(x) - res.seasonal # Procuramos outliers com MAD mad = MAD() mad.fit(residuo) return mad.decision_function(residuo)
def plot_stl(data: np.ndarray) -> DecomposeResult: stl = STL(data, period=12) res = stl.fit() fig, axs = plt.subplots(3, sharex=True, figsize=(10, 10)) axs[0].set_title("Trend") axs[0].plot(res.trend, color="blue") axs[0].grid() axs[1].set_title("Seasonal") axs[1].plot(res.seasonal, color="royalblue") axs[1].grid() axs[2].set_title("Residual") axs[2].plot(res.resid, color="darkblue") axs[2].grid() plt.xticks(np.arange(0, len(data), 12)) plt.show() return res
def seasonal_esd(ts, periodicity=None, hybrid=False, max_anomalies=10, alpha=0.05): """ Compute the Seasonal Extreme Studentized Deviate of a time series. The steps taken are first to to decompose the time series into STL decomposition (trend, seasonality, residual). Then, calculate the Median Absolute Deviate (MAD) if hybrid (otherwise the median) and perform a regular ESD test on the residual, which we calculate as: R = ts - seasonality - MAD or median Note: The statsmodel library requires a seasonality to compute the STL decomposition, hence the parameter seasonality. If none is given, then it will automatically be calculated to be 20% of the total timeseries. Args: ts (list or np.array): The timeseries to compute the ESD. periodicity (int): Number of time points for a season. hybrid (bool): See Twitter's research paper for difference. max_anomalies (int): The number of times the Grubbs' Test will be applied to the ts. alpha (float): The significance level. Returns: list int: The indices of the anomalies in the timeseries. """ if max_anomalies >= len(ts) / 2: raise ValueError( 'The maximum number of anomalies must be less than half the size of the time series.' ) ts = np.array(ts) period = periodicity or int( 0.2 * len(ts)) # Seasonality is 20% of the ts if not given. stl = STL(ts, period=period, robust=True) decomposition = stl.fit() residual = ts - decomposition.seasonal - np.median(ts) outliers = generalized_esd(residual, max_anomalies=max_anomalies, alpha=alpha, hybrid=hybrid) return outliers
def decompose(df, period=52): d = pd.DataFrame() stl = STL(df["Weekly_Sales"], period=period) tsd = stl.fit() resid = tsd.resid.to_frame().reset_index() resid["DateTime"] = df["DateTime"] trend = tsd.trend.to_frame().reset_index() trend["DateTime"] = df["DateTime"] season = tsd.seasonal.to_frame().reset_index() season["DateTime"] = df["DateTime"] d = df d.loc[:, "season"] = season["season"] d.loc[:, "trend"] = trend["trend"] d.loc[:, "resid"] = resid["resid"] return d
def singleDecomp(df, keyword): dataFrame = df[df['keyword'].str.contains(keyword)] apfelstrudel = dataFrame['interest'].tolist() apfelstrudel = pd.Series(apfelstrudel, index=pd.date_range('1-1-2017', periods=len(apfelstrudel), freq='W'), name=keyword) stl = STL(apfelstrudel, seasonal=13) decomposition = stl.fit() #decomposition.plot() #plt.show() return decomposition
def twitter(x, period=None, seasonal=45): ''' Retorna os index dos valores que são anomalias input precisa ser um Serie com DateTimeIndex''' # filtrando o componente seasonal if period is not None: stl = STL(x, period=period, seasonal=seasonal) else: stl = STL(x, seasonal=seasonal) res = stl.fit() # calculamos o residuo residuo = x - np.nanmedian(x) - res.seasonal # Procuramos outliers com MAD mad = MAD() mad.fit(residuo) index = mad.predict(residuo).index return x.loc[index]
def clean(series): n_series = len(series) if n_series % 2 == 0: n_series = n_series - 1 stl = STL(series, period=7, robust=True, seasonal=n_series) res = stl.fit() detrend = series - res.trend strength = 1 - np.var(res.resid) / np.var(detrend) if strength >= 0.6: series = res.trend + res.resid # deseasonlized series tt = np.arange(len(series)) model = SuperSmoother() model.fit(tt, series) yfit = model.predict(tt) resid = series - yfit resid_q = np.quantile(resid, [0.25, 0.75]) iqr = np.diff(resid_q) #limits = resid.q + 3 * iqr * [-1, 1] limits = resid_q + 5 * iqr * [-1, 1] # Find residuals outside limits series_cleaned = series.copy() outliers = None if (limits[1] - limits[0]) > 1e-14: outliers = [ a or b for a, b in zip((resid < limits[0]).to_numpy(), ( resid > limits[1]).to_numpy()) ] if any(outliers): series_cleaned.loc[outliers] = np.nan # Replace outliers id_outliers = [i for i, x in enumerate(outliers) if x] for ii in id_outliers: xx = [ii - 2, ii - 1, ii + 1, ii + 2] xx = [x for x in xx if x < series_cleaned.shape[0] and x >= 0] assert (len(xx) > 0) assert (not np.isnan(series_cleaned.iloc[xx]).to_numpy().all()) series_cleaned.iloc[ii] = np.nanmedian( series_cleaned.iloc[xx].to_numpy().flatten()) return series_cleaned, outliers
def nonStatKeywords(df, keywords): for keyword in keywords: dataFrame = df[df['keyword'].str.contains(keyword)] product = dataFrame['interest'].tolist() product = pd.Series(product, index=pd.date_range('1-1-2017', periods=len(product), freq='W'), name=keyword) stl = STL(product, seasonal=13) res = stl.fit() season = res.seasonal result = adfuller(season) if result[1] > 0.05: print(keyword, result[1])
def stl_and_plot(file: str, seconds: int = 1_500_000): df = pd.read_csv(os.path.join('data', file)) timestamps, values, interval = linear_interpolation(df.timestamp, df.value) values = (values - values.mean()) / values.std() timestamps = timestamps[:seconds // interval] values = values[:seconds // interval] start_time = datetime.datetime.fromtimestamp(timestamps[0]) end_time = datetime.datetime.fromtimestamp(timestamps[-1]) series = pd.Series(values, index=pd.date_range(start=start_time, end=end_time, freq=pd.offsets.Second(interval)), name=file) stl = STL(series, period=24 * 3600 // interval, seasonal=21) res = stl.fit() res.plot() plt.savefig(os.path.join('out', f'{file}.png'))
def compute_measures(self, var, window=None): """ Computing some measures with the wind series Window is a dictionary with a keyword for the windoe size and a window length :return: """ if self.raw_data is None: raise NameError("Raw data is not loaded") if var > self.raw_data.shape[1]: raise NameError("Invalid variable number") dvals = {} dvals['SpecEnt'] = spectral_entropy(self.raw_data[:, var], sf=1) dvals['SampEnt'] = sample_entropy(self.raw_data[:, var], order=2) data = self.raw_data[:, var] for w in window: print(w) lw = window[w] print(lw) length = int(data.shape[0] / lw) size = lw * length datac = data[:size] datac = datac.reshape(-1, lw) means = np.mean(datac, axis=1) vars = np.std(datac, axis=1) dvals[f'Stab{w}'] = np.std(means) dvals[f'Lump{w}'] = np.std(vars) # decompositions stl = STL(self.raw_data[:, var], period=lw) res = stl.fit() strength_seasonality = 1 - (np.var( res.resid)) / (np.var(res.observed - res.trend)) strength_trend = 1 - (np.var( res.resid)) / (np.var(res.observed - res.seasonal)) dvals[f'strength_seasonality{w}'] = strength_seasonality dvals[f'strength_trend{w}'] = strength_trend fig = res.plot() return dvals
def decompose(self, col=feature): # decomposition = sm.tsa.seasonal_decompose(pd.DataFrame(self.df[col]), model='Additive') # fig = decomposition.plot() a = self.df[col] b = a.index c = pd.infer_freq(b) d = pd.Series(a, index=pd.date_range(b.date[0], b.date[len(a) - 1], periods=None, freq=c), name=col) print(d) # x=pd.Series(p.values,index=p.index.values) stl = STL(d, seasonal=7) res = stl.fit() fig = res.plot() plt.show()