def inspect_seasonality(ts, freq=252): temp = rcParams['figure.figsize'] rcParams['figure.figsize'] = 12, 8 res = smt.seasonal_decompose(ts, model='additive', freq=freq) p = res.plot() rcParams['figure.figsize'] = temp return p
def decompose(x, method="STL", **kwargs): """Perform seasonal decomposition of the time series.""" if method not in ["STL", "MA"]: raise ValueError("`method` must be either 'STL' or 'MA'.") decomposition = None if method == "STL": decomposition = STL(x, **kwargs).fit() else: decomposition = seasonal_decompose(x, **kwargs) return decomposition
def decomp_adjust(data, train_hours, test_hours, model): data.index = pd.to_datetime(data.index, utc=True) decomp = seasonal_decompose(data['total load actual'][0:train_hours], model=model, freq=24) seasonality = list(decomp.seasonal[:24]) * int( (train_hours + test_hours) / 24) data['seasonality'] = seasonality data['seasonally decomposed'] = \ data['total load actual'] - seasonality if model == "additive" \ else data['total load actual'] / seasonality
def tsplot(TS, period=7, lags=None, figsize=(18, 20), style='bmh'): if not isinstance(TS, pd.Series): TS = pd.Series(TS) with plt.style.context(style): fig = plt.figure(figsize=figsize) # mpl.rcParams['font.family'] = 'Ubuntu Mono' layout = (6, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) dc_trend_ax = plt.subplot2grid(layout, (1, 0), colspan=2) dc_seasonal_ax = plt.subplot2grid(layout, (2, 0), colspan=2) dc_resid_ax = plt.subplot2grid(layout, (3, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (4, 0)) pacf_ax = plt.subplot2grid(layout, (4, 1)) qq_ax = plt.subplot2grid(layout, (5, 0)) pp_ax = plt.subplot2grid(layout, (5, 1)) TS.plot(ax=ts_ax) ts_ax.set_title('Time Series') smt.seasonal_decompose(TS, model='additive', period=period).trend.plot(ax=dc_trend_ax) dc_trend_ax.set_title('[Decompose] Time Series Trend') smt.seasonal_decompose(TS, model='additive', period=period).seasonal.plot(ax=dc_seasonal_ax) dc_seasonal_ax.set_title('[Decompose] Time Series Seasonal') smt.seasonal_decompose(TS, model='additive', period=period).resid.plot(ax=dc_resid_ax) dc_resid_ax.set_title('[Decompose] Time Series Resid') smt.graphics.plot_acf(TS, lags=lags, ax=acf_ax, alpha=0.5) smt.graphics.plot_pacf(TS, lags=lags, ax=pacf_ax, alpha=0.5) sm.qqplot(TS, line='s', ax=qq_ax) qq_ax.set_title('QQ Plot') stats.probplot(TS, sparams=(TS.mean(), TS.std()), plot=pp_ax) plt.tight_layout() plt.savefig('time_series_analysis.png') plt.show()
def scenario04(self, start='2020-01-01', end='2020-10-01', verbose=True, save=None): if not end: end = np.datetime64('today') self.config['scenario04'] = dict() self.config['scenario04']['freq'] = self.config['freq'] fig = plt.figure(figsize=(20, 18)) layout = (4, 1) ax00 = plt.subplot2grid(layout, (0, 0)) ax01 = plt.subplot2grid(layout, (1, 0)) ax02 = plt.subplot2grid(layout, (2, 0)) ax03 = plt.subplot2grid(layout, (3, 0)) for name in self.stock_names: x, y = self.datareader(name=name, start=start, end=end, verbose=verbose) result = smt.seasonal_decompose( y, model='additive', freq=self.config['scenario04']['freq'], two_sided=False) ax00.scatter(x, result.observed, marker='*') ax00.plot(x, result.observed, label=f'{name}') ax01.scatter(x, result.trend, marker='*') ax01.plot(x, result.trend, label=f'{name}') ax02.scatter(x, result.seasonal, marker='*') ax02.plot(x, result.seasonal, label=f'{name}') ax03.scatter(x, result.resid, marker='*') ax03.plot(x, result.resid, label=f'{name}') ax00.grid(True) ax01.grid(True) ax02.grid(True) ax03.grid(True) ax00.legend() ax01.legend() ax02.legend() ax03.legend() plt.tight_layout() if save: plt.savefig('analysis04.png') plt.show()
def decompose_ts(ts, s=250, figsize=(20,13)): decomposition = smt.seasonal_decompose(ts, freq=s) trend = decomposition.trend seasonal = decomposition.seasonal residual = decomposition.resid fig, ax = plt.subplots(nrows=4, ncols=1, sharex=True, sharey=False, figsize=figsize) ax[0].plot(ts) ax[0].set_title('Original') ax[0].grid(True) ax[1].plot(trend) ax[1].set_title('Trend') ax[1].grid(True) ax[2].plot(seasonal) ax[2].set_title('Seasonality') ax[2].grid(True) ax[3].plot(residual) ax[3].set_title('Residuals') ax[3].grid(True) return {"trend":trend, "seasonal":seasonal, "residual":residual}
dtw_matrix = utilities.DTW_distance_matrix(df.head(1000).drop('timestamp', axis=1), 1) dtw_matrix = dtw_matrix.T + dtw_matrix linkage_matrix = linkage(ssd.squareform(dtw_matrix), method='weighted', metric='euclidean') print(is_valid_linkage(linkage_matrix)) plt.figure(1) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') scipy.cluster.hierarchy.dendrogram(linkage_matrix, labels=columns, leaf_rotation=90., leaf_font_size=12., show_contracted=True) plt.show() dtw_matrix = utilities.DTW_distance_matrix(df_returns.head(1000).drop('timestamp', axis=1), 1) dtw_matrix = dtw_matrix.T + dtw_matrix linkage_matrix = linkage(ssd.squareform(dtw_matrix), method='weighted', metric='euclidean') print(is_valid_linkage(linkage_matrix)) plt.figure(1) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') scipy.cluster.hierarchy.dendrogram(linkage_matrix, labels=columns, leaf_rotation=90., leaf_font_size=12., show_contracted=True) plt.show() ### Seasonality df.reset_index(inplace=True) df['timestamp'] = pd.to_datetime(df['timestamp']) df = df.set_index('timestamp') smt.seasonal_decompose(df.GBPUSD_close, freq=60).plot()
plt.show() #%% loading stock data start = '2018-06-29' end = '2020-09-29' df = web.DataReader('005380', 'naver', start=start, end=end) index = pd.to_numeric(pd.to_datetime(df.index)).values series = df['Low'].values ts = pd.DataFrame({'index': index, 'series': series}) ts = ts.set_index('index') data = ts.values.astype(np.float64) data = data.squeeze() period=int(365/4) trend = smt.seasonal_decompose(data, model='additive', period=period).trend seasonal = smt.seasonal_decompose(data, model='additive', period=period).seasonal resid = smt.seasonal_decompose(data, model='additive', period=period).resid tsplot(data, period=period) #%% [Stationary] stationary(data) #stationary(trend) #stationary(seasonal) #stationary(resid) #%% [Descriptive Statistics] pd.DataFrame(data).describe() #pd.DataFrame(trend).describe() #pd.DataFrame(seasonal).describe()
# Now fit another OLS model. data = ( y.to_frame(name="y") .assign(Δy=lambda df: df.y.diff()) .assign(LΔy=lambda df: df.Δy.shift()) ) mod_stationary = smf.ols("Δy ~ LΔy", data=data.dropna()) res_stationary = mod_stationary.fit() tsplot(res_stationary.resid, lags=24); # ##### Seasonality # We have a strong monthly seasonality. smt.seasonal_decompose(y).plot() # #### ARIMA # ARIMA can handle all the problems specified: # # - Multicollinearity # - Autocorrelation # - Non-stationary # - Seasonality # # **A**utoRegressive # **I**ntegrated # **M**oving # **A**verage
if i >= j: plt.figure(figsize=(10, 10)) plt.xcorr(residuals[data.columns.values[i]], residuals[data.columns.values[j]], maxlags=40) plt.suptitle(data.columns.values[i] + " x " + data.columns.values[j]) plt.show() # VAR(7) seems to filter well the series even with the seasonaliry # checking a VAR(1) with deseasonalized data # adjusting individual seasonal adjustment des_data = data for i in data.columns: des_data[i] = (data[i] - seasonal_decompose(data[i], freq=7).seasonal) # fitting a VAR(p) with deseasonalized data model = VAR(des_data) # getting VAR(1) residuals fittedmodel = model.fit(maxlags=1) residuals = fittedmodel.resid for i in range(data.shape[1]): for j in range(data.shape[1]): if i >= j: plt.figure(figsize=(10, 10)) plt.xcorr(residuals[data.columns.values[i]], residuals[data.columns.values[j]], maxlags=40)
fc, confint = smodel.predict(n_periods=24, return_conf_int=True) fc_df = pd.DataFrame(confint, columns=['lower', 'upper']) fc_df['fc'] = fc fc_df.index = pd.date_range(dd.index[-1], periods=24, freq='MS') plt.plot(dd) plt.plot(fc_df.fc) plt.fill_between(fc_df.index, fc_df.lower, fc_df.upper, alpha=.15) plt.title("Forecast of drug sales - SARIMA") #%% 15. How to build SARIMAX Model with exogenous variable from dateutils.parser import parse sd = ts.seasonal_decompose(dd[-36:], model='multiplicative', extrapolate_trend='freq') sd dir(sd) seasonal = sd.seasonal[-12:].to_frame() seasonal['month'] = seasonal.index.month seasonal ss['month'] = ss.index.month ss # !!! dfs = pd.merge(ss, seasonal, how='left', on='month') dfs dfs.index = ss.index
def scenario05(self, start='2020-01-01', end='2020-10-01', verbose=True, save=None): if not end: end = np.datetime64('today') self.config['scenario05'] = dict() self.config['scenario05']['freq'] = self.config['freq'] for name in self.stock_names: self.config['scenario05'][f'{name}'] = self.datareader( name=name, start=start, end=end, verbose=verbose) fig = plt.figure(figsize=(20, 18)) layout = (4, 1) ax00 = plt.subplot2grid(layout, (0, 0)) ax01 = plt.subplot2grid(layout, (1, 0)) ax02 = plt.subplot2grid(layout, (2, 0)) ax03 = plt.subplot2grid(layout, (3, 0)) for name in self.stock_names: x, y = self.config['scenario05'][f'{name}'] ax00.scatter(x, y, marker='*') ax00.plot(x, y, label=f'{name}') y = (y - y.mean()) / y.std() ax02.scatter(x, y, marker='*') ax02.plot(x, y, label=f'{name}') x, y = self.config['scenario05'][f'{name}'] trend = smt.seasonal_decompose( y, model='additive', freq=self.config['scenario05']['freq'], two_sided=True).trend ndiff = int(np.isnan(trend).sum() / 2) x = x[ndiff:-ndiff] y = trend[~np.isnan(trend)] ax01.scatter(x, y, marker='*') ax01.plot(x, y, label=f'{name}') y = (y - y.mean()) / y.std() ax03.scatter(x, y, marker='*') ax03.plot(x, y, label=f'{name}') ax00.grid(True) ax01.grid(True) ax02.grid(True) ax03.grid(True) ax00.legend() ax01.legend() ax02.legend() ax03.legend() ax00.set_title('[Observed]') ax01.set_title('[Trend]') ax02.set_title('[Normalized observed]') ax03.set_title('[Normalized trend]') ax02.axhline(0, c='black', ls='--') ax03.axhline(0, c='black', ls='--') plt.tight_layout() if save: plt.savefig('analysis05.png') plt.show()
def scenario03(self, start='2020-01-01', end='2020-10-01', verbose=True, save=None): if not end: end = np.datetime64('today') time_range = np.arange(start, end, dtype='datetime64[D]') end_ = time_range[int(len(time_range) * self.config['ratio'])] self.config['scenario03'] = dict() self.config['scenario03']['freq'] = self.config['freq'] self.config['scenario03']['start'] = start self.config['scenario03']['end'] = end self.config['scenario03']['end_'] = end_ self.config['scenario03']['ks200'] = fdr.DataReader('KS200', start=start, end=end)['Close'] self.config['scenario03']['ks200_'] = fdr.DataReader('KS200', start=start, end=end_)['Close'] for name in self.stock_names: self.config['scenario03'][f'{name}_'] = self.datareader( name=name, start=start, end=end_, verbose=verbose) self.config['scenario03'][f'{name}'] = self.datareader( name=name, start=start, end=end, verbose=verbose) fig = plt.figure(figsize=(20, 35)) layout = (8, 1) ax00 = plt.subplot2grid(layout, (0, 0)) ax01 = plt.subplot2grid(layout, (1, 0)) ax02 = plt.subplot2grid(layout, (2, 0)) ax03 = plt.subplot2grid(layout, (3, 0)) ax04 = plt.subplot2grid(layout, (4, 0)) ax05 = plt.subplot2grid(layout, (5, 0)) ax06 = plt.subplot2grid(layout, (6, 0), rowspan=2) # axes : (0, 0) > differencing for normalized stock prices based on kospi200 index until end_ ks200_x = self.config['scenario03']['ks200_'].index.values ks200_y = self.config['scenario03']['ks200_'].values ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() for name in self.stock_names: x, y = self.config['scenario03'][f'{name}_'] y = (y - y.mean()) / y.std() y = y - ks200_y ax00.scatter(x, y, marker='*') ax00.plot(x, y, label=f'{name}') ax00.axhline(0, c='black', ls='--') ax00.axvline(start, c='grey', ls='--') ax00.axvline(end_, c='r', ls='--') ax00.set_title('[Past : Diff]') ax00.grid(True) ax00.legend() # axes : (1, 0) > normalized stock prices until end_ ks200_x = self.config['scenario03']['ks200_'].index.values ks200_y = self.config['scenario03']['ks200_'].values ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() ax01.scatter(ks200_x, ks200_y, marker='*', c='black') ax01.plot(ks200_x, ks200_y, label='KS200', c='black') for name in self.stock_names: x, y = self.config['scenario03'][f'{name}_'] y = (y - y.mean()) / y.std() ax01.scatter(x, y, marker='*') ax01.plot(x, y, label=f'{name}') ax01.axhline(0, c='black', ls='--') ax01.axvline(start, c='grey', ls='--') ax01.axvline(end_, c='r', ls='--') ax01.set_title('[Past : Normalized]') ax01.grid(True) ax01.legend() # axes : (2, 0) > normalized trend of stock prices until end_ ks200_x = self.config['scenario03']['ks200_'].index.values ks200_y = self.config['scenario03']['ks200_'].values trend = smt.seasonal_decompose(ks200_y, model='additive', freq=self.config['scenario03']['freq'], two_sided=True).trend ndiff = int(np.isnan(trend).sum() / 2) ks200_x = ks200_x[ndiff:-ndiff] ks200_y = trend[~np.isnan(trend)] ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() ax02.scatter(ks200_x, ks200_y, marker='*', c='black') ax02.plot(ks200_x, ks200_y, label='KS200', c='black') for name in self.stock_names: x, y = self.config['scenario03'][f'{name}_'] trend = smt.seasonal_decompose( y, model='additive', freq=self.config['scenario03']['freq'], two_sided=True).trend ndiff = int(np.isnan(trend).sum() / 2) x = x[ndiff:-ndiff] y = trend[~np.isnan(trend)] y = (y - y.mean()) / y.std() ax02.scatter(x, y, marker='*') ax02.plot(x, y, label=f'{name}') ax02.axhline(0, c='black', ls='--') ax02.axvline(start, c='grey', ls='--') ax02.axvline(end_, c='r', ls='--') ax02.set_title('[Past : Normalized trend]') ax02.grid(True) ax02.legend() # axes : (3, 0) > differencing for normalized stock prices based on kospi200 index ks200_x = self.config['scenario03']['ks200'].index.values ks200_y = self.config['scenario03']['ks200'].values ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() for name in self.stock_names: x, y = self.config['scenario03'][f'{name}'] y = (y - y.mean()) / y.std() y = y - ks200_y ax03.scatter(x, y, marker='*') ax03.plot(x, y, label=f'{name}') ax03.axhline(0, c='black', ls='--') ax03.axvline(start, c='grey', ls='--') ax03.axvline(end_, c='r', ls='--') ax03.axvline(end, c='black', ls='--') ax03.set_title('[Present : Diff]') ax03.grid(True) ax03.legend() # axes : (4, 0) > normalized stock prices ks200_x = self.config['scenario03']['ks200'].index.values ks200_y = self.config['scenario03']['ks200'].values ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() ax04.scatter(ks200_x, ks200_y, marker='*', c='black') ax04.plot(ks200_x, ks200_y, label='KS200', c='black') for name in self.stock_names: x, y = self.config['scenario03'][f'{name}'] y = (y - y.mean()) / y.std() ax04.scatter(x, y, marker='*') ax04.plot(x, y, label=f'{name}') ax04.axhline(0, c='black', ls='--') ax04.axvline(start, c='grey', ls='--') ax04.axvline(end_, c='r', ls='--') ax04.axvline(end, c='black', ls='--') ax04.set_title('[Present : Normalized]') ax04.grid(True) ax04.legend() # axes : (5, 0) > normalized trend of stock prices ks200_x = self.config['scenario03']['ks200'].index.values ks200_y = self.config['scenario03']['ks200'].values trend = smt.seasonal_decompose(ks200_y, model='additive', freq=self.config['scenario03']['freq'], two_sided=True).trend ndiff = int(np.isnan(trend).sum() / 2) ks200_x = ks200_x[ndiff:-ndiff] ks200_y = trend[~np.isnan(trend)] ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std() ax05.scatter(ks200_x, ks200_y, marker='*', c='black') ax05.plot(ks200_x, ks200_y, label='KS200', c='black') for name in self.stock_names: x, y = self.config['scenario03'][f'{name}'] trend = smt.seasonal_decompose( y, model='additive', freq=self.config['scenario03']['freq'], two_sided=True).trend ndiff = int(np.isnan(trend).sum() / 2) x = x[ndiff:-ndiff] y = trend[~np.isnan(trend)] y = (y - y.mean()) / y.std() ax05.scatter(x, y, marker='*') ax05.plot(x, y, label=f'{name}') ax05.axhline(0, c='black', ls='--') ax05.axvline(start, c='grey', ls='--') ax05.axvline(end_, c='r', ls='--') ax05.axvline(end, c='black', ls='--') ax05.set_title('[Present : Normalized]') ax05.grid(True) ax05.legend() # axes : (6, 0) > stock prices for name in self.stock_names: x, y = self.config['scenario03'][f'{name}'] ax06.scatter(x, y, marker='*') ax06.plot(x, y, label=f'{name}') ax06.axvline(start, c='grey', ls='--') ax06.axvline(end_, c='r', ls='--') ax06.axvline(end, c='black', ls='--') ax06.set_title('[Present : Origin]') ax06.grid(True) ax06.legend() plt.tight_layout() if save: plt.savefig('analysis03.png') plt.show()
def plot_decompose(df): sm.seasonal_decompose(df['rate']).plot() plt.show()
import warnings warnings.filterwarnings('ignore') import pandas_datareader as web import statsmodels.tsa.api as tsa import pandas as pd import numpy as np from numpy.linalg import LinAlgError p = f'{os.geetcwd()}/img/' industrial_production = web.DataReader('IPGMFN', 'fred', '2000', '2020-12').squeeze() print(type(industrial_production.head())) components = tsa.seasonal_decompose(industrial_production, model='additive') ts = industrial_production.to_frame('Original').assign( Trend=components.trend).assign(Seasonality=components.seasonal).assign( Residual=components.resid) import matplotlib.pyplot as plt ts.plot(subplots=True, figsize=(14, 8)) plt.show() plt.savefig(f'{p}1.png') # time series stationarity industrial_production_log = np.log(industrial_production) industrial_production_log_diff = industrial_production_log.diff( 12).dropna() # seasonal differencing => yoy instantanteous returns
import pandas_datareader.data as web import statsmodels.tsa.api as tsa ind = web.DataReader('IPGMFN', 'fred', '1988', '2017-12').squeeze() components = tsa.seasonal_decompose(ind, model='additive') ts = (ind.to_frame('Original').assign(Trend=components.trend).assign( Seasonality=components.seasonal).assign(Residual=components.resid)) ts.plot(subplots=True, figsize=(14, 8)) df = web.DataReader(name='SP500', data_source='fred', start=2009).squeeze().to_frame('close') spx = web.DataReader('SP500', 'fred', 2009, 2020).squeeze().to_frame('close')
def preprocess_load_data_forec(dataframe, quarter_hour=True, short_term=True, scaler=None, n_ahead=1, calendars=None): # pre-process load data for forecasting: scale, split in train / test, de-seasonalize, and construct features # expects pandas Dataframe with a Datetimeindex and a load column containing the load data in MW with no missing # values. # Resolution either quarter hour (quarter_hour=True), if quarter_hour=False assumed to be hourly data # use GW for convenience and readability later, also the standard-scaled values are smaller dataframe = dataframe / 1000 # split data first so scaler and deseasonilizing can be trained on train set properly train_df_o, test_df_o = train_test_split(dataframe, test_size=0.2, shuffle=False) if scaler is None: scaler = StandardScaler() scaler.fit(np.array(train_df_o['load']).reshape(-1, 1)) train_df = pd.DataFrame( { 'load': scaler.transform(np.array(train_df_o['load']).reshape( -1, 1)).squeeze() }, index=train_df_o.index) test_df = pd.DataFrame( { 'load': scaler.transform(np.array(test_df_o['load']).reshape(-1, 1)).squeeze() }, index=test_df_o.index) # deseasonalize offset_train = pd.DataFrame(0, index=train_df.index, columns=['load']) offset_test = pd.DataFrame(0, index=test_df.index, columns=['load']) # decomp and train Holt Winters on decomp seasonal_periods = [24, 24 * 7] freq = 'H' if quarter_hour: seasonal_periods = [p * 4 for p in seasonal_periods] freq = '15T' for p in seasonal_periods: decomp = seasonal_decompose(train_df, period=p) exp = ExponentialSmoothing(decomp.seasonal, seasonal_periods=p, seasonal='add', freq=freq).fit() train_pred = exp.predict(start=train_df.index[0], end=train_df.index[-1]) test_pred = exp.predict(start=test_df.index[0], end=test_df.index[-1]) train_df['load'] = (train_df['load'] - train_pred) test_df['load'] = (test_df['load'] - test_pred) offset_train['load'] = offset_train['load'] + train_pred offset_test['load'] = offset_test['load'] + test_pred # construct features train_df = construct_features(dataframe=train_df, offset=offset_train, short_term=short_term, quarter_hour=quarter_hour, n_ahead=n_ahead, calendars=calendars) test_df = construct_features(dataframe=test_df, offset=offset_test, short_term=short_term, quarter_hour=quarter_hour, n_ahead=n_ahead, calendars=calendars) return train_df, test_df, scaler
def decompose(data): #print(tsa.seasonal_decompose(ts(data)).seasonal) trend = tsa.seasonal_decompose(ts(data)).trend.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records") seasonal = tsa.seasonal_decompose(ts(data)).seasonal.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records") resid = tsa.seasonal_decompose(ts(data)).resid.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records") return trend, seasonal, resid
def example_3(): import pandas_datareader as pdr gs = pdr.data.DataReader("GS", data_source='yahoo', start='2006-01-01', end='2010-01-01') print(gs.head().round(2)) print(gs.loc[pd.Timestamp('2006-01-01'):pd.Timestamp('2006-12-31')].head()) print(gs.loc['2006'].head()) #-------------------- # Resampling. if True: print(gs.resample("5d").mean().head()) print(gs.resample("W").agg(['mean', 'sum']).head()) # You can up-sample to convert to a higher frequency. The new points are filled with NaNs. print(gs.resample("6h").mean().head()) #-------------------- # Rolling, expanding, exponential weighted (EW). if False: gs.Close.plot(label='Raw') gs.Close.rolling(28).mean().plot(label='28D MA') gs.Close.expanding().mean().plot(label='Expanding Average') gs.Close.ewm(alpha=0.03).mean().plot(label='EWMA($\\alpha=.03$)') plt.legend(bbox_to_anchor=(1.25, .5)) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() # Each of .rolling, .expanding, and .ewm return a deferred object, similar to a GroupBy. roll = gs.Close.rolling(30, center=True) m = roll.agg(['mean', 'std']) plt.figure() ax = m['mean'].plot() ax.fill_between(m.index, m['mean'] - m['std'], m['mean'] + m['std'], alpha=.25) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() #-------------------- # Grab bag. if False: # Offsets. # These are similar to dateutil.relativedelta, but works with arrays. print(gs.index + pd.DateOffset(months=3, days=-2)) # Holiday calendars. from pandas.tseries.holiday import USColumbusDay print(USColumbusDay.dates('2015-01-01', '2020-01-01')) # Timezones. # tz naiive -> tz aware..... to desired UTC print(gs.tz_localize('US/Eastern').tz_convert('UTC').head()) #-------------------- # Modeling time series. if True: from collections import namedtuple import statsmodels.formula.api as smf import statsmodels.tsa.api as smt import statsmodels.api as sm from modern_pandas_utils import download_timeseries def download_many(start, end): months = pd.period_range(start, end=end, freq='M') # We could easily parallelize this loop. for i, month in enumerate(months): download_timeseries(month) def time_to_datetime(df, columns): ''' Combine all time items into datetimes. 2014-01-01,1149.0 -> 2014-01-01T11:49:00 ''' def converter(col): timepart = (col.astype(str) .str.replace('\.0$', '') # NaNs force float dtype .str.pad(4, fillchar='0')) return pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce') return datetime_part df[columns] = df[columns].apply(converter) return df def unzip_one(fp): try: zf = zipfile.ZipFile(fp) csv = zf.extract(zf.filelist[0]) return csv except zipfile.BadZipFile as ex: print('zipfile.BadZipFile raised in {}: {}.'.format(fp, ex)) raise def read_one(fp): df = (pd.read_csv(fp, encoding='latin1') .rename(columns=str.lower) .drop('unnamed: 6', axis=1) .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time']) .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']))) return df store = './modern_pandas_data/ts.hdf5' if not os.path.exists(store): download_many('2000-01-01', '2016-01-01') zips = glob.glob(os.path.join('modern_pandas_data', 'timeseries', '*.zip')) csvs = [unzip_one(fp) for fp in zips] dfs = [read_one(fp) for fp in csvs] df = pd.concat(dfs, ignore_index=True) df['origin'] = df['origin'].astype('category') df.to_hdf(store, 'ts', format='table') else: df = pd.read_hdf(store, 'ts') with pd.option_context('display.max_rows', 100): print(df.dtypes) daily = df.fl_date.value_counts().sort_index() y = daily.resample('MS').mean() print(y.head()) ax = y.plot() ax.set(ylabel='Average Monthly Flights') sns.despine() X = (pd.concat([y.shift(i) for i in range(6)], axis=1, keys=['y'] + ['L%s' % i for i in range(1, 6)]).dropna()) print(X.head()) mod_lagged = smf.ols('y ~ trend + L1 + L2 + L3 + L4 + L5', data=X.assign(trend=np.arange(len(X)))) res_lagged = mod_lagged.fit() res_lagged.summary() sns.heatmap(X.corr()) ax = res_lagged.params.drop(['Intercept', 'trend']).plot.bar(rot=0) plt.ylabel('Coefficeint') sns.despine() # Autocorrelation. # 'Results.resid' is a series of residuals: y - ŷ. mod_trend = sm.OLS.from_formula('y ~ trend', data=y.to_frame(name='y').assign(trend=np.arange(len(y)))) res_trend = mod_trend.fit() def tsplot(y, lags=None, figsize=(10, 8)): fig = plt.figure(figsize=figsize) layout = (2, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) y.plot(ax=ts_ax) smt.graphics.plot_acf(y, lags=lags, ax=acf_ax) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax) [ax.set_xlim(1.5) for ax in [acf_ax, pacf_ax]] sns.despine() plt.tight_layout() return ts_ax, acf_ax, pacf_ax tsplot(res_trend.resid, lags=36) y.to_frame(name='y').assign(Δy=lambda x: x.y.diff()).plot(subplots=True) sns.despine() ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest") #ADF(*smt.adfuller(y))._asdict() ADF(*smt.adfuller(y.dropna()))._asdict() ADF(*smt.adfuller(y.diff().dropna()))._asdict() data = (y.to_frame(name='y').assign(Δy=lambda df: df.y.diff()).assign(LΔy=lambda df: df.Δy.shift())) mod_stationary = smf.ols('Δy ~ LΔy', data=data.dropna()) res_stationary = mod_stationary.fit() tsplot(res_stationary.resid, lags=24) # Seasonality. #smt.seasonal_decompose(y).plot() smt.seasonal_decompose(y.fillna(method='ffill')).plot() # ARIMA. mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1)) res = mod.fit() tsplot(res.resid[2:], lags=24) res.summary() mod_seasonal = smt.SARIMAX(y, trend='c', order=(1, 1, 2), seasonal_order=(0, 1, 2, 12), simple_differencing=False) res_seasonal = mod_seasonal.fit() res_seasonal.summary() tsplot(res_seasonal.resid[12:], lags=24) # Forecasting. pred = res_seasonal.get_prediction(start='2001-03-01') pred_ci = pred.conf_int() plt.figure() ax = y.plot(label='observed') pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_ylabel("Monthly Flights") plt.legend() sns.despine() pred_dy = res_seasonal.get_prediction(start='2002-03-01', dynamic='2013-01-01') pred_dy_ci = pred_dy.conf_int() plt.figure() ax = y.plot(label='observed') pred_dy.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_dy_ci.index, pred_dy_ci.iloc[:, 0], pred_dy_ci.iloc[:, 1], color='k', alpha=.25) ax.set_ylabel("Monthly Flights") # Highlight the forecast area. ax.fill_betweenx(ax.get_ylim(), pd.Timestamp('2013-01-01'), y.index[-1], alpha=.1, zorder=-1) ax.annotate('Dynamic $\\longrightarrow$', (pd.Timestamp('2013-02-01'), 550)) plt.legend() sns.despine() plt.show()