示例#1
0
def inspect_seasonality(ts, freq=252):
    temp = rcParams['figure.figsize']
    rcParams['figure.figsize'] = 12, 8
    res = smt.seasonal_decompose(ts, model='additive', freq=freq)
    p = res.plot()
    rcParams['figure.figsize'] = temp
    return p
示例#2
0
def decompose(x, method="STL", **kwargs):
    """Perform seasonal decomposition of the time series."""
    if method not in ["STL", "MA"]:
        raise ValueError("`method` must be either 'STL' or 'MA'.")
    decomposition = None
    if method == "STL":
        decomposition = STL(x, **kwargs).fit()
    else:
        decomposition = seasonal_decompose(x, **kwargs)
    return decomposition
def decomp_adjust(data, train_hours, test_hours, model):
    data.index = pd.to_datetime(data.index, utc=True)
    decomp = seasonal_decompose(data['total load actual'][0:train_hours],
                                model=model,
                                freq=24)
    seasonality = list(decomp.seasonal[:24]) * int(
        (train_hours + test_hours) / 24)

    data['seasonality'] = seasonality
    data['seasonally decomposed'] = \
        data['total load actual'] - seasonality if model == "additive" \
        else data['total load actual'] / seasonality
示例#4
0
def tsplot(TS, period=7, lags=None, figsize=(18, 20), style='bmh'):
    if not isinstance(TS, pd.Series):
        TS = pd.Series(TS)

    with plt.style.context(style):
        fig = plt.figure(figsize=figsize)
        # mpl.rcParams['font.family'] = 'Ubuntu Mono'

        layout = (6, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        dc_trend_ax = plt.subplot2grid(layout, (1, 0), colspan=2)
        dc_seasonal_ax = plt.subplot2grid(layout, (2, 0), colspan=2)
        dc_resid_ax = plt.subplot2grid(layout, (3, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (4, 0))
        pacf_ax = plt.subplot2grid(layout, (4, 1))
        qq_ax = plt.subplot2grid(layout, (5, 0))
        pp_ax = plt.subplot2grid(layout, (5, 1))

        TS.plot(ax=ts_ax)
        ts_ax.set_title('Time Series')
        smt.seasonal_decompose(TS, model='additive', period=period).trend.plot(ax=dc_trend_ax)
        dc_trend_ax.set_title('[Decompose] Time Series Trend')
        smt.seasonal_decompose(TS, model='additive', period=period).seasonal.plot(ax=dc_seasonal_ax)
        dc_seasonal_ax.set_title('[Decompose] Time Series Seasonal')
        smt.seasonal_decompose(TS, model='additive', period=period).resid.plot(ax=dc_resid_ax)
        dc_resid_ax.set_title('[Decompose] Time Series Resid')
        smt.graphics.plot_acf(TS, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(TS, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(TS, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')
        stats.probplot(TS, sparams=(TS.mean(), TS.std()), plot=pp_ax)

        plt.tight_layout()
        plt.savefig('time_series_analysis.png')
        plt.show()
示例#5
0
    def scenario04(self,
                   start='2020-01-01',
                   end='2020-10-01',
                   verbose=True,
                   save=None):
        if not end: end = np.datetime64('today')
        self.config['scenario04'] = dict()
        self.config['scenario04']['freq'] = self.config['freq']

        fig = plt.figure(figsize=(20, 18))
        layout = (4, 1)
        ax00 = plt.subplot2grid(layout, (0, 0))
        ax01 = plt.subplot2grid(layout, (1, 0))
        ax02 = plt.subplot2grid(layout, (2, 0))
        ax03 = plt.subplot2grid(layout, (3, 0))

        for name in self.stock_names:
            x, y = self.datareader(name=name,
                                   start=start,
                                   end=end,
                                   verbose=verbose)
            result = smt.seasonal_decompose(
                y,
                model='additive',
                freq=self.config['scenario04']['freq'],
                two_sided=False)
            ax00.scatter(x, result.observed, marker='*')
            ax00.plot(x, result.observed, label=f'{name}')
            ax01.scatter(x, result.trend, marker='*')
            ax01.plot(x, result.trend, label=f'{name}')
            ax02.scatter(x, result.seasonal, marker='*')
            ax02.plot(x, result.seasonal, label=f'{name}')
            ax03.scatter(x, result.resid, marker='*')
            ax03.plot(x, result.resid, label=f'{name}')

        ax00.grid(True)
        ax01.grid(True)
        ax02.grid(True)
        ax03.grid(True)
        ax00.legend()
        ax01.legend()
        ax02.legend()
        ax03.legend()
        plt.tight_layout()
        if save: plt.savefig('analysis04.png')
        plt.show()
示例#6
0
def decompose_ts(ts, s=250, figsize=(20,13)):
    decomposition = smt.seasonal_decompose(ts, freq=s)
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    fig, ax = plt.subplots(nrows=4, ncols=1, sharex=True, sharey=False, figsize=figsize)
    ax[0].plot(ts)
    ax[0].set_title('Original')
    ax[0].grid(True)
    ax[1].plot(trend)
    ax[1].set_title('Trend')
    ax[1].grid(True)
    ax[2].plot(seasonal)
    ax[2].set_title('Seasonality')
    ax[2].grid(True)
    ax[3].plot(residual)
    ax[3].set_title('Residuals')
    ax[3].grid(True)
    return {"trend":trend, "seasonal":seasonal, "residual":residual}
示例#7
0
dtw_matrix = utilities.DTW_distance_matrix(df.head(1000).drop('timestamp', axis=1), 1)
dtw_matrix = dtw_matrix.T + dtw_matrix
linkage_matrix = linkage(ssd.squareform(dtw_matrix), method='weighted', metric='euclidean')
print(is_valid_linkage(linkage_matrix))

plt.figure(1)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
scipy.cluster.hierarchy.dendrogram(linkage_matrix, labels=columns, leaf_rotation=90., leaf_font_size=12., show_contracted=True)
plt.show()

dtw_matrix = utilities.DTW_distance_matrix(df_returns.head(1000).drop('timestamp', axis=1), 1)
dtw_matrix = dtw_matrix.T + dtw_matrix
linkage_matrix = linkage(ssd.squareform(dtw_matrix), method='weighted', metric='euclidean')
print(is_valid_linkage(linkage_matrix))

plt.figure(1)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
scipy.cluster.hierarchy.dendrogram(linkage_matrix, labels=columns, leaf_rotation=90., leaf_font_size=12., show_contracted=True)
plt.show()


### Seasonality
df.reset_index(inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp')
smt.seasonal_decompose(df.GBPUSD_close, freq=60).plot()
示例#8
0
        plt.show()


#%% loading stock data
start = '2018-06-29'
end = '2020-09-29'
df = web.DataReader('005380', 'naver', start=start, end=end)
index = pd.to_numeric(pd.to_datetime(df.index)).values
series = df['Low'].values
ts = pd.DataFrame({'index': index, 'series': series})
ts = ts.set_index('index')
data = ts.values.astype(np.float64)
data = data.squeeze()

period=int(365/4)
trend = smt.seasonal_decompose(data, model='additive', period=period).trend
seasonal = smt.seasonal_decompose(data, model='additive', period=period).seasonal
resid = smt.seasonal_decompose(data, model='additive', period=period).resid

tsplot(data, period=period)

#%% [Stationary]
stationary(data)
#stationary(trend)
#stationary(seasonal)
#stationary(resid)

#%% [Descriptive Statistics]
pd.DataFrame(data).describe()
#pd.DataFrame(trend).describe()
#pd.DataFrame(seasonal).describe()
示例#9
0
# Now fit another OLS model.

data = (
    y.to_frame(name="y")
    .assign(Δy=lambda df: df.y.diff())
    .assign(LΔy=lambda df: df.Δy.shift())
)
mod_stationary = smf.ols("Δy ~ LΔy", data=data.dropna())
res_stationary = mod_stationary.fit()
tsplot(res_stationary.resid, lags=24);

# ##### Seasonality

# We have a strong monthly seasonality.

smt.seasonal_decompose(y).plot()

# #### ARIMA

# ARIMA can handle all the problems specified:
#
# - Multicollinearity
# - Autocorrelation
# - Non-stationary
# - Seasonality
#
# **A**utoRegressive  
# **I**ntegrated  
# **M**oving  
# **A**verage  
示例#10
0
        if i >= j:
            plt.figure(figsize=(10, 10))
            plt.xcorr(residuals[data.columns.values[i]],
                      residuals[data.columns.values[j]],
                      maxlags=40)
            plt.suptitle(data.columns.values[i] + " x " +
                         data.columns.values[j])
            plt.show()

# VAR(7) seems to filter well the series even with the seasonaliry

# checking a VAR(1) with deseasonalized data
# adjusting individual seasonal adjustment
des_data = data
for i in data.columns:
    des_data[i] = (data[i] - seasonal_decompose(data[i], freq=7).seasonal)

# fitting a VAR(p) with deseasonalized data
model = VAR(des_data)

# getting VAR(1) residuals
fittedmodel = model.fit(maxlags=1)
residuals = fittedmodel.resid

for i in range(data.shape[1]):
    for j in range(data.shape[1]):
        if i >= j:
            plt.figure(figsize=(10, 10))
            plt.xcorr(residuals[data.columns.values[i]],
                      residuals[data.columns.values[j]],
                      maxlags=40)
示例#11
0
fc, confint = smodel.predict(n_periods=24, return_conf_int=True)

fc_df = pd.DataFrame(confint, columns=['lower', 'upper'])
fc_df['fc'] = fc
fc_df.index = pd.date_range(dd.index[-1], periods=24, freq='MS')

plt.plot(dd)
plt.plot(fc_df.fc)
plt.fill_between(fc_df.index, fc_df.lower, fc_df.upper, alpha=.15)
plt.title("Forecast of drug sales - SARIMA")

#%% 15. How to build SARIMAX Model with exogenous variable
from dateutils.parser import parse

sd = ts.seasonal_decompose(dd[-36:],
                           model='multiplicative',
                           extrapolate_trend='freq')
sd
dir(sd)

seasonal = sd.seasonal[-12:].to_frame()
seasonal['month'] = seasonal.index.month
seasonal

ss['month'] = ss.index.month
ss

# !!!
dfs = pd.merge(ss, seasonal, how='left', on='month')
dfs
dfs.index = ss.index
示例#12
0
    def scenario05(self,
                   start='2020-01-01',
                   end='2020-10-01',
                   verbose=True,
                   save=None):
        if not end: end = np.datetime64('today')
        self.config['scenario05'] = dict()
        self.config['scenario05']['freq'] = self.config['freq']
        for name in self.stock_names:
            self.config['scenario05'][f'{name}'] = self.datareader(
                name=name, start=start, end=end, verbose=verbose)

        fig = plt.figure(figsize=(20, 18))
        layout = (4, 1)
        ax00 = plt.subplot2grid(layout, (0, 0))
        ax01 = plt.subplot2grid(layout, (1, 0))
        ax02 = plt.subplot2grid(layout, (2, 0))
        ax03 = plt.subplot2grid(layout, (3, 0))

        for name in self.stock_names:
            x, y = self.config['scenario05'][f'{name}']
            ax00.scatter(x, y, marker='*')
            ax00.plot(x, y, label=f'{name}')

            y = (y - y.mean()) / y.std()
            ax02.scatter(x, y, marker='*')
            ax02.plot(x, y, label=f'{name}')

            x, y = self.config['scenario05'][f'{name}']
            trend = smt.seasonal_decompose(
                y,
                model='additive',
                freq=self.config['scenario05']['freq'],
                two_sided=True).trend
            ndiff = int(np.isnan(trend).sum() / 2)
            x = x[ndiff:-ndiff]
            y = trend[~np.isnan(trend)]

            ax01.scatter(x, y, marker='*')
            ax01.plot(x, y, label=f'{name}')

            y = (y - y.mean()) / y.std()
            ax03.scatter(x, y, marker='*')
            ax03.plot(x, y, label=f'{name}')

        ax00.grid(True)
        ax01.grid(True)
        ax02.grid(True)
        ax03.grid(True)
        ax00.legend()
        ax01.legend()
        ax02.legend()
        ax03.legend()
        ax00.set_title('[Observed]')
        ax01.set_title('[Trend]')
        ax02.set_title('[Normalized observed]')
        ax03.set_title('[Normalized trend]')
        ax02.axhline(0, c='black', ls='--')
        ax03.axhline(0, c='black', ls='--')
        plt.tight_layout()
        if save: plt.savefig('analysis05.png')
        plt.show()
示例#13
0
    def scenario03(self,
                   start='2020-01-01',
                   end='2020-10-01',
                   verbose=True,
                   save=None):
        if not end: end = np.datetime64('today')
        time_range = np.arange(start, end, dtype='datetime64[D]')
        end_ = time_range[int(len(time_range) * self.config['ratio'])]
        self.config['scenario03'] = dict()
        self.config['scenario03']['freq'] = self.config['freq']
        self.config['scenario03']['start'] = start
        self.config['scenario03']['end'] = end
        self.config['scenario03']['end_'] = end_
        self.config['scenario03']['ks200'] = fdr.DataReader('KS200',
                                                            start=start,
                                                            end=end)['Close']
        self.config['scenario03']['ks200_'] = fdr.DataReader('KS200',
                                                             start=start,
                                                             end=end_)['Close']
        for name in self.stock_names:
            self.config['scenario03'][f'{name}_'] = self.datareader(
                name=name, start=start, end=end_, verbose=verbose)
            self.config['scenario03'][f'{name}'] = self.datareader(
                name=name, start=start, end=end, verbose=verbose)

        fig = plt.figure(figsize=(20, 35))
        layout = (8, 1)
        ax00 = plt.subplot2grid(layout, (0, 0))
        ax01 = plt.subplot2grid(layout, (1, 0))
        ax02 = plt.subplot2grid(layout, (2, 0))
        ax03 = plt.subplot2grid(layout, (3, 0))
        ax04 = plt.subplot2grid(layout, (4, 0))
        ax05 = plt.subplot2grid(layout, (5, 0))
        ax06 = plt.subplot2grid(layout, (6, 0), rowspan=2)

        # axes : (0, 0) > differencing for normalized stock prices based on kospi200 index until end_
        ks200_x = self.config['scenario03']['ks200_'].index.values
        ks200_y = self.config['scenario03']['ks200_'].values
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}_']
            y = (y - y.mean()) / y.std()
            y = y - ks200_y

            ax00.scatter(x, y, marker='*')
            ax00.plot(x, y, label=f'{name}')
        ax00.axhline(0, c='black', ls='--')
        ax00.axvline(start, c='grey', ls='--')
        ax00.axvline(end_, c='r', ls='--')
        ax00.set_title('[Past : Diff]')
        ax00.grid(True)
        ax00.legend()

        # axes : (1, 0) > normalized stock prices until end_
        ks200_x = self.config['scenario03']['ks200_'].index.values
        ks200_y = self.config['scenario03']['ks200_'].values
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        ax01.scatter(ks200_x, ks200_y, marker='*', c='black')
        ax01.plot(ks200_x, ks200_y, label='KS200', c='black')
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}_']
            y = (y - y.mean()) / y.std()
            ax01.scatter(x, y, marker='*')
            ax01.plot(x, y, label=f'{name}')
        ax01.axhline(0, c='black', ls='--')
        ax01.axvline(start, c='grey', ls='--')
        ax01.axvline(end_, c='r', ls='--')
        ax01.set_title('[Past : Normalized]')
        ax01.grid(True)
        ax01.legend()

        # axes : (2, 0) > normalized trend of stock prices until end_
        ks200_x = self.config['scenario03']['ks200_'].index.values
        ks200_y = self.config['scenario03']['ks200_'].values
        trend = smt.seasonal_decompose(ks200_y,
                                       model='additive',
                                       freq=self.config['scenario03']['freq'],
                                       two_sided=True).trend
        ndiff = int(np.isnan(trend).sum() / 2)
        ks200_x = ks200_x[ndiff:-ndiff]
        ks200_y = trend[~np.isnan(trend)]
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        ax02.scatter(ks200_x, ks200_y, marker='*', c='black')
        ax02.plot(ks200_x, ks200_y, label='KS200', c='black')
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}_']
            trend = smt.seasonal_decompose(
                y,
                model='additive',
                freq=self.config['scenario03']['freq'],
                two_sided=True).trend
            ndiff = int(np.isnan(trend).sum() / 2)
            x = x[ndiff:-ndiff]
            y = trend[~np.isnan(trend)]
            y = (y - y.mean()) / y.std()

            ax02.scatter(x, y, marker='*')
            ax02.plot(x, y, label=f'{name}')
        ax02.axhline(0, c='black', ls='--')
        ax02.axvline(start, c='grey', ls='--')
        ax02.axvline(end_, c='r', ls='--')
        ax02.set_title('[Past : Normalized trend]')
        ax02.grid(True)
        ax02.legend()

        # axes : (3, 0) > differencing for normalized stock prices based on kospi200 index
        ks200_x = self.config['scenario03']['ks200'].index.values
        ks200_y = self.config['scenario03']['ks200'].values
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}']
            y = (y - y.mean()) / y.std()
            y = y - ks200_y

            ax03.scatter(x, y, marker='*')
            ax03.plot(x, y, label=f'{name}')
        ax03.axhline(0, c='black', ls='--')
        ax03.axvline(start, c='grey', ls='--')
        ax03.axvline(end_, c='r', ls='--')
        ax03.axvline(end, c='black', ls='--')
        ax03.set_title('[Present : Diff]')
        ax03.grid(True)
        ax03.legend()

        # axes : (4, 0) > normalized stock prices
        ks200_x = self.config['scenario03']['ks200'].index.values
        ks200_y = self.config['scenario03']['ks200'].values
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        ax04.scatter(ks200_x, ks200_y, marker='*', c='black')
        ax04.plot(ks200_x, ks200_y, label='KS200', c='black')
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}']
            y = (y - y.mean()) / y.std()

            ax04.scatter(x, y, marker='*')
            ax04.plot(x, y, label=f'{name}')
        ax04.axhline(0, c='black', ls='--')
        ax04.axvline(start, c='grey', ls='--')
        ax04.axvline(end_, c='r', ls='--')
        ax04.axvline(end, c='black', ls='--')
        ax04.set_title('[Present : Normalized]')
        ax04.grid(True)
        ax04.legend()

        # axes : (5, 0) > normalized trend of stock prices
        ks200_x = self.config['scenario03']['ks200'].index.values
        ks200_y = self.config['scenario03']['ks200'].values
        trend = smt.seasonal_decompose(ks200_y,
                                       model='additive',
                                       freq=self.config['scenario03']['freq'],
                                       two_sided=True).trend
        ndiff = int(np.isnan(trend).sum() / 2)
        ks200_x = ks200_x[ndiff:-ndiff]
        ks200_y = trend[~np.isnan(trend)]
        ks200_y = (ks200_y - ks200_y.mean()) / ks200_y.std()
        ax05.scatter(ks200_x, ks200_y, marker='*', c='black')
        ax05.plot(ks200_x, ks200_y, label='KS200', c='black')
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}']
            trend = smt.seasonal_decompose(
                y,
                model='additive',
                freq=self.config['scenario03']['freq'],
                two_sided=True).trend
            ndiff = int(np.isnan(trend).sum() / 2)
            x = x[ndiff:-ndiff]
            y = trend[~np.isnan(trend)]
            y = (y - y.mean()) / y.std()

            ax05.scatter(x, y, marker='*')
            ax05.plot(x, y, label=f'{name}')
        ax05.axhline(0, c='black', ls='--')
        ax05.axvline(start, c='grey', ls='--')
        ax05.axvline(end_, c='r', ls='--')
        ax05.axvline(end, c='black', ls='--')
        ax05.set_title('[Present : Normalized]')
        ax05.grid(True)
        ax05.legend()

        # axes : (6, 0) > stock prices
        for name in self.stock_names:
            x, y = self.config['scenario03'][f'{name}']

            ax06.scatter(x, y, marker='*')
            ax06.plot(x, y, label=f'{name}')
        ax06.axvline(start, c='grey', ls='--')
        ax06.axvline(end_, c='r', ls='--')
        ax06.axvline(end, c='black', ls='--')
        ax06.set_title('[Present : Origin]')
        ax06.grid(True)
        ax06.legend()

        plt.tight_layout()
        if save: plt.savefig('analysis03.png')
        plt.show()
示例#14
0
def plot_decompose(df):
    sm.seasonal_decompose(df['rate']).plot()
    plt.show()
示例#15
0
import warnings
warnings.filterwarnings('ignore')

import pandas_datareader as web
import statsmodels.tsa.api as tsa
import pandas as pd
import numpy as np
from numpy.linalg import LinAlgError

p = f'{os.geetcwd()}/img/'

industrial_production = web.DataReader('IPGMFN', 'fred', '2000',
                                       '2020-12').squeeze()
print(type(industrial_production.head()))

components = tsa.seasonal_decompose(industrial_production, model='additive')

ts = industrial_production.to_frame('Original').assign(
    Trend=components.trend).assign(Seasonality=components.seasonal).assign(
        Residual=components.resid)

import matplotlib.pyplot as plt
ts.plot(subplots=True, figsize=(14, 8))
plt.show()
plt.savefig(f'{p}1.png')

# time series stationarity
industrial_production_log = np.log(industrial_production)
industrial_production_log_diff = industrial_production_log.diff(
    12).dropna()  # seasonal differencing => yoy instantanteous returns
import pandas_datareader.data as web
import statsmodels.tsa.api as tsa
ind = web.DataReader('IPGMFN', 'fred', '1988', '2017-12').squeeze()
components = tsa.seasonal_decompose(ind, model='additive')

ts = (ind.to_frame('Original').assign(Trend=components.trend).assign(
    Seasonality=components.seasonal).assign(Residual=components.resid))
ts.plot(subplots=True, figsize=(14, 8))

df = web.DataReader(name='SP500', data_source='fred',
                    start=2009).squeeze().to_frame('close')
spx = web.DataReader('SP500', 'fred', 2009, 2020).squeeze().to_frame('close')
def preprocess_load_data_forec(dataframe,
                               quarter_hour=True,
                               short_term=True,
                               scaler=None,
                               n_ahead=1,
                               calendars=None):
    # pre-process load data for forecasting: scale, split in train / test, de-seasonalize, and construct features
    # expects pandas Dataframe with a Datetimeindex and a load column containing the load data in MW with no missing
    # values.
    # Resolution either quarter hour (quarter_hour=True), if quarter_hour=False assumed to be hourly data

    # use GW for convenience and readability later, also the standard-scaled values are smaller
    dataframe = dataframe / 1000

    # split data first so scaler and deseasonilizing can be trained on train set properly
    train_df_o, test_df_o = train_test_split(dataframe,
                                             test_size=0.2,
                                             shuffle=False)
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(np.array(train_df_o['load']).reshape(-1, 1))
    train_df = pd.DataFrame(
        {
            'load':
            scaler.transform(np.array(train_df_o['load']).reshape(
                -1, 1)).squeeze()
        },
        index=train_df_o.index)
    test_df = pd.DataFrame(
        {
            'load':
            scaler.transform(np.array(test_df_o['load']).reshape(-1,
                                                                 1)).squeeze()
        },
        index=test_df_o.index)

    # deseasonalize
    offset_train = pd.DataFrame(0, index=train_df.index, columns=['load'])
    offset_test = pd.DataFrame(0, index=test_df.index, columns=['load'])
    # decomp and train Holt Winters on decomp
    seasonal_periods = [24, 24 * 7]
    freq = 'H'

    if quarter_hour:
        seasonal_periods = [p * 4 for p in seasonal_periods]
        freq = '15T'

    for p in seasonal_periods:
        decomp = seasonal_decompose(train_df, period=p)
        exp = ExponentialSmoothing(decomp.seasonal,
                                   seasonal_periods=p,
                                   seasonal='add',
                                   freq=freq).fit()

        train_pred = exp.predict(start=train_df.index[0],
                                 end=train_df.index[-1])
        test_pred = exp.predict(start=test_df.index[0], end=test_df.index[-1])
        train_df['load'] = (train_df['load'] - train_pred)
        test_df['load'] = (test_df['load'] - test_pred)

        offset_train['load'] = offset_train['load'] + train_pred
        offset_test['load'] = offset_test['load'] + test_pred

    # construct features
    train_df = construct_features(dataframe=train_df,
                                  offset=offset_train,
                                  short_term=short_term,
                                  quarter_hour=quarter_hour,
                                  n_ahead=n_ahead,
                                  calendars=calendars)
    test_df = construct_features(dataframe=test_df,
                                 offset=offset_test,
                                 short_term=short_term,
                                 quarter_hour=quarter_hour,
                                 n_ahead=n_ahead,
                                 calendars=calendars)

    return train_df, test_df, scaler
示例#18
0
def decompose(data):
    #print(tsa.seasonal_decompose(ts(data)).seasonal)
    trend = tsa.seasonal_decompose(ts(data)).trend.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records")
    seasonal = tsa.seasonal_decompose(ts(data)).seasonal.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records") 
    resid = tsa.seasonal_decompose(ts(data)).resid.fillna(0).reset_index(drop=True).reset_index().to_dict(orient="records")
    return trend, seasonal, resid
示例#19
0
def example_3():
	import pandas_datareader as pdr

	gs = pdr.data.DataReader("GS", data_source='yahoo', start='2006-01-01', end='2010-01-01')
	print(gs.head().round(2))
	print(gs.loc[pd.Timestamp('2006-01-01'):pd.Timestamp('2006-12-31')].head())
	print(gs.loc['2006'].head())

	#--------------------
	# Resampling.
	if True:
		print(gs.resample("5d").mean().head())
		print(gs.resample("W").agg(['mean', 'sum']).head())

		# You can up-sample to convert to a higher frequency. The new points are filled with NaNs.
		print(gs.resample("6h").mean().head())

	#--------------------
	# Rolling, expanding, exponential weighted (EW).
	if False:
		gs.Close.plot(label='Raw')
		gs.Close.rolling(28).mean().plot(label='28D MA')
		gs.Close.expanding().mean().plot(label='Expanding Average')
		gs.Close.ewm(alpha=0.03).mean().plot(label='EWMA($\\alpha=.03$)')

		plt.legend(bbox_to_anchor=(1.25, .5))
		plt.tight_layout()
		plt.ylabel("Close ($)")
		sns.despine()

		# Each of .rolling, .expanding, and .ewm return a deferred object, similar to a GroupBy.
		roll = gs.Close.rolling(30, center=True)

		m = roll.agg(['mean', 'std'])
		plt.figure()
		ax = m['mean'].plot()
		ax.fill_between(m.index, m['mean'] - m['std'], m['mean'] + m['std'], alpha=.25)
		plt.tight_layout()
		plt.ylabel("Close ($)")
		sns.despine()

	#--------------------
	# Grab bag.
	if False:
		# Offsets.
		#	These are similar to dateutil.relativedelta, but works with arrays.
		print(gs.index + pd.DateOffset(months=3, days=-2))

		# Holiday calendars.
		from pandas.tseries.holiday import USColumbusDay
		print(USColumbusDay.dates('2015-01-01', '2020-01-01'))

		# Timezones.
		# tz naiive -> tz aware..... to desired UTC
		print(gs.tz_localize('US/Eastern').tz_convert('UTC').head())

	#--------------------
	# Modeling time series.
	if True:
		from collections import namedtuple
		import statsmodels.formula.api as smf
		import statsmodels.tsa.api as smt
		import statsmodels.api as sm
		from modern_pandas_utils import download_timeseries

		def download_many(start, end):
			months = pd.period_range(start, end=end, freq='M')
			# We could easily parallelize this loop.
			for i, month in enumerate(months):
				download_timeseries(month)

		def time_to_datetime(df, columns):
			'''
			Combine all time items into datetimes.
			2014-01-01,1149.0 -> 2014-01-01T11:49:00
			'''
			def converter(col):
				timepart = (col.astype(str)
					.str.replace('\.0$', '')  # NaNs force float dtype
					.str.pad(4, fillchar='0'))
				return  pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce')
				return datetime_part
			df[columns] = df[columns].apply(converter)
			return df

		def unzip_one(fp):
			try:
				zf = zipfile.ZipFile(fp)
				csv = zf.extract(zf.filelist[0])
				return csv
			except zipfile.BadZipFile as ex:
				print('zipfile.BadZipFile raised in {}: {}.'.format(fp, ex))
				raise

		def read_one(fp):
			df = (pd.read_csv(fp, encoding='latin1')
				.rename(columns=str.lower)
				.drop('unnamed: 6', axis=1)
				.pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time'])
				.assign(fl_date=lambda x: pd.to_datetime(x['fl_date'])))
			return df

		store = './modern_pandas_data/ts.hdf5'

		if not os.path.exists(store):
			download_many('2000-01-01', '2016-01-01')

			zips = glob.glob(os.path.join('modern_pandas_data', 'timeseries', '*.zip'))
			csvs = [unzip_one(fp) for fp in zips]
			dfs = [read_one(fp) for fp in csvs]
			df = pd.concat(dfs, ignore_index=True)

			df['origin'] = df['origin'].astype('category')
			df.to_hdf(store, 'ts', format='table')
		else:
			df = pd.read_hdf(store, 'ts')

		with pd.option_context('display.max_rows', 100):
			print(df.dtypes)

		daily = df.fl_date.value_counts().sort_index()
		y = daily.resample('MS').mean()
		print(y.head())

		ax = y.plot()
		ax.set(ylabel='Average Monthly Flights')
		sns.despine()

		X = (pd.concat([y.shift(i) for i in range(6)], axis=1, keys=['y'] + ['L%s' % i for i in range(1, 6)]).dropna())
		print(X.head())

		mod_lagged = smf.ols('y ~ trend + L1 + L2 + L3 + L4 + L5', data=X.assign(trend=np.arange(len(X))))
		res_lagged = mod_lagged.fit()
		res_lagged.summary()

		sns.heatmap(X.corr())

		ax = res_lagged.params.drop(['Intercept', 'trend']).plot.bar(rot=0)
		plt.ylabel('Coefficeint')
		sns.despine()

		# Autocorrelation.
		# 'Results.resid' is a series of residuals: y - ŷ.
		mod_trend = sm.OLS.from_formula('y ~ trend', data=y.to_frame(name='y').assign(trend=np.arange(len(y))))
		res_trend = mod_trend.fit()

		def tsplot(y, lags=None, figsize=(10, 8)):
			fig = plt.figure(figsize=figsize)
			layout = (2, 2)
			ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
			acf_ax = plt.subplot2grid(layout, (1, 0))
			pacf_ax = plt.subplot2grid(layout, (1, 1))
			
			y.plot(ax=ts_ax)
			smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
			smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
			[ax.set_xlim(1.5) for ax in [acf_ax, pacf_ax]]
			sns.despine()
			plt.tight_layout()
			return ts_ax, acf_ax, pacf_ax

		tsplot(res_trend.resid, lags=36)

		y.to_frame(name='y').assign(Δy=lambda x: x.y.diff()).plot(subplots=True)
		sns.despine()

		ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest")

		#ADF(*smt.adfuller(y))._asdict()
		ADF(*smt.adfuller(y.dropna()))._asdict()
		ADF(*smt.adfuller(y.diff().dropna()))._asdict()

		data = (y.to_frame(name='y').assign(Δy=lambda df: df.y.diff()).assign(LΔy=lambda df: df.Δy.shift()))
		mod_stationary = smf.ols('Δy ~ LΔy', data=data.dropna())
		res_stationary = mod_stationary.fit()

		tsplot(res_stationary.resid, lags=24)

		# Seasonality.
		#smt.seasonal_decompose(y).plot()
		smt.seasonal_decompose(y.fillna(method='ffill')).plot()

		# ARIMA.
		mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1))
		res = mod.fit()
		tsplot(res.resid[2:], lags=24)

		res.summary()

		mod_seasonal = smt.SARIMAX(y, trend='c', order=(1, 1, 2), seasonal_order=(0, 1, 2, 12), simple_differencing=False)
		res_seasonal = mod_seasonal.fit()

		res_seasonal.summary()

		tsplot(res_seasonal.resid[12:], lags=24)

		# Forecasting.
		pred = res_seasonal.get_prediction(start='2001-03-01')
		pred_ci = pred.conf_int()

		plt.figure()
		ax = y.plot(label='observed')
		pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7)
		ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2)
		ax.set_ylabel("Monthly Flights")
		plt.legend()
		sns.despine()

		pred_dy = res_seasonal.get_prediction(start='2002-03-01', dynamic='2013-01-01')
		pred_dy_ci = pred_dy.conf_int()

		plt.figure()
		ax = y.plot(label='observed')
		pred_dy.predicted_mean.plot(ax=ax, label='Forecast')
		ax.fill_between(pred_dy_ci.index, pred_dy_ci.iloc[:, 0], pred_dy_ci.iloc[:, 1], color='k', alpha=.25)
		ax.set_ylabel("Monthly Flights")

		# Highlight the forecast area.
		ax.fill_betweenx(ax.get_ylim(), pd.Timestamp('2013-01-01'), y.index[-1], alpha=.1, zorder=-1)
		ax.annotate('Dynamic $\\longrightarrow$', (pd.Timestamp('2013-02-01'), 550))

		plt.legend()
		sns.despine()

	plt.show()