def adf_test(x): stats = ['Test Statistic', 'p-value' , '# of lags', '# of observations'] adf_test = smt.adfuller(x, autolag = 'AIC') # number of considered lags is automatically selected based on the Akaike Information Criterion (AIC). results = pd.Series(adf_test[0:4], index = stats) for key, values in adf_test[4].items(): results[f'Critical Value({key})'] = values return results
def cointegrated(all_pairs, X_train): # creating a list to hold cointegrated pairs cointegrated = [] # iterate over each pair in possible pairs list; pair is a list of our 2 stock symbols for count, allo in enumerate(all_pairs): for pair in all_pairs[count]: # getting data for each stock in pair from training_df ols = linregress(X_train[str(pair[1])], X_train[str(pair[0])]) #note scipy's linregress takes in Y then X # storing slope or hedge ratio in variable slope = ols[0] # creating spread spread = X_train[str(pair[1])] - (slope * X_train[str(pair[0])]) # testing spread for cointegration cadf = adfuller(spread,1) # checking to see if spread is cointegrated, if so then store pair in cointegrated list if cadf[0] < cadf[4]['1%']: print('Pair Cointegrated at 99% Confidence Interval') # appending the X and Y of pair cointegrated.append([pair[0],pair[1]]) elif cadf[0] < cadf[4]['5%']: print('Pair Cointegrated at 95% Confidence Interval') # appending the X and Y of pair cointegrated.append([pair[0],pair[1]]) elif cadf[0] < cadf[4]['10%']: print('Pair Cointegrated at 90% Confidence Interval') cointegrated.append(pair[0],pair[1]) else: print('Pair Not Cointegrated ') return cointegrated
def test_stationarity(timeseries, maxlag=2, regression='c', autolag=None, window=None, plot=False, verbose=False): """ Check unit root stationarity of a time series array or an entire dataframe. Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR. Null hypothesis: the series is non-stationary. If p >= alpha, the series is non-stationary. If p < alpha, reject the null hypothesis (has unit root stationarity). Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/ Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html window argument is only required for plotting rolling functions. Default=4. """ # set defaults (from function page) if type(timeseries) == pd.DataFrame: print('modifying time series dataframe into an array to test') timeseries = timeseries.values.ravel() if regression is None: regression = 'c' if verbose: print('Running Augmented Dickey-Fuller test with paramters:') print('maxlag: {}'.format(maxlag)) print('regression: {}'.format(regression)) print('autolag: {}'.format(autolag)) alpha = 0.05 if plot: if window is None: window = 4 # Determing rolling statistics rolmean = timeseries.rolling(window=window, center=False).mean() rolstd = timeseries.rolling(window=window, center=False).std() # Plot rolling statistics: orig = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean ({})'.format(window)) std = plt.plot(rolstd, color='black', label='Rolling Std ({})'.format(window)) plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) # Perform Augmented Dickey-Fuller test: try: dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag) dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used', ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value if verbose: print('Results of Augmented Dickey-Fuller Test:') print(dfoutput) if dftest[1] >= alpha: print(' this series is non-stationary') else: print(' this series is stationary') return dfoutput except: print('Augment Dickey-Fuller test gives an error') return
def ARIMA_GARCH(self, data, dataType, dataSize): X = data #Performs an Augmented Dickey-Fuller test to check for stationarity. result = smt.adfuller(X[0]) pvalue = result[1] if pvalue < 0.2: print('p-value = ' + str(pvalue) + ' The series is likely stationary.') differentiation = "None" else: print('p-value = ' + str(pvalue) + ' The series is likely NON-stationary.') #differentiation = "Once" differentiation = "None" print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) #Starts a subprocess of the ./arima_garch.r program which is a program written in R and uses the rugarch packaged to perform anomaly detection using an ARMA-GARCH nr_of_series = len(data) print("Starting subprocess") subprocess.call([ "Rscript", "--vanilla", "./arima_garch.r", str(dataType), str(dataSize), str(nr_of_series), differentiation ]) print("Ended subprocess") #Load the results that were created by the R program, from file. forecasts_mean = genfromtxt('forecasts_mean_' + dataType + "_" + dataSize + '.csv', skip_header=0, delimiter=' ', dtype=float) forecasts_variance = genfromtxt('forecasts_variance_' + dataType + "_" + dataSize + '.csv', skip_header=0, delimiter=' ', dtype=float) #Correcting for NaN's and 0's for row in range(len(forecasts_mean)): for col in range(len(forecasts_mean[row])): if forecasts_mean[row][col] == 0 and col > 0: forecasts_mean[row][col] = forecasts_mean[row][col - 1] if np.isnan(forecasts_variance[row][col]): if col > 0: forecasts_variance[row][col] = forecasts_variance[row][ col - 1] else: forecasts_variance[row][col] = 1 return forecasts_mean, forecasts_variance
def analysis(self): """http://necochan.com/2014/06/07/python-for-economist-6/""" eq = self.criterion_column + "~" + "+".join(self.explanatory_columns) self.rm = smf.ols(formula=eq, data=self.data).fit() print(self.rm.summary()) # ADF test, H0: Non-stationary tsa.adfuller(self.rm.resid, regression='nc') # Autocorrel plot of resid autocorrelation_plot(self.rm.resid) # Show ACF of residuals ACF_resid = tsa.acf(self.rm.resid) # Keep ACF of residuals # Checking Multicolinearity by VIF VIF = pd.DataFrame([ oti.variance_inflation_factor(self.rm.model.exog, i) for i in range(1, self.rm.model.exog.shape[1]) ], index=self.rm.model.exog_names[1:], columns=['VIF']) # VIF>10 should be cared
def has_unit_root(X, debug=True): # = not able to reject null hypothesis # Null hypothesis: x has a unit root (= is not stationary, might be trend stationary) adf_stat, p_value, used_lag, nobs, critical_values, icbest = smt.adfuller( X) if debug: print( '-' * 10, ' ADF ', '-' * 10, ) print(f'{adf_stat}, {p_value}, {used_lag}, {critical_values}') return abs(adf_stat) < abs(critical_values['5%'])
def unit_root(ser): t_boundary = [ { '1%': -3.96, '5%': -3.41, '10%': -3.12 }, { '1%': -3.43, '5%': -2.86, '10%': -2.57 }, { '1%': -2.58, '5%': -1.95, '10%': -1.61 }, ] """ t分布临界值(n=∞) 1% -2.33 5% -1.65 10% -1.28 """ result = adfuller(ser, maxlag=40, store=False, regresults=False) #print result for i, row in enumerate(t_boundary): if result[4]['1%'] > row['1%']: print "计算式%s,0.01显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % ( 3 - i, row['1%']) else: print "计算式%s,0.01显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i, row['1%']) if result[4]['5%'] > row['5%']: print "计算式%s,0.05显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % ( 3 - i, row['5%']) else: print "计算式%s,0.05显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i, row['5%']) if result[4]['10%'] > row['10%']: print "计算式%s,0.10显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % ( 3 - i, row['10%']) else: print "计算式%s,0.10显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i, row['10%']) #print "数据取%s阶滞后下,LM检验表明模型残差项不存在自相关性"%result[2] print result[4]
def __adf(self, residuals: array): ''' critical values are in the following dictionary form: {'1%': -3.4304385694773387, '5%': -2.8615791461685034, '10%': -2.566790836162312} ''' adf_results = adfuller(residuals) adf_test_statistic: float = adf_results[0] adf_critical_values: Dict[str, float] = adf_results[4] return adf_test_statistic, adf_critical_values
def test_for_stationarity(y): cout("Results of Augmented Dickey-Fuller test:") dftest = smt.adfuller(y, autolag='AIC') rounded = map(lambda x: round(x, 6), dftest[0:4]) dfoutput = pd.Series( rounded, index=['test statistic', 'p-value', '# of lags', '# of observations']) for key, value in dftest[4].items(): dfoutput['Critical Value ({})'.format(key)] = round(value, 6) cout(dfoutput) b, desc = is_stationary(dftest) b = "TRUE " if b else "false" cout(" Stationary? {0} '{1}'".format(b, desc)) return dftest
def stationary(TS): """ Augmented Dickey-Fuller test Null Hypothesis (H0): [if p-value > 0.5, non-stationary] > Fail to reject, it suggests the time series has a unit root, meaning it is non-stationary. > It has some time dependent structure. Alternate Hypothesis (H1): [if p-value =< 0.5, stationary] > The null hypothesis is rejected; it suggests the time series does not have a unit root, meaning it is stationary. > It does not have time-dependent structure. """ result = smt.adfuller(TS) print(f'[ADF Statistic] : {result[0]}') print(f'[p-value] : {result[1]}') for key, value in result[4].items(): print(f'[Critical Values {key} ] : {value}')
def check_each_var_for_stationarity(time_df, autolag, verbose=0): alpha = 0.05 all_vars = 1 copy_cols = time_df.columns.tolist() for each_var in copy_cols: timeseries = time_df[each_var].values dftest = smt.adfuller(timeseries, autolag=autolag) if verbose >= 2: ############################ Print Summary ##################### output = { 'test_statistic': round(dftest[0], 4), 'pvalue': round(dftest[1], 4), 'n_lags': round(dftest[2], 4), 'n_obs': dftest[3] } p_value = output['pvalue'] print(f' Augmented Dickey-Fuller Test on "{each_var}"', "\n ", '-' * 47) print(f' Null Hypothesis: Data has unit root. Non-Stationary.') print(f' Significance Level = {alpha}') print(f' Test Statistic = {output["test_statistic"]}') print(f' No. Lags Chosen = {output["n_lags"]}') for key, val in dftest[4].items(): print(f' Critical value {adjust(key)} = {round(val, 3)}') if p_value <= alpha: print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.") print(f" => Series is Stationary.") else: print( f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis." ) print(f" => Series is Non-Stationary.") #################################################################### if dftest[1] < alpha: all_vars = 1 * all_vars else: all_vars = 0 * all_vars return all_vars ##################################################################################
def run(self): if not self._args: return None data = self._data_service.get_data(self._args) original_data = data.fillna(method='bfill') ran = pd.date_range(self._args['date_from'], self._args['date_to'], freq='D') original_data = pd.Series(original_data['close'], index=ran) original_data = original_data.fillna(method='bfill') split = len(original_data) - int(self._args['days_to_predict']) train_data, prediction_data = original_data[:split], original_data[ split:] ADF = namedtuple('ADF', 'adf pvalue usedlag nobs critical icbest') stationarity_results = ADF(*smt.adfuller(train_data))._asdict() significance_level = 0.01 # if the series are stationary, no need for an integrated order order = (1, 0, 1) if stationarity_results['pvalue'] > significance_level: order = (1, 2, 1) # result = self._model_fit(train_data, order) # prediction = result.predict(prediction_data.index[0], # prediction_data.index[-1], # typ='levels') # print(prediction.tail(self._args['days_to_predict'])) result = self._model_fit(original_data, order) print(result.summary()) forecast = result.forecast(steps=int(self._args['days_to_predict']))[0] print(forecast) return 'object'
def ad_fuller_test(timeseries: pd.Series): """ Ad fuller documentation here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.adfuller.html#statsmodels.tsa.stattools.adfuller Tests the unit root in a univariate process in the presence of serial correlation. Null hypothesis: there is a unit root Alternative hypothesis: there is no unit root, in otherwords the process is stationary. If the series has a unit root, then there is said to be no regression to the mean, while stationary processes will regress to the mean. """ result = adfuller(timeseries) AdFullerResult = namedtuple('AdFullerResult', 'statistic pvalue') return AdFullerResult(result[0], result[1])
def adf_stationary_test(df: pd.DataFrame, alpha: float = 0.05, criterion: str = 'AIC') -> bool: """ Test whether dataframe is stationary using the Augmented Dickey Fuller (ADF) test found in statsmodel. Source: https://www.insightsbot.com/augmented-dickey-fuller-test-in-python/ Parameters ---------- df : The pd.DataFrame to test for stationarity. Currently must be univariate. alpha : The number that is (1 - confidence interval). The default is 0.05 for 95% CI. criterion : The criterion used to automatically determine lag. The default is 'AIC' or Akaike information criterion. Returns ------- stationary : Whether the df stationary or not. """ # Run Augmented Dickey-Fuller Test (ADF) statistical test: adf_test = adfuller(df, autolag=criterion) p_value = adf_test[1] if (p_value < alpha): stationary = True else: stationary = False results = pd.Series(adf_test[0:4], index=[ ' ADF Test Statistic', ' P-Value', ' # Lags Used', ' # Observations Used' ]) # Add Critical Values for key, value in adf_test[4].items(): results[f' Critical Value ({key})'] = value print(" - Augmented Dickey-Fuller Test Results:\n") print(results.to_string() + "\n") return stationary
def get_cointegrated(all_pairs, training_df): cointegrated = [] for count, pair in enumerate(all_pairs): try: ols = linregress(training_df[str(pair[1])], training_df[str(pair[0])]) slope = ols[0] spread = training_df[str( pair[1])] - (slope * training_df[str(pair[0])]) cadf = adfuller(spread, 1) if cadf[0] < cadf[4]['1%']: print('Pair Cointegrated at 99% Confidence Interval') cointegrated.append([pair[0], pair[1]]) elif cadf[0] < cadf[4]['5%']: print('Pair Cointegrated at 95% Confidence Interval') cointegrated.append([pair[0], pair[1]]) elif cadf[0] < cadf[4]['10%']: print('Pair Cointegrated at 90% Confidence Interval') cointegrated.append(pair[0], pair[1]) else: print('Pair Not Cointegrated ') continue except: print('Exception: Symbol not in Dataframe') continue return cointegrated
def example_3(): import pandas_datareader as pdr gs = pdr.data.DataReader("GS", data_source='yahoo', start='2006-01-01', end='2010-01-01') print(gs.head().round(2)) print(gs.loc[pd.Timestamp('2006-01-01'):pd.Timestamp('2006-12-31')].head()) print(gs.loc['2006'].head()) #-------------------- # Resampling. if True: print(gs.resample("5d").mean().head()) print(gs.resample("W").agg(['mean', 'sum']).head()) # You can up-sample to convert to a higher frequency. The new points are filled with NaNs. print(gs.resample("6h").mean().head()) #-------------------- # Rolling, expanding, exponential weighted (EW). if False: gs.Close.plot(label='Raw') gs.Close.rolling(28).mean().plot(label='28D MA') gs.Close.expanding().mean().plot(label='Expanding Average') gs.Close.ewm(alpha=0.03).mean().plot(label='EWMA($\\alpha=.03$)') plt.legend(bbox_to_anchor=(1.25, .5)) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() # Each of .rolling, .expanding, and .ewm return a deferred object, similar to a GroupBy. roll = gs.Close.rolling(30, center=True) m = roll.agg(['mean', 'std']) plt.figure() ax = m['mean'].plot() ax.fill_between(m.index, m['mean'] - m['std'], m['mean'] + m['std'], alpha=.25) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() #-------------------- # Grab bag. if False: # Offsets. # These are similar to dateutil.relativedelta, but works with arrays. print(gs.index + pd.DateOffset(months=3, days=-2)) # Holiday calendars. from pandas.tseries.holiday import USColumbusDay print(USColumbusDay.dates('2015-01-01', '2020-01-01')) # Timezones. # tz naiive -> tz aware..... to desired UTC print(gs.tz_localize('US/Eastern').tz_convert('UTC').head()) #-------------------- # Modeling time series. if True: from collections import namedtuple import statsmodels.formula.api as smf import statsmodels.tsa.api as smt import statsmodels.api as sm from modern_pandas_utils import download_timeseries def download_many(start, end): months = pd.period_range(start, end=end, freq='M') # We could easily parallelize this loop. for i, month in enumerate(months): download_timeseries(month) def time_to_datetime(df, columns): ''' Combine all time items into datetimes. 2014-01-01,1149.0 -> 2014-01-01T11:49:00 ''' def converter(col): timepart = (col.astype(str) .str.replace('\.0$', '') # NaNs force float dtype .str.pad(4, fillchar='0')) return pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce') return datetime_part df[columns] = df[columns].apply(converter) return df def unzip_one(fp): try: zf = zipfile.ZipFile(fp) csv = zf.extract(zf.filelist[0]) return csv except zipfile.BadZipFile as ex: print('zipfile.BadZipFile raised in {}: {}.'.format(fp, ex)) raise def read_one(fp): df = (pd.read_csv(fp, encoding='latin1') .rename(columns=str.lower) .drop('unnamed: 6', axis=1) .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time']) .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']))) return df store = './modern_pandas_data/ts.hdf5' if not os.path.exists(store): download_many('2000-01-01', '2016-01-01') zips = glob.glob(os.path.join('modern_pandas_data', 'timeseries', '*.zip')) csvs = [unzip_one(fp) for fp in zips] dfs = [read_one(fp) for fp in csvs] df = pd.concat(dfs, ignore_index=True) df['origin'] = df['origin'].astype('category') df.to_hdf(store, 'ts', format='table') else: df = pd.read_hdf(store, 'ts') with pd.option_context('display.max_rows', 100): print(df.dtypes) daily = df.fl_date.value_counts().sort_index() y = daily.resample('MS').mean() print(y.head()) ax = y.plot() ax.set(ylabel='Average Monthly Flights') sns.despine() X = (pd.concat([y.shift(i) for i in range(6)], axis=1, keys=['y'] + ['L%s' % i for i in range(1, 6)]).dropna()) print(X.head()) mod_lagged = smf.ols('y ~ trend + L1 + L2 + L3 + L4 + L5', data=X.assign(trend=np.arange(len(X)))) res_lagged = mod_lagged.fit() res_lagged.summary() sns.heatmap(X.corr()) ax = res_lagged.params.drop(['Intercept', 'trend']).plot.bar(rot=0) plt.ylabel('Coefficeint') sns.despine() # Autocorrelation. # 'Results.resid' is a series of residuals: y - ŷ. mod_trend = sm.OLS.from_formula('y ~ trend', data=y.to_frame(name='y').assign(trend=np.arange(len(y)))) res_trend = mod_trend.fit() def tsplot(y, lags=None, figsize=(10, 8)): fig = plt.figure(figsize=figsize) layout = (2, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) y.plot(ax=ts_ax) smt.graphics.plot_acf(y, lags=lags, ax=acf_ax) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax) [ax.set_xlim(1.5) for ax in [acf_ax, pacf_ax]] sns.despine() plt.tight_layout() return ts_ax, acf_ax, pacf_ax tsplot(res_trend.resid, lags=36) y.to_frame(name='y').assign(Δy=lambda x: x.y.diff()).plot(subplots=True) sns.despine() ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest") #ADF(*smt.adfuller(y))._asdict() ADF(*smt.adfuller(y.dropna()))._asdict() ADF(*smt.adfuller(y.diff().dropna()))._asdict() data = (y.to_frame(name='y').assign(Δy=lambda df: df.y.diff()).assign(LΔy=lambda df: df.Δy.shift())) mod_stationary = smf.ols('Δy ~ LΔy', data=data.dropna()) res_stationary = mod_stationary.fit() tsplot(res_stationary.resid, lags=24) # Seasonality. #smt.seasonal_decompose(y).plot() smt.seasonal_decompose(y.fillna(method='ffill')).plot() # ARIMA. mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1)) res = mod.fit() tsplot(res.resid[2:], lags=24) res.summary() mod_seasonal = smt.SARIMAX(y, trend='c', order=(1, 1, 2), seasonal_order=(0, 1, 2, 12), simple_differencing=False) res_seasonal = mod_seasonal.fit() res_seasonal.summary() tsplot(res_seasonal.resid[12:], lags=24) # Forecasting. pred = res_seasonal.get_prediction(start='2001-03-01') pred_ci = pred.conf_int() plt.figure() ax = y.plot(label='observed') pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_ylabel("Monthly Flights") plt.legend() sns.despine() pred_dy = res_seasonal.get_prediction(start='2002-03-01', dynamic='2013-01-01') pred_dy_ci = pred_dy.conf_int() plt.figure() ax = y.plot(label='observed') pred_dy.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_dy_ci.index, pred_dy_ci.iloc[:, 0], pred_dy_ci.iloc[:, 1], color='k', alpha=.25) ax.set_ylabel("Monthly Flights") # Highlight the forecast area. ax.fill_betweenx(ax.get_ylim(), pd.Timestamp('2013-01-01'), y.index[-1], alpha=.1, zorder=-1) ax.annotate('Dynamic $\\longrightarrow$', (pd.Timestamp('2013-02-01'), 550)) plt.legend() sns.despine() plt.show()
def test_stationarity(timeseries): #Determing rolling statistics rolmean = timeseries.rolling(window=12, center=False).mean() rolstd = timeseries.rolling(window=12, center=False).std() #Plot rolling statistics: orig = plt.plot(timeseries, color='blue',label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color='black', label = 'Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) #Perform Dickey-Fuller test: print('Results of Dickey-Fuller Test:') dftest = smt.adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value print(dfoutput) >>> test_stationarity(ts) #variation in standard deviation is small, mean is clearly increasing with time #p-value>0.05, hence accept H0(=TS is non-stationary) Results of Dickey-Fuller Test: Test Statistic 0.411488 p-value 0.981920 #Lags Used 14.000000 Number of Observations Used 4370.000000 Critical Value (10%) -2.567122
plt.subplot help(plt.subplot) ax1=plt.subplot() ax2 = ax1. df_merge = pd.merge(df_log, pred_df, left_index=True, right_index=True) df_merge.columns = ["y", "yhat"] df_merge["resid"] = df_merge["y"] - df_merge["yhat"] df_merge[["y", "yhat"]].plot() mse = pow(df_merge["resid"],2).sum()/df_merge["resid"].__len__() rmse = np.sqrt(pow(df_merge["resid"],2).sum()/df_merge["resid"].__len__()) df_merge["resid"] df_merge["x"] = np.exp(df_merge.iloc[:,0]) df_merge["xhat"] = np.exp(df_merge.iloc[:,1]) df_merge["x_resid"] = df_merge["x"] - df_merge["xhat"] tsa.adfuller(df_merge["resid"]) df_merge["resid"].plot() tsa.graphics.plot_acf(df_merge["resid"]) tsa.graphics.plot_pacf(df_merge["resid"]) # prediction pred = best["model"].predict(start=test_df.index[0], end=test_df.index[-1], dynamic= True) plt.figure(figsize=(22,10)) plt.plot(train_df.index, train_df, label="Train") plt.plot(pred.index, pred, label="SARIMA", color="r") plt.plot(test_df.index, test_df, label="Test", color="k") plt.legend(loc="best", fontsize="xx-large") plt.show()
# is the **Augmented Dickey-Fuller** test. # # The null hypothesis in the test # is that the data is non-stationary, # and therefore needs to be differenced. # # The alternate hypothesis is that is is stationary, # and therefore does not need to be differenced. # # This test is available in `smt.adfuller` in stastmodels. # + from collections import namedtuple ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest") ADF(*smt.adfuller(y))._asdict() # - # So here we failed to reject the null hypothesis. # Difference it and try again. ADF(*smt.adfuller(y.diff().dropna()))._asdict() # Now fit another OLS model. data = ( y.to_frame(name="y") .assign(Δy=lambda df: df.y.diff()) .assign(LΔy=lambda df: df.Δy.shift()) ) mod_stationary = smf.ols("Δy ~ LΔy", data=data.dropna())
Summaryで特筆すべきはDurbin-Watson比率です。これが2よりも十分に大きい時は負の系列相関。2より十分小さいときには正の系列相関が疑われます。経済時系列データを使った回帰分析では、系列相関が頻繁に生じますから、特に注意が必要です。 ちなみに、系列相関をはじめ、古典的な回帰モデルの診断手続きは経済企画庁[1988]が詳しいです。だいぶ昔のレポートですが、線形回帰モデルは古典的な方法ですので、その基本は変わっていません。http://www.esri.go.jp/jp/archive/bun/bun112/bun112a.pdf 誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。 このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。 """ # ADF test, H0: Non-stationary tsa.adfuller(rlt.resid,regression='nc') # Autocorrel plot of resid autocorrelation_plot(rlt.resid) # Show ACF of residuals ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals """ 誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。 系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。 以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。 リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。
def test_stationarity(time_df, maxlag=31, regression='c', autolag='BIC', window=None, plot=False, verbose=False, var_only=False): """ Check unit root stationarity of a time series array or an entire dataframe. Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR. Null hypothesis: the series is non-stationary. If p >= alpha, the series is non-stationary. If p < alpha, reject the null hypothesis (has unit root stationarity). Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/ Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html window argument is only required for plotting rolling functions. Default=4. """ time_df = copy.deepcopy(time_df) if len(time_df) <= int(1.5 * maxlag): maxlag = 5 ## set it to a low number # set defaults (from function page) if type(time_df) == pd.DataFrame: #print('modifying time series dataframe into an array to test') timeseries = time_df.values.ravel() if regression is None: regression = 'c' if verbose: print('\nRunning Augmented Dickey-Fuller test with paramters:') print(' maxlag: {}'.format(maxlag), 'regression: {}'.format(regression), 'autolag: {}'.format(autolag)) alpha = 0.05 if plot: try: if window is None: window = 4 # Determing rolling statistics rolmean = timeseries.rolling(window=window, center=False).mean() rolstd = timeseries.rolling(window=window, center=False).std() # Plot rolling statistics: orig = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean ({})'.format(window)) std = plt.plot(rolstd, color='black', label='Rolling Std ({})'.format(window)) plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) except: print('Data must have date-time as index to plot!') return # Perform Augmented Dickey-Fuller test: if var_only: ### In VAR models, check all_vars for stationarity ### if it is 1, then all vars are stationary. If not difference it once and try again! ### Use Statsmodels for tests ########### diff_limit = 0 for i in range(3): stationary_test = check_each_var_for_stationarity( time_df, autolag, verbose) if stationary_test: if i == 0: print('Data is already stationary') diff_limit = 0 break elif i == 1: print('Data is stationary after one differencing') diff_limit = 1 break elif i == 2: diff_limit = 2 print('Data is stationary after two differencing') break else: if i == 2: print( 'Alert! Data is not stationary even after two differencing. Continuing...' ) diff_limit = 0 break else: time_df = time_df.diff(1).dropna() continue return diff_limit else: ### In non-VAR models you need to test only the target variable for stationarity ## timeseries = copy.deepcopy(time_df) dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag) dfoutput = pd.Series(dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used', ], name='Dickey-Fuller Augmented Test') for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value if verbose: print('Results of Augmented Dickey-Fuller Test:') pretty_print_table(dfoutput) if dftest[1] >= alpha: print( ' this series is non-stationary. Trying test again after differencing...' ) timeseries = pd.Series(timeseries).diff(1).dropna().values dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag) dfoutput = pd.Series(dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used', ], name='Dickey-Fuller Augmented Test') for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value if verbose: print( 'After differencing=1, results of Augmented Dickey-Fuller Test:' ) pretty_print_table(dfoutput) if dftest[1] >= alpha: print(' this series is not stationary') return False else: print(' this series is stationary') return True else: print(' this series is stationary') return True
■ 推計結果の診断: 系列相関と多重共線性 Summaryで特筆すべきはDurbin-Watson比率です。これが2よりも十分に大きい時は負の系列相関。2より十分小さいときには正の系列相関が疑われます。経済時系列データを使った回帰分析では、系列相関が頻繁に生じますから、特に注意が必要です。 ちなみに、系列相関をはじめ、古典的な回帰モデルの診断手続きは経済企画庁[1988]が詳しいです。だいぶ昔のレポートですが、線形回帰モデルは古典的な方法ですので、その基本は変わっていません。http://www.esri.go.jp/jp/archive/bun/bun112/bun112a.pdf 誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。 このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。 """ # ADF test, H0: Non-stationary tsa.adfuller(rlt.resid, regression='nc') # Autocorrel plot of resid autocorrelation_plot(rlt.resid) # Show ACF of residuals ACF_resid = tsa.acf(rlt.resid) # Keep ACF of residuals """ 誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。 系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。 以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。 リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。 """