def setup_class(cls): cls.res1 = adfuller(cls.y, regression="nc", autolag=None, maxlag=1) cls.teststat = -2.4511596 cls.pvalue = 0.013747 # Stata does not return a p-value for noconstant # this value is just taken from our results cls.critvalues = [-2.587,-1.950,-1.617] _, _1, _2, cls.store = adfuller(cls.y, regression="nc", autolag=None, maxlag=1, store=True)
def diff_nonstationary(x, alpha): """Returns number of differentiations required to transform a non-stationary time series into a stationary one. If 0 (zero) is returned, there's no need to differentiate.""" """ PARAMETERS: 1) x - input time series 2) alpha - significance level """ i = 0 # no need to differentiate pvalue = adfuller(x, regression = ('ct' if stats.linregress( pd.Series(range(1, len(x)+1)), x ).pvalue<alpha else 'c') )[1] while pvalue>alpha: x = x.diff() pvalue = adfuller(x.dropna(), regression = 'c')[1] i += 1 if pvalue<=alpha: break return(int(i)) ### End of code
def cluster_vs_meta_granger_TM(c,X,M,Ml,lags=7,thresh=0.05): # use the Toda Yamamoto method (environmental data is stationary, but clusters are not) x1 = X[c].sum(0) adf = stattools.adfuller(x1,maxlag=lags) if (adf[0] > adf[4]['5%']): m1 = adf[2] else: m1 = 0 R = [] for j,x2 in enumerate(M): have_values = np.isfinite(x2) xi = x1[have_values] x2i = x2[have_values] adf = stattools.adfuller(x2i,maxlag=lags) if (adf[0] > adf[4]['5%']): m2 = adf[2] else: m2 = 0 m = max(m1,m2) y = [xi[i+max(0,m2-m1):len(xi)+i-(m1+lags)] for i in range(m1+lags)] + [x2i[i+max(0,m1-m2):len(xi)+i-(m2+lags)] for i in range(m2+lags)] y = np.array(y).T lm = linear_model.OLS(xi[max(m1,m2)+lags:],y) result = lm.fit() Restr = np.eye(y.shape[1])[m+lags:] wald = result.wald_test(Restr) if wald.pvalue < thresh: R.append((wald.pvalue,Ml[j])) return m,sorted(R)
def __init__(self): self.res1 = adfuller(self.y, regression="nc", autolag=None, maxlag=1) self.teststat = -2.4511596 self.pvalue = 0.013747 # Stata does not return a p-value for noconstant # this value is just taken from our results self.critvalues = [-2.587, -1.950, -1.617] _, _1, _2, self.store = adfuller(self.y, regression="nc", autolag=None, maxlag=1, store=True)
def ADF(self, v, crit='5%', max_d=6, reg='nc', autolag='AIC'): """ Augmented Dickey Fuller test Parameters ---------- v: ndarray matrix residuals matrix Returns ------- bool: boolean true if v pass the test """ boolean = True try: l = v.shape[1] for j in range(l): adf = adfuller(v[:, j], max_d, reg, autolag) if(adf[0] < adf[4][crit]): pass else: boolean = False break except: adf = adfuller(v, max_d, reg, autolag) if(adf[0] > adf[4][crit]): boolean = False return boolean
def test_adfuller_short_series(reset_randomstate): y = np.random.standard_normal(7) res = adfuller(y, store=True) assert res[-1].maxlag == 1 y = np.random.standard_normal(2) with pytest.raises(ValueError, match='sample size is too short'): adfuller(y) y = np.random.standard_normal(3) with pytest.raises(ValueError, match='sample size is too short'): adfuller(y, regression='ct')
def testADFTest(): import statsmodels.tsa.stattools as sts import statsmodels.stats.stattools as sss import numpy as np data =np.random.randn(100) #http://statsmodels.sourceforge.net/stable/generated/statsmodels.tsa.stattools.adfuller.html print sts.adfuller(data) #http://statsmodels.sourceforge.net/stable/generated/statsmodels.stats.stattools.jarque_bera.html print sss.jarque_bera(data)
def adftest(y, short_flag): '''Augmented Dicky-Fuller test for given timeseries. When test-statistics (first returned value) is absolutely less than critical values, process could be considered as stationary one.''' sep = 32 * '--' print "\n\t\tAugmented Dicky-Fuller test\n" if short_flag: stationarity = ["stationary", "nonstationary"] test_c = adfuller(y, regression='c') stat_c = 1 if test_c[0] > test_c[4]['5%'] else 0 test_ct = adfuller(y, regression='ct') stat_ct = 1 if test_ct[0] > test_ct[4]['5%'] else 0 test_ctt = adfuller(y, regression='ctt') stat_ctt = 1 if test_ctt[0] > test_ctt[4]['5%'] else 0 test_nc = adfuller(y, regression='nc') stat_nc = 1 if test_nc[0] > test_nc[4]['5%'] else 0 print sep print "- constant only:\t\t\t\t{}".format(stationarity[stat_c]) print "- constant and trend:\t\t\t\t{}".format(stationarity[stat_ct]) print "- constant, and linear and quadratic trend:\t{}".format(stationarity[stat_ctt]) print "\n- no constant, no trend:\t\t\t{}".format(stationarity[stat_nc]) print sep else: print "- constant only\n{}".format(adfuller(y,regression='c')) print "- constant and trend\n{}".format(adfuller(y,regression='ct')) print "- constant, and linear and quadratic trend\n{}".format(adfuller(y,regression='ctt')) print "\n- no constant, no trend\n{}".format(adfuller(y,regression='nc')) print sep
def summarize_all(self): if len(self.independent) == 1: dependent = self.dependent independent = self.independent[0] params = self.result.params result = self.result k = params[1] b = params[0] conf = result.conf_int() cadf = adfuller(result.resid) if cadf[0] <= cadf[4]['5%']: boolean = 'likely' else: boolean = 'unlikely' print print ("{:^40}".format("{} vs {}".format(dependent.upper(), independent.upper()))) print ("%20s %s = %.4f * %s + %.4f" % ("Model:", dependent, k, independent, b)) print ("%20s %.4f" % ("R square:", result.rsquared)) print ("%20s [%.4f, %.4f]" % ("Confidence interval:", conf.iloc[1, 0], conf.iloc[1, 1])) print ("%20s %.4f" % ("Model error:", result.resid.std())) print ("%20s %s" % ("Mean reverting:", boolean)) print ("%20s %d" % ("Half life:", half_life(result.resid))) else: dependent = self.dependent independent = self.independent # list params = self.result.params result = self.result b = params[0] conf = result.conf_int() # pandas cadf = adfuller(result.resid) if cadf[0] <= cadf[4]['5%']: boolean = 'likely' else: boolean = 'unlikely' print print ("{:^40}".format("{} vs {}".format(dependent.upper(), (', '.join(independent)).upper()))) string = [] for i in range(len(independent)): string.append("%.4f * %s" % (params[independent[i]], independent[i])) print ("%20s %s = %s + %.4f" % ("Model:", dependent, ' + '.join(string), b)) print ("%20s %.4f" % ("R square:", result.rsquared)) string = [] for i in range(len(independent)): string.append("[%.4f, %.4f]" % (conf.loc[independent[i], 0], conf.loc[independent[i], 1])) print ("%20s %s" % ("Confidence interval:", ' , '.join(string))) print ("%20s %.4f" % ("Model error:", result.resid.std())) print ("%20s %s" % ("Mean reverting:", boolean)) print ("%20s %d" % ("Half life:", half_life(result.resid)))
def _adsf_score(series, times, window_length): ret = [] for t in times: ret.append(ts.adfuller(series[t:t+window_length])[0]) return ret
def dickeyfuller_fcn(data,maxlag): #@FORMAT: data = np(values) try: df_fcn = adfuller(data,maxlag) return df_fcn[1] except: return np.nan
def test_stationarity(self, timeseries, window, return_plot=False): dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used']) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value if return_plot: # Determing rolling statistics rolmean = timeseries.rolling(window = window, center = False).mean() rolstd = timeseries.rolling(window = window, center = False).std() # Plot rolling statistics: orig = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color='black', label='Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) # Perform Dickey-Fuller test: print('Results of Dickey-Fuller Test:') print(dfoutput) return dfoutput
def test_frame_timeseries_dickey_fuller_constant_trend_squared(self): """Test Augmented Dickey Fuller with constant, trend, and trend squared regression""" result = self.frame.timeseries_augmented_dickey_fuller_test("logM", max_lag=1, regression="ctt") df_ctt_result = smtsa.adfuller(self.pandaframe["logM"], maxlag=1, regression="ctt") self.assertAlmostEqual(result.p_value, df_ctt_result[1], delta=0.0001) self.assertAlmostEqual(result.test_stat, df_ctt_result[0], delta=0.01)
def test_frame_timeseries_dickey_fuller_no_constant(self): """Test Augmented Dickey Fuller with no constant regression""" result = self.frame.timeseries_augmented_dickey_fuller_test("logM", max_lag=1, regression="nc") df_nc_result = smtsa.adfuller(self.pandaframe["logM"], maxlag=1, regression="nc") self.assertAlmostEqual(result.p_value, df_nc_result[1], delta=0.0001) self.assertAlmostEqual(result.test_stat, df_nc_result[0], delta=0.01)
def stationarity(timeseries): #Determing rolling statistics rol_mean = timeseries.rolling(window=12).mean() rol_std = timeseries.rolling(window=12).std() #Plot rolling statistics: fig, ax = plt.subplots() plt.grid(color='grey', which='major', axis='y', linestyle='--') plt.plot(timeseries, color='blue', label='Original', linewidth=1.25) plt.plot(rol_mean, color='red', label='Rolling Mean', linewidth=1.25) plt.plot(rol_std, color='black', label = 'Rolling Std', linewidth=1.25) plt.legend(loc='best') title = headers[1], data[index].iloc[0], '-' ,data[index].iloc[-1] plt.title(title) plt.tick_params(axis="both", which="both", bottom="on", top="off", labelbottom="on", left="off", right="off", labelleft="on") ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') fig.title = ('stationarity.png') fig.savefig(fig.title, bbox_inches="tight") #Perform Dickey-Fuller test: print ('Results of Dickey-Fuller Test:\n') df_test = adfuller(timeseries, autolag='AIC') df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','No. of Observations Used']) for key,value in df_test[4].items(): df_output['Critical Value (%s)'%key] = value print (df_output.round(3))
def cointegration_test(symbol, etf): # Step 1: regress one variable on the other ols_result = sm.OLS(instPrices[symbol], etfPrices).fit() # Step 2: obtain the residual (ols_resuld.resid) # Step 3: apply Augmented Dickey-Fuller test to see whether # the residual is unit root return ts.adfuller(ols_result.resid)
def __init__(self): self.res1 = adfuller(self.x, regression="nc", autolag=None, maxlag=4) self.teststat = 3.5227498 self.pvalue = 0.99999 # Stata does not return a p-value for noconstant. # Tau^max in MacKinnon (1994) is missing, so it is # assumed that its right-tail is well-behaved self.critvalues = [-2.587, -1.950, -1.617]
def stationary(self): """Evaluate wether the timeseries is stationary. non-stationary timeseries are probably random walks and not suitable for forecasting. Args: None Returns: state: True if stationary """ # Initialize key variables state = False values = [] # statistical test result = adfuller(self._y_current) adf = result[0] print('> Stationarity Test:') print(' ADF Statistic: {:.3f}'.format(adf)) print(' p-value: {:.3f}'.format(result[1])) print(' Critical Values:') for key, value in result[4].items(): print('\t{}: {:.3f}'.format(key, value)) values.append(value) # Return if adf < min(values): state = True return state
def adfuller_json(ts, autolag="AIC"): """ Wrapper to perform Dickey-Fuller test and return results in json. Params: ts - a 1d np.array autolag - autolag parameter for adfuller, AIC by default Output: res - dict with results crit - dict with critical values prints a table of results as a side effect """ r = adfuller(ts, autolag=autolag) res = dict(stat=r[0], pval=r[1], nlags=r[2], nobs=r[3]) neatoutput = pd.Series(r[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) crit = {} for key,value in r[4].items(): crit['crit{0}'.format(key)] = value neatoutput['Critical Value ({0})'.format(key)] = value print(neatoutput) return res, crit
def is_stationary(x, p = 10): x = np.array(x) result = ts.adfuller(x, regression='ctt') #1% level if p == 1: #if DFStat <= critical value if result[0] >= result[4]['1%']: #DFstat is less negative #is stationary return True else: #is nonstationary return False #5% level if p == 5: #if DFStat <= critical value if result[0] >= result[4]['5%']: #DFstat is less negative #is stationary return True else: #is nonstationary return False #10% level if p == 10: #if DFStat <= critical value if result[0] >= result[4]['10%']: #DFstat is less negative #is stationary return True else: #is nonstationary return False
def testStationarity(ts): dftest = adfuller(ts) # 对上述函数求得的值进行语义描述 dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value return dfoutput
def passes_dftest(data): if statmodel.adfuller(data[0][1][1], 250, 'ctt', 't-stat', False, False)[0] < 1: data[0][1][0] = True return data else: data[0][1][0] = False return data
def runSingleMRTest(ticker): dr = DataReader(ticker, "yahoo", datetime(datetime.now().year-1,datetime.now().month,datetime.now().day), datetime.now()) ts = dr['Adj Close'] # ADF test with a lag order value of 2 adf = ts_tool.adfuller(ts, 2) print adf[0] # adf test-statistic print adf[1] # p-value print adf[4] # 1/5/10 test statistic print hurst(ts) # calculate half-life md = sm.OLS(ts.diff(), sm.add_constant(ts.shift()), missing='drop') mdf = md.fit() half_life = -np.log(2)/mdf.params[1] lookback = np.round(half_life) print lookback # calculate VaR using Variance-covariance c = 0.95 rets = ts.pct_change() mu = np.mean(rets) sigma = np.std(rets) # The VaR returned is daily possible loss # to convert it to monthly, mu = mu * sqrt(20) # or annually, mu = mu * sqrt(250) alpha = norm.ppf(1-c, mu, sigma) print -alpha
def stationarity_test(ts): """@ brief Helper function to determine if a series is stationary.""" # Determining rolling mean and variance. rol_mean = ts.rolling(window=12, center=False).mean() rol_std = ts.rolling(window=12, center=False).std() # Plot rolling statistics. plt.plot(ts, color='#da4264', label='Original') plt.plot(rol_mean, color='#391a5a', label='Rolling Mean') plt.plot(rol_std, color='#369acd', label='Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean and Standard Deviation') plt.show() # Perform Dickey-Fuller test: print 'Results of Dickey-Fuller Test:' dftest = adfuller(ts, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used']) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print dfoutput print "\nConclusion:" for key, value in dftest[4].items(): if dfoutput['Test Statistic'] < dfoutput['Critical Value (%s)' % key]: print "Non-stationary series: Reject", else: print "Non-stationary series: Accept", print "at %s level" % key
def test_stationary(self, securities_list, beg_date, end_date): """ Method checks the stationarity of the price series. Works on internal list of securities. Returns: True/False if there is only one security, and a list of True/False if there are many securities. (True - stationary; False - non-stationary) ADF test is used. Parameter: lb => look-back period (in days). Default is 30 days. (Note also that the order of the results (indexes) are the same as the indexes for the list of securities contained in the current class object. It's a hint for you to know how to find security names for True results).""" if not isinstance(securities_list, list): securities = [securities_list] else: securities = securities_list prices = [] for j in range(0, len(securities)): tmp = securities[j].get_prices(beg_date, end_date) prices.append(tmp) results = [] for price_series in prices: print(price_series) tmp = ts.adfuller(price_series, regression="c", autolag='AIC') results.append(tmp) simple_results = [BasicMR.simplify_adf_results(result[1], result[0], float(result[4]['5%'])) for result in results] if len(simple_results) == 1: return simple_results else: return simple_results
def teste_estacionariedade(self, timeseries): ''' Este metodo tem por testar a estacionariedade de uma serie com o teste adfuller :param: timeseries: serie temporal, array :return: print com as estatisticas do teste ''' #Determing rolling statistics timeseries = pd.DataFrame(timeseries) rolmean = timeseries.rolling(window=12, center=False).mean() rolstd = timeseries.rolling(window=12, center=False).std() #Perform Dickey-Fuller test: print('Results of Dickey-Fuller Test:') timeseries = timeseries[1:].values dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value print(dfoutput) #Plot rolling statistics: orig = plt.plot(timeseries, color='blue',label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color='black', label = 'Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show()
def is_stationary(ts, test_window): """ This function checks whether the given TS is stationary. Can make it boolean, but lets just leave it for visualisation purposes. Not to be run once the numbers have been fixed. """ # Determine the rolling statistics (places like these compelled me to use Pandas and not numpy here) rol_mean = pd.rolling_mean(ts, window=test_window) rol_std = pd.rolling_std(ts, window=test_window) # Plot rolling statistics: orig = plt.plot(ts, color="blue", label="Original") mean = plt.plot(rol_mean, color="red", label="Rolling Mean") std = plt.plot(rol_std, color="black", label="Rolling Std") plt.legend(loc="best") plt.title("Rolling Mean & Standard Deviation") plt.show() # Perform the Dickey-Fuller test: (Check documentation of fn for return params) print "Results of Dickey-Fuller Test:" dftest = adfuller(timeseries, autolag="AIC") dfoutput = pd.Series(dftest[0:4], index=["Test Statistic", "p-value", "#Lags Used", "Number of Observations Used"]) for key, value in dftest[4].items(): dfoutput["Critical Value (%s)" % key] = value print dfoutput
def ADF(ticker,start,end): print('ADF') stock = DataReader(ticker, "yahoo", start, end) result = ts.adfuller(stock['Adj Close'], 1) print(result) print('') test = result[0] crit = result[4] one = crit['1%'] five = crit['5%'] ten = crit['10%'] if test<one: print('Lesser than 1%') print('-----------------------------------------') return stock if test<five: print('Lesser than 5%') print('-----------------------------------------') return stock if test<ten: print('Lesser than 10%') print('-----------------------------------------') return stock print('Cannot reject Null Hypothesis') print('-----------------------------------------') return stock
def test_stationarity(timeseries): # http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/ # Determing rolling statistics rolmean = pd.rolling_mean(timeseries, window=12) rolstd = pd.rolling_std(timeseries, window=12) # Plot rolling statistics: fig = plt.figure(figsize=(12, 8)) orig = plt.plot(timeseries, color='blue',label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color='black', label = 'Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show() # Perform Dickey-Fuller test: print 'Results of Dickey-Fuller Test:' dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series( dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ] ) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print dfoutput
def get_adf_test(self, x, p): result = ts.adfuller(x) pvalue = result[1] if pvalue < p: test_result = True else: test_result = False return test_result, pvalue
def check_stationarity(df): results = [] for ticker, prices in df.items(): results.append([ticker, adfuller(prices, regression='ct')[1]]) return pd.DataFrame(results, columns=['ticker', 'adf']).sort_values('adf')
"gyr_phone_y_freq_0.9_Hz_ws_40", "pca_3", "pca_3_temp_mean_ws_120", "pca_4_temp_mean_ws_120", "acc_phone_z_freq_0.9_Hz_ws_40", "pca_5", "pca_4", "acc_phone_y_freq_0.9_Hz_ws_40", "gyr_phone_z_freq_0.5_Hz_ws_40", "gyr_phone_x_freq_0.1_Hz_ws_40" ] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] # Let us first study whether the time series is stationary and what the autocorrelations are. dftest = adfuller(dataset['acc_phone_x'], autolag='AIC') plt.Figure() autocorrelation_plot(dataset['acc_phone_x']) DataViz.save(plt) plt.show() # Now let us focus on the learning part. learner = TemporalRegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of e.g. the NN is random. repeats = 3
plt.xlabel('JAREN') plt.ylabel('RUIS') plt.show() plt.rcParams.update({'figure.figsize': (10, 10)}) result.plot().suptitle('Multiplicatieve Decompositie', fontsize=12) plt.show() yshow = y[:len(y-11)] y, ytest = y[:(len(y)-12)], y[(len(y)-12):len(y)] # The term bfill means that we use the value before filling in missing values # To be secure.The data about incidents do not have missing values. So in this # case the action is obsolete. y = y.fillna(y.bfill()) result = adfuller(y) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) stepwise_model = auto_arima(y, start_p=1, start_q=1, max_p=3, max_q=3, m=12, start_P=0, seasonal=True, d=1, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) print(stepwise_model.aic())
def __init__(self): self.res1 = adfuller(self.y, regression="nc", autolag=None, maxlag=1) self.teststat = -2.4511596 self.pvalue = 0.013747 # Stata does not return a p-value for noconstant # this value is just taken from our results self.critvalues = [-2.587, -1.950, -1.617]
if __name__ == "__main__": start = datetime.datetime(2012, 1, 1) end = datetime.datetime(2013, 1, 1) arex = web.DataReader("AREX", "yahoo", start, end) wll = web.DataReader("WLL", "yahoo", start, end) df = pd.DataFrame(index=arex.index) df["AREX"] = arex["Adj Close"] df["WLL"] = wll["Adj Close"] # Plot the two time series plot_price_series(df, "AREX", "WLL") # Display a scatter plot of the two time series plot_scatter_series(df, "AREX", "WLL") # Calculate optimal hedge ratio "beta" res = ols(y=df['WLL'], x=df["AREX"]) beta_hr = res.beta.x # Calculate the residuals of the linear combination df["res"] = df["WLL"] - beta_hr * df["AREX"] # Plot the residuals plot_residuals(df) # Calculate and output the CADF test on the residuals cadf = ts.adfuller(df["res"]) pprint.pprint(cadf)
def __init__(self): self.res1 = adfuller(self.y, regression="ct", autolag=None, maxlag=1) self.teststat = -4.425093 self.pvalue = .00199633 self.critvalues = [-4.006, -3.437, -3.137]
def __init__(self): self.res1 = adfuller(self.x, regression="ct", autolag=None, maxlag=4) self.teststat = -1.8566374 self.pvalue = .67682968 self.critvalues = [-4.007, -3.437, -3.137]
df = web.DataReader(stock_code, data_source='yahoo', start=start_date, end=end_date) return df # stock_code='000001' #股票代码平安银行 # stock_code='600519' #股票代码贵州茅台 stock_code='600522' #股票代码中天科技 datalen=365 content='Close' data=get_stock_history(stock_code,datalen,content) print(data) subdata = data.iloc[:,:4] # print(subdata.shape) #平稳性检验 pvalue = stat.adfuller(subdata.values[:,3], 1)[1] print("指标 ",data.columns[3]," 单位根检验的p值为:",pvalue) #一阶差分并进行平稳性检验 subdata_diff1 = subdata.iloc[1:,:].values - subdata.iloc[:-1,:].values pvalue = stat.adfuller(subdata_diff1[:,3], 1)[1] print("指标 ",data.columns[3]," 单位根检验的p值为:",pvalue) # 模型阶数从1开始逐一增加 rows, cols = subdata_diff1.shape aicList = [] lmList = [] for p in range(1,11): baseData = None for i in range(p, rows):
#ADF Test Open_table_index = [] int_w = [] for i in range(len(arg_mix)): if arg_mix[i] != 0: w1, w2 = num_weight(table.w1.iloc[i], table.w2.iloc[i], Open_price[i, 0], Open_price[i, 1], maxi, capital) if Adf: if tick: ADF_spread = w1 * np.log( test_stock1[i,:(150+arg_test[i])] ) +\ w2 * np.log( test_stock2[i,:(150+arg_test[i])] ) else: #此處僅為表示tick與非tick算式一樣但想法上有本質上的不同 ADF_spread = w1 * np.log( test_stock1[i,:(150+arg_test[i])] ) +\ w2 * np.log( test_stock2[i,:(150+arg_test[i])] ) if adfuller(ADF_spread, regression='c')[1] <= 0.05: Open_table_index.append(i) int_w.append([w1, w2]) else: Open_table_index.append(i) int_w.append([w1, w2]) Open_table_index = np.array(Open_table_index) #delete useless table row table = table.iloc[Open_table_index, :] arg_test = arg_test[Open_table_index] arg_mix = arg_mix[Open_table_index] ClPos = ClPos[Open_table_index] ClPos_test = ClPos_test[Open_table_index] LongOrShort = LongOrShort[Open_table_index] Open_price = Open_price[Open_table_index]
def granger(array_X, array_Y, X_name, Y_name, para_set, path_to_output): step = para_set.step lag = para_set.lag test_mode = para_set.test_mode significant_thres = para_set.significant_thres min_segment_len = para_set.min_segment_len max_segment_len = para_set.max_segment_len n_sample = len(array_X) print('sample size: ' + str(n_sample)) # =================================================== initialization ================================================= cnt_prune_YX = cnts_prune(0, 0, 0, 0) cnt_prune_XY = cnts_prune(0, 0, 0, 0) time_prune_XY = time_prune(0, 0, 0, 0) time_prune_YX = time_prune(0, 0, 0, 0) print(X_name) time1 = timeit.default_timer() time_granger = 0 time_adf = 0 array_YX = np.concatenate((array_Y, array_X), axis=1) array_XY = np.concatenate((array_X, array_Y), axis=1) n_step = int(n_sample / step - 1) list_segment_split = [step * i for i in range(n_step)] list_segment_split.append(n_sample - 1) start = 0 end = 0 total_cnt_segment_YX = 0 total_cnt_segment_XY = 0 total_cnt_segment_adf = 0 total_cnt_segment_cal_adf = 0 total_cnt_segment_examine_adf_Y = 0 array_results_YX = np.full((n_step + 1, n_step + 1), -2, dtype=float) array_results_XY = np.full((n_step + 1, n_step + 1), -2, dtype=float) array_adf_results_X = np.full((n_step + 1, n_step + 1), -2, dtype=float) array_adf_results_Y = np.full((n_step + 1, n_step + 1), -2, dtype=float) # get lagged data dta_YX, dtaown_YX, dtajoint_YX = parts.get_lagged_data(array_YX, lag, addconst=True, verbose=False) dta_XY, dtaown_XY, dtajoint_XY = parts.get_lagged_data(array_XY, lag, addconst=True, verbose=False) # make the data to the original length # dta_YX = np.concatenate((np.zeros((lag, np.shape(dta_YX)[1])), dta_YX), axis = 0) # dtaown_YX = np.concatenate((np.zeros((lag, np.shape(dtaown_YX)[1])), dtaown_YX), axis = 0) # dtajoint_YX = np.concatenate((np.zeros((lag, np.shape(dtajoint_YX)[1])), dtajoint_YX), axis = 0) # dta_XY = np.concatenate((np.zeros((lag, np.shape(dta_XY)[1])), dta_XY), axis = 0) # dtaown_XY = np.concatenate((np.zeros((lag, np.shape(dtaown_XY)[1])), dtaown_XY), axis = 0) # dtajoint_XY = np.concatenate((np.zeros((lag, np.shape(dtajoint_XY)[1])), dtajoint_XY), axis = 0) # # dtaown_YX[:lag,-1] = 1 # dtajoint_YX[:lag,-1] = 1 # dtaown_XY[:lag,-1] = 1 # dtajoint_XY[:lag,-1] = 1 # maintain a non_zero flag to update degree of freedom if_non_zero_columns_YX = np.zeros(np.shape(dtajoint_YX)[1]) if_non_zero_columns_XY = np.zeros(np.shape(dtajoint_XY)[1]) # begin loop for i in range(n_step): start = list_segment_split[i] print(str(start) + '/' + str(len(array_YX))) reset_cnt_YX = -1 res2down_YX = None res2djoint_YX = None res2down_ssr_upper_YX = 0 res2down_ssr_lower_YX = 0 res2djoint_ssr_upper_YX = 0 res2djoint_ssr_lower_YX = 0 res2djoint_df_resid_YX = 0 reset_cnt_XY = -1 res2down_XY = None res2djoint_XY = None res2down_ssr_upper_XY = 0 res2down_ssr_lower_XY = 0 res2djoint_ssr_upper_XY = 0 res2djoint_ssr_lower_XY = 0 res2djoint_df_resid_XY = 0 for j in range(i + 1, n_step + 1): end = list_segment_split[j] dta_start = start dta_end = end - lag if (len(array_YX[start:end, :]) < min_segment_len or len(array_YX[start:end, :]) > max_segment_len): if_non_zero_columns_YX = np.logical_or( np.sum(dtajoint_YX[dta_end - step:dta_end, :], axis=0) != 0, if_non_zero_columns_YX) if_non_zero_columns_XY = np.logical_or( np.sum(dtajoint_XY[dta_end - step:dta_end, :], axis=0) != 0, if_non_zero_columns_XY) continue # ======================================================= F test ======================================================= time3 = timeit.default_timer() if test_mode == 'standard': p_value_YX, res2down_YX, res2djoint_YX = granger_std.grangercausalitytests( dta_YX[dta_start:dta_end], dtaown_YX[dta_start:dta_end], dtajoint_YX[dta_start:dta_end], lag, addconst=True, verbose=False) if p_value_YX < significant_thres: p_value_XY, res2down_XY, res2djoint_XY = granger_std.grangercausalitytests( dta_XY[dta_start:dta_end], dtaown_XY[dta_start:dta_end], dtajoint_XY[dta_start:dta_end], lag, addconst=True, verbose=False) else: p_value_XY = -1 elif test_mode == 'fast_version_1': #only check F_upper p_value_YX, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, reset_cnt_YX, if_non_zero_columns_YX = prune.grangercausalitytests_check_F_upper( dta_YX[dta_start:dta_end], dtaown_YX[dta_start:dta_end], dtajoint_YX[dta_start:dta_end], lag, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, if_non_zero_columns_YX, significant_thres, step, reset_cnt_YX, addconst=True, verbose=False) if p_value_YX < significant_thres and p_value_YX >= 0: p_value_XY, res2down_XY, res2djoint_XY = granger_std.grangercausalitytests( dta_XY[dta_start:dta_end], dtaown_XY[dta_start:dta_end], dtajoint_XY[dta_start:dta_end], lag, addconst=True, verbose=False) else: p_value_XY = -1 elif test_mode == 'fast_version_2': # check F_upper then check F_lower total_cnt_segment_YX += 1 p_value_YX, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2down_ssr_lower_YX, res2djoint_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, reset_cnt_YX, cnt_prune_YX, time_prune_YX, if_non_zero_columns_YX \ = prune.grangercausalitytests_check_F_upper_lower(dta_YX[dta_start:dta_end], dtaown_YX[dta_start:dta_end], dtajoint_YX[dta_start:dta_end], lag, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2down_ssr_lower_YX, res2djoint_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, if_non_zero_columns_YX, significant_thres, step, reset_cnt_YX, cnt_prune_YX, time_prune_YX, addconst=True, verbose=False) if p_value_YX < significant_thres and p_value_YX >= 0: total_cnt_segment_XY += 1 p_value_XY, res2down_XY, res2djoint_XY = granger_std.grangercausalitytests( dta_XY[dta_start:dta_end], dtaown_XY[dta_start:dta_end], dtajoint_XY[dta_start:dta_end], lag, addconst=True, verbose=False) else: p_value_XY = -1 elif test_mode == 'fast_version_3': # check YX then check XY total_cnt_segment_YX += 1 p_value_YX, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2down_ssr_lower_YX, res2djoint_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, reset_cnt_YX, cnt_prune_YX, time_prune_YX, if_non_zero_columns_YX \ = prune.grangercausalitytests_check_F_upper_lower(dta_YX[dta_start:dta_end], dtaown_YX[dta_start:dta_end], dtajoint_YX[dta_start:dta_end], lag, res2down_YX, res2djoint_YX, res2down_ssr_upper_YX, res2down_ssr_lower_YX, res2djoint_ssr_upper_YX, res2djoint_ssr_lower_YX, res2djoint_df_resid_YX, if_non_zero_columns_YX, significant_thres, step, reset_cnt_YX, cnt_prune_YX, time_prune_YX, addconst=True, verbose=False) if p_value_YX < significant_thres and p_value_YX >= 0: total_cnt_segment_XY += 1 p_value_XY, res2down_XY, res2djoint_XY, res2down_ssr_upper_XY, res2down_ssr_lower_XY, res2djoint_ssr_upper_XY, res2djoint_ssr_lower_XY, res2djoint_df_resid_XY, reset_cnt_XY, cnt_prune_XY, time_prune_XY, if_non_zero_columns_XY \ = prune.grangercausalitytests_check_F_upper_lower(dta_XY[dta_start:dta_end], dtaown_XY[dta_start:dta_end], dtajoint_XY[dta_start:dta_end], lag, res2down_XY, res2djoint_XY, res2down_ssr_upper_XY, res2down_ssr_lower_XY, res2djoint_ssr_upper_XY, res2djoint_ssr_lower_XY, res2djoint_df_resid_XY, if_non_zero_columns_XY, significant_thres, step, reset_cnt_XY, cnt_prune_XY, time_prune_XY, addconst=True, verbose=False) else: p_value_XY = -1 if res2down_XY != None and res2djoint_XY != None: res2down_ssr_upper_XY, res2down_ssr_lower_XY, res2djoint_ssr_upper_XY, res2djoint_ssr_lower_XY, res2djoint_df_resid_XY, if_non_zero_columns_XY = prune.update_bound( dta_XY[dta_start:dta_end], dtaown_XY[dta_start:dta_end], dtajoint_XY[dta_start:dta_end], res2down_XY, res2djoint_XY, res2down_ssr_upper_XY, res2down_ssr_lower_XY, res2djoint_ssr_upper_XY, res2djoint_ssr_lower_XY, res2djoint_df_resid_XY, if_non_zero_columns_XY, lag, step, addconst=True, verbose=False) if res2down_XY.ssr > res2down_ssr_upper_XY or res2djoint_XY.ssr > res2djoint_ssr_upper_XY: print('error') array_results_YX[i, j] = p_value_YX array_results_XY[i, j] = p_value_XY time4 = timeit.default_timer() time_granger += (time4 - time3) # ====================================== stationary test ==================================================== time5 = timeit.default_timer() if para_set.cal_stationary_separately == 0: total_cnt_segment_adf += 1 if p_value_YX < significant_thres and p_value_YX >= 0 and p_value_XY > significant_thres: total_cnt_segment_examine_adf_Y += 1 adfstat_Y, pvalue_Y, usedlag_Y, nobs_Y, critvalues_Y, icbest_Y = adfuller( array_XY[start:end, 1], lag) if pvalue_Y < significant_thres and pvalue_Y >= 0: adfstat_X, pvalue_X, usedlag_X, nobs_X, critvalues_X, icbest_X = adfuller( array_XY[start:end, 0], lag) total_cnt_segment_cal_adf += 1 else: pvalue_X = -1 else: pvalue_Y = -1 pvalue_X = -1 else: total_cnt_segment_examine_adf_Y += 1 adfstat_Y, pvalue_Y, usedlag_Y, nobs_Y, critvalues_Y, icbest_Y = adfuller( array_XY[start:end, 1], lag) if pvalue_Y < significant_thres and pvalue_Y >= 0: adfstat_X, pvalue_X, usedlag_X, nobs_X, critvalues_X, icbest_X = adfuller( array_XY[start:end, 0], lag) total_cnt_segment_cal_adf += 1 else: pvalue_X = -1 array_adf_results_Y[i, j] = pvalue_Y array_adf_results_X[i, j] = pvalue_X time6 = timeit.default_timer() time_adf += (time6 - time5) time2 = timeit.default_timer() total_time = time2 - time1 print('total time: ' + str(time2 - time1)) time_set = [time1, time2, time_granger, time_adf] cnt_set = [ total_cnt_segment_YX, cnt_prune_YX, time_prune_YX, total_cnt_segment_XY, cnt_prune_XY, time_prune_XY, total_cnt_segment_adf, total_cnt_segment_cal_adf, total_cnt_segment_examine_adf_Y ] output.output_causal(path_to_output, X_name, Y_name, time_set, cnt_set, array_results_YX, array_results_XY, array_adf_results_X, array_adf_results_Y, list_segment_split, para_set) return total_time, time_granger, time_adf, total_cnt_segment_YX, cnt_prune_YX, time_prune_YX, total_cnt_segment_XY, cnt_prune_XY, time_prune_XY
def setup_class(cls): cls.res1 = adfuller(cls.y, regression="ct", autolag=None, maxlag=1) cls.teststat = -4.425093 cls.pvalue = .00199633 cls.critvalues = [-4.006, -3.437, -3.137]
# calculate stationarity test of time series data from pandas import read_csv from statsmodels.tsa.stattools import adfuller series = read_csv('daily-total-female-births.csv', header=0, index_col=0, parse_dates=True, squeeze=True) X = series.values result = adfuller(X) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value))
# 3.单位根检验:Dickey-Fuller test # 单位根检验是指检验序列中是否存在单位根,如果存在单位根,那就是非平稳时间序列。 from statsmodels.tsa.stattools import adfuller import pandas as pd import numpy as np df = pd.read_csv("../production.csv") data = np.array(df["production"].values) result = adfuller(data) output = { 'Test Statistic Value': result[0], 'p-value': result[1], 'Lags Used': result[2], 'Number of Observations Used': result[3], 'Critical Value(1%)': result[4]['1%'], 'Critical Value(5%)': result[4]['5%'], 'Critical Value(10%)': result[4]['10%'] } print(output) # 返回结果: # Test Statistic Value: 统计值 # p-value: t统计量对应的概率值,p值要小于给定的显著性水平才可以拒绝假设 # p值越接近零越好 # 如果p_value接近于 0.05 时,则要通过临界值(Test Statistic Value 和 Critical Value)进行判断 # Lags Used: 滞后阶数 # Number of Observations Used: 统计的数据的数目 # Critical Value(1%, 5%, 10%): 不同程度拒绝原假设的统计值 # ADF检验的原假设是存在单位根, # 只要这个统计值是小于 1%水平下的数字就可以极显著的拒绝原假设,认为数据平稳 # 注意,ADF值一般是负的,也有正的,但是它只有小于1%水平下的才能认为是及其显著的拒绝原假设
# load the dataset df = pd.read_csv('data/AirPassengers.csv') print(df.dtypes) df['Month'] = pd.to_datetime(df['Month']) print(df.dtypes) df.set_index('Month', inplace=True) plt.plot(df['Passengers']) #Is the data stationary? #Dickey-Fuller test from statsmodels.tsa.stattools import adfuller adf, pvalue, usedlag_, nobs_, critical_values_, icbest_ = adfuller(df) print("pvalue = ", pvalue, " if above 0.05, data is not stationary") #Since data is not stationary, we may need SARIMA and not just ARIMA # df['year'] = [d.year for d in df.index] df['month'] = [d.strftime('%b') for d in df.index] years = df['year'].unique() #Plot yearly and monthly values as boxplot sns.boxplot(x='year', y='Passengers', data=df) sns.boxplot(x='month', y='Passengers', data=df) #Extract and plot trend, seasonal and residuals. from statsmodels.tsa.seasonal import seasonal_decompose
def setup_class(cls): cls.res1 = adfuller(cls.x, regression="ct", autolag=None, maxlag=4) cls.teststat = -1.8566374 cls.pvalue = .67682968 cls.critvalues = [-4.007, -3.437, -3.137]
def get_engle_granger_two_step_cointegration_test(y, x): """Estimates long-run and short-run cointegration relationship for series y and x and apply the two-step Engle & Granger test for cointegration. Uses a 2-step process to first estimate coefficients for the long-run relationship y_t = c + gamma * x_t + z_t and then the short-term relationship, y_t - y_(t-1) = alpha * z_(t-1) + epsilon_t, with z the found residuals of the first equation. Then tests cointegration by Dickey-Fuller phi=1 vs phi < 1 in z_t = phi * z_(t-1) + eta_t If this implies phi < 1, the z series is stationary is concluded to be stationary, and thus the series y and x are concluded to be cointegrated. Parameters Parameters ---------- y : pd.Series The first time series of the pair to analyse. x : pd.Series The second time series of the pair to analyse. Returns ------- c : float The constant term in the long-run relationship y_t = c + gamma * x_t + z_t. This describes the static shift of y with respect to gamma * x. gamma : float The gamma term in the long-run relationship y_t = c + gamma * x_t + z_t. This describes the ratio between the const-shifted y and x. alpha : float The alpha term in the short-run relationship y_t - y_(t-1) = alpha * z_(t-1) + epsilon. This gives an indication of the strength of the error correction toward the long-run mean. z : pd.Series Series of residuals z_t from the long-run relationship y_t = c + gamma * x_t + z_t, representing the value of the error correction term. dfstat : float The Dickey Fuller test-statistic for phi = 1 vs phi < 1 in the second equation. A more negative value implies the existence of stronger cointegration. pvalue : float The p-value corresponding to the Dickey Fuller test-statistic. A lower value implies stronger rejection of no-cointegration, thus stronger evidence of cointegration. """ warnings.simplefilter(action="ignore", category=FutureWarning) long_run_ols = sm.OLS(y, sm.add_constant(x)) warnings.simplefilter(action="default", category=FutureWarning) long_run_ols_fit = long_run_ols.fit() c, gamma = long_run_ols_fit.params z = long_run_ols_fit.resid short_run_ols = sm.OLS(y.diff().iloc[1:], (z.shift().iloc[1:])) short_run_ols_fit = short_run_ols.fit() alpha = short_run_ols_fit.params[0] # NOTE: The p-value returned by the adfuller function assumes we do not estimate z first, but test # stationarity of an unestimated series directly. This assumption should have limited effect for high N, however. # Critical values taking this into account more accurately are provided in e.g. McKinnon (1990) and # Engle & Yoo (1987). adfstat, pvalue, _, _, _ = adfuller(z, maxlag=1, autolag=None) return c, gamma, alpha, z, adfstat, pvalue
def setup_class(cls): cls.res1 = adfuller(cls.x, regression="c", autolag=None, maxlag=4) cls.teststat = .97505319 cls.pvalue = .99399563 cls.critvalues = [-3.476, -2.883, -2.573]
# fii = mod.fit() # print(fii.summary()) # # plt.plot(par_time, par_avg, color='blue') # plt.plot(par_time, fii.predict(poly.fit_transform(par_time)), color='red') # plt.ylim([-700, 14000]) # plt.title('Trajectory of $cp_{t_k}^-$') # plt.xlabel('Time Steps') # plt.ylabel('$cp_{t_k}^-$') # plt.show() from statsmodels.tsa.stattools import adfuller par = plus_ccpp_t par_avg = [np.mean(x) for x in par if x != []] result = adfuller(par_avg) # print('ADF Statistic: %f' % result[0]) print('p-value: %.2f' % result[1]) # print('Critical Values:') # for key, value in result[4].items(): # print('\t%s: %.3f' % (key, value)) # pi_t is a quadratic function. polynomial regression par = pioo_t par_avg = [np.mean(x) for x in par if x != []] par_time = list(range(1, len(par_avg) + 1)) # plt.plot(par_avg) from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression poly = PolynomialFeatures(degree=2)
def __init__(self): self.res1 = adfuller(self.y, regression="c", autolag=None, maxlag=1) self.teststat = -4.3346988 self.pvalue = .00038661 self.critvalues = [-3.476, -2.883, -2.573]
# Solutions from statsmodels.tsa.stattools import kpss kpss_stat, p_value, lags, crit = kpss(X_stationary) p_value > 0.05, p_value # Accept hypothesis of stationarity kpss_stat, p_value, lags, crit = kpss(X_non_stationary) p_value > 0.05, p_value # Reject hypothesis of stationarity kpss_stat, p_value, lags, crit = kpss(X_AR1) p_value > 0.05, p_value # Reject hypothesis of stationarity from statsmodels.tsa.stattools import adfuller adf, p_value, *_ = adfuller(X_stationary) p_value < 0.05, p_value # Reject hypothesis of a unit root, which indicates stationary
def __init__(self): self.res1 = adfuller(self.x, regression="c", autolag=None, maxlag=4) self.teststat = .97505319 self.pvalue = .99399563 self.critvalues = [-3.476, -2.883, -2.573]
def test_adfuller_maxlag_too_large(reset_randomstate): y = np.random.standard_normal(100) with pytest.raises(ValueError, match='maxlag must be less than'): adfuller(y, maxlag=51)
return pd.Series(diff) # invert differenced value def inverse_difference(history, yhat, interval=1): return yhat + history[-interval] # difference data months_in_year = 12 stationary = difference(X, months_in_year) stationary.index = dataPiece.index[months_in_year:] # check if stationary result = adfuller(stationary) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) # save stationary.to_csv('stationary.csv') # plot stationary.plot() plt.show()
def augDickeyFuller(series, lag): stats = ts.adfuller(series, lag) return stats
# dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used']) # for key, value in dftest[4].items(): # dfoutput['Critical Value (%s)' % key] = value # print dfoutput # test_stationarity(volume) ma = volume.rolling(window=12,center=False).mean() msd = volume.rolling(window=12,center=False).std() #msd = pd.rolling_std(volume,window=12,center=False) plt.plot(volume,'blue') plt.plot(ma,'green') plt.plot(msd,'red') plt.show() # The moving average and moving deviation is increasing over year adtestoutput = adfuller(volume.volume) print('Test Statistic: %.6f' % adtestoutput[0]) print('p-value: %.6f' % adtestoutput[1]) print('#Lags Used: %.6f' % adtestoutput[2]) print('Number of Observations Used %.6f' % adtestoutput[3]) print('Critical Value (1%%) %.6f' % adtestoutput[4]['1%']) print('Critical Value (10%% %.6f' % adtestoutput[4]['10%']) print('Critical Value (5%%) %.6f' % adtestoutput[4]['5%']) #####Task 3:Make a Time Series stationary – 5pts #QA plot log logvolume = np.log(volume) logvolume.plot() pyplot.show() #QC log data plot
# Import the adfuller module from statsmodels from statsmodels.tsa.stattools import adfuller # Compute the ADF for HO and NG result_HO = adfuller(HO['Close']) print("The p-value for the ADF test on HO is ", result_HO[1]) result_NG = adfuller(NG['Close']) print("The p-value for the ADF test on NG is ", result_NG[1]) # Compute the ADF of the spread result_spread = adfuller(7.25 * HO['Close'] - NG['Close']) print("The p-value for the ADF test on the spread is ", result_spread[1])
# Test: # #### H<sub>0</sub>: β = 1 (This is a random walk) # #### H<sub>1</sub>: β < 1 (This is not a random walk) # <br> # Dickey-Fuller Test: # #### H<sub>0</sub>: β = 0 (This is a random walk) # #### H<sub>1</sub>: β < 0 (This is not a random walk) # ### Augmented Dickey-Fuller test # An augmented Dickey–Fuller test (ADF) tests the null hypothesis that a unit root is present in a time series sample. It is basically Dickey-Fuller test with more lagged changes on RHS. # In[ ]: # Augmented Dickey-Fuller test on volume of google and microsoft stocks adf = adfuller(microsoft["Volume"]) print("p-value of microsoft: {}".format(float(adf[1]))) adf = adfuller(google["Volume"]) print("p-value of google: {}".format(float(adf[1]))) # ##### As microsoft has p-value 0.0003201525 which is less than 0.05, null hypothesis is rejected and this is not a random walk. # ##### Now google has p-value 0.0000006510 which is more than 0.05, null hypothesis is rejected and this is not a random walk. # ### Generating a random walk # In[ ]: seed(42) rcParams['figure.figsize'] = 16, 6 random_walk = normal(loc=0, scale=0.01, size=1000)
%matplotlib inline df=pd.read_csv('time_series_data.csv') df.head() # Updating the header df.columns=["Month","Sales"] df.head() df.describe() df.set_index('Month',inplace=True) from pylab import rcParams rcParams['figure.figsize'] = 15, 7 df.plot() from statsmodels.tsa.stattools import adfuller test_result=adfuller(df['Sales']) #case4 #https://www.digitalocean.com/community/tutorials/a-guide-to-time-series-forecasting-with-arima-in-python-3 import warnings import itertools import pandas as pd import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') data = sm.datasets.co2.load_pandas()
def testStationarity(ts, window, initialPlotDate='', finalPlotDate='', saveImg=False, saveDir='', saveName='', saveFormat='pdf'): initialPlotDate = initialPlotDate if initialPlotDate else ts.index[0] finalPlotDate = finalPlotDate if finalPlotDate else ts.index[-1] #Determing rolling statistics rolmean = ts.dropna().rolling(window=window, center=False).mean() rolstd = ts.dropna().rolling(window=window, center=False).std() fig, ax = plt.subplots(figsize=(15, 10), nrows=1, ncols=1, sharex=True) #Plot rolling statistics: ax.plot(ts[initialPlotDate:finalPlotDate], color='blue', label='Original') ax.plot(rolmean[initialPlotDate:finalPlotDate], color='red', label='Rolling Mean') ax.plot(rolstd[initialPlotDate:finalPlotDate], color='black', label='Rolling Std') ax.legend(loc='best') ax.set_title('Rolling Mean & Standard Deviation') #Perform Dickey-Fuller test: #print 'Results of Dickey-Fuller Test:' dftest = adfuller(ts.dropna(), autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value #print dfoutput plt.figtext(0.1, 0.010, 'Results of Dickey-Fuller Test:', size=14, verticalalignment='center') plt.figtext(0.1, -0.025, 'Test Statistic {:48.6f}'.format(dfoutput['Test Statistic']), size=14) plt.figtext(0.1, -0.050, 'p-value {:58.6f}'.format(dfoutput['p-value']), size=14) plt.figtext(0.1, -0.075, '#Lags Used {:51.6f}'.format(dfoutput['#Lags Used']), size=14) plt.figtext(0.1, -0.100, 'Number of Observations Used {:20.6f}'.format( dfoutput['Number of Observations Used']), size=14) plt.figtext(0.1, -0.125, 'Critical Value (1%) {:41.6f}'.format( dfoutput['Critical Value (1%)']), size=14) plt.figtext(0.1, -0.150, 'Critical Value (5%) {:41.6f}'.format( dfoutput['Critical Value (5%)']), size=14) plt.figtext(0.1, -0.175, 'Critical Value (10%) {:39.6f}'.format( dfoutput['Critical Value (10%)']), size=14) if saveImg: saveName = saveName if saveName else '{}_ADF'.format(s.name) fig.savefig('{}/{}.{}'.format(saveDir, saveName, saveFormat), bbox_inches='tight') return fig, ax
#Muestreo medio mensual data_m = file['Temp'].resample('MS').mean() #Grafica serie muestreada #print(data_m.head()) data_m.plot(figsize=(15, 5)) plt.show() #La serie presenta periodicidad anual pero no una componenete de tendencia rcParams['figure.figsize'] = 15, 5 decomposition = sm.tsa.seasonal_decompose(data_m, model='additive') fig = decomposition.plot() plt.show() #El valor p<0.001 (un valor significativo) => que se descarta la hipótesis de no estacionariedad result = adfuller(file.Temp, autolag='AIC') print(f'p-value: {result[1]}') # Se definen los parámetros p, d y q, valores entre 0 y 2 p = d = q = range(0, 2) # Genera las diferentes combinaciones de las tripletas p, d y q pdq = list(itertools.product(p, d, q)) # Genera las diferentes combinaciones de las tripletas estacionales p, d y q seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))] warnings.filterwarnings("ignore") # Genera las diferentes combinaciones de las tripletas p, d y q del modelo ARIMA con las estacionales P, D y Q parameters = list(itertools.product(pdq, seasonal_pdq))