def draw_acf_pacf(ts, lags=31): # type: (object, object) -> object f = plt.figure(facecolor='white') ax1 = f.add_subplot(211) plot_acf(ts, lags=31, ax=ax1) ax2 = f.add_subplot(212) plot_pacf(ts, lags=31, ax=ax2) plt.show()
def draw_acf_pacf(ts, lags=1): f = plt.figure(facecolor='white') ax1 = f.add_subplot(211) plot_acf(ts, lags=31, ax=ax1) ax2 = f.add_subplot(212) plot_pacf(ts, lags=31, ax=ax2) plt.show()
def draw_ACFs(df): """ :param df: pandas.DataFrame """ def label(ax, string): ax.annotate(string, (1, 1), xytext=(-8, -8), ha='right', va='top', size=14, xycoords='axes fraction', textcoords='offset points') fig, axes = plt.subplots(nrows=5, figsize=(8, 12)) fig.tight_layout() axes[0].plot(df[TimeSeriesDataFrameMap.Square_residuals]) label(axes[0], 'Returns') plot_acf(df[TimeSeriesDataFrameMap.Residuals], axes[1], lags=10) label(axes[1], 'Residuals autocorrelation') plot_acf(df[TimeSeriesDataFrameMap.Abs_residuals], axes[2], lags=10) label(axes[2], 'Absolute residuals autocorrelation') plot_acf(df[TimeSeriesDataFrameMap.Square_residuals], axes[3], lags=10) label(axes[3], 'Square residuals autocorrelation') plot_pacf(df[TimeSeriesDataFrameMap.Square_residuals], axes[4], lags=10) label(axes[4], 'Square residuals partial autocorrelation') plt.show()
def d_param(self, diff): '''function takes different values for difference step, and returns true or false flag if acf and pacf values lie into the threshold area''' THRESHOLD = 0.08 if diff == 0: acf = tss.acf(self.val) pacf = tss.pacf(self.val) # acf and pacf plots fig = plt.figure(figsize = (12,8)) ax1 = fig.add_subplot(121) fig = plot_acf(self.val,lags =40 ,ax=ax1) ax2 = fig.add_subplot(122, sharey=ax1) fig= plot_pacf(self.val, lags = 40, ax =ax2) plt.savefig('ACF_vs_PACF.jpg') plt.close() # check if most acf and pacf are lie in the accepted region for diff0 acf_percent = len(acf[np.abs(acf) <= THRESHOLD])/float(len(acf)) pacf_percent = len(pacf[np.abs(pacf) <= THRESHOLD])/float(len(pacf)) return (acf_percent >= .65) and (pacf_percent >= 0.65) elif diff == 1: diff1_acf = tss.acf(self.diff1_val.dropna()) diff1_pacf = tss.pacf(self.diff1_val.dropna()) # for acf and pacf plots fig = plt.figure(figsize = (12,8)) ax1 = fig.add_subplot(121) fig = plot_acf(self.diff1_val.dropna(),lags =40 ,ax=ax1) ax2 = fig.add_subplot(122, sharey=ax1) fig= plot_pacf(self.diff1_val.dropna(), lags = 40, ax =ax2) plt.savefig('ACF_vs_PACF_diff1.jpg') plt.close() # check if most acf and pacf are lie in the accepted region for diff1 acf_percent = len(diff1_acf[np.abs(diff1_acf) <= THRESHOLD])/float(len(diff1_acf)) pacf_percent = len(diff1_pacf[np.abs(diff1_pacf) <= THRESHOLD])/float(len(diff1_pacf)) return (acf_percent >= .65) and (pacf_percent >= 0.65) elif diff == 2: diff2_acf = tss.acf(self.diff2_val.dropna()) diff2_pacf = tss.pacf(self.diff2_val.dropna()) # check save fig for acf and pacf plots fig = plt.figure(figsize = (12,8)) ax1 = fig.add_subplot(121) fig = plot_acf(self.diff2_val.dropna(),lags =40 ,ax=ax1) ax2 = fig.add_subplot(122, sharey=ax1) fig = plot_pacf(self.diff2_val.dropna(), lags = 40, ax =ax2) plt.savefig('ACF_vs_PACF_diff2.jpg') plt.close() # check if most acf and pacf are lie in the accepted region for diff2 acf_percent = len(diff2_acf[np.abs(diff2_acf) <= THRESHOLD])/float(len(diff2_acf)) pacf_percent = len(diff2_pacf[np.abs(diff2_pacf) <= THRESHOLD])/float(len(diff2_pacf)) return (acf_percent >= .65) and (pacf_percent >= 0.65) else: raise InvalidParamError
def test_plot_pacf(close_figures): # Just test that it runs. fig = plt.figure() ax = fig.add_subplot(111) ar = np.r_[1., -0.9] ma = np.r_[1., 0.9] armaprocess = tsp.ArmaProcess(ar, ma) rs = np.random.RandomState(1234) pacf = armaprocess.generate_sample(100, distrvs=rs.standard_normal) plot_pacf(pacf, ax=ax) plot_pacf(pacf, ax=ax, alpha=None)
def test_plot_pacf(): # Just test that it runs. fig = plt.figure() ax = fig.add_subplot(111) ar = np.r_[1., -0.9] ma = np.r_[1., 0.9] armaprocess = tsp.ArmaProcess(ar, ma) pacf = armaprocess.pacf(20)[:20] plot_pacf(pacf, ax=ax) plot_pacf(pacf, ax=ax, alpha=None) plt.close(fig)
def plotds(xt, nlag=30, fig_size=(12, 10)): if not isinstance(xt, pd.Series): xt = pd.Series(xt) plt.figure(figsize=fig_size) layout = (2, 2) # Assign axes ax_xt = plt.subplot2grid(layout, (0, 0), colspan=2) ax_acf= plt.subplot2grid(layout, (1, 0)) ax_pacf = plt.subplot2grid(layout, (1, 1)) # Plot graphs xt.plot(ax=ax_xt) ax_xt.set_title('Time Series') plot_acf(xt, lags=50, ax=ax_acf) plot_pacf(xt, lags=50, ax=ax_pacf) plt.tight_layout() return None
def plot_acf_pacf(self, channel, lags=20): ''' Input: channel and #lags to include Output: Plots with autocorrelation function and partial autocorrelation function. ''' #set indexto date in input ts = chan_filter(self.df, channel) ts.sort_index(inplace=True) data = ts["AVG CCV's"] fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = plot_acf(data, lags=lags, ax=ax1) ax2 = fig.add_subplot(212) fig = plot_pacf(data, lags=lags, ax=ax2) plt.show()
def test_plot_pacf_irregular(): # Just test that it runs. fig = plt.figure() ax = fig.add_subplot(111) ar = np.r_[1., -0.9] ma = np.r_[1., 0.9] armaprocess = tsp.ArmaProcess(ar, ma) rs = np.random.RandomState(1234) pacf = armaprocess.generate_sample(100, distrvs=rs.standard_normal) plot_pacf(pacf, ax=ax, lags=np.arange(1, 11)) plot_pacf(pacf, ax=ax, lags=10, zero=False) plot_pacf(pacf, ax=ax, alpha=None, zero=False) plt.close(fig)
def test_plot_pacf_kwargs(): # Just test that it runs. fig = plt.figure() ax = fig.add_subplot(111) ar = np.r_[1., -0.9] ma = np.r_[1., 0.9] armaprocess = tsp.ArmaProcess(ar, ma) rs = np.random.RandomState(1234) pacf = armaprocess.generate_sample(100, distrvs=rs.standard_normal) buff = BytesIO() plot_pacf(pacf, ax=ax) fig.savefig(buff, format='rgba') plt.close(fig) buff_linestyle = BytesIO() fig_linestyle = plt.figure() ax = fig_linestyle.add_subplot(111) plot_pacf(pacf, ax=ax, ls='-') fig_linestyle.savefig(buff_linestyle, format='rgba') plt.close(fig_linestyle) buff_with_vlines = BytesIO() fig_with_vlines = plt.figure() ax = fig_with_vlines.add_subplot(111) vlines_kwargs = {'linestyles': 'dashdot'} plot_pacf(pacf, ax=ax, vlines_kwargs=vlines_kwargs) fig_with_vlines.savefig(buff_with_vlines, format='rgba') plt.close(fig_with_vlines) buff.seek(0) buff_linestyle.seek(0) buff_with_vlines.seek(0) plain = buff.read() linestyle = buff_linestyle.read() with_vlines = buff_with_vlines.read() assert_(plain != linestyle) assert_(with_vlines != plain) assert_(linestyle != with_vlines)
# artifacts in the plot. this works better x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] # make the plot plt.plot(x,np.asarray(lcs),linewidth=1.0) plt.show() plt.clf() # there's a definite linear, increasing trend... # let's first try diffs lcs_d1 = lcs.diff() plt.plot(x,np.asarray(lcs_d1),linewidth=1.0) plt.show() # trend is gone # we could have used the signal package like this (with same result): # import scipy.signal as sig # lcs_dt = sig.detrend(lcs) # plt.plot(x,lcs_dt,linewidth=2.0) # plt.show() # now plot the ACF of the transformed series plt.figure() st.plot_acf(lcs_d1) plt.show() plt.clf() # and PACF plt.figure() st.plot_pacf(lcs_d1) plt.show() plt.clf() print "no autocorrelated structures"
plt.title('一阶差分') # plt.show() check = sm.tsa.stattools.adfuller(stock_diff) #平稳性检验 print(check) ''' result : (-8.940749717155187, 9.267799032663645e-15, 0, 149, {'1%': -3.4750180242954167, '5%': -2.8811408028842043, '10%': -2.577221358046935}, 561.3312278939167) 1%、%5、%10不同程度拒绝原假设的统计值和ADF Test result的比较,ADF Test result同时小于1%、5%、10%即说明非常好地拒绝该假设; P-value是否非常接近0 故通过平稳性检验; ''' acf = plot_acf(stock_diff, lags=20) plt.title('ACF') # acf.show() pacf = plot_pacf(stock_diff, lags=20) plt.title('PACF') # pacf.show() model = ARIMA(stock_train, order=(1, 1, 1), freq='W-MON') result = model.fit() #print(result.summary) pred = result.predict('20160829', '20181203', dynamic=True, typ='levels') plt.figure(figsize=(6, 6)) plt.xticks(rotation=45) plt.plot(pred) plt.plot(stock_train) plt.show()
#subtract regression line from loan count data to account for non-stationary data new_loan= [] item_count = 0 for item in loan_count_summary: new_loan.append(item-est.params[0]-est.params[1]*item_count) item_count +=1 loan_count_rev = pd.Series(new_loan, index = loan_count_summary.index) #create variables to plot regression line x = loan_count_summary.index y = est.params[0]+ est.params[1]*x #plot regression line with loan data and detrended data plt.plot(x, y, 'r-') loan_count_rev.plot() lcr = loan_count_summary.plot() lcr.set_xlabel('Time (1 unit is a Month)') lcr.set_ylabel('Loan Count') lcr.legend(['OLS Regression Line', 'Loan Count Detrended (Y-Regression Line)', 'Loan Count Raw Data']) plt.show() #plot auto-correlation and partial auto-correlation plot_acf(loan_count_rev) plt.show() plot_pacf(loan_count_rev) plt.show()
import matplotlib.pyplot as plt import numpy as np import pandas as pd import statsmodels.graphics.tsaplots as tsaplots df = pd.read_csv('LoanStats3b.csv', header=1, low_memory=False) # converts string to datetime object in pandas: df['issue_d_format'] = pd.to_datetime(df['issue_d']) dfts = df.set_index('issue_d_format') year_month_summary = dfts.groupby(lambda x : x.year * 100 + x.month).count() loan_count_summary = year_month_summary['issue_d'] tsaplots.plot_acf(loan_count_summary) tsaplots.plot_pacf(loan_count_summary) plt.show()
df = df['CO1 Comdty'] df = df.dropna() prices = df.sort_index() # Explore the data a little prices.plot() prices.mean() # mean prices.std() # standard deviation prices.std() / prices.mean() # coefficient of variation # the stock is very volatile, there are big jumps; the std. is rather large # compared to the mean (high coefficient of variation) tsaplots.plot_acf(prices, lags=36) # just terrible tsaplots.plot_pacf(prices, lags=36) # very significant at lag=1 # this is a typical sign of an integrated process, so let's derivate it # (use returns instead of prices), which will hopefully yield a stationary process! returns = prices.pct_change() returns = returns[1:] # drop NA returns.isnull().sum() returns.plot() returns.mean() returns.std() returns.std() / returns.mean() # verify stationarity adfuller(returns, regression="c")
## p-value가 0에 가까운 값이 출력된다. ## 안정적인 시계열 데이터가 되었다. ## k = 1 로 결정(1차 차분이 안정적이다.) # ARIMA(p, k, q) => p, q 결정 ## 2 x 3 subplot을 통해 그려본다. figure, axes = plt.subplots(2, 3, figsize=(15, 7)) axes[0, 0].plot(df.DEXKOUS) axes[1, 0].plot(df.DEXKOUS.diff()) axes[0, 0].set_title('original series') axes[1, 0].set_title('1st difference series') plot_acf(df.DEXKOUS, axes[0, 1]) plot_pacf(df.DEXKOUS, axes[0, 2]) plot_acf(df.DEXKOUS.diff(), axes[1, 1]) plot_pacf(df.DEXKOUS.diff(), axes[1, 2]) plt.tight_layout() plt.show() ## AR 차수 : 3차 ~ 1차 ## MA 차수 : 2차 ~ 0차 # ARIMA 예측 모델링 ## ARIMA의 차수는 (3, 1, 2) model = ARIMA( df.DEXKOUS, order=(3, 1, 2), freq='B' ) # 환율 데이터는 토요일 일요일은 나오지 않으므로 제외한다는 의미 freq='B' / Business day만 설정한다는 의미
import statsmodels.api as sm import warnings from statsmodels.tsa.stattools import acovf, acf,pacf,pacf_yw,pacf_ols from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #Non stat df1 = pd.read_csv('./airline_passengers.csv',index_col='Month',parse_dates=True) df1.index.freq = 'MS' #df2 = pd.read_csv('statmodel/Data/DailyTotalFemaleBirths.csv', index_col='Data',parse_dates=True) #df2.index.freq = 'D' df2 = pd.read_csv('statmodel/Data/DailyTotalFemaleBirths.csv',index_col='Date',parse_dates=True) df2.index.freq = 'D' #warnings.filterwarnings('ignore') df1.plot() plt.show() #non-stationary plot_acf(df1, lags=40) #stationary, ikkeno seasonality her plot_acf(df2,lags=40) plt.show() plot_pacf(df2,lags=40, title="Partial Auto Correlation") plt.show()
# In[246]: import statsmodels.graphics.tsaplots as tsplots tsplots.plot_acf(returns, lags= 20) plt.show() # In[247]: returns[1:10] # In[248]: tsplots.plot_pacf(returns, lags= 20) plt.show() # In[249]: from pandas.tools.plotting import bootstrap_plot bootstrap_plot(returns, size = 50) plt.show() # Find out the following facts about the data set # - Total number of data points # - Number of positive returns # - Number of negative returns # - Average annualized returns
def PlotPacf(self, dataset, title, indices=None, nlags=40): pacfFrame = pacf(dataset.values, nlags=nlags) plot_pacf(pacfFrame, title=title + 'PACF') plt.show()
from pandas import read_csv from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf # ACF = AR # PACF = MA from matplotlib import pyplot as pp thisdir = os.getcwd() print(thisdir) series = read_csv(r'data\stationary.csv', header=None, index_col=0, parse_dates=True, squeeze=True) pp.figure() pp.subplot(211) plot_acf(series, ax=pp.gca()) pp.subplot(212) plot_pacf(series, ax=pp.gca()) pp.tight_layout(pad=3.0) pp.show() #%% Testen ARIMA # Uit bovenstaande ACF en PACF from pandas import read_csv from sklearn.metrics import mean_squared_error from statsmodels.tsa.arima_model import ARIMA from math import sqrt # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval]
''' Created on Dec 11, 2018 @author: snake91 ''' import numpy as np from statsmodels.graphics.tsaplots import plot_acf, plot_pacf import matplotlib.pyplot as plt x = np.random.normal(size=500) window = 10 y = [np.mean(x[i - window:i]) for i in range(window, len(x))] plt.plot(x[window:len(x)]) plt.plot(y) plot_acf(x, lags=10) plot_pacf(x, lags=10) plot_acf(y, lags=10) plot_pacf(y, lags=10)
# stationory stationoryResult = adfuller(targetData) # print 'adf: ', stationoryResult[0] # print 'p-value: ', stationoryResult[1] # print 'Critical values: ', stationoryResult[4] # if stationoryResult[0]> stationoryResult[4]['5%']: # print 'Time Series is nonstationary' # else: # print 'Time Series is stationary' # acf, pacf f2 = plt.figure(facecolor='white') ax1 = f2.add_subplot(211) plot_acf(targetData, lags=40, ax=ax1) ax2 = f2.add_subplot(212) plot_pacf(targetData, lags=40, ax=ax2) # plt.show() # according to the problem, only use AR(p) p belongs to (30,35) comm = MPI.COMM_WORLD rank = comm.Get_rank() AR_mod = lambda x: ARMA(targetData, (x, 0)).fit(disp=0, method='mle') aic_list = AR_mod(rank).aic print "%i|%i|%i" % (rank, aic_list, rank) ''' # confirm the lag p_index = aic_list.index(min(aic_list)) AR = AR_mod(p_index)
def check_acf(x): fig, ax = plt.subplots(3, figsize=(12, 6)) ax[0] = plot_acf(x, ax=ax[0], lags=25) ax[1] = plot_pacf(x, ax=ax[1], lags=25) ax[2].plot(x)
ϕ 2 = 0.3 (again, reverse the signs) Plot the PACF for simulated_data_2 using the plot_pacf function ''' # Import the modules for simulating data and for plotting the PACF from statsmodels.tsa.arima_process import ArmaProcess from statsmodels.graphics.tsaplots import plot_pacf # Simulate AR(1) with phi=+0.6 ma = np.array([1]) ar = np.array([1, -0.6]) AR_object = ArmaProcess(ar, ma) simulated_data_1 = AR_object.generate_sample(nsample=5000) # Plot PACF for AR(1) plot_pacf(simulated_data_1, lags=20) plt.show() # Simulate AR(2) with phi1=+0.6, phi2=+0.3 ma = np.array([1]) ar = np.array([1, -0.6, -0.3]) AR_object = ArmaProcess(ar, ma) simulated_data_2 = AR_object.generate_sample(nsample=5000) # Plot PACF for AR(2) plot_pacf(simulated_data_2, lags=20) plt.show()
# Legend and Labels plt.legend([ "Normal Dist. Fit ($\mu \sim${0}, $\sigma=${1:.2f})".format(0, sigma), '$\hat{e}_t$' ]) plt.xlabel('Value') plt.ylabel('Frequency') #%% from statsmodels.graphics.tsaplots import plot_pacf ax = plt.subplot(gs[2]) plot_pacf(et_hat_series, lags=50, alpha=0.01, ax=ax) plt.title('') plt.xlabel('Lags') plt.ylabel('PACF') #%% from statsmodels.tsa.ar_model import AR resultGetVectorAR = GetVectorAR(et_hat[None, :], maxlags=1, trend='c') resultGetAR = AR(et_hat).fit(maxlag=3, trend='c', method='cmle') print('Is AR({%d}) model stable: {%s}' % (resultGetAR.k_ar, str(IsStable(resultGetAR.roots)))) print( 'Is VectorAR({%s}) model stable: {%s}' % (resultGetVectorAR['maxlags'], str(IsStable(resultGetVectorAR['roots'])))) print('NOTE THAT VECTOR_AR[1] IS *NOT* STABLE')
year_month_summary = dfts.groupby(lambda x: x.year * 100 + x.month).count() loan_count_summary = year_month_summary['issue_d'] print(loan_count_summary) #we're left with a data table of year+month x # of loans issued plt.xlabel('2015 Issue Date (Month)') plt.ylabel('Loans Issued') loan_count_summary.plot() plt.show() #ACF sag.plot_acf(loan_count_summary) plt.show() #PACF sag.plot_pacf(loan_count_summary) plt.show() print( "There are autocorrelated structures in the data, specfically there seems to be Seasonality and a need to add an Auto Regressive term." ) #output #201501 2616 #201502 2588 #201503 3002 #201504 3067 #201505 3167 #201506 3494 #201507 3694 #201508 3729
def _plot_PACF(x): from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(x)
## Using Returns df['returns'] = df.market_value.pct_change(1).mul(100) df = df.iloc[1:] sts.adfuller(df.returns) ## ACF and PACF for Returns ## ACF and PACF for Returns sgt.plot_acf(df.returns, lags=40, zero = False) plt.title("ACF FTSE Returns", size=24) plt.show() sgt.plot_pacf(df.returns, lags = 40, zero = False, method = ('ols')) plt.title("PACF FTSE Returns", size=24) plt.show() ## AR(1) for Returns ## AR(1) for Returns model_ret_ar_1 = ARMA(df.returns, order = (1,0)) results_ret_ar_1 = model_ret_ar_1.fit() results_ret_ar_1.summary() ## Higher-Lag AR Models for Returns ## Higher-Lag AR Models for Returns
# ## `for` loop for creating ACF and PACF plots # In[19]: from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf for i in range(0,len(Warehouse)): plot_acf(diff_warehouse(Warehouse[i]).Order_Demand) print '\n\n\n\n___________________________________________________________________________________________________________________________' print color.BOLD + '\n\n\t\t\t\t\t\t\t %s \n'% Warehouse[i] + color.END plt.show() plot_pacf(diff_warehouse(Warehouse[i]).Order_Demand) plt.show() # ## Method 2 - Auto Arima # In[20]: from pyramid.arima import auto_arima import plotly.plotly as py import plotly.graph_objs as go from plotly.offline import download_plotlyjs, init_notebook_mode, plot for i in range(0,len(Warehouse)): train = diff_warehouse(Warehouse[i]).iloc[0:int(len(diff_warehouse(Warehouse[i]))*0.7)]
kings_ma3_res = kings_ma3_res.dropna() kings_ma3_res.head() #Plotting histogram for residuals plt.hist(kings_ma3_res) plt.title('Histogram Residuals @MA3') ''' if we r getting symmetrical hist then its good model that we did! ''' #Plotting acf & pacf from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf plot_acf(kings_ma3_res, lags=20) plot_pacf(kings_ma3_res, lags=19) #Squaring residuals/ errors kings_ma3_se = pow(kings_ma3_res,2) kings_ma3_se.head() #average/mean of squared residuals/ errors kings_ma3_mse = (kings_ma3_se.sum())/len(kings_ma3_se) print(kings_ma3_mse) #128.7527777777778 #Root of average/mean of squared residuals/ errors kings_ma3_rmse = sqrt(kings_ma3_mse) print(kings_ma3_rmse) #11.346928120763689 #Another method to find RMSE kings_ma3 = kings.rolling(window=3).mean()
## 1. ARIMA ## import pandas as pd import matplotlib.pyplot as plt from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from statsmodels.tsa.arima_model import ARIMA series = pd.read_csv('C:/Users/Shinhyunjin/Dropbox/market-price.csv', header=0, index_col=0, squeeze=True) series.plot() plot_acf(series) plot_pacf(series) plt.show #1차차분 diff_1 = series.diff(periods=1).iloc[1:] diff_1.plot() plot_acf(diff_1) plot_pacf(diff_1) plt.show() # Modeling model = ARIMA(series, order=(0, 1, 1)) #ARIMA(0,1,1) model_fit = model.fit(trend='c', full_output=True, disp=1) print(model_fit.summary())
# ACF and PACF plots of time series from pandas import read_csv from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf from matplotlib import pyplot series = read_csv('dataset.csv', header=None, index_col=0, parse_dates=True, squeeze=True) pyplot.figure() pyplot.subplot(211) plot_acf(series, ax=pyplot.gca()) pyplot.subplot(212) plot_pacf(series, ax=pyplot.gca()) pyplot.show()
from statsmodels.graphics.tsaplots import plot_acf plot_acf(data).show() #平稳性检测 from statsmodels.tsa.stattools import adfuller as ADF print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore #差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] D_data.plot() #时序图 plt.show() plot_acf(D_data).show() #自相关图 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) #平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA data[u'销量'] = data[u'销量'].astype(float) #定阶 pmax = int(len(D_data) / 10) #一般阶数不超过length/10 qmax = int(len(D_data) / 10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax + 1): tmp = []
def programmer_6(): """ 警告解释: # UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, " 调用了多次plt.show() 解决方案,使用plt.subplot() # RuntimeWarning: overflow encountered in exp 运算精度不够 forecastnum-->预测天数 plot_acf().show()-->自相关图 plot_pacf().show()-->偏自相关图 """ discfile = 'data/arima_data.xls' forecastnum = 5 data = pd.read_excel(discfile, index_col=u'日期') fig = plt.figure(figsize=(8, 6)) # 第一幅自相关图 ax1 = plt.subplot(411) fig = plot_acf(data, ax=ax1) # 平稳性检测 print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) # 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore # 差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] # 时序图 D_data.plot() plt.show() # 第二幅自相关图 fig = plt.figure(figsize=(8, 6)) ax2 = plt.subplot(412) fig = plot_acf(D_data, ax=ax2) # 偏自相关图 ax3 = plt.subplot(414) fig = plot_pacf(D_data, ax=ax3) plt.show() fig.clf() print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) # 平稳性检测 # 白噪声检验 print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'销量'] = data[u'销量'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 data.dropna(inplace=True) # 存在部分报错,所以用try来跳过报错;存在warning,暂未解决使用warnings跳过 import warnings warnings.filterwarnings('error') for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) # 从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) # 用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(forecastnum) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
def display_pacf(series, lags = 50): from matplotlib import pyplot from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(series, lags = lags) pyplot.show()
#Perform Dickey-Fuller test: print 'Results of Dickey-Fuller Test:' dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value print dfoutput from statsmodels.graphics.tsaplots import plot_acf, plot_pacf # regular diff diff0 = df.stack().diff(periods=4)[4:] diff0.plot(title='European Retail Trade Differenced') plot_acf(diff0, lags=30) plot_pacf(diff0, lags=30) test_stationarity(diff0) # additional diff diff1 = diff0.diff()[1:] diff1.plot(title='European Retail Trade Differenced Twice') plot_acf(diff1, lags=30) plot_pacf(diff1, lags=30) test_stationarity(diff1) import statsmodels.api as sm data = df.stack().values model = sm.tsa.statespace.SARIMAX(data, order=(0,1,1), seasonal_order=(0,1,1,4)) results = model.fit()
#自相关,和偏相关图不是截尾,也不是拖尾.p > 0.05. df.index = pd.to_datetime(df.index,format='%Y') #to_datetime df.plot() plt.show() #0 hypo: random walk with drift. result = adfuller(df['tavg']) print('adf test:',result[1]) #p>0.5,非平稳.一阶差分. chg = df.diff() chg = chg.dropna() fix,axes = plt.subplots(2,1) plot_acf(chg,lags=20,ax=axes[0]) plot_pacf(chg,lags=20,ax=axes[1]) plt.show() #AR(1). mod_ar1 = ARMA(chg,order=(1,0)) res_ar1 = mod_ar1.fit() print('AIC of AR(1):',res_ar1.aic) #AR(2). mod_ar2 = ARMA(chg,order=(2,0)) res_ar2 = mod_ar2.fit() print('AIC of AR(2):',res_ar2.aic) #ARMA(1,1). mod_arma11 = ARMA(chg,order=(1,1)) res_arma11 = mod_arma11.fit() print('AIC of ARMA(1,1):',res_arma11.aic)
X = series.values X = X.astype('float32') train_size = int(len(X) * 0.50) train, test = X[0:train_size], X[train_size:] # walk-forward validation history = [x for x in train] predictions = list() for i in range(len(test)): # difference data months_in_year = 12 diff = difference(history, months_in_year) # predict model = ARIMA(diff, order=(0, 0, 1)) model_fit = model.fit(trend='nc', disp=0) yhat = model_fit.forecast()[0] yhat = inverse_difference(history, yhat, months_in_year) predictions.append(yhat) # observation obs = test[i] history.append(obs) # errors residuals = [test[i] - predictions[i] for i in range(len(test))] residuals = DataFrame(residuals) print(residuals.describe()) # plot pyplot.figure() pyplot.subplot(211) plot_acf(residuals, ax=pyplot.gca()) pyplot.subplot(212) plot_pacf(residuals, ax=pyplot.gca()) pyplot.show()
autocorrelation_plot(y, ax=plt.subplot(222), color='k') plt.xlabel(u'Шаг') plt.ylabel(u'АКФ') plt.title('') # Spectrum plot plt.subplot(223) plt.plot(freq, np.log(spectrum), color='k') plt.xlabel(u"Частота") plt.ylabel(u"Амплитуда (log)") #plt.title("Timeseries spectrum") plt.grid(True) if f_show_PACF: from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(y, ax=plt.subplot(224), lags=pacf_lags) plt.xlabel(u'Шаг') plt.ylabel(u'ЧАКФ') plt.title('') else: # Power spectrum plot plt.subplot(224) plt.plot(freq, power_spectrum, color='k') plt.xlabel(u"Частота") plt.ylabel(u"Мощность") #plt.title("Power spectrum") plt.grid(True) plt.subplots_adjust(hspace=0.4, wspace=0.4)
from statsmodels.graphics.tsaplots import plot_acf plot_acf(data).show() #平稳性检测 from statsmodels.tsa.stattools import adfuller as ADF print( ADF(data[u'销量'])) #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore #差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] D_data.plot() #时序图 plt.show() plot_acf(D_data).show() #自相关图 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 ADF(D_data[u'销量差分'])#平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox acorr_ljungbox(D_data, lags=1) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA #定阶 pmax = int(len(D_data)/10) #一般阶数不超过length/10 qmax = int(len(D_data)/10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax+1): tmp = [] for q in range(qmax+1):
# We will perform statistical tests like KPSS and ADF to confirm our understanding. # # But first, let's plot ACF and PACF graphs. # In[29]: acf = plot_acf(series, lags=50, alpha=0.05) plt.title("ACF for Weighted Price", size=20) plt.show() # The above graph shows that effect barely detoriate over time, so past values affect the present ones. The more lags we include, the better our model will fit the dataset, now the risk is coefficients might predict the dataset too well, cause an overfitting. # In our model, we always try to include only those lags which have a direct effect on our present value. Hence, let's try PACF. # In[30]: plot_pacf(series, lags=50, alpha=0.05, method='ols') plt.title("PACF for Weighted Price", size=20) plt.show() # Coefficients values for lag>5 are statistically not significant and their impact on the model is minimal, except a few spikes at 8,11,22 and beyond. # <a id="subsection-four"></a> # # KPSS Test # # The KPSS test, short for, Kwiatkowski-Phillips-Schmidt-Shin (KPSS), is a type of Unit root test that tests for the stationarity of a given series around a deterministic trend. # # Here, the null hypothesis is that the series is **stationary**. # # That is, if p-value is < signif level (say 0.05), then the series is non-stationary and vice versa. # In[31]:
import pandas as pd from matplotlib import pyplot as plt from statsmodels.graphics.tsaplots import plot_acf, plot_pacf header = "Sodium..mg." series = pd.read_csv('paleo.csv', header=0, index_col=0, parse_dates=True, squeeze=True) # Create figure fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 8)) # Make ACF plot plot_acf(series[{header}], lags=10, zero=False, ax=ax1) # Make PACF plot plot_pacf(series[{header}], lags=10, zero=False, ax=ax2) plt.show()
# ## Partial Autocorrelation # # In general, a partial correlation is a conditional correlation. # # It is the correlation between two variables under the assumption that we know and take into account the values of some other set of variables. # # For instance, consider a regression context in which y = response variable and x1, x2, and x3 are predictor variables. The partial correlation between y and x3 is the correlation between the variables determined taking into account how both y and x3 are related to x1 and x2. # # Formally, this is relationship is defined as: # # ## $\frac{\text{Covariance}(y, x_3|x_1, x_2)}{\sqrt{\text{Variance}(y|x_1, x_2)\text{Variance}(x_3| x_1, x_2)}}$ # # Check out this [link](http://www.itl.nist.gov/div898/handbook/pmc/section4/pmc4463.htm) for full details on this. # We can then plot this relationship: # In[36]: result = plot_pacf(df["Seasonal First Difference"].dropna()) # ### Interpretation # # Typically a sharp drop after lag "k" suggests an AR-k model should be used. If there is a gradual decline, it suggests an MA model. # ### Final Thoughts on Autocorrelation and Partial Autocorrelation # # * Identification of an AR model is often best done with the PACF. # * For an AR model, the theoretical PACF “shuts off” past the order of the model. The phrase “shuts off” means that in theory the partial autocorrelations are equal to 0 beyond that point. Put another way, the number of non-zero partial autocorrelations gives the order of the AR model. By the “order of the model” we mean the most extreme lag of x that is used as a predictor. # # # * Identification of an MA model is often best done with the ACF rather than the PACF. # * For an MA model, the theoretical PACF does not shut off, but instead tapers toward 0 in some manner. A clearer pattern for an MA model is in the ACF. The ACF will have non-zero autocorrelations only at lags involved in the model. # _____ # ### Final ACF and PACF Plots # # We've run quite a few plots, so let's just quickly get our "final" ACF and PACF plots. These are the ones we will be referencing in the rest of the notebook below.
# calculating and printing root mean squared error rmse = np.sqrt(mean_squared_error(Y_test[1:], Y_predict_RandomWalk)) * 100 print(rf"The RMSE is {rmse:2.4f}%") # # ARIMA # ### Identifying hyperparameters # In[96]: # Plotting the PACF and ACF to identify AR and MA lags (as per Box-Jenkins identification) from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf plot_acf(Y_train, lags=100) plot_pacf(Y_train, lags=100) plt.show() # For MA(q), only the first q autocorrelations are nonzero, so the ACF should cut off after lag q. # For AR(p), the autocorrelations may decline gradually, but the PACF should cut off after lag p # ### Building and training the model # In[97]: #marking start time for model training start_time = time.time() #training the ARIMA model ARIMA_model = ARIMA(endog=Y_train, order=(0, 0, 0)).fit()
fig = plt.figure(figsize=(12, 8)) ax2 = fig.add_subplot(111) ts2 = ts.diff(2) ts2.plot(ax=ax2) plt.plot(ts2) plt.show() # 合适的q 和 p值 # 选择合适的ARIMA模型-即选择ARIMA模型中的P和q值 from statsmodels.graphics.tsaplots import plot_acf, plot_pacf f = plt.figure(facecolor='white') ax1 = f.add_subplot(211) plot_acf(ts, lags=40, ax=ax1) ax2 = f.add_subplot(212) plot_pacf(ts, lags=40, ax=ax2) plt.show() # 预测结果 import statsmodels.api as sm # arma_mod20 = sm.tsa.ARMA(ts, (10, 3)).fit() print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic) pre = arma_mod20.predict('2018-08-16 17:50:00', '2018-08-16 18:10:00', dynamic=True) print(pre) fig, ax = plt.subplots(figsize=(12, 8))
def f_autocorr(pmi): a = plot_acf(pmi) b = plot_pacf(pmi) return (a, b)
dataset['#Passengers Second Diff'] = dataset['#Passengers First Diff'] - dataset['#Passengers First Diff'].shift(1) dataset adf_check(dataset['#Passengers Second Diff'].dropna()) dataset['Seasonal Difference'] = dataset['#Passengers']-dataset['#Passengers'].shift(12) dataset adf_check(dataset['Seasonal Difference'].dropna()) from statsmodels.graphics.tsaplots import plot_acf, plot_pacf plot_acf(dataset['#Passengers Second Diff'].dropna(), lags=28) #q=0 plot_pacf(dataset['#Passengers Second Diff'].dropna(),lags=14) #p=0 plot_acf(dataset['Seasonal Difference'].dropna(), lags=12) plot_pacf(dataset['Seasonal Difference'].dropna(), lags=12) #P=2 from statsmodels.tsa.arima_model import ARIMA import statsmodels.api as sm model = sm.tsa.statespace.SARIMAX(dataset['#Passengers'], order=(1,2,1), seasonal_order=(2,2,0,12)) results = model.fit() print(results.summary()) dataset['Forecast'] = results.predict(start=130, end=144, dynamic=True)
year_month_summary = dfts.groupby(lambda x : x.year * 100 + x.month).count() loan_count_summary = year_month_summary['issue_d'] print(loan_count_summary) #we're left with a data table of year+month x # of loans issued plt.xlabel('2015 Issue Date (Month)') plt.ylabel('Loans Issued') loan_count_summary.plot() plt.show() #ACF sag.plot_acf(loan_count_summary) plt.show() #PACF sag.plot_pacf(loan_count_summary) plt.show() print ("There are autocorrelated structures in the data, specfically there seems to be Seasonality and a need to add an Auto Regressive term.") #output #201501 2616 #201502 2588 #201503 3002 #201504 3067 #201505 3167 #201506 3494 #201507 3694 #201508 3729 #201509 3873 #201510 4181
def timeseries_exploratory_plots(x, title="Plots", raw=True, change=True, change_type="adj", lag=1, autocorr=True, autocorr_changes=True, density=True, density_resolution=0.1, density_change=True, density_change_resolution=0.1, pacf=True, pacf_change=True, figsize=(10,20)): """ Given a pandas series, it plots up to three plots below each other. - the raw data - the percent changes - the autocorrelation plots NOTE: that currently it REQUIRES the data to NOT have any missing/NAN values, otherwise it will plot things incorrectly, so make sure data being fed in is cleaned up. Args: change_type: (str) The type of changes to use. One of: - "diff" for raw differences in subsequent numbers - "pct" to use pandas x.diff() function - "adj" to use my adjusted percentage change function lag: (int)(default = 1) How many timesteps to offset the change/diff by. density: (bool) create plot of density? density_resolution: (float)(default=0.1) Resolution of the density estimation. density_change: (bool) create plot of density for changes? density_change_resolution: (float)(default=0.1) Resolution of the density estimation. pacf: (bool) create partial auto correlation funciton plot on the raw data? pacf_change: (bool) create partial auto correlation funciton plot on the changes data? """ nplots = sum([raw, change, autocorr, autocorr_changes, density, density_change, pacf, pacf_change]) fig, axes = plt.subplots(nplots,1, figsize=figsize) axes = axes.flatten() fig.suptitle(title, fontsize=15) i = 0 if raw: ax = plot_lines([x], axtitle="raw data", xlabel="x", ylabel="raw value", ax=axes[i], show=False, minorgrid=True) i += 1 if change or autocorr_changes: # percent_changes = pd.Series(x).pct_change() if change_type == "adj": percent_changes = adjusted_percent_change(x, lag=lag, epsilon=0.1) elif change_type == "pct": percent_changes = pd.Series(x).pct_change(lag) elif change_type == "diff": percent_changes = pd.Series(x).diff(lag) if change: ax = plot_lines([percent_changes], axtitle="Percent Change", color_offset=1, ax=axes[i], xlabel="x", ylabel="percent change", minorgrid=True) i += 1 if density: ax = plot_densities([x], ax=axes[i], axtitle="Distribution of raw data", resolution=density_resolution, minorgrid=True) i += 1 if density_change: ax = plot_densities([percent_changes], ax=axes[i], axtitle="Distribution of changes", resolution=density_change_resolution, minorgrid=True) i += 1 # Autocorrelation function plot if autocorr: ax = axes[i] fig = plot_acf(x, lags=50, alpha=0.05, title="Autocorrelation plot", ax=ax) setgrid(axes[i], minor=True) ax.set_xlabel("lag amount") ax.set_ylabel("Correlation") i += 1 # Autocorrelation function plot of the changes if autocorr_changes: ax = axes[i] fig = plot_acf(percent_changes[1:].fillna(method="backfill"), lags=50, alpha=0.05, title="Autocorrelation of Percent Changes plot", ax=ax) setgrid(axes[i], minor=True) ax.set_xlabel("lag amount") ax.set_ylabel("Correlation") i += 1 # partial Autocorrelation function plot if pacf: ax = axes[i] fig = plot_pacf(x, lags=50, alpha=0.05, title="Partial Autocorrelation Function plot", ax=ax) setgrid(axes[i], minor=True) ax.set_xlabel("lag amount") ax.set_ylabel("Correlation") i += 1 # partial Autocorrelation function plot on changes data if pacf_change: ax = axes[i] fig = plot_pacf(percent_changes, lags=50, alpha=0.05, title="Partial Autocorrelation Function plot on changes", ax=ax) setgrid(axes[i], minor=True) ax.set_xlabel("lag amount") ax.set_ylabel("Correlation") i += 1 # Give enough spacing between subplots x-axes and titles of plots fig.tight_layout(pad=1.10, rect=[0, 0.03, 1, 0.95]) return fig
df = pd.read_csv("data/^GSPC.csv", index_col=0, parse_dates=True) df.head(10) # checking for Autocorrelation # Autocorrelation plots autocorrelation_plot(df) plt.savefig(PATH + "Autocorrelation plot") plt.show() # Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots plot_acf(df["Adj Close"], lags=50) # lag 50 days plt.savefig(PATH + "Autocorrelation Function plot") plt.show() plot_pacf(df["Adj Close"], lags=50) plt.savefig(PATH + "Partial Autocorrelation Function plot") plt.show() # If the time series is stationary, the ACF/PACF plots will show a quick drop-off in correlation after a # small amount of lag between points. # This data is non-stationary as a high number of previous observations are correlated with future values. ### Autoregressive model ### # train / test split # test - predictions of last 5 years of the Adj. Close price test_length = 1265 X = df["Adj Close"].values train, test = X[1:len(X) - test_length], X[len(X) - test_length:]
plt.rcParams['axes.unicode_minus'] = False data.plot() plt.show() from statsmodels.graphics.tsaplots import plot_acf plot_acf(data).show() from statsmodels.tsa.stattools import adfuller as ADF print 'ADF test result:', ADF(data['value']) D_data = data.diff().dropna() D_data.columns = ['diff value'] D_data.plot() plt.show() plot_acf(D_data).show() from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() print 'diff seq ADF test result:', ADF(D_data['diff value']) from statsmodels.stats.diagnostic import acorr_ljungbox print 'dff white noise test result:', acorr_ljungbox(D_data, lags = 1) from statsmodels.tsa.arima_model import ARIMA model = ARIMA(data, (1,1,1)).fit() model.summary2() model.forecast(5*6)