def whitenoise_test(dataset, number): data = dataset.copy() data = data.iloc[:len(data) - number] #不使用最后5个数据 #白噪声检测 from statsmodels.stats.diagnostic import acorr_ljungbox [[lb], [p]] = acorr_ljungbox(data['rentNumber'], lags=1) if p < 0.05: print(u'原始序列为非白噪声序列,对应的p值为:%s' % p) else: print(u'原始该序列为白噪声序列,对应的p值为:%s' % p) [[lb], [p]] = acorr_ljungbox(data['rentNumber'].diff().dropna(), lags=1) if p < 0.05: print(u'一阶差分序列为非白噪声序列,对应的p值为:%s' % p) else: print(u'一阶差分该序列为白噪声序列,对应的p值为:%s' % p)
def acorr_val(self): # 白噪声检测 lbvalue, pvalue = acorr_ljungbox(self.ts, lags=1) table_rows = [[lbvalue, pvalue ]] table_names = ['lbvalue', 'pvalue'] pre_table(table_names, table_rows) return pvalue
def diff_process(self): self.p_value = acorr_ljungbox(self.df.iloc[:, 1], lags=1) print('白噪声检验p值:', self.p_value[1], '\n') #大于0.05认为是白噪声,即序列在时间上不具有相关性 #self.ADF_value = ADF(self.df.iloc[:,0]) #p值为0小于0.05认为是平稳的(单位根检验) ''' 单位根检验按p值判断是否平稳,否则一直作差分直到序列平稳 ''' self.diff_ = self.df.iloc[:, 1] self.ADF_value = adfuller(self.diff_, autolag='AIC') self.i = 0 while self.ADF_value[1] >= 0.05: self.diff_ = self.diff_.diff() #一次差分 self.diff_ = self.diff_.dropna() self.ADF_value = adfuller(self.diff_, autolag='AIC') # 1%、%5、%10不同程度拒绝原假设的统计值和ADF Test result的比较, # ADF Test result同时小于1%、5%、10%说明非常好的拒绝原假设,p值小于0.05,则平稳 print('ADF检验:', '\n', self.ADF_value, '\n') self.i += 1 fig = plt.figure(figsize=(20, 6)) ax1 = fig.add_subplot(211) #原始数据图 ax1.plot(self.df.iloc[:, 1]) ax2 = fig.add_subplot(212) #再一次差分之后 平稳 ax2.plot(self.diff_) plt.show()
def get_best_log(ts, max_log=5, rule1=True, rule2=True): """ 稳定性检测+数据平稳处理 :param ts: 时间序列格式数据,Series格式 :param max_log: 最大的log处理次数 :param rule1: :param rule2: :return:log处理次数,平稳处理的后的时间序列数据 """ if rule1 and rule2: return 0, ts else: for i in range(1, max_log): ts = np.log(ts) lbvalue, pvalue2 = acorr_ljungbox(ts, lags=1) #白噪音简称,目的时间序列是否都是白噪声 adf, pvalue1, usedlag, nobs, critical_values, icbest = adfuller( ts) #ADF检测,同样是检测ts是否平稳 rule1 = (adf < critical_values['1%'] and adf < critical_values['5%'] and adf < critical_values['10%'] and pvalue1 < 0.01) #稳定性检测 rule2 = (pvalue2 < 0.05) rule3 = (i < 5) if rule1 and rule2 and rule3: print('the best log n is :{0}'.format(i)) return i, ts
def whitenoiseTest(data, lagnum=1): lb, p = acorr_ljungbox(data, lags=1) h = (p < 0.05).sum() # p < 0.05 是非白噪声 if h > 0: return False # 序列为非白噪声序列 else: return True # 序列为白噪声序列
def whitenoise_test(ts): from statsmodels.stats.diagnostic import acorr_ljungbox q, p = acorr_ljungbox(ts) with plt.style.context('ggplot'): fig = plt.figure(figsize=(10, 4)) axes = fig.subplots(1, 2) axes[0].plot(q, label='Q统计量') axes[0].set_ylabel('Q') axes[0].set_title('收益率残差平方自相关性检验') axes[1].plot(p, label='p值') axes[1].set_ylabel('P') axes[1].set_title('收益率残差平方自相关性检验') axes[0].legend() axes[1].legend() plt.tight_layout() return
def selectFFT(series, minAlpha=None): # Implements a forward algorithm for selecting FFT frequencies #1) Initialize variables series_ = series fftRes = np.fft.fft(series_, axis=0) fftRes = {i: j[0] for i, j in zip(range(fftRes.shape[0]), fftRes)} fftOpt = np.zeros(series_.shape, dtype=complex) lags, crit = int(12 * (series_.shape[0] / 100.)**.25), None #2) Search forward while True: key, critOld = None, crit for key_ in fftRes.keys(): fftOpt[key_, 0] = fftRes[key_] series__ = np.fft.ifft(fftOpt, axis=0) series__ = np.real(series__) crit_ = sm3.acorr_ljungbox(series_ - series__, lags=lags) # test for the max # lags crit_ = crit_[0][-1], crit_[1][-1] if crit == None or crit_[0] < crit[0]: crit, key = crit_, key_ fftOpt[key_, 0] = 0 if key != None: fftOpt[key, 0] = fftRes[key] del fftRes[key] else: break if minAlpha != None: if crit[1] > minAlpha: break if critOld != None and crit[0] / critOld[0] > 1 - minAlpha: break series_ = np.fft.ifft(fftOpt, axis=0) series_ = np.real(series_) out = {'series': series_, 'fft': fftOpt, 'res': fftRes, 'crit': crit} return out
def con_SARIMAX(y_series=None,season=7): ''' 时间序列平稳性检验,p-value<0.05则通过,否则不通过 最大差分次数max_diff_time=2 ''' # 检查数据量是否>=50 if len(y_series)>=50: data_amount_check=True else: data_amount_check=False # 检查1阶差分+周期差分是否可平稳 isStationarity=False p_value_shreshold = 0.05 # 1阶差分 ts_diff = y_series.diff(1) # 不会改变y_series的值 ts_diff.dropna(inplace=True) # 丢掉缺失值,If True, do operation inplace and return None. #进行周期差分 ts_diff = y_series.diff(season) ts_diff.dropna(inplace=True) #丢掉缺失值,If True, do operation inplace and return None. # 白噪声检验结果 lbvalue,pvalue2=acorr_ljungbox(ts_diff,lags=1) rule_1=(pvalue2<p_value_shreshold) # ADF检验,平稳性检验 adf,pvalue1,usedlag,nobs,critical_values,icbest= adfuller(ts_diff) rule_2=(adf<critical_values['1%'] and adf<critical_values['5%'] and adf<critical_values['10%'] and pvalue1<0.01) # 忽略白噪声检验 rule_1 and if rule_2: isStationarity=True return data_amount_check and isStationarity
def test_acorr_ljung_box(self): res = self.res #> bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box") #> mkhtest(bt, "ljung_box_4", "chi2") ljung_box_4 = dict(statistic=5.23587172795227, pvalue=0.263940335284713, parameters=(4,), distr='chi2') #> bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce") #> mkhtest(bt, "ljung_box_bp_4", "chi2") ljung_box_bp_4 = dict(statistic=5.12462932741681, pvalue=0.2747471266820692, parameters=(4,), distr='chi2') #ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q #> bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2) #> mkhtest(bt, "ljung_box_4df2", "chi2") ljung_box_4df2 = dict(statistic=5.23587172795227, pvalue=0.0729532930400377, parameters=(2,), distr='chi2') #> bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2) #> mkhtest(bt, "ljung_box_bp_4df2", "chi2") ljung_box_bp_4df2 = dict(statistic=5.12462932741681, pvalue=0.0771260128929921, parameters=(2,), distr='chi2') lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid, 4, boxpierce=True) compare_t_est([lb[-1], lbpval[-1]], ljung_box_4, decimal=(13, 14)) compare_t_est([bp[-1], bppval[-1]], ljung_box_bp_4, decimal=(13, 14))
def model_eval(dct_model, **kwargs): """ __Description__: ... __Parametres__: kwargs : parametres supplementaires pour le modele qui sont: -b_eval : booléen précisant si l'évaluation du modele sur train doit etre fait __Return__: Un dictionnaire constitué des éléments suivants: model: retour de la methode statsmodels.tsa.statespace.SARIMAX result_model: retour de la methode statsmodels.tsa.statespace.SARIMAX.fit statistique: valeurs des grandeurs stat AIC, SSE """ b_eval = kwargs.get('b_eval', True) if not isinstance(b_eval, bool): print("'b_eval' parameter must be a boolean.") return 'evaluation' if 'result' not in dct_model.keys(): print("No SARIMAXResult present for the key 'result' in dct_model.") return 'evaluation' if b_eval: eval_stat = {} eval_stat['AIC'] = dct_model['result'].aic eval_stat['BIC'] = dct_model['result'].bic eval_stat['SSE'] = dct_model['result'].sse eval_stat['MSE'] = dct_model['result'].mse eval_stat['LjungBox test'] = acorr_ljungbox( x=dct_model['result'].resid, lags=[int(log(dct_model['result'].resid.shape[0]))]) dct_model['eval_stat'] = dict(eval_stat) return
def serial_correlation(variable, plot_name='autocorr_error.png'): autocorrelation_plot(variable) plot.savefig(plot_name) plot.close() # https://robjhyndman.com/hyndsight/ljung-box-test/ lags = min(10, round(len(variable) / 5)) print(acorr_ljungbox(variable, lags=lags))
def residue_test(residue): ''' 观察ARIMA模型的残差是否是平均值为0且方差为常数的正态分布 ''' fig = plt.figure(figsize=(12, 8)) # ax1 = fig.add_subplot(211) # fig = plot_acf(residue.values.squeeze(), lags=35, ax=ax1) # plt.show() ax2 = fig.add_subplot(212) fig = plot_pacf(residue.values.squeeze(), lags=35, ax=ax2) plt.show() # 通过q-q图观察,检验残差是否符合正态分布 fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111) fig = qqplot(residue, line='q', ax=ax, fit=True) plt.show() # Ljung-Box Test - 基于一些列滞后阶数,判断序列总体的相关性或随机性是否存在 r1, q1, p1 = ACF(residue.values.squeeze(), qstat=True) tmp = np.c_[list(range(1, 36)), r1[1:], q1, p1] table = pd.DataFrame(tmp, columns=['lag', 'AC', 'Q', 'Prob(>Q)']) print(table.set_index('lag')[:15]) # 残差的白噪声检验 print('残差的白噪声检验结果为:', acorr_ljungbox(residue, lags=1))
def autocorr_test(_xdata, _ydata): import numpy as np import pandas as pd from statsmodels.stats.diagnostic import acorr_ljungbox from statsmodels.tsa.stattools import acf #all statst need regularly spaced, continuous time series - just y variable #Durbin-Watson statistics: # calculated correctly with missing data # but no significance level. Apparently critical values for DW are not implemented in any python library #ACF: # crashes on missing data # Ljung-Box: # crashes on missing data too _ydata=np.ma.masked_invalid(_ydata) #autocorrelation in residuals #this is acf function that does not allow nans # print "\nautocorrelation for first three lags:", acf(_ydata)[1:4] #this is from pandas, is nan agnostic pdf=pd.Series(_ydata, index=_xdata, copy=True) print "autocorrelation for first three lags:", [pdf.autocorr(i) for i in range(1,4)] #durbin-watson a=_ydata[:-1].astype('float') b=_ydata[1:].astype('float') _stat=np.nansum((b-a)**2)/np.nansum(_ydata**2) print "Durbin-Watson statistic (close to 2 if no autocorrelation):", _stat _stat, _pvalue=acorr_ljungbox(_ydata, lags=1, boxpierce=False) print "Ljung-Box p-value on lag 1 autocorrelation:", _pvalue print ""
def mix_model(time_series_diff, args, name): """ time_series_diff: stationary time_series after diff. args: arguments parsed before. name: the name of time_series_diff. return fitted ARIMA model, parameters for ARIMA model, fitted GARCH model and parameters for GARCH model. """ # get arima model arima_model_fit, arima_order = ARIMA_model(time_series_diff, args, name) if args.plot: # residual plots of residual model plot_residual(arima_model_fit.resid, name) # check if the resid of arima model is white noise _, pvalue = acorr_ljungbox(arima_model_fit.resid, # auto_lag=True, model_df=sum(arima_order), return_df=False ) logger.debug("acorr_ljungbox: " + str(list(pvalue))) if args.plot: plot_pvalue(pvalue, "acorr_ljungbox") if np.sum(pvalue < 0.05) > 0: logger.info("residual after fit still can not give white noises, we turn to use GARCH") else: logger.info("Although the residual does give good random values, we still turn to GARCH") # get garch model garch_model_fit, garch_order = GARCH_model(arima_model_fit.resid, args, name) return arima_model_fit, arima_order, garch_model_fit, garch_order
def __ts_differencing(self): # 计算时间序列的差分d值 ''' 时间序列平稳性检验,p-value<0.05则通过,否则不通过 最大差分次数max_diff_time=2 ''' while True: lbvalue, pvalue2 = acorr_ljungbox(self.ts_diff, lags=1) #白噪声检验结果 adf, pvalue1, usedlag, nobs, critical_values, icbest = adfuller( self.ts_diff) #ADF检验 rule_1 = (adf < critical_values['1%'] and adf < critical_values['5%'] and adf < critical_values['10%'] and pvalue1 < 0.01) rule_2 = (pvalue2 < self.p_value_shreshold) if rule_1 and rule_2: self.isStationarity = True break if not (rule_1 and rule_2) and self.d < self.max_diff_time: self.d = self.d + 1 self.ts_diff = self.ts_train.diff(self.d) #进行d阶差分 self.diffs.append(self.ts_diff) self.ts_diff.dropna( inplace=True ) #丢掉缺失值,If True, do operation inplace and return None. else: break
def _ljung_box_test(table, input_cols, lags=None): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Ljung Box test Result""") for input_col in input_cols: lbvalue, pvalue = acorr_ljungbox(x=table[input_col], lags=lags) lb_res = dict() lb_res['lags'] = range(1, len(lbvalue) + 1) lb_res['test statistic'] = lbvalue lb_res['p-value based on chi-square distribution'] = pvalue lb_res = pd.DataFrame(lb_res) rb.addMD( strip_margin(""" | ## {input_col} test result | | {lb_res} """.format(input_col=input_col, lb_res=pandasDF2MD(lb_res, num_rows=lb_res.shape[0])))) result[input_col] = lb_res result['_repr_brtc_'] = rb.get() return {'result': result}
def ljung(label, series, names): """ Table for Ljung-Box test Parameters ---------- label : string Label in latex and name of txt file series : list of pandas.Series names : list of strings Names of each series in table """ with open('latex/tables/{}.txt'.format(label), 'w') as b: a = '''\\begin{{table}}[H] \\caption{{Ljung-Box Test}} \\label{{tab:{}}} \\centering \\begin{{tabular}}{{ | c | c | }} \\hline Series & P-value \\\\ \\hline \\hline'''.format(label) for i in range(len(series)): var = series[i][1:] a += '\n{0} & {1:.3e} \\\\'.format(names[i], dig.acorr_ljungbox(var)[1][39]) a += '\n\\hline' a += '''\n\\end{tabular} \\end{table}''' b.write(a)
def eval_plot(X, Y, Y_hat, lags=None): R = np.array(Y) - np.array(Y_hat) f, ax = plt.subplots(2, 2) res = stats.probplot(R, plot=ax[0, 0]) ax[0, 0].set_title('Normal Probability Plot of the Residuals') ax[0, 1].scatter(X, R) ax[0, 1].set_title('Residuals vs Fitted Values') ax[1, 0].hist(R) ax[1, 0].set_title('Histogram of the Residuals') ax[1, 1].plot(R) ax[1, 1].set_title('Residuals vs Order of the Data') plt.show() if lags is None: lags = min(20, len(R) / 2) (lb, p_values) = acorr_ljungbox(R, lags=lags, boxpierce=False) print('Ljung-Box Test') print( "H_0 (p>0.05) --> The data are independently distributed -- i.e. there's no auto correlations" ) print( "H_a (p<0.05) --> The data are not independently distributed -- i.e. there is auto correlations" ) print('p_values', p_values) sub = list(filter(lambda p: p < .05, p_values)) if len(sub) > 0: print( 'PROBLEM! There appears to be information left in the residuals') else: print('There does not appear to be information left in the residuals') return len(sub) > 0
def is_white_noise(col, lags=LAGS, box_pierce=False): # https://stats.stackexchange.com/questions/200267/interpreting-ljung-box-test-results-from-statsmodels-stats-diagnostic-acorr-lju ljung_box_result, pvals = diagnostic.acorr_ljungbox(col, lags, box_pierce) for val in pvals: if val > ALPHA: return True return False
def ljung_box_test(self, output_folder, df_name): ''' function that applies L-jung box test for detecting white noise in the target variable of the dataframe being passed as a parameter of this class :param output_folder: path to the output folder where the dataframe that contains the columns returned by the Ljung-Box test will be saved :param df_name: name that is associated to the dataframe that will be created :return: the dataframe created ''' if self.service_name is not None and self.mohafaza is not None: print("testing for %s in %s" % (self.service_name, self.mohafaza)) arr = sm.acorr_ljungbox(self.df[self.target_variable], boxpierce=True) df = pd.DataFrame({ 'lb': arr[0], 'p-values': arr[1], 'bpvalue': arr[2], 'bpp-values': arr[3] }) df.index.name = 'lag_nb' if not os.path.exists(output_folder): os.makedirs(output_folder) df.to_csv(output_folder + df_name + '.csv') if len(df[df['p-values'] <= 0.05]) == len(df): print('all p-values for ljung box <= 0.05') if len(df[df['bpp-values'] <= 0.05]) == len(df): print('all p-values for box pierce <= 0.05') print('-----------------------------------------------') return df
def get_best_log(ts, max_log=5, rule1=True, rule2=True): ''' :param ts: 时间序列数据,Series类型 :param max_log: 最大log处理的次数,int型 :param rule1: rule1规则布尔值,布尔型 :param rule2: rule2规则布尔值,布尔型 :return: 达到平稳处理的最佳次数值和处理后的时间序列 ''' if rule1 and rule2: # 如果两个规则同时满足 return 0, ts # 直接返回0和原始时间序列数据 else: # 只要有一个规则不满足 for i in range(1, max_log): # 循环做log处理 ts = np.log(ts) # log处理 lbvalue, pvalue1 = acorr_ljungbox(ts, lags=1) # 白噪声检验结果 adf, pvalue2, usedlag, nobs, critical_values, icbest = adfuller( ts) # ADF检验 rule_1 = (adf < critical_values['1%'] and adf < critical_values['5%'] and adf < critical_values['10%'] and pvalue1 < 0.01) # 稳定性检验 rule_2 = (pvalue2 < 0.05) # 白噪声检验 rule_3 = (i < 5) if rule_1 and rule_2 and rule_3: # 如果同时满足条件 print('The best log n is: {0}'.format(i)) # 打印输出最佳次数 return i, ts # 返回最佳次数和处理后的时间序列
def arimaModelCheck(): ''' 模型检验 :return: ''' discfile = 'data/discdata_processed.xls' # 残差延迟个数 lagnum = 12 data = pd.read_excel(discfile, index_col='COLLECTTIME') data = data.iloc[:len(data) - 5] xdata = data['CWXT_DB:184:D:\\'] # 建立ARIMA(0,1,1)模型 from statsmodels.tsa.arima_model import ARIMA # 建立并训练模型 arima = ARIMA(xdata, (0, 1, 1)).fit() # 预测 xdata_pred = arima.predict(typ='levels') # 计算残差 pred_error = (xdata_pred - xdata).dropna() from statsmodels.stats.diagnostic import acorr_ljungbox # 白噪声检验 lb, p = acorr_ljungbox(pred_error, lags=lagnum) # p值小于0.05,认为是非白噪声。 h = (p < 0.05).sum() if h > 0: print(u'模型ARIMA(0,1,1)不符合白噪声检验') else: print(u'模型ARIMA(0,1,1)符合白噪声检验')
def is_white_noise(time_series): values = time_series.values p = acorr_ljungbox(values, lags=1)[1] if p < 0.05: return False return True
def tsdiag(arimaResiduals, afcFags=25, lbLags=10, figsize=(10, 8), style='bmh'): if not isinstance(arimaResiduals, pd.Series): arimaFittedvVlues = pd.Series(arimaResiduals) with plt.style.context(style): plt.figure(figsize=figsize) # Set the size of the figure layout = (3, 1) sr_ax = plt.subplot2grid(layout, (0, 0)) acf_ax = plt.subplot2grid(layout, (1, 0)) lb_ax = plt.subplot2grid(layout, (2, 0)) # Create the standard residual plot sr_ax.plot(arimaFittedvVlues) sr_ax.set_title("Standardizede Residuals") sr_ax.set_xlabel("Time") # Crate the ACF plot plot_acf(arimaResiduals, lags=afcFags, ax=acf_ax) # Create the Ljung-Box statitics plot lb = acorr_ljungbox(arimaResiduals, lags=lbLags) lbPvalue = lb[1] # get the pvalue from the ljungbox test lb_ax.scatter(np.arange(lbLags), lbPvalue, facecolors='none', edgecolors='b') lb_ax.set_ylim(-0.1, 1) lb_ax.axhline(y=0.05, linestyle='--') lb_ax.set_title("p values for Ljung-Box Statistic") lb_ax.set_ylabel("p values") lb_ax.set_xlabel("lags") plt.tight_layout() return
def testing(data): ''' 进行ADF平衡性检验 & 白噪声检验 ''' print('原始序列的ADF平衡性检验的结果为:', ADF(data['volume'])) print('原始序列的白噪声检验的结果为:', acorr_ljungbox(data['volume'], lags=1))
def acorr_ljungbox_(timeseries): """ :param timeseries: time series that aims to analyse :return: the values of the acorr ljungbox_test, in order to determine whether the time series is random or not """ a = acorr_ljungbox(timeseries, lags=1) return a[1][0] ### return 检验结果的 p_value值
def ljungbox(data, lags=12): blres = acorr_ljungbox(data, lags=lags, return_df=True) print(blres) print("Box-Ljung test") print(f"X-squared: {round(blres.tail(1)['lb_stat'].values[0], 4)}", end=", ") print(f"df = {len(blres)}", end=", ") print(f"p-value: {blres.tail(1)['lb_pvalue'].values[0]}")
def alles_ljung(residuals): ''' Parameters ---------- residuals : array of float The residuals after the fit of model+baseline+stellar_var. Returns ------- isUncorrelated : bool True if the residuals are not correlated, False otherwise. Outputs ------- It also prints the statstics and conclusions. Sauces ------ https://www.statology.org/ljung-box-test-python/ https://www.statsmodels.org/dev/generated/statsmodels.stats.diagnostic.acorr_ljungbox.html?highlight=ljung ''' logprint('Ljung-Box Test') logprint('--------------') logprint( 'This tests the null hypothesis that there is no correlation among the residuals.' ) df = acorr_ljungbox(residuals, lags=[1, 5, 10, 15, 20], return_df=True) df.reset_index(inplace=True) df = df.rename(columns={'index': 'lag'}) logprint('Does the null hypotheses hold at a significance level of...') df['0.15'] = df[ 'lb_pvalue'] > 0.15 #if Ture, the null hypothesis cannot be rejected at this significance level df['0.1'] = df[ 'lb_pvalue'] > 0.10 #if Ture, the null hypothesis cannot be rejected at this significance level df['0.05'] = df[ 'lb_pvalue'] > 0.05 #if Ture, the null hypothesis cannot be rejected at this significance level df['0.025'] = df[ 'lb_pvalue'] > 0.025 #if Ture, the null hypothesis cannot be rejected at this significance level df['0.01'] = df[ 'lb_pvalue'] > 0.01 #if Ture, the null hypothesis cannot be rejected at this significance level isUncorrelated = all(df['0.15'] == True) & all(df['0.1'] == True) & all( df['0.05'] == True) & all(df['0.025'] == True) & all( df['0.01'] == True) logprint(df.to_string(index=False)) if isUncorrelated: logprint('The null hypothesis cannot be rejected.') logprint('In simple words: your residuals look good.') else: logprint( 'The null hypothesis is rejected at some significance levels.') logprint( 'In simple words: there might still be some structure in your residuals.' ) logprint('\n') return isUncorrelated
def test_ljungbox_errors(): data = sunspots.load_pandas().data['SUNACTIVITY'] with pytest.raises(ValueError, match="model_df must"): smsdia.acorr_ljungbox(data, model_df=-1) with pytest.raises(ValueError, match="period must"): smsdia.acorr_ljungbox(data, model_df=-1, period=1) with pytest.raises(ValueError, match="period must"): smsdia.acorr_ljungbox(data, model_df=-1, period=-2) with pytest.warns(FutureWarning, match="The default value of lags"): smsdia.acorr_ljungbox(data, return_df=False)
def random_test(close_price): """""" acorr_result = acorr_ljungbox(close_price, lags=1) p_value = acorr_result[1] if p_value < 0.05: output("第二步:随机性检验:非纯随机性") else: output("第二步:随机性检验:纯随机性") output(f"白噪声检验结果:{acorr_result}\n")
def test_autocorr(df): """ :param df: pandas.DataFrame """ lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(df[TimeSeriesDataFrameMap.Square_residuals], lags=10, boxpierce=True) print('Ljung Box Test') print('Lag P-value') for l, p in zip(range(1, 13), pvalue): print(l, ' ', p)
def programmer_3(): discfile = "data/discdata_processed.xls" data = pd.read_excel(discfile) data = data.iloc[:len(data) - 5] [[lb], [p]] = acorr_ljungbox(data["CWXT_DB:184:D:\\"], lags=1) if p < 0.05: print(u"原始序列为非白噪声序列,对应的p值为:%s" % p) else: print(u"原始序列为白噪声序列,对应的p值为:%s" % p) [[lb], [p]] = acorr_ljungbox( data["CWXT_DB:184:D:\\"].diff().dropna(), lags=1) if p < 0.05: print(u"一阶差分序列为非白噪声序列,对应的p值为:%s" % p) else: print(u"一阶差分序列为白噪声序列,对应的p值为:%s" % p) print(lb)
def test_acorr_ljung_box_big_default(self): res = self.res #test with big dataset and default lag #> bt = Box.test(residuals(fm), type = "Ljung-Box") #> mkhtest(bt, "ljung_box_none", "chi2") ljung_box_none = dict(statistic=51.03724531797195, pvalue=0.11334744923390, distr='chi2') #> bt = Box.test(residuals(fm), type = "Box-Pierce") #> mkhtest(bt, "ljung_box_bp_none", "chi2") ljung_box_bp_none = dict(statistic=45.12238537034000, pvalue=0.26638168491464, distr='chi2') lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid, boxpierce=True) compare_t_est([lb[-1], lbpval[-1]], ljung_box_none, decimal=(13, 13)) compare_t_est([bp[-1], bppval[-1]], ljung_box_bp_none, decimal=(13, 13))
def test_acorr_ljung_box_small_default(self): res = self.res #test with small dataset and default lag #> bt = Box.test(residuals(fm), type = "Ljung-Box") #> mkhtest(bt, "ljung_box_small", "chi2") ljung_box_small = dict(statistic=9.61503968281915, pvalue=0.72507000996945, parameters=(0,), distr='chi2') #> bt = Box.test(residuals(fm), type = "Box-Pierce") #> mkhtest(bt, "ljung_box_bp_small", "chi2") ljung_box_bp_small = dict(statistic=7.41692150864936, pvalue=0.87940785887006, parameters=(0,), distr='chi2') lb, lbpval, bp, bppval = smsdia.acorr_ljungbox(res.resid[:30], boxpierce=True) compare_t_est([lb[-1], lbpval[-1]], ljung_box_small, decimal=(13, 13)) compare_t_est([bp[-1], bppval[-1]], ljung_box_bp_small, decimal=(13, 13))
def programmer_5(): discfile = "data/discdata_processed.xls" # 残差延迟个数 lagnum = 12 data = pd.read_excel(discfile, index_col="COLLECTTIME") data = data.iloc[:len(data) - 5] xdata = data["CWXT_DB:184:D:\\"] # 训练模型并预测,计算残差 arima = ARIMA(xdata, (0, 1, 1)).fit() xdata_pred = arima.predict(typ="levels") pred_error = (xdata_pred - xdata).dropna() lb, p = acorr_ljungbox(pred_error, lags=lagnum) h = (p < 0.05).sum() if h > 0: print(u"模型ARIMA(0,1,1)不符合白噪声检验") else: print(u"模型ARIMA(0,1,1)符合白噪声检验") print(lb)
#-*- coding: utf-8 -*- #白噪声检验 import pandas as pd #参数初始化 discfile = '../data/discdata_processed.xls' data = pd.read_excel(discfile) data = data.iloc[: len(data)-5] #不使用最后5个数据 #白噪声检测 from statsmodels.stats.diagnostic import acorr_ljungbox [[lb], [p]] = acorr_ljungbox(data['CWXT_DB:184:D:\\'], lags = 1) if p < 0.05: print(u'原始序列为非白噪声序列,对应的p值为:%s' %p) else: print(u'原始该序列为白噪声序列,对应的p值为:%s' %p) [[lb], [p]] = acorr_ljungbox(data['CWXT_DB:184:D:\\'].diff().dropna(), lags = 1) if p < 0.05: print(u'一阶差分序列为非白噪声序列,对应的p值为:%s' %p) else: print(u'一阶差分该序列为白噪声序列,对应的p值为:%s' %p)
def normal_stats(a_row, q): # check independence and normality of transformed errors w, n_pval = sps.shapiro(a_row) # Null hypothesis: data is normal loc, scale = np.mean(a_row), np.std(a_row) upr, lwr = sps.norm.ppf(q, loc, scale), sps.norm.ppf(1.0 - q, loc, scale) l_pval = np.min(smd.acorr_ljungbox(a_row, lags=int(12 * (len(a_row) / 100.0)**0.25))[1]) # Null Hypothesis: data are independently distributed. Take the min (worst case) return [l_pval, n_pval, upr, lwr]
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore #差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] D_data.plot() #时序图 plt.show() plot_acf(D_data).show() #自相关图 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) #平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA #定阶 pmax = int(len(D_data)/10) #一般阶数不超过length/10 qmax = int(len(D_data)/10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax+1): tmp = [] for q in range(qmax+1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp)
def programmer_6(): """ 警告解释: # UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, " 调用了多次plt.show() 解决方案,使用plt.subplot() # RuntimeWarning: overflow encountered in exp 运算精度不够 forecastnum-->预测天数 plot_acf().show()-->自相关图 plot_pacf().show()-->偏自相关图 """ discfile = 'data/arima_data.xls' forecastnum = 5 data = pd.read_excel(discfile, index_col=u'日期') fig = plt.figure(figsize=(8, 6)) # 第一幅自相关图 ax1 = plt.subplot(411) fig = plot_acf(data, ax=ax1) # 平稳性检测 print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) # 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore # 差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] # 时序图 D_data.plot() plt.show() # 第二幅自相关图 fig = plt.figure(figsize=(8, 6)) ax2 = plt.subplot(412) fig = plot_acf(D_data, ax=ax2) # 偏自相关图 ax3 = plt.subplot(414) fig = plot_pacf(D_data, ax=ax3) plt.show() fig.clf() print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) # 平稳性检测 # 白噪声检验 print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'销量'] = data[u'销量'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 data.dropna(inplace=True) # 存在部分报错,所以用try来跳过报错;存在warning,暂未解决使用warnings跳过 import warnings warnings.filterwarnings('error') for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) # 从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) # 用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(forecastnum) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
#Raw Spending Plot of monthly averages plt.plot(mcc2_ts_m) plt.xticks(mcc2_ts_m.index, ('08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06')) plt.savefig('C:/Users/bodhisattva_2/Dropbox/mcc2_ts_m_plot.png', bbox_inches='tight') plt.clf() #Generate and plot the autocorrelation function (ACF) over all available lags for monthly spending acf_mcc2_m = stattools.acf(mcc2_ts_m['trans_amt'].values, nlags=num_rows_ts_m) plt.axhline(y=0, xmin=0, xmax=1, color='k') plt.plot(acf_mcc2_m) plt.savefig('C:/Users/bodhisattva_2/Dropbox/mcc2_ts_m_acf_plot.png', bbox_inches='tight') plt.clf() #Given that the ACF showed fairly weak periodicity I decided to run a Ljung-Box test for white noise on monthly spending. #As I expected, I fail to reject null hypothesis of white noise process with series at the monthly level. diagnostic.acorr_ljungbox(mcc2_ts_m['trans_amt'].values, lags=num_rows_ts_m-1) #My next thought was that aggregating the data might be obscuring trends at the weekly level. I ran the same sequence of analyses #as I did for monthly spending. #Generate and plot the autocorrelation function (ACF) over 21 lags. This was chosen to make the plot clear, but a similar pattern held for #more lags. num_lags = 21 acf_mcc2 = stattools.acf(mcc2_ts['trans_amt'].values, nlags=num_lags) plt.plot(acf_mcc2) plt.axhline(y=0, xmin=0, xmax=1, color='k') plt.xticks(np.arange(num_lags)) plt.savefig('C:/Users/bodhisattva_2/Dropbox/mcc2_ts_acf_plot.png', bbox_inches='tight') #The ACF displays strong weekly periodicity and as one might expect, a Ljung-Box test rejects the null hypothesis of white noise process at the daily level. diagnostic.acorr_ljungbox(mcc2_ts['trans_amt'].values, lags=num_lags)
print( ADF(data[u'销量'])) #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore #差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] D_data.plot() #时序图 plt.show() plot_acf(D_data).show() #自相关图 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 ADF(D_data[u'销量差分'])#平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox acorr_ljungbox(D_data, lags=1) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA #定阶 pmax = int(len(D_data)/10) #一般阶数不超过length/10 qmax = int(len(D_data)/10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax+1): tmp = [] for q in range(qmax+1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp)
# -*- coding: utf-8 -*- # 模型检验 import pandas as pd # 参数初始化 discfile = '../data/discdata_processed.xls' lagnum = 12 # 残差延迟个数 data = pd.read_excel(discfile, index_col='COLLECTTIME') data = data.iloc[: len(data) - 5] # 不使用最后5个数据 xdata = data['CWXT_DB:184:D:\\'] from statsmodels.tsa.arima_model import ARIMA # 建立ARIMA(0,1,1)模型 arima = ARIMA(xdata, (0, 1, 1)).fit() # 建立并训练模型 xdata_pred = arima.predict(typ='levels') # 预测 print "-------預測模型------------\n", xdata_pred pred_error = (xdata_pred - xdata).dropna() # 计算残差 from statsmodels.stats.diagnostic import acorr_ljungbox # 白噪声检验 lb, p = acorr_ljungbox(pred_error, lags=lagnum) h = (p < 0.05).sum() # p值小于0.05,认为是非白噪声。 if h > 0: print(u'模型ARIMA(0,1,1)不符合白噪声检验') else: print(u'模型ARIMA(0,1,1)符合白噪声检验')
#clicksPerDay.index.name = None #encountersPerDay.index.name = None clicksPerDay = pd.Series(data=clicksPerDay["count_clicks"], index=clicksPerDay.index) encountersPerDay = pd.Series(data=encountersPerDay["count_encounter"], index=encountersPerDay.index) clicksPerDay = clicksPerDay.fillna(method="ffill") encountersPerDay = encountersPerDay.fillna(method="ffill") ####################LJUNG-BOX#################### clicksPACF = stattools.pacf_ols(clicksPerDay, nlags=MAX_LAG) encountersPACF = stattools.pacf_ols(encountersPerDay, nlags=MAX_LAG) #my implementation results = _math.ljungBox(encountersPACF, len(encountersPerDay), MAX_LAG) lag, R, Q, p = zip(*results) print np.asarray(Q[1:]) print np.asarray(p[1:]) #statsmodels implementation results = diagnostic.acorr_ljungbox(encountersPerDay, lags=MAX_LAG) print results #my copy of the statsmodels implementation results = _math.ljungBox2(encountersPerDay, maxlag=MAX_LAG) print results #they are not the same... I wonder why.
plt.rcParams['axes.unicode_minus'] = False data.plot() plt.show() from statsmodels.graphics.tsaplots import plot_acf plot_acf(data).show() from statsmodels.tsa.stattools import adfuller as ADF print 'ADF test result:', ADF(data['value']) D_data = data.diff().dropna() D_data.columns = ['diff value'] D_data.plot() plt.show() plot_acf(D_data).show() from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() print 'diff seq ADF test result:', ADF(D_data['diff value']) from statsmodels.stats.diagnostic import acorr_ljungbox print 'dff white noise test result:', acorr_ljungbox(D_data, lags = 1) from statsmodels.tsa.arima_model import ARIMA model = ARIMA(data, (1,1,1)).fit() model.summary2() model.forecast(5*6)