def stationarityTest(data): diff = 0 adf_data = ADF(data) while adf_data[1] >= 0.05: diff += 1 adf_data = ADF(data.diff(diff).dropna()) return (diff, adf_data[1])
def adf_diff(data: pd.DataFrame, plot: bool = False) -> int: """ ADF检验 -> d """ # diff & fillna data_diff1 = data.diff(1).fillna(0.0) data_diff2 = data.diff(1).diff(1).fillna(0.0) # ADF data_adf = ADF(data) data_diff1_adf = ADF(data_diff1) data_diff2_adf = ADF(data_diff2) # get p p = 0 for i, adf in enumerate([data_adf, data_diff1_adf, data_diff2_adf]): t_val, p_val, _, _, ts, _ = adf if t_val < min(ts.values()): p = i print('p={}\nadf={}'.format(i, adf)) break else: p += i if plot: plt.figure(figsize=(20, 5)) plt.plot(data, label='Original', color='blue') plt.plot(data_diff1, label='Diff1', color='red') plt.plot(data_diff2, label='Diff2', color='green') plt.legend(loc='best') plt.title("{}".format(index)) plt.show() return p
def practice1_saling_data_analysis(): data = pd.read_csv('L09-TimeSeriesPrediction/data/arima_data.csv', index_col='date') data.index = pd.to_datetime(data.index) # 将字符串索引转换为时间索引 data.plot() plt.show() # plot_acf(data).show() # plt.show() # plot_pacf(data).show() # plt.show() # testing(data) # 初次检验 period = 0 # 原始序列 D_data = data ADF_p = ADF(D_data)[1] acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0] # 通过ADF检验和白噪声检验,确定可以分析的平稳的非白噪声序列 while ADF_p >= 0.05 || acorr_ljungbox_p >= 0.05: period += 1 D_data = D_data.diff(periods=period).dropna() ADF_p = ADF(D_data)[1] acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0] p, q = order_determination(data, D_data)
def diff(time_series, if_plot, name, if_diff): """ times_seris: time_series, pd.Dataframe. if_plot: boolen value indicating whether to plot. name: string value indicating name of the time series. if_diff: boolen value indicating whether to diff. return stationary time_series, counts of diff when the time_series become stationary. """ counts = 0 # indicating how many times the series diffs. copy_series = copy.deepcopy(time_series) # directly return if_diff False. if not if_diff: return copy_series, counts # keep diff until ADF test's p-value is smaller than 1%. while ADF(copy_series.tolist())[1] > 0.05: logger.info("time " + str(counts) + " ADF test: " + str(ADF(copy_series.tolist()))) copy_series = copy_series.diff(1) copy_series = copy_series.fillna(0) counts += 1 logger.info("time " + str(counts) + " ADF test: " + str(ADF(copy_series.tolist()))) # plot diff and original time series in one graph. if if_plot: plot_diff(time_series, copy_series, counts, name) return copy_series, counts
def get_adf(): infile = "../data/discdata_processed.xls" data = pd.read_excel(infile) data = data.iloc[:len(data) - 5] adf = ADF(data["CWXT_DB:184:D:\\"]) diff = 0 while adf[1] > 0.05: diff += 1 adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna()) print("经过%d阶差分后归于平稳,p值为%s" % (diff, adf[1]))
def stationarity_test(dataset, number): data = dataset.copy() data = data.iloc[:len(data) - number] #不检测最后number个数据 #平稳性检测 from statsmodels.tsa.stattools import adfuller as ADF diff = 0 adf = ADF(data['rentNumber']) while adf[1] > 0.05: diff = diff + 1 adf = ADF(data['rentNumber'].diff(diff).dropna()) print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
def arima_regression(): # 参数初始化 discfile = SRC_PATH + '/data/arima_data.xls' forecastnum = 5 # 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式 data = pd.read_excel(discfile, index_col=u'日期') # 时序图 data.plot() plt.show() # 自相关图 plot_acf(data).show() print u'原始序列的ADF检验结果为:', ADF(data[u'销量']) # 差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] D_data.plot() # 时序图 plt.show() print data print D_data plot_acf(D_data).show() # 自相关图 plot_pacf(D_data).show() # 偏自相关图 print u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分']) # 平稳性检测 # 白噪声检验 print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'销量'] = data[u'销量'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: # 存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值 p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(5) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
def check_steady(data): #平稳性检测function from statsmodels.tsa.stattools import adfuller as ADF print(u'原始序列的ADF检验结果为:') print('返回所有的信息:', ADF(data)) #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore print('返回P_VALUE:', ADF(data)[1]) print('---->>>>将计算获得的p-value与显著性水平数值0.05比较,大于该数值说明该序列不是平稳序列,反之是平稳序列!') if ADF(data)[1] < 0.05: print('----->>>>>STEADY!') else: print('----->>>>>NOT STEADY!')
def cal_d(df): if ADF(df.tmid)[1] < 0.05 or acorr_ljungbox(df, lags=1)[1] < 0.05: return 0 d = 1 d_df = df.diff(periods=1, axis=0).dropna() #平稳性检验、白噪声检验 while ADF(d_df.tmid)[1] >= 0.05 or acorr_ljungbox(d_df, lags=1)[1] >= 0.05: d_df = d_df.diff(periods=1, axis=0).dropna() d = d + 1 if d >= 2: return 2 return d
def arima(): import matplotlib.pyplot as plt import pandas as pd import numpy as np test_data = list([random.randint(1, 20) for i in range(49)]) # 时序图 data = pd.Series(test_data) data.plot() plt.show() # 自相关 from statsmodels.graphics.tsaplots import plot_acf #plot_acf(data).show() # 平稳性检测 from statsmodels.tsa.stattools import adfuller as ADF print('original ADF result is', ADF(data)) D_data = data.diff(3).dropna() #print(D_data) D_data.plot() plt.show() print('Diffenciate ADF result is', ADF(D_data)) from statsmodels.tsa.arima_model import ARIMA # 定阶 pmax = int(len(D_data) / 10) qmax = int(len(D_data) / 10) bic_matrix = [] for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) print(bic_matrix) # 展平后找出最小的位置 bic_matrix = pd.DataFrame(bic_matrix) p, q = bic_matrix.stack().idxmin() print('BIC minimum p and q is', p, q) model = ARIMA(data, (p, 1, q)).fit() model.summary2() model.forecast(5)
def stationarity_test(): discfile = './data/discdata_processed.xls' data = pd.read_excel(discfile) data = data.iloc[:len(data) - 5] from statsmodels.tsa.stattools import adfuller as ADF diff = 0 adf = ADF(data["CWXT_DB:184:D:\\"]) while adf[1] >= 0.05: diff += 1 adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna()) print("原始序列经过{}阶差分后归于平稳,对应的p值为{}".format(diff, adf[1]))
def session_2(): data = pd.read_csv('discdata_processed.csv') # 去除最后5个数据,不使用最后5个数据 predict_num = 5 data = data.iloc[:len(data) - predict_num] # 平稳性检测 diff = 0 adf = ADF(data['CWXT_DB:184:D:\\']) while adf[1] > 0.05: # adf[1]为p值,p值小于0.05可认为是平稳的 diff = diff + 1 adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna()) print('原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
def programmer_2(): discfile = "data/discdata_processed.xls" data = pd.read_excel(discfile) # 去除最后5个数据 predictnum = 5 data = data.iloc[:len(data) - predictnum] # 平稳性检测 diff = 0 adf = ADF(data["CWXT_DB:184:D:\\"]) while adf[1] > 0.05: diff = diff + 1 adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna()) print(u"原始序列经过%s阶差分后归于平稳,p值为%s" % (diff, adf[1]))
def adf_test(ts): adftest = ADF(ts, autolag='AIC') adf_res = pd.Series(adftest[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used']) for key, value in adftest[4].items(): adf_res['Critical Value (%s)' % key] = value return adf_res
def testStationarity(ts): dftest = ADF(ts) # 对上述函数求得的值进行语义描述 dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value print(dfoutput)
def evaluate(arima_inflow_data, arima_inflow_ground_truth, arima_outflow_data, arima_outflow_ground_truth): rmse_list = [] mape_list = [] length = len(arima_outflow_ground_truth) for i in range(length): # todo get the data of grid i and truth i inflow_observations = pd.Series(arima_inflow_data[i]) inflow_ground_truth = arima_inflow_ground_truth[i] outflow_observations = pd.Series(arima_outflow_data[i]) outflow_ground_truth = arima_outflow_ground_truth[i] # todo get the p, d, q and fit model print('original in ADF result is', ADF(inflow_observations, 1)) inflow_D_data = inflow_observations.diff(1).dropna() print('Diffenciate in ADF result is', ADF(inflow_D_data, 1)) print('original out ADF result is', ADF(outflow_observations, 1)) outflow_D_data = outflow_observations.diff(1).dropna() print('Diffenciate out ADF result is', ADF(outflow_D_data, 1)) # todo forecast and calculate the error p = 0 q = 0 # 0 0 0.01600 # 1 0 0.023 # 1 1 0.026 # 0 1 0.01964 inflow_model = ARIMA(inflow_observations.values, (p, 1, q)).fit() inflow_result, _b, _c = inflow_model.forecast(1) inflow_loss_item = inflow_result - inflow_ground_truth outflow_model = ARIMA(outflow_observations.values, (p, 1, q)).fit() outflow_result, _b, _c = outflow_model.forecast(1) outflow_loss_item = outflow_result - outflow_ground_truth rmse_list.append(inflow_loss_item) rmse_list.append(outflow_loss_item) mape_list.append(inflow_loss_item) mape_list.append(outflow_loss_item) rmse_list = np.array(rmse_list) return np.mean(np.square(rmse_list))**0.5, np.mean(np.abs(mape_list))
def testing(data): ''' 进行ADF平衡性检验 & 白噪声检验 ''' print('原始序列的ADF平衡性检验的结果为:', ADF(data['volume'])) print('原始序列的白噪声检验的结果为:', acorr_ljungbox(data['volume'], lags=1))
def caculate_ADF(data): # 返回值依次为adf、pvalue、nobs、critical values、icbest、regresult、resstore # adf:-0.0 # pvalue: 0.95853208606005602 # nobs: 8 # critical values: 10 # icbest: {'1%': -4.3315729999999997, '5%': -3.2329500000000002, '10%': -2.7486999999999999} # resstore: -414.96637673426136 print(u'原始序列的ADF检验结果为:', ADF(data))
def stationarityTest(): ''' 平稳性检验 :return: ''' discfile = 'data/discdata_processed.xls' predictnum = 5 data = pd.read_excel(discfile) data = data.iloc[:len(data) - predictnum] # 平稳性检验 from statsmodels.tsa.stattools import adfuller as ADF diff = 0 adf = ADF(data['CWXT_DB:184:D:\\']) while adf[1] > 0.05: diff = diff + 1 adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna()) print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
def stationarityTest(data): ''' 平稳性检验 :return: ''' # 平稳性检验 from statsmodels.tsa.stattools import adfuller as ADF k = 0 xdata = data['Y'] adf = ADF(xdata) #平稳性检测 # print(u'原始序列平稳性检测的p值:',adf[1]) while adf[1] >= 0.05: k = k + 1 adf = ADF(xdata.diff(k).dropna()) print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (k, adf[1])) return k
def Model_Determination(data): ''' 根据股票数据,确定对应的ARIMA(p, k, q)模型 ''' p, k, q = 0, 0, 0 # 原始序列 D_data = data ADF_p = ADF(D_data['Open'])[1] acorr_ljungbox_p = list(acorr_ljungbox(D_data['Open'], lags=1)[1])[0] # 通过ADF检验和白噪声检验,确定可以分析的平稳的非白噪声序列 while (ADF_p >= 0.05) or (acorr_ljungbox_p >= 0.05): k += 1 D_data = data['Open'].diff(periods=k).dropna() ADF_p = ADF(D_data)[1] acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0] p, q = order_determination(data, D_data, k) return p, k, q
def stability_test(retrun_series): """""" statitstic = ADF(retrun_series) t_s = statitstic[1] t_c = statitstic[4]["5%"] if t_s > t_c: output("平稳性检验:存在单位根,时间序列不平稳") else: output("平稳性检验:不存在单位根,时间序列平稳") output(f"ADF检验结果:{statitstic}\n")
def diff(timeseries): timeseries_diff1 = timeseries.diff(1) timeseries_diff2 = timeseries_diff1.diff(1) timeseries_diff1 = timeseries_diff1.fillna(0) timeseries_diff2 = timeseries_diff2.fillna(0) timeseries_adf = ADF(timeseries['value'].tolist()) timeseries_diff1_adf = ADF(timeseries_diff1['value'].tolist()) timeseries_diff2_adf = ADF(timeseries_diff2['value'].tolist()) print('timeseries_adf : ', timeseries_adf) print('timeseries_diff1_adf : ', timeseries_diff1_adf) print('timeseries_diff2_adf : ', timeseries_diff2_adf) plt.figure(figsize=(16, 12)) plt.plot(timeseries, label='Original', color='blue') plt.plot(timeseries_diff1, label='Diff1', color='red') plt.plot(timeseries_diff2, label='Diff2', color='purple') plt.legend(loc='best') plt.show()
def random_series(): # Create a random series x = np.random.rand(100) plt.plot(x) plt.show() print('ADF平衡性检验的结果为:', ADF(x)) print('白噪声检验的结果为:', acorr_ljungbox(x, lags=1)) plot_acf(x).show() plt.show() plot_pacf(x).show() plt.show()
def stability_test(close_price): """""" statitstic = ADF(close_price) t_s = statitstic[1] t_c = statitstic[4]["10%"] if t_s > t_c: output("第三步:平稳性检验:存在单位根,时间序列不平稳") else: output("第三步:平稳性检验:不存在单位根,时间序列平稳") output(f"ADF检验结果:{statitstic}\n")
def AdfTest(index_list): adftest = ADF(index_list) # 返回值依次为adf,pvalue,usedlag,nobs,critical values,icbest,regresults,resstore i = 0 for key, value in adftest[4].items(): if value < adftest[0]: i += 1 # 假如adf值小于两个水平值,p值小于0.05,则判断为平稳序列 if i <= 1 and adftest[1] < 0.01: return 1 else: return 0
def stationarityTest(): """ 为了确定原始数据序列中没有随机趋势或确定趋势, 需要对数据进行平稳性检验,否则将会产生“伪回归”现象。 本案例采用单位根检验(ADF)的方法或者时序图的方法进行平稳性检验。 :return: """ # 参数初始化 discfile = "G:\\# Project\\数据集\\UsingDataSet\\Python数据分析与挖掘\\discdata_processed.xls" data = pd.read_excel(discfile) # 去除最后5个数据 predictnum = 5 data = data.iloc[:len(data) - predictnum] # 平稳性检测 diff = 0 adf = ADF(data["CWXT_DB:184:D:\\"]) while adf[1] > 0.05: # adf[1]为p值,p小于0.05认为是平稳的 diff = diff + 1 adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna()) print(u"原始序列经过%s阶差分后归于平稳,p值为%s" % (diff, adf[1]))
def stationarityTest(): ''' 检验时间序列稳定性: 平稳性检验: 为了确定原始数据序列中没有随机趋势或确定趋势,需要对数据进行平稳性检验,否则将会产生“伪回归”的现象。采用ADF方法来进行平稳性检验。 p值小于0.05认为是平稳的 :return: ''' discfile = 'data/discdata_processed.csv' predictnum = 5 data = pd.read_csv(discfile) # 100 data = data.iloc[: len(data) - predictnum] # 95 # 平稳性检验 from statsmodels.tsa.stattools import adfuller as ADF # 单位根检测法 diff = 0 adf = ADF(data['CWXT_DB:184:D:\\']) # print(adf) # p值小于0.05认为是平稳的 while adf[1] > 0.05: diff = diff + 1 adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna()) print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
def test_parameters(sel_frame, target, params): # ARIMA(p,d,q)模型中选择合适模型,其中p为自回归项,d为差分阶数,q为移动平均项数。 sel_frame = sel_frame.set_index(['createtime']) # # 自相关图 # from statsmodels.graphics.tsaplots import plot_acf # plot_acf = plot_acf(sel_frame) # plot_acf.show() # # 偏自相关图 # from statsmodels.graphics.tsaplots import plot_pacf # plot_pacf = plot_pacf(sel_frame) # plot_pacf.show() test_data = sel_frame[target] # 平稳性检测 from statsmodels.tsa.stattools import adfuller as ADF # print(sel_frame['createtime'].tolist())l print(u'原始序列的ADF检验结果为(第一个返回值为adf,若小于1%5%10%均值则为平稳序列,d=0):', ADF(test_data)) from statsmodels.stats.diagnostic import acorr_ljungbox # 返回统计量和p值 print(u'差分序列的白噪声检验结果为(p值):', acorr_ljungbox(test_data, lags=1)) # ARIMA,计算p和q # 一般阶数不超过length/10 pmax = int(len(test_data) / 100) qmax = int(len(test_data) / 100) # bic矩阵 bic_matrix = [] for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): # 存在部分报错,所以用try来跳过报错。 try: tmp.append(ARIMA(sel_frame, (p, params[1], q)).fit().bic) except Exception as err: print(err) tmp.append(None) bic_matrix.append(tmp) # 从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) # 先用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) plt.show()
def checkADF_d(y_ori, diffbegin, diffend): if ADF(DataFrame(y_ori)[u'VVALUE'])[1] < 0.05: pvalue = ADF(DataFrame(y_ori)[u'VVALUE'])[1] y_check = y_ori d = 0 print('%s阶差分,pvalue:%s' % (0, pvalue)) print(u'差分序列的ADF检验结果为', ADF(DataFrame(y)[u'VVALUE'])) else: for i in range(diffbegin, diffend): #自定义差分阶范围,最好1,9 y_dif = y_ori.diff(i).dropna() y_dif.columns = [u'VVALUE_dif'] pvalue = ADF(DataFrame(y_dif)[u'VVALUE'])[1] d = i if pvalue < 0.05: #P明显小于0.05,一阶差分后序列为平稳序列 print('%s阶差分,pvalue:%s' % (i, pvalue)) print(u'差分序列的ADF检验结果为', ADF(DataFrame(y_dif)[u'VVALUE'])) y_check = y_dif #修正后的时序图,可能是原序列或者差分序列 y.plot() plt.show() #返回统计量和p值 if float(acorr_ljungbox(y, lags=1)[1]) < 0.05: print(u'原序列的白噪声检验结果通过为:', acorr_ljungbox(y, lags=1)) break else: print(u'原序列的白噪声检验结果:当前序列无法拒绝假设,失败为:', acorr_ljungbox(y, lags=1)) print(u'该差分下模型没有意义', acorr_ljungbox(y, lags=1)) #print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(y_data, lags=1)) #P值小于0.05,所以一阶差分后的序列为平稳非白噪声序列。,P》0.05则是白噪声,数据随机无可取价值信息 continue else: print('%s阶差分不能满足要求,结束' % i) return d, y_check