def getLikelihood(endog,exog, order = None,n_forecasted_data=1): ''' train_en = endog[:predict_start-1] test_en = endog[predict_start:] print train_en print test_en train_ex = exog[:predict_start-1] test_ex = exog[predict_start:] ''' # Automatically determine values of orders if order is None: from scipy.optimize import brute grid = (slice(1, 3, 1), slice(1, 3, 1),slice(0, 3, 1)) print "############################################" print endog print "############################################" try: order = brute(objfunc, grid, args=(exog, endog), finish=None) order = order.astype(int) except : order = [1,1,3] # Model fits given data (endog) with optimized order print "*********************************************" print "Choose order of ", print order print "*********************************************" model = ARIMA(endog,order).fit(full_output=False,disp=False) # 1st element of array x is the forecasted data. x = model.forecast(n_forecasted_data) return x[0]
# fit and forecasting model model model1_x = ExponentialSmoothing(history1, seasonal_periods=7, seasonal='add', trend='add').fit() y1_x = model1_x.forecast(steps=7) # to predict one steps into the future model1_z = ExponentialSmoothing(history2, seasonal_periods=7, seasonal='add', trend='add').fit() y1_z = model1_z.forecast(steps=7) # to predict one steps into the future model2_x = ARIMA(history1, order=(0, 1, 1)).fit(disp=0) y2_x = model2_x.forecast(steps=7) model2_z = ARIMA(history2, order=(2, 0, 0)).fit(disp=0) y2_z = model2_z.forecast(steps=7) model3_x = sm.tsa.statespace.SARIMAX(history1, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), enforce_stationarity=False, enforce_invertibility=False).fit() y3_x = model3_x.forecast(steps=1) model3_z = sm.tsa.statespace.SARIMAX(history2, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), enforce_stationarity=False,
qmax = int(len(D_O3) / 10) #一般阶数不超过length/10 e_matrix = [] #评价矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(O3last2month, (p, 1, q)).fit().aic) except: tmp.append(None) e_matrix.append(tmp) e_matrix = pd.DataFrame(e_matrix) #从中可以找出最小值 p, q = e_matrix.stack().idxmin() #先用stack展平,然后用找出最小值位置。 print('AIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(O3last2month, (p, 1, q)).fit() model.summary2() #给出模型报告 print(model.forecast(5)) #作为期5天的预测,返回预测结果、标准误差、置信区间。 preO3 = model.forecast(1)[0] #PM2.5 from statsmodels.graphics.tsaplots import plot_acf plot_acf(data10.loc[:, 'PM2.5']) from statsmodels.stats.diagnostic import acorr_ljungbox print('白噪声-检验结果:', acorr_ljungbox(data1.loc[:, 'PM2.5'], lags=1)) from statsmodels.tsa.stattools import adfuller as ADF print('ADF-检验结果:', ADF(data10.loc[:, 'PM2.5'])) PM25last2month = data10.iloc[-60:, 2] from statsmodels.tsa.arima_model import ARIMA PM25last2month = PM25last2month.astype(float) pmax = int(len(PM25last2month) / 10) #一般阶数不超过length/10 qmax = int(len(PM25last2month) / 10) #一般阶数不超过length/10 e_matrix = [] #评价矩阵
print('差分序列的ADF检验结果为:', ADF(D_data['销量差分'])) # 平稳性检测 # 白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox print('差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA # 定阶 data['销量'] = data['销量'].astype(float) pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # BIC矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: # 存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值 p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。 print('BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 print('模型报告为:\n', model.summary2()) print('预测未来5天,其预测结果、标准误差、置信区间如下:\n', model.forecast(5))
def get_moving_average_growth_rate_and_prediction(input_filename, state_name='Karnataka'): matplotlib.use('Agg') india_covid_19 = pd.read_csv(input_filename) #1st problem india_covid_19['Date'] = pd.to_datetime(india_covid_19['Date'], dayfirst=True) all_state = list(india_covid_19['State/UnionTerritory'].unique()) all_state.remove('Unassigned') latest = india_covid_19[india_covid_19['Date'] > '30-01-20'] state_cases = latest.groupby('State/UnionTerritory')[ 'Confirmed', 'Deaths', 'Cured'].max().reset_index() latest['Active'] = latest['Confirmed'] - (latest['Deaths'] - latest['Cured']) state_cases = state_cases.sort_values('Confirmed', ascending=False).fillna(0) states = list(state_cases['State/UnionTerritory'][0:15]) states_confirmed = {} states_deaths = {} states_recovered = {} states_active = {} states_dates = {} for state in states: df = latest[latest['State/UnionTerritory'] == state].reset_index() k = [] l = [] m = [] n = [] for i in range(1, len(df)): k.append(df['Confirmed'][i] - df['Confirmed'][i - 1]) l.append(df['Deaths'][i] - df['Deaths'][i - 1]) m.append(df['Cured'][i] - df['Cured'][i - 1]) n.append(df['Active'][i] - df['Active'][i - 1]) states_confirmed[state] = k states_deaths[state] = l states_recovered[state] = m states_active[state] = n date = list(df['Date']) states_dates[state] = date[1:] fig = plt.figure(figsize=(25, 17)) plt.suptitle('5-Day Moving Average of Confirmed Cases in Top 15 States', fontsize=20, y=1.0) k = 0 for i in range(1, 15): ax = fig.add_subplot(5, 3, i) ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%b')) ax.bar(states_dates[states[k]], states_confirmed[states[k]], label='Day wise Confirmed Cases ') moving_aves = calc_movingaverage(states_confirmed[states[k]], 5) ax.plot(states_dates[states[k]][:-2], moving_aves, color='red', label='Moving Average', linewidth=3) plt.title(states[k], fontsize=20) handles, labels = ax.get_legend_handles_labels() fig.legend(handles, labels, loc='upper left') k = k + 1 plt.tight_layout(pad=3.0) #First output moving_average_fig = fig filename = 'coronavirus_reports/' + datetime.date.today().strftime( "%Y-%m-%d") + '_00-00-00_' + 'coronavirus-MovingAverageGraph.png' moving_average_fig.savefig(filename) filename = 'static/' + str(datetime.date.today( )) + '_00-00-00_' + 'coronavirus-MovingAverageGraph.png' moving_average_fig.savefig(filename) fig = plt.figure(figsize=(25, 17)) plt.suptitle('Growth Rate in Top 15 States', fontsize=20, y=1.0) k = 0 for i in range(1, 15): ax = fig.add_subplot(5, 3, i) ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%b')) #ax.bar(states_dates[states[k]],states_confirmed[states[k]],label = 'Day wise Confirmed Cases ') growth_rate = calc_growthRate(states_confirmed[states[k]]) ax.plot_date(states_dates[states[k]][21:], growth_rate[20:], color='#9370db', label='Growth Rate', linewidth=3, linestyle='-') plt.title(states[k], fontsize=20) handles, labels = ax.get_legend_handles_labels() fig.legend(handles, labels, loc='upper left') k = k + 1 plt.tight_layout(pad=3.0) growth_rate_graph_fig = fig filename = 'coronavirus_reports/' + datetime.date.today().strftime( "%Y-%m-%d") + '_00-00-00_' + 'coronavirus-GrowthRateGraph.png' growth_rate_graph_fig.savefig(filename) filename = 'static/' + str(datetime.date.today( )) + '_00-00-00_' + 'coronavirus-GrowthRateGraph.png' growth_rate_graph_fig.savefig(filename) k = india_covid_19[india_covid_19['State/UnionTerritory'] == state_name].iloc[:, [1, 8]] data = k.values data = k arima = ARIMA(data['Confirmed'], order=(5, 1, 0)) arima = arima.fit(trend='c', full_output=True, disp=True) forecast = arima.forecast(steps=30) pred = list(forecast[0]) start_date = data['Date'].max() prediction_dates = [] for i in range(30): date = start_date + datetime.timedelta(days=1) prediction_dates.append(date) start_date = date fig = plt.figure(figsize=(15, 10)) plt.xlabel("Dates", fontsize=20) plt.ylabel('Total cases', fontsize=20) plt.title("Predicted Values for the next 15 Days for " + state_name, fontsize=20) plt.plot_date(y=pred, x=prediction_dates, linestyle='dashed', color='#ff9999', label='Predicted') plt.plot_date(y=data['Confirmed'], x=data['Date'], linestyle='-', color='blue', label='Actual') plt.legend() prediction_fig = fig filename = 'coronavirus_reports/' + str(datetime.date.today( )) + '_00-00-00_' + 'coronavirus_Prediction_' + state_name + '.png' prediction_fig.savefig(filename) filename = 'static/' + str(datetime.date.today( )) + '_00-00-00_' + 'coronavirus_Prediction_' + state_name + '.png' prediction_fig.savefig(filename)
def make_forecast(self, data_df): model = ARIMA(data_df, order=self.best_model_order).fit(disp=False) forecast_list = model.forecast(steps=self.steps)[0].tolist() return forecast_list
# 白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox # 返回统计量和p值 print(u"差分序列的白噪声检验结果为:{}".format(acorr_ljungbox(D_data, lags=1))) from statsmodels.tsa.arima_model import ARIMA # 模型定阶 data[u'销量'] = data[u'销量'].astype(float) # 注意训练时序模型时要传进去的是float型 pmax = int(len(D_data) / 10) # 一般阶数不超过长度的十分之一 qmax = int(len(D_data) / 10) bic_mat = [] for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: # 拟合原序列 # 人为观察出来用MA(1)模型拟合差分序列,即对1阶差分后的原数据进行ARIMA(p,1,q)模型 tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_mat.append(tmp) bic_mat = pd.DataFrame(bic_mat) print(bic_mat) p, q = bic_mat.stack().idxmin() print(u"bic最小的p值和q值为:{},{}".format(p, q)) # 建立ARIMA(0,1,1)模型 model = ARIMA(data, (p, 1, q)).fit() print(model.summary2()) # 作为期5天的预测,返回预测结果,标准误差,置信区间 print(model.forecast(5))
plot_pacf(data['AvgNetFare'], lags=30) plt.show() len(data) - 30 xTrain, xTest = data['AvgNetFare'][:406], data['AvgNetFare'][406:] #len(xTest) xTest arima = ARIMA(xTrain, order=(10, 2, 1)) arima = arima.fit() arima.summary() pred = arima.forecast(steps=len(xTest)) print(mean_squared_error(xTest, pred[0])) print(np.sqrt(mean_squared_error(xTest, pred[0]))) #pred ax = arima.plot_predict(start='2019-05-12', end='2019-06-10') ax.set_figheight(9) ax.set_figwidth(19) import itertools """Auto Arima""" auto = auto_arima(xTrain, start_p=0,
# @Author : Aries # @Site : # @File : arima_model_check.py # @Software: PyCharm #模型检验 import pandas as pd #参数初始化 discfile = u'拓展思考样本数据.xls' lagnum = 12 #残差延迟个数 data = pd.read_excel(discfile, index_col=u'日期') xdata = data[u'日志类告警'] print xdata from statsmodels.tsa.arima_model import ARIMA #建立ARIMA(0,1,1)模型 arima = ARIMA(xdata.astype(float), (1, 1, 4)).fit() #建立并训练模型 xdata_pred = arima.predict(typ='levels') #预测 print xdata_pred print arima.forecast(2) pred_error = (xdata_pred - xdata).dropna() #计算残差 from statsmodels.stats.diagnostic import acorr_ljungbox #白噪声检验 lb, p = acorr_ljungbox(pred_error, lags=lagnum) h = (p < 0.05).sum() #p值小于0.05,认为是非白噪声。 if h > 0: print(u'模型ARIMA(0,1,1)不符合白噪声检验') else: print(u'模型ARIMA(0,1,1)符合白噪声检验')
# print(data.head(3)) data['销量'] = data['销量'].astype(float) # print(data.head(3)) p_max = int(len(D_data) / 10) # 一般不超过len/10 q_max = int(len(D_data) / 10) # 一般不超过len/10 # print(p_max) bic_matrix = [] for p in range(p_max + 1): temp = [] for q in range(q_max + 1): try: temp.append(ARIMA(data, (p, 1, q)).fit().bic) except: temp.append(None) bic_matrix.append(temp) # print(bic_matrix) bic_df = pd.DataFrame(bic_matrix) # print(bic_df) p, q = bic_df.stack().idxmin() # 先用stack展平,再用idxmin找出最小值位置 print("BIC中p和q分别为: {p}、{q}".format(p=p, q=q)) model = ARIMA(data, (p, 1, q)).fit() # 建立模型 print('输出模型报告:', '\n', model.summary2()) print('输出预测5的结果:', '\n', model.forecast(5)) # 预测值、标准误差、置信区间 # print(model.summary.tables[1])
for p in range(pmax+1): tmp = [] for q in range(qmax+1): try: tmp.append(ARIMA(data,(p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) #从中找出最小值 print(bic_matrix) p,q = bic_matrix.stack().idxmin() #使用stack展平 然后找出最小值位置 print('bic最小的p和最小的q为: %s \ %s' %(p,q)) model = ARIMA(data,(p,1,q)).fit() #建立模型arima(0,1,1) result = model.summary2() print(result) test = model.forecast(5) #给出未来五天的预测 返回预测结果 标准误差 置信区间 print(test)
def time_series(datas): from flask import request # 获取post请求参数 # datas = request.get_json() # print(datas) #请求数据 # data_input, callback_flag = parse_datas(datas) frame, callback_flag, db_map = db_data(datas) file_data, data, train_data, pred_data, step_month, date_next_list = parse_datas( frame, db_map) k = stationarityTest(data) if_black = whitenoiseTest(data) p, q = findOptimalpq(train_data, k) model_if_white, pred = arimaModelCheck(train_data, p, k, q) result, R2_score, assess = calErrors(pred_data, pred) #测试数据 callback_flag = datas['callbackFlag'] print('callback_flag', callback_flag) #前端输入参数 input_p = datas['data']['selected_thisTime']['p'] input_q = datas['data']['selected_thisTime']['q'] # k =stationarityTest() #平稳性检验,返回差分阶数k print('k', k) if k <= 5: # 自行规定,最多差分5次 # if_black = whitenoiseTest() #白噪声检测,如果if_black=1,即为非白噪声 if if_black == 1: #非白噪声序列,需要提取信息 if callback_flag == 0: p, q = findOptimalpq(train_data, k) #通过计算,获取p,q最合适的值 else: p, q = input_p, input_q #callback_flag =1时,回调,由前端输入参数 print('p', p) print('q', q) model_if_white, pred = arimaModelCheck(train_data, p, k, q) if model_if_white == 1: #残差属于白噪声序列,无需再提取,可进行下一步 result, R2_score, assess = calErrors(pred_data, pred) print(result) #对所有样本数据建模,进行样本外预测 xdata = file_data['Y'] # 建立并训练模型 arima = ARIMA(xdata, (p, k, q)).fit() predict = arima.forecast(predictnum)[0] # 预测样本之外的5个时间单位,取其第一行 print('predict', predict) predict_list = [] for i in range(len(predict)): out_put = {"date": date_next_list[i], "Y": predict[i]} predict_list.append(out_put) print("预测下5个月份/季度的数据", predict_list) if_callback = 0 else: #残差为非白噪声序列,需要重新调整p,q predict_list = [] R2_score = "" assess = "" if_callback = 1 # 告诉后端,强制回调,下面的值都为空即可 else: print("注意:该数据不适合建立时间序列模型!") return_data = { "pass_data": {}, "display_data": predict_list, "display_data_type": "", "model_assess": { "模型评分": R2_score, "模型评价": assess }, "if_display": 0, "display_info": ["display_data", "model_assess"], "if_callback": if_callback, "args": { "list": ["p", "q"], "selected_thisTime": { "p": p, "q": q }, "selected_lastTime": {}, "args_display_type": "select", "args_info": "本次操作说明:需要调整参数p和q的值,步长为1, 取值范围[0,5]。其中,p是自回归(AR)的项数,用来获取自变量;q是移动平均(MA)的项数,为了使其光滑" }, "return_data_instructions": "if_callback为0时,可继续下一步也可回调,请将该节点的结果data保存,并传给下一节点使用;if_callback为1时,表示强制回调。", "others": "" } print(return_data) return return_data
# we therefore use training length of 53 to train our model dtw_forecast = dtw_pred(test.cumulative_cases, 38, 30) simple_plot(test.cumulative_cases[-30:]) simple_plot(dtw_forecast) res = test.cumulative_cases[-30:] - dtw_forecast res = sum(res**2) # residual of 1.26E9, not bad! but not the best obviously simple_plot(test.cumulative_cases) ##################### final model summary #arima ON_cases = covid[covid.province == 'Ontario'].cumulative_cases arima_train = fast_log(ON_cases)[70:] pq_search(arima_train, 3, 2, 2, 3, 0.05) # min AIC is at 321 arima_model = ARIMA(arima_train, (0, 2, 3)).fit(disp=False) arima_forecast, se, conf = arima_model.forecast(30, alpha=0.05) # 30 days forecast arima_forecast = np.exp(arima_forecast) arima_forecast = pd.Series(arima_forecast) lower_forecast = np.exp(pd.Series(conf[:, 0])) upper_forecast = np.exp(pd.Series(conf[:, 1])) # exponential smoothing expsm_model = ExponentialSmoothing(ON_cases, trend='mul', seasonal=None, damped=True).fit() expsm_forecast = expsm_model.forecast(30) # dtw dtw_train = ON_cases.append(pd.Series([0] * 30)) dtw_forecast = dtw_pred(dtw_train, 38, 30) dtw_forecast = pd.Series(dtw_forecast)
from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 ADF(D_data[u'销量差分'])#平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox acorr_ljungbox(D_data, lags=1) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA #定阶 pmax = int(len(D_data)/10) #一般阶数不超过length/10 qmax = int(len(D_data)/10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax+1): tmp = [] for q in range(qmax+1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) #从中可以找出最小值 p,q = bic_matrix.stack().idxmin() #先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:%s、%s' %(p,q)) model = ARIMA(data, (0,1,1)).fit() #建立ARIMA(0, 1, 1)模型 model.summary() #给出一份模型报告 model.forecast(5) #作为期5天的预测,返回预测结果、标准误差、置信区间。
def programmer_6(): """ 警告解释: # UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, " 调用了多次plt.show() 解决方案,使用plt.subplot() # RuntimeWarning: overflow encountered in exp 运算精度不够 forecastnum-->预测天数 plot_acf().show()-->自相关图 plot_pacf().show()-->偏自相关图 """ discfile = 'data/arima_data.xls' forecastnum = 5 data = pd.read_excel(discfile, index_col=u'日期') fig = plt.figure(figsize=(8, 6)) # 第一幅自相关图 ax1 = plt.subplot(411) fig = plot_acf(data, ax=ax1) # 平稳性检测 print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) # 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore # 差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] # 时序图 D_data.plot() plt.show() # 第二幅自相关图 fig = plt.figure(figsize=(8, 6)) ax2 = plt.subplot(412) fig = plot_acf(D_data, ax=ax2) # 偏自相关图 ax3 = plt.subplot(414) fig = plot_pacf(D_data, ax=ax3) plt.show() fig.clf() print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) # 平稳性检测 # 白噪声检验 print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'销量'] = data[u'销量'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 data.dropna(inplace=True) # 存在部分报错,所以用try来跳过报错;存在warning,暂未解决使用warnings跳过 import warnings warnings.filterwarnings('error') for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) # 从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) # 用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(forecastnum) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
plot_pacf(D_data).show() print(u'1阶差分序列的ADF检验结果为:',ADF(D_data[u'销量差分'])) from statsmodels.stats.diagnostic import acorr_ljungbox print(u'差分序列的白噪声检验结果为:',acorr_ljungbox(D_data,lags=1)) from statsmodels.tsa.arima_model import ARIMA data[u'销量'] = data[u'销量'].astype(float) pmax=int(len(D_data)/10) qmax=int(len(D_data)/10) bic_matrix=[] for p in range(pmax+1): tmp=[] for q in range(qmax+1): try: tmp.append(ARIMA(data,(p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix=pd.DataFrame(bic_matrix) print(bic_matrix) p,q=bic_matrix.stack().idxmin() print(u'bic最小的P值和q值为:%s、%s'%(p,q)) model=ARIMA(data,(p,1,q)).fit() model.summary2() forecast=model.forecast(5) print(forecast)
# 就结果来看,如果取显著性水平为0.05,那么相关系数与零没有显著差异,即为白噪声序列。 print("===========================Ljung-Box检验========================================") r_Ljung_Box,q_Ljung_Box,p_Ljung_Box = sm.tsa.acf(resid.values.squeeze(), qstat=True) data = np.c_[range(1,41), r_Ljung_Box[1:], q_Ljung_Box, p_Ljung_Box] table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"]) print(table.set_index('lag')) print("===========================Ljung-Box检验========================================") #6.数据预测 ###################################使用forecast预测数据######################################################################### print("##########使用forecast预测数据###################") # forecast返回值为有3个元素的元组(tuple),每个元素都是一个array, # 说明:forecast : array, stderr : array,conf_int : array2D predict_dta = arima.forecast(forecast_size) # 连续预测N个值 print(predict_dta) print("##########使用forecast预测数据###################") ###################################使用forecast预测数据######################################################################### ###################################使用plot_predict预测数据##################################################################### print("##########使用plot_predict预测数据###################") if d == 0: predict_dta2 = arima.predict(start = forecast_start_date, end = forecast_end_date,dynamic = False) else: predict_dta2 = arima.predict(start = forecast_start_date, end = forecast_end_date,dynamic = False,typ = forecast_typ) print(predict_dta2) xdata_pred2,ax = plt.subplots(figsize = fig_size ) ax = data_analysis.ix[1:].plot(ax=ax)
def gen_ohlcv(interval): """ Generate OHLCV Chart for BTCUSD with predicted price overlay. :params interval: update the graph based on an interval """ # hack to wrap interval around available data. OOS starts at 1500, df has a # total of 2274 rows after processing to wrap around 2274-1500 ~ 750. Reset # prediction data to empty df. interval = interval % 750 print("interva is {}...".format(interval)) # read data from source df = get_ohlcv_data(interval - 100, interval) df['log_ret'] = np.log(df.Close) - np.log(df.Close.shift(1)) print("\ndata df loaded, starting prediction...\n") # online training and forecast. model = ARIMA(df.tail(60)["log_ret"], order=(3, 1, 0), freq='D').fit(disp=0) pred = model.forecast()[0] print("\nprediction ended, writing to output df...") # save forecast to output dataframe. should be dB irl. next_dt = df.tail(1).index[0] + pd.Timedelta('1 day') df_pred.loc[next_dt] = [ pred[0], (np.exp(pred) * df.tail(1).Close.values)[0] ] print("\nnext datetime is {}...".format(next_dt)) # get index location of period. loc = df_pred.index.get_loc(next_dt) + 1 print("\nloc is {}...".format(loc)) # slices for the past N periods perdiction for plotting df_pred_plot = df_pred.iloc[slice(max(0, loc - 30), min(loc, len(df)))].sort_index() print("\n set pred df for plotting...\n", df_pred_plot) # plotting ohlc candlestick trace_ohlc = go.Candlestick( x=df.tail(50).index, open=df['Open'].tail(50), close=df['Close'].tail(50), high=df['High'].tail(50), low=df['Low'].tail(50), opacity=0.5, hoverinfo="skip", name="BTCUSD", ) # plotting prediction line trace_line = go.Scatter(x=df_pred_plot.index, y=df_pred_plot.pred_Close, line_color='yellow', mode="lines+markers", name="Predicted Close") layout = go.Layout( plot_bgcolor=app_color["graph_bg"], paper_bgcolor=app_color["graph_bg"], font={"color": "#fff"}, height=700, xaxis={ "showline": False, "showgrid": False, "zeroline": False, }, yaxis={ "showgrid": True, "showline": True, "fixedrange": True, "zeroline": True, "gridcolor": app_color["graph_line"], "title": "Price (USD$)" }, ) return go.Figure(data=[trace_ohlc, trace_line], layout=layout)
# ifsuccess判断本次报警是否成功,1成功,0失败 ifsuccess = 0 # sensitivity代表本次报警的灵敏度 sensitivity = 0 length = len(light_data) middle = length // 2 # end即代表当前位置 end = middle - 54 # 历史窗口预测误差 history_f = [[] for i in range(length)] windows_error = [] while end <= (length - 1): windows_data = light_data[end - 50:end] p, q = ARI(windows_data) model = ARIMA(windows_data, (p, diff, q)).fit() f5 = model.forecast(5)[0] for i in range(5): history_f[end + i].append(f5[i]) if len(history_f[end]) == 5: x = np.mean(history_f[end]) er = x - light_data[end] if len(windows_error) < 50: windows_error.append(er) else: if er > np.max(windows_error): if time[end] >= anomaly_start and time[end] <= anomaly_end: ifsuccess = 1 sensitivity = (time[end] - t0) / t1 break else: windows_error.pop(0)
return (country_cases, country_daily_increase, country_daily_death, country_name) country_cases, country_daily_increase, country_daily_death, country_name = country_visualizations( 'Sri Lanka') country_cases_df = pd.DataFrame(country_cases, columns=['Date', 'cases']) country_daily_increase_df = pd.DataFrame(country_daily_increase, columns=['Date', 'cases']) country_daily_death_df = pd.DataFrame(country_daily_death, columns=['Date', 'cases']) arima = ARIMA(country_cases_df['cases'], order=(5, 1, 0)) arima = arima.fit(trend='c', full_output=True, disp=True) forecast = arima.forecast(steps=30) pred = list(forecast[0]) start_date = country_cases_df['Date'].iloc[-1] prediction_dates = [] for i in range(30): date = start_date + timedelta(days=1) prediction_dates.append(date) start_date = date fig = plt.figure() #plt.xlabel("Dates",fontsize = 20) plt.ylabel('Total cases', fontsize=20) #plt.title("Predicted Total cases for the next 15 Days" , fontsize = 20) obj, = plt.plot_date(y=pred,
from statsmodels.tsa.stattools import adfuller as ADF print(u'ADF:',ADF(data[u'high'])) D_data=data.diff().dropna() D_data.columns=[u'result'] D_data.plot(); plt.show() plot_acf(D_data).show() plt.show() from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #print(u'ADF2:',ADF(D_data[u'result2'])) from statsmodels.stats.diagnostic import acorr_ljungbox #print(u'result3:',acorr_ljungbox(D_data,lags=1)) from statsmodels.tsa.arima_model import ARIMA pmax=int(len(D_data)/10) qmax=int(len(D_data)/10) bic_matrix=[] for p in range(pmax)+1: tmp=[] for q in range(qmax+1): try: tmp.append(ARIMA(data,(p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix=pd.DataFrame(bic_matrix) p,q=bic_matrix.stack().idxmin() model=ARIMA(data,(p,1,q)).fit() model.summary2() model.forecast(1)
start_q=0, max_p=10, max_q=10, m=4, start_P=0, seasonal=True, d=1, D=1, trace=True, error_action="ignore", suppress_warnings=True, stepwise=False) auto_arima_model.summary() # SARIMAX(1, 1, 1)x(0, 1, 1, 12) # AIC ==> 1348.728 # BIC ==> 1362.665 # For getting Fitted values for train data set we use # predict_in_sample() function auto_arima_model.predict_in_sample() # For getting predictions for future we use predict() function pred_test = pd.Series(auto_arima_model.predict(n_periods=12)) # Adding the index values of Test Data set to predictions of Auto Arima pred_test.index = Test.index MAPE(pred_test, Test.Sales) # 12.72 from statsmodels.tsa.arima_model import ARIMA model = ARIMA(plastic.Sales, order=(1, 1, 0)).fit(transparams=True) forecasterrors = model.forecast(steps=12)[0] #it will give the next 12 values
def loop_train(dataset, i): loop_train_model = ARIMA(dataset['Adj Close'], (0, 1, 1)).fit() dataset['Adj Close'].loc[datetime.datetime( 2015, 12, 12 + i)] = loop_train_model.forecast(1)[0][0] return loop_train_model.forecast(1)[0]
#存在部分报错,所以用try来跳过报错。 try: tmp.append(ARIMA(ts_log, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) #从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) #先用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() #print bic_matrix print(u'商店:%s,BIC最小的p值和q值为:%s、%s' % (a + 1, p, q)) #建立ARIMA(0, 1, 1)模型 model = ARIMA(ts_log, (p, 1, q)).fit() #作为期90天的预测,返回预测结果、标准误差、置信区间。 aaa = np.exp(model.forecast(90)[0]) t[a] = aaa #print t t1 = pd.DataFrame(np.array(t))[0] for i in range(1, 90): t1 = pd.concat([t1, pd.DataFrame(np.array(t))[i]], axis=0) t1 = t1.reset_index(drop=True) rng = pd.date_range('2017-01-01', '2017-03-31', freq='D') result = pd.DataFrame() result['date'] = rng result = pd.concat([ result, result, result, result, result, result, result, result, result, result, result, result, result, result, result ],
from db_tools import *
# 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: # 存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(all_index, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值 p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(all_index, (1, 1, 0)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(10)[0] # 作为期5天的预测,返回预测结果、标准误差、置信区间。 ax = all_index.plot() fig = model.predict('2017-10-01', '2018-05-01', dynamic=True) plt.show() # sql_ygjq = "select NVL(mon1,mon2) as m1,NVL(id1,id2) as id1,nvl(sum1,0) as sum1, \ # NVL(mon2,mon1) as m2,NVL(id2,id1)as id2,NVL(sum2,0) as sum2 from \ # (select t1.t_month as mon1,t1.aab001 as id1,t1.cnt as sum1, \ # t2.t_month as mon2,t2.aab001 as id2,t2.cnt as sum2 \ # from (select * from AB01_WITH_CD01_COUNT_RESULT where t_month = '201701')t1 \ # FULL join (select * from AB01_WITH_CD01_COUNT_RESULT where t_month = '201702')t2 \ # on t1.aab001=t2.aab001 )" # company = pd.read_sql_query(sql_zc, con=db)
plot_pacf(D_data).show() #偏自相关图 print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) #平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA data[u'销量'] = data[u'销量'].astype(float) #定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值 p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(5) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
ax2 = fig.add_subplot(gs[1,0]) plot_acf(series, ax=ax2, title='ACF') ax3 = fig.add_subplot(gs[1,1]) sns.kdeplot(series, ax=ax3) ax3.set_title('density') plt.show() # %% check_residuals(residuals) # %% arima_forecast, se, conf = arima.forecast(24) arima_forecast = pd.Series(arima_forecast, index=airpassengers_test.index) lower_series = pd.Series(conf[:, 0], index=airpassengers_test.index) upper_series = pd.Series(conf[:, 1], index=airpassengers_test.index) # %% plt.plot(airpassengers_season_diff_train, label='train') plt.plot(arima_forecast, label='forecast') plt.fill_between(lower_series.index, lower_series, upper_series, color='k', alpha=.15) plt.legend() # %%
def programmer_6(): """ 警告解释: # UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, " 调用了多次plt.show() 解决方案,使用plt.subplot() # RuntimeWarning: overflow encountered in exp 运算精度不够 forecastnum-->预测天数 plot_acf().show()-->自相关图 plot_pacf().show()-->偏自相关图 """ discfile = 'data/arima_data.xls' forecastnum = 5 data = pd.read_excel(discfile, index_col=u'日期') fig = plt.figure(figsize=(8, 6)) # 第一幅自相关图 ax1 = plt.subplot(411) fig = plot_acf(data, ax=ax1) # 平稳性检测 print(u'原始序列的ADF检验结果为:', ADF(data[u'销量'])) # 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore # 差分后的结果 D_data = data.diff().dropna() D_data.columns = [u'销量差分'] # 时序图 D_data.plot() plt.show() # 第二幅自相关图 fig = plt.figure(figsize=(8, 6)) ax2 = plt.subplot(412) fig = plot_acf(D_data, ax=ax2) # 偏自相关图 ax3 = plt.subplot(414) fig = plot_pacf(D_data, ax=ax3) plt.show() fig.clf() print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) # 平稳性检测 # 白噪声检验 print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'销量'] = data[u'销量'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 data.dropna(inplace=True) # 存在部分报错,所以用try来跳过报错;存在warning,暂未解决使用warnings跳过 import warnings warnings.filterwarnings('error') for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) # 从中可以找出最小值 bic_matrix = pd.DataFrame(bic_matrix) # 用stack展平,然后用idxmin找出最小值位置。 p, q = bic_matrix.stack().idxmin() print(u'BIC最小的p值和q值为:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 model.summary2() # 给出一份模型报告 model.forecast(forecastnum) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
print('模型ARIMA(%s,1, %s)不符合白噪音检验' % (p, q)) print('在BIC矩阵中去掉[%s,%s]组合,重新进行计算' % (p, q)) matrix.iloc[p, q] = np.nan arimafail = arima continue else: # print(p,q) print('模型ARIMA(%s,%s)符合白噪声检验' % (p, q)) break ''' ''' # 第 5 步--C盘---------模型预测 print('模型报告:summary():\n', arima.summary()) forecast_values, forecasts_standard_error, forecast_confidence_interval = arima.forecast( 5) pre_data = pd.DataFrame(xtest_value) pre_data.insert(1, 'CWXT_DB:184:D:\\_predict', forecast_values) pre_data.rename(columns={ 'CWXT_DB:184:D:\\': '实际值', 'CWXT_DB:184:D:\\_predict': '预测值' }, inplace=True) result_d = pre_data.applymap(lambda x: '%.2f' % x) result_d.to_excel('../my_data/pedictdata_D_BIC_ARMA.xlsx') # 第 5 步--D盘---------模型评价 # 为了评价时序预测模型效果的好坏,本章采用3个衡量模型预测精度的统计量指标:平均绝对误差、均方根误差、平均绝对百分误差 result = pd.read_excel('../my_data/pedictdata_D_BIC_ARMA.xlsx', index_col='COLLECTTIME')
print(u'1-diff series white noise test result: ', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值 data[u'SALES_VOLUME'] = data[u'SALES_VOLUME'].astype(float) # 定阶 pmax = int(len(D_data) / 10) # 一般阶数不超过length/10 qmax = int(len(D_data) / 10) # 一般阶数不超过length/10 bic_matrix = [] # bic矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: # 存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值 p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC minimal p-value and q-value is:%s、%s' % (p, q)) model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型 # 给出一份模型报告 print("************************************************************") print(model.summary2()) print("************************************************************") print() print("************************************************************") print(model.forecast(forecastnum)) # 作为期forecastnum天的预测,返回预测结果、标准误差、置信区间。
# In[68]: train.head() # In[69]: modelFit = ARIMA(train, order=(2, 0, 2)).fit() # In[70]: modelFit.summary() # In[71]: forcastData = modelFit.forecast(steps=20)[0] meanSquareError = mean_squared_error(test, forcastData) print('MSE: ' + str(meanSquareError)) rootMeanSquareError = np.sqrt(meanSquareError) print('RMSE: ' + str(rootMeanSquareError)) # In[72]: plt.figure(figsize=(12, 5)) plt.plot(train.index.to_pydatetime(), train, label='training') plt.plot(test.index.to_pydatetime(), test, label='actual') plt.plot(test.index.to_pydatetime(), forcastData, label='forecast') plt.legend() # In[73]:
class ModeDecomp(object): def __init__(self, dataSet, type, test_size=24): data = dataSet.set_index('date') data.index = pd.to_datetime(data.index) self.dataSet = data self.test_size = test_size self.train_size = len(self.dataSet) - self.test_size # self.mile_train = self.dataSet['mileage_utilization'][:len(self.dataSet) - test_size] # self.time_train = self.dataSet['time_utilization'][:len(self.dataSet) - test_size] # self.num_rain = self.dataSet['pick_up_freq'][:len(self.dataSet) - test_size] self.train = self.dataSet[type][:len(self.dataSet) - test_size] self.train = self._diff_smooth(self.train) # self.train = self._diff_smooth(self.time_train) # self.num_rain = self._diff_smooth(self.num_rain) self.test = self.dataSet[type][-test_size:] # self.test = self.dataSet['time_utilization'][-test_size:] # 对数据进行平滑处理 def _diff_smooth(self, dataSet): dif = dataSet.diff() # 差分序列 td = dif.describe() high = td['75%'] + 1.5 * (td['75%'] - td['25%']) # 定义高点阈值,1.5倍四分位距之外 low = td['25%'] - 1.5 * (td['75%'] - td['25%']) # 定义低点阈值,同上 # 变化幅度超过阈值的点的索引 forbid_index = dif[(dif > high) | (dif < low)].index i = 0 while i < len(forbid_index) - 1: n = 1 # 发现连续多少个点变化幅度过大,大部分只有单个点 start = forbid_index[i] # 异常点的起始索引 while forbid_index[i + n] == start + timedelta(minutes=60 * n): n += 1 if (i + n) > len(forbid_index) - 1: break i += n - 1 end = forbid_index[i] # 异常点的结束索引 # 用前后值的中间值均匀填充 try: value = np.linspace(dataSet[start - timedelta(minutes=60)], dataSet[end + timedelta(minutes=60)], n) dataSet[start:end] = value except: pass i += 1 return dataSet def decomp(self, freq): decomposition = seasonal_decompose(self.train, freq=freq, two_sided=False) self.trend = decomposition.trend self.seasonal = decomposition.seasonal self.residual = decomposition.resid # decomposition.plot() # plt.show() d = self.residual.describe() delta = d['75%'] - d['25%'] self.low_error, self.high_error = (d['25%'] - 1 * delta, d['75%'] + 1 * delta) def trend_model(self, order): self.trend.dropna(inplace=True) self.trend_model_ = ARIMA(self.trend, order).fit(disp=-1, method='css') # return self.trend_model_ def predict_new(self): """ 预测新数据 :return: """ n = self.test_size self.pred_time_index = pd.date_range(start=self.train.index[-1], periods=n + 1, freq='60min')[1:] self.trend_pred = self.trend_model_.forecast(n)[0] pred_time_index = self.add_season() return pred_time_index def add_season(self): ''' 为预测出的趋势数据添加周期数据和残差数据 ''' self.train_season = self.seasonal[:self.train_size] values = [] low_conf_values = [] high_conf_values = [] for i, t in enumerate(self.pred_time_index): trend_part = self.trend_pred[i] #相同时间的数据均值 season_part = self.train_season[self.train_season.index.time == t.time()].mean() #趋势+周期+误差界限 predict = trend_part + season_part low_bound = trend_part + season_part + self.low_error high_bound = trend_part + season_part + self.high_error values.append(predict) low_conf_values.append(low_bound) high_conf_values.append(high_bound) self.final_pred = pd.Series(values, index=self.pred_time_index, name='predict') self.low_conf = pd.Series(low_conf_values, index=self.pred_time_index, name='low_conf') self.high_conf = pd.Series(high_conf_values, index=self.pred_time_index, name='high_conf') return self.pred_time_index
plt.rcParams['axes.unicode_minus'] = False data.plot() plt.show() from statsmodels.graphics.tsaplots import plot_acf plot_acf(data).show() from statsmodels.tsa.stattools import adfuller as ADF print 'ADF test result:', ADF(data['value']) D_data = data.diff().dropna() D_data.columns = ['diff value'] D_data.plot() plt.show() plot_acf(D_data).show() from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() print 'diff seq ADF test result:', ADF(D_data['diff value']) from statsmodels.stats.diagnostic import acorr_ljungbox print 'dff white noise test result:', acorr_ljungbox(D_data, lags = 1) from statsmodels.tsa.arima_model import ARIMA model = ARIMA(data, (1,1,1)).fit() model.summary2() model.forecast(5*6)