plot_acf(D_TS1_1[:100],ax=ax1).show() #一阶差分ACF plot_pacf(D_TS1_1[:100],ax=ax2).show() #一阶差分PACF print (u'差分序列的ADF检验结果为:',ADF(D_TS1_1)) #白噪声检验 statics,p_value = acorr_ljungbox(D_TS1_1,lags = 1) print (u'差分序列的白噪声检验结果为: Q-statics = %s p-value = %s'%(statics,p_value)) #返回统计量和p值 #ARIMA差分自回归滑动平均模型 #定阶 p = 1 q = 1 arima1_1 = ARIMA(TS1_1,(1,1,1)).fit() #建立ARIMA(1,1,1)模型 ARIMA(0,1,1) arima1_1.summary() # 预测值,预测误差,预测置信区间 pre_a1_1, pre_b1_1, pre_c1_1 = arima1_1.forecast(365) plt.plot(pre_a1_1) pre_b1_1 Date2 = pd.date_range('20020401','20110708') # pre_a1_1_cumsum = pre_a1_1.cumsum() pre_D_TS1_1 = arima1_1.fittedvalues plt.plot(D_TS1_1) plt.plot(pre_D_TS1_1, color='red') pre_D_TS1_1_cumsum = pre_D_TS1_1.cumsum() pre_TS1_1 = pd.Series(TS1_1, index = Date2)
# %% fig, (ax5, ax6) = plt.subplots(1,2, figsize=(16, 4)) plot_acf(airpassengers_season_diff_train.dropna(), ax5) ax3.set_title('ACF of differenced season seriess') plot_pacf(airpassengers_season_diff_train.dropna(), ax6) ax4.set_title('PACF of differenced season series') plt.show() # %% # Find d parameter for ARIMA find_d = ARIMA(airpassengers_season_diff_train.dropna(), order=(0,0,0)).fit() find_d.summary() # %% arima = ARIMA(airpassengers_season_diff_train.dropna(), order=(1,0,1)).fit() arima.summary() # %% [markdown] # The values under *coef* are the weights of the respective terms. # # AIC and BIC is to tell how good is the model and can be use to compare with other models. The lower the AIC the better the model # # # %% [markdown] # ## Residuals #
size = int(len(df_comp) * 0.8) df = df_comp.iloc[:size] df_test = df_comp.iloc[size:] # review ACF and PACF (in reality is more functional to run auto_arima vs checking ACF/PACF manually, but this is for sake of example) # ---------- # not done here # run ARIMAX model using S&P500 vakues as exogenous factor to explain FTSE values # ---------- model_arimax_111 = ARIMA(df.market_value, order=(1,1,1), exog=df.spx).fit() print(model_arimax_111.summary()) print('----------') # analyzing residuals # ---------- df['residuals_model_arimax_111'] = model_arimax_111.resid.iloc[:] sgt.plot_acf(residuals_model_arimax_111[1:], zero = False, lags = 40) plt.title("ACF Of Residuals for ARIMAX(1,1,1)",size=20) plt.show()
data = fast_log(data) test_stationarity( differencing(data, 2), window=7, cutoff=0.01) # looks very bad in the beginning, variance is wild ## removing the first 70 data = data[70:] test_stationarity(differencing(data, 2), window=7, cutoff=0.01) # Not too too bad, let's try train_data, test_data = train_test( data, 30, 'n') # we try to predict about a month's data plot_pacf(differencing(train_data, 2)) ## looks like AR(3) plot_acf(differencing(test_data, 2)) ## looks like MA(0) search = pq_search(train_data, 5, 2, 2, 5, 0.05) #search for optimal pq search[0] # looks like pdq=321 is the one with lowest aic arima_model = ARIMA(train_data, (3, 2, 1)).fit(disp=False) print(arima_model.summary()) plt.rcParams.update({'font.size': 10}) plt.rc('xtick', labelsize=5) arima_res_plot(arima_model) #mean of residual almost 0, std is not bad plt.savefig('cumulative_cases_arima_321_res.png', format='png', dpi=300) plt.rc('xtick', labelsize=5) arima_pred_plot( arima_model, test_data, 0.05, mode='exp') # the predictions are not looking too good, sse=4.1E10 plt.title( 'Predicted vs actual Cumulative Cases \n ARIMA(3,2,1) Residual: 4.1E10') plt.savefig('cumulative_cases_pred_arima_321.png', format='png', dpi=300) sm.stats.acorr_ljungbox(
from statsmodels.datasets.macrodata import load_pandas from statsmodels.tsa.base.datetools import dates_from_range from statsmodels.tsa.arima_model import ARIMA import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm plt.interactive(False) # let's examine an ARIMA model of CPI cpi = load_pandas().data['cpi'] dates = dates_from_range('1959q1', '2009q3') cpi.index = dates res = ARIMA(cpi, (1, 1, 1), freq='Q').fit() print res.summary() # we can look at the series cpi.diff().plot() # maybe logs are better log_cpi = np.log(cpi) # check the ACF and PCF plots acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95) # center the confidence intervals about zero #confint_acf -= confint_acf.mean(1)[:, None] pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols') # confidence interval is now an option to pacf from scipy import stats confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
def ts_arima(ts,p,d,q,start,end): arima = ARIMA(ts, order=(p,d,q)).fit(disp=-1,maxiter=100) print("未来五年:", arima.forecast(5)[0]) print(arima.summary()) ts_predict_arima = arima.predict(start,end, dynamic = False) return ts_predict_arima
##Use median value to build ARIMA model and decide the parameters zillow_cd_median = pd.DataFrame( zillow_cd.drop(columns=[ 'RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName', 'SizeRank' ]).median()) zillow_cd_median.index = pd.to_datetime(zillow_cd_median.index) ##Check the ACF and PACF, determine that the order (0,1,1) is proper for the model plot_acf(zillow_cd_median) plot_pacf(zillow_cd_median) ##Fit the ARIMA model and predict model1 = ARIMA(zillow_cd_median.values, (0, 1, 1)).fit() model1.summary() output1 = pd.DataFrame(model1.forecast(t)[0]) zillow_cd_median = zillow_cd_median.append(output1) zillow_cd_median.index = pd.date_range('8/1/2010', latest, freq='MS') ##Create the forcasting line trend chart for the median property value plt.figure(figsize=(15, 4)) plt.margins(x=0) p2 = plt.plot(zillow_cd_median) plt.axvline(x=pre, color='r', linestyle='--') plt.ylim(ymin=0) plt.show( ) #gently increaseing trend for the perdiction period from 2017 to 2019 ##Use the same parameters to predict the property value of specific area ##Here could build a funcion for connecting the cleaned useful dataset: airbnb_cd and zillow_cd into profit_ny
return automodel # In[132]: forecated_accuracy = pd.DataFrame() Predicted = pd.DataFrame() model = arimamodel(Train['y']) #Build Auto ARIMA MODEL forecast_arima, conf_int = model.predict(n_periods=Test.shape[0] + 10, return_conf_int=True) Predicted['preds'] = np.round(forecast_arima, 2) Predicted = Predicted.reset_index() # Get the Model Parameters/ Orders used by Auto- Arima s = model.summary().tables[0].as_text() start = s.find("Model:") end = s.find(")") end += len("end") Predicted['Parameters'] = s[ start:end] #Store the Model Parameters/ Orders in 'Parameters' column #forecast fc, conf = model.predict(len(Test), return_conf_int=True) plt = plotarima(Train['y'], Test['y'], model, "Auto_ARIMA.pdf", fc, conf) #Test=Test.reset_index() forecated_accuracy = pd.DataFrame( forecast_accuracy(forecast_arima[:len(Test)], Test['y'])) #merge Test,forecasted and accuracy metrics Test = Test.reset_index(drop=True)
def buildARIMA(data, p, q, d): model = ARIMA(data, (p, d, q)).fit() print(model.summary()) return model
def def_MA(timeseries, q=1, steps1=1): model = ARIMA(timeseries, (0, 0, q)).fit() summary = model.summary() forecast = model.forecast(steps=steps1) return summary, forecast
class Model(Config): def __init__(self, m, station, var, predictives): self.var = var self.station = station self.predictives = predictives if var in self.predictives: self.predictives.remove(var) self.alg = m['alg'](**m["args"]) self.date = None self.param_grid = m['param_grid'] if 'param_grid' in m else None self.actual_holdout = None self.pred_holdout = None self.actual_train = None self.actual_test = None self.pred = None self.whole_pred = None self.se = None self.conf = None self.metrics = {} self.holdout_metrics = {} self.created = datetime.now() self.model_data = None self.lstm_history = None self.lstm_train_pred = None self.lstm_test_pred = None self.supervised_learn = None def set_props(self, alg, df): self.algorithm = alg self.start_time = df['Date'].min() self.end_time = df['Date'].max() self.n_records = df.shape[0] def get_meta(self): return dict( algorithm=self.algorithm, supervised_learning=self.supervised_learn, predictives=self.predictives, start_time=self.start_time, end_time=self.end_time, n_records=self.n_records, metrics=self.metrics, created=self.created, ) def dataset_split( self, data, ratio, supervised_learning=Config.MODELLING_CONFIG["SUPERVISED_LEARNING"], shuffle_data=False): ratio = ratio or self.MODELLING_CONFIG["SPLIT_RATIO"] if supervised_learning == False: self.logger.info( " Intitiate forecasting model train-test split ...") train_size = int( len(data) * Config.MODELLING_CONFIG["SPLIT_RATIO"]) train, test = data.iloc[0:train_size], data.iloc[ train_size:len(data)] self.logger.info( " Training dataset: {}, Testing dataset: {}".format( train.shape, test.shape)) elif supervised_learning == True: self.logger.info( " Intitiate supervised learning model train-test split ...") train, test = train_test_split( data, test_size=ratio, shuffle=shuffle_data, random_state=self.MODELLING_CONFIG["RANDOM_STATE"], ) self.logger.info( " Training dataset: {}, Testing dataset: {}".format( train.shape, test.shape)) return train, test def dl_univariate_data(self, data, start_index, end_index, history_size, target_size): data = [] labels = [] start_index = start_index + history_size if end_index is None: end_index = len(data) - target_size for i in range(start_index, end_index): indices = range(i - history_size, i) # # Reshape data from (history_size,) to (history_size, 1) data.append(np.reshape(data[indices], (history_size, 1))) labels.append(data[i + target_size]) return np.array(data), np.array(labels) def multivariate_data(self, data, target, start_index, end_index, history_size, target_size, step, single_step=False): data = [] labels = [] start_index = start_index + history_size if end_index is None: end_index = len(data) - target_size for i in range(start_index, end_index): indices = range(i - history_size, i, step) data.append(data[indices]) if single_step: labels.append(target[i + target_size]) else: labels.append(target[i:i + target_size]) return np.array(data), np.array(labels) @staticmethod def mean_absolute_percentage_error(y_true, y_pred): mape = np.mean(np.abs((y_true - y_pred) / y_true + 1e-6)) * 100 if type(mape) == pd.Series: mape = mape[0] return mape @staticmethod def root_mean_square_error(y_true, y_pred): rmse = math.sqrt(mean_squared_error(y_true, y_pred)) return rmse @staticmethod def mean_absolute_error(y_true, y_pred): mae = mean_absolute_error(y_true, y_pred) return mae def evaluate(self, actual, pred): r2score = r2_score(actual, pred) MAPE = Model.mean_absolute_percentage_error(actual, pred) MAE = mean_absolute_error(actual, pred) rmse = Model.root_mean_square_error(actual, pred) metrics = dict(MAE=MAE, MAPE=MAPE, RMSE=rmse) # R2_Score=r2score return metrics def predict(self, X_test): """predict new cases""" if all(p in X_test for p in self.predictives): X_test = X_test[self.predictives].astype(float) X_test.fillna(method=self.MODELLING_CONFIG["STATUS_MISSING_FILL"], inplace=True) if any(X_test.isnull().values.all(axis=0)): return [np.nan] * X_test.shape[0] preds = self.alg.predict(X_test.dropna()) return preds else: return [np.nan] * X_test.shape[0] def regression_scalar(self, data): """Regression using linear algorithms""" df = data[self.predictives + [self.var, "Date"]] print(self.predictives) print(len(self.predictives)) scaler = MinMaxScaler() scaler.fit(df.drop(columns=[self.var, "Date"]).values) train, test = self.dataset_split(df) self.date = test["Date"] X_train = train.drop(columns=[self.var, "Date"]) X_test = test.drop(columns=[self.var, "Date"]) y_train = train[[self.var]] self.actual = test[[self.var]].values.ravel() self.example = X_train.iloc[0].values # # scaling X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) self.alg.fit(X_train, y_train.values.ravel()) self.pred = self.alg.predict(X_test) self.metrics = self.evaluate(self.actual, self.pred) def regression_tree(self, data, metric_eval, cv_type): """regression using tree-based algrithms""" df = data[self.predictives + [self.var, "Date"]] # Train/Test if metric_eval == "test": if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: n_holdout = max( 1, int(df.shape[0] * self.MODELLING_CONFIG["HOLDOUT_PERCENT"])) holdout = df.iloc[-n_holdout:, ] X_holdout = holdout.drop(columns=[self.var, "Date"]) self.actual_holdout = holdout[[self.var]].values.ravel() df = df.iloc[:-n_holdout, ] train, test = self.dataset_split( df, ratio=Config.MODELLING_CONFIG["SPLIT_RATIO"]) self.date = test["Date"] X_train = train.drop(columns=[self.var, "Date"]) X_test = test.drop(columns=[self.var, "Date"]) y_train = train[[self.var]] self.actual = test[[self.var]].values.ravel() self.example = X_train.iloc[0].values if self.param_grid != None: #print(" Running Grid Search...") param_grid_1 = { k: v for k, v in self.param_grid.items() if k in ["max_depth", "num_leaves", "n_estimators"] } n_folds = int(100 / (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1 grid_search_rf = GridSearchCV(estimator=self.alg, param_grid=param_grid_1, scoring='r2', cv=n_folds, n_jobs=8) grid_search_rf.fit(X_train, y_train.values.ravel()) print(' Best Params: ', grid_search_rf.best_params_) print(' R2-Score: ', grid_search_rf.best_score_) self.alg = self.alg.set_params(**grid_search_rf.best_params_) self.alg.fit(X_train, y_train.values.ravel()) self.pred = self.alg.predict(X_test) self.metrics = self.evaluate(self.actual, self.pred) if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: self.pred_holdout = self.alg.predict(X_holdout) self.metrics_holdout = self.evaluate(self.actual_holdout, self.pred_holdout) # Cross-validation elif (metric_eval == "cv"): if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: n_holdout = int(df.shape[0] * self.MODELLING_CONFIG["HOLDOUT_PERCENT"]) holdout = df.iloc[-n_holdout:, ] X_holdout = holdout.drop(columns=[self.var, "Date"]) self.actual_holdout = holdout[[self.var]].values.ravel() df = df.iloc[:-n_holdout, ] X_train = df.drop(columns=[self.var, "Date"]) y_train = df[[self.var]] self.actual = df[[self.var]].values.ravel() self.date = df["Date"] self.example = X_train.iloc[0].values fold = LeaveOneOut() if cv_type == "loo" else int( 100 / (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) if self.param_grid != None: print(" Running Grid Search...") param_grid_1 = { k: v for k, v in self.param_grid.items() if k in ["max_depth", "num_leaves", "n_estimators"] } n_folds = int(100 / (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1 grid_search_rf = GridSearchCV(estimator=self.alg, param_grid=param_grid_1, scoring='r2', cv=n_folds, n_jobs=8) grid_search_rf.fit(X_train, y_train.values.ravel()) print(' Best Params: ', grid_search_rf.best_params_) print(' R2-Score: ', grid_search_rf.best_score_) self.alg = self.alg.set_params(**grid_search_rf.best_params_) self.alg.fit(X_train, y_train.values.ravel()) self.pred = cross_val_predict(estimator=self.alg, X=X_train, y=y_train.values.ravel(), cv=fold, n_jobs=-1) self.metrics = self.evaluate(self.actual, self.pred) if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: self.pred_holdout = self.alg.predict(X_holdout) self.metrics_holdout = self.evaluate(self.actual_holdout, self.pred_holdout) elif (metric_eval == "cv"): if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: self.n_holdout = int(df.shape[0] * self.MODELLING_CONFIG["HOLDOUT_PERCENT"]) holdout = df.iloc[-self.n_holdout:, ] X_holdout = holdout.drop(columns=[self.var, "Date"]) self.actual_holdout = holdout[[self.var]].values.ravel() df = df.iloc[:-self.n_holdout, ] train, test = self.dataset_split(df) self.date = test["Date"] X_train = train.drop(columns=[self.var, "Date"]) X_test = test.drop(columns=[self.var, "Date"]) y_train = train[[self.var]] self.actual = test[[self.var]].values.ravel() if self.param_grid != None: print(" Running Grid Search...") param_grid_1 = { k: v for k, v in self.param_grid.items() if k in ["max_depth", "num_leaves", "n_estimators"] } n_folds = int(100 / (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1 grid_search_rf = GridSearchCV(estimator=self.alg, param_grid=param_grid_1, scoring='r2', cv=n_folds, n_jobs=8) grid_search_rf.fit(X_train, y_train.values.ravel()) ## Second pass for grid search on learning params print(' Best Params: ', grid_search_rf.best_params_) print(' R2-Score: ', grid_search_rf.best_score_) self.alg = self.alg.set_params(**grid_search_rf.best_params_) self.alg.fit(X_train, y_train.values.ravel()) self.pred = self.alg.predict(X_test) self.metrics = self.evaluate(self.actual, self.pred) if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0: self.pred_holdout = self.alg.predict(X_holdout) self.metrics_holdout = self.evaluate(self.actual_holdout, self.pred_holdout) def forecast_model(self, data, seasonal=Config.MODELLING_CONFIG["SEASONAL_OPTION"]): df = data[self.predictives] train, test = self.dataset_split( df, self.MODELLING_CONFIG["SPLIT_RATIO"], supervised_learning=Config.MODELLING_CONFIG["SUPERVISED_LEARNING"]) history = [x for x in train] prediction_list = list() if seasonal == True: if self.alg == 'SARIMA': train, test = np.log10(train), np.log10(test) self.alg = pm.auto_arima(train, start_p=1, d=0, start_q=1, max_p=5, max_d=2, max_q=5, m=7, start_P=0, D=0, start_Q=0, max_P=5, max_D=2, max_Q=5, seasonal=True, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) for data in range(len(test)): self.alg = self.alg.fit(disp=1) self.pred = self.alg.predict(n_periods=1) prediction_list.append(self.pred) self.pred self.actual_test = test[data] history.append(self.actual_test) elif self.alg == 'HOLT_WINTER': self.alg = self.alg( train, seasonal_periods=Config. MODELLING_CONFIG["HOLT_WINTER_SEASON"], trend=Config.MODELLING_CONFIG["HOLT_WINTER_TREND"], seasonal=Config.MODELLING_CONFIG["HOLT_WINTER_SEASONAL"]) self.pred = self.alg.forecast(len(test)) elif seasonal == False: for data in range(len(test)): self.alg = ARIMA(train, order=(Config.MODELLING_CONFIG['ARIMA_P'], Config.MODELLING_CONFIG['ARIMA_D'], Config.MODELLING_CONFIG['ARIMA_Q'])) self.alg = self.alg.fit(disp=1) self.pred, self.se, self.conf = self.alg.forecast() prediction_list.append(self.pred) self.actual_test = test[data] history.append(self.actual_test) self.metrics = self.evaluate(self.actual_test, self.pred) def lstm_model(self, data): df = data[self.predictives] train, test = self.dataset_split(df, self.MODELLING_CONFIG["SPLIT_RATIO"]) scaler = MinMaxScaler() scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) X_train, y_train = self.create_dataset( train, time_steps=Config.RNN_CONFIG["TIME_STEPS"]) X_test, y_test = self.create_dataset( test, time_steps=Config.RNN_CONFIG["TIME_STEPS"]) self.actual_train = y_train self.actual_test = y_test X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.alg = keras.Sequential() self.alg.add( LSTM(units=Config.RNN_CONFIG["UNITS"], input_shape=(X_train.shape[1], X_train.shape[2]))) self.alg.add(Dropout(rate=Config.RNN_CONFIG["DROPOUT_RATE"])) self.alg.add(Dense(units=Config.RNN_CONFIG["DENSE_UNIT"])) self.alg.compile(loss=Config.RNN_CONFIG["LOSS_FUNC"], optimizer=Config.RNN_CONFIG["OPTIMIZER"]) self.lstm_history = self.alg.fit( X_train, y_train, epochs=Config.RNN_CONFIG["EPOCHS"], batch_size=Config.RNN_CONFIG["BATCH_SIZE"], validation_split=Config.RNN_CONFIG["VALIDATION_SPLIT"], shuffle=Config.RNN_CONFIG["SHUFFLE"], validation_data=(X_test, y_test), verbose=1, ) self.logger.info(self.alg.summary()) self.lstm_train_pred = self.alg.predict(X_train) self.lstm_test_pred = self.alg.predict(X_test) self.lstm_train_pred = scaler.inverse_transform(self.lstm_train_pred) y_train = scaler.inverse_transform([y_train]) self.lstm_test_pred = scaler.inverse_transform(self.lstm_test_pred) y_test = scaler.inverse_transform([y_test]) self.metrics = self.evaluate(self.actual_test[0], self.pred[:0]) def feature_importance_plot(self): fig, ax = plt.subplots(figsize=(10, len(self.predictives) / 2)) s = pd.Series(self.alg.feature_importances_, index=self.predictives) ax = s.sort_values(ascending=False).plot.barh() ax.invert_yaxis() patches = [ mpatches.Patch(label="Test Size: {}".format(self.actual.shape[0]), color='none') ] for alg, val in self.metrics.items(): patches.append( mpatches.Patch( label="{}: {:0.2f}".format(alg, val), color='none', )) plt.legend(handles=patches, loc='lower right') return fig def residual_plot(self): fig = plt.figure(figsize=(10, 6)) gs = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[3, 1]) ax1 = fig.add_subplot(gs[0]) residual = self.actual - self.pred sns.residplot(x=self.pred, y=residual, ax=ax1) ax1.set_ylabel("Residual") ax1.set_xlabel("Predict") ax1.set_title(self.station) ax2 = fig.add_subplot(gs[1], sharey=ax1) ax2.hist(residual, orientation="horizontal") ax2.set_xlabel('Residual Distribution') return fig
def def_ARIMA(timeseries, p, d, q, steps1): model = ARIMA(timeseries, (p, d, q)).fit() summary = model.summary() forecast = model.forecast(steps=steps1) return summary, forecast
dta0 = pd.read_csv('data.CSV', header=0) dta0.index = pd.to_datetime(dta0['date']) ##data = pd.DataFrame() ##data['date'] = ['2008/1/11','2008/2/6','2008/3/17','2008/4/13','2008/5/17','2008/6/15','2008/7/1','2008/7/12','2008/8/10','2008/9/14','2008/10/12','2008/11/16','2008/12/13','2009/1/19','2009/2/16','2009/3/13','2009/4/18','2009/5/16','2009/6/20','2009/7/11','2009/8/15','2009/9/19','2009/10/16','2009/11/14','2009/12/11','2010/1/15','2010/2/20','2010/3/13','2010/4/17','2010/5/15','2010/6/12','2010/7/16','2010/8/14','2010/9/18','2010/10/16','2010/11/19','2010/12/24','2011/1/21','2011/2/18','2011/3/19','2011/4/17','2011/5/15','2011/6/18','2011/7/16','2011/8/20','2011/9/24','2011/10/22','2011/11/19','2011/12/24','2012/1/14'] ##data['dy'] = [0.62,1.01,1.78,1.29,0.11,-0.35,-0.44,-0.3,-1.11,-1.78,-1.39,-0.94,-0.36,1.47,1.75,2.04,1.03,0.02,-0.59,-1.35,-2.14,-1.96,-1.46,-0.56,0.04,0.96,1.58,1.43,0.95,0.14,-0.3,-1.35,-1.6,-1.98,-1.58,-0.98,0.56,1.14,1.19,1.18,0.61,0.76,-0.66,-1.14,-1.35,-1.85,-0.95,-0.65,0.44,1.09] ##data.index = pd.to_datetime(data['date']) p = 0 d = 1 q = 1 arima = ARIMA(dta0['H'].dropna(), (p, d, q)).fit() print(arima.summary()) dta_pred = arima.predict(typ='levels') #预测 ###拟合 ##fig1 = plt.figure(figsize=(12,8)) ##plt.plot(dta0['dy'], color='green') ##plt.plot(dta_pred, color='yellow') ##fig1.show() #模型预测 forecast_ts = arima.forecast(10) fore = pd.DataFrame() fore['date'] = ['2021-01-24', '2021-01-25', '2021-01-26'] fore['result'] = pd.DataFrame(forecast_ts[0]) fore.index = pd.to_datetime(fore['date'])
from statsmodels.tsa.arima_model import ARIMA #calculate errors errors = model_full.predict(bluejet1) - bluejet1.flyers #ACF plot to look significance of errors tsa_plots.plot_acf(errors, lags=12) # from the acf plot we can see that lags of the errors having significant association, hence we can use errors for forecsting errors for next 12 timeperiods. # Auto regreesion(p), from the ACF considering principal of parcimony we can take P=1 model_AR = ARIMA(errors, order=(1, 0, 0)).fit(disp=0) model_AR.summary() pred_data["forecasted_errors"] = pd.Series(model_AR.forecast(12)[0]) pred_data[ "improved_forecast"] = pred_data.forecasted_flyers + pred_data.forecasted_errors #decomposition import pandas as pd from pandas import read_csv from matplotlib import pyplot from statsmodels.tsa.seasonal import seasonal_decompose series = read_csv('bluejet.csv', header=0, index_col=0) series.reset_index(inplace=True) series["date"] = pd.to_datetime(series["Month"]) series = series.set_index("date")
# ACF -> to identify q plt.subplot(122) plt.plot(lags_acf) plt.axhline(y=0,linestyle="-",color="gray") plt.axhline(y=-1.96/np.sqrt(len(diff_mystock)),linestyle="--",color='red') plt.axhline(y=1.96/np.sqrt(len(diff_mystock)),linestyle="--",color='red') plt.title("ACF") plt.xlabel("Lags") plt.ylabel("Correlation") p=0; q=0; d=0 # Build the ARIMA model m1 = ARIMA(diff_mystock,order=(p,d,q)).fit(disp=0) m1.summary() plt.hist(m1.resid) plt.title("ARIMA model residuals") # LJung-Box test to check the model goodness # H0: residuals are independently distributed # H1: residuals are not independently distributed pvalue = sm.stats.acorr_ljungbox(m1.resid,lags=[1])[1] if pvalue > 0.05: print("FTR H0. Residuals are independently distributed") else: print("Reject H0. Residuals are not independently distributed") # forecast for the next 12 months f1 = m1.forecast(steps=12)
fig1 = sm.graphics.tsa.plot_acf(trainWTI['WTI']) ax = fig1.add_subplot(111) ax.set_xlabel("Lag") ax.set_ylabel("ACF") plt.show() fig2 = sm.graphics.tsa.plot_pacf(trainWTI['WTI']) ax = fig2.add_subplot(111) ax.set_xlabel("Lag") ax.set_ylabel("PACF") plt.show() # Parameter freq indicates that monthly statistics is used arima_mod100 = ARIMA(trainWTI, (2,0,0), freq='M').fit() # try (1,0,1) print arima_mod100.summary() # Check assumptions: # 1) The residuals are not correlated serially from one observation to the next. # The Durbin-Watson Statistic is used to test for the presence of serial correlation among the residuals # The value of the Durbin-Watson statistic ranges from 0 to 4. # As a general rule of thumb, the residuals are uncorrelated is the Durbin-Watson statistic is approximately 2. # A value close to 0 indicates strong positive correlation, while a value of 4 indicates strong negative correlation. print "==================== Durbin-Watson =====================" print sm.stats.durbin_watson(arima_mod100.resid.values) print "========================================================" fig = plt.figure(figsize=(10,5)) ax = fig.add_subplot(111) ax = arima_mod100.resid.plot(ax=ax) ax.set_title("Residual series")
import pandas as pd engine = create_engine( 'mysql+mysqlconnector://viewer:@dadata.cba.edu:3306/ACS' ) SELECT = """SELECT AVG(hhincome) AS hhincome, year, statefip FROM ACS GROUP BY year, statefip ORDER BY year, statefip""" data = pd.read_sql(SELECT, engine) reg = sm.ols("np.log(hhincome) ~ year", data=data).fit() print(reg.summary()) #%% reg = sm.ols("np.log(hhincome) ~ year + C(statefip)", data=data).fit() print(reg.summary()) #%% from statsmodels.tsa.arima_model import ARIMA y = data.loc[data['statefip']==31, ['hhincome','year']] y.index=pd.to_datetime(y.year) reg = ARIMA(y['hhincome'], order=(1,1,0)).fit() print(reg.summary())
# split dataset (on straight data = prices) # ---------- size = int(len(df_comp) * 0.8) df = df_comp.iloc[:size] df_test = df_comp.iloc[size:] # -- creating returns column from train dataset df['returns'] = df.market_value.pct_change(1) * 100 # review ACF and PACF (in reality is more functional to run auto_arima vs checking ACF/PACF manually, but this is for sake of example) # ---------- # not done here # select ARMA model (by looking to PACF here) and iterating through more models # ---------- model_arima_111 = ARIMA(df.market_value, order=(1, 1, 1)).fit() print(model_arima_111.summary()) print('----------') model_arima_511 = ARIMA(df.market_value, order=(5, 1, 1)).fit() print(model_arima_511.summary()) print('----------') # compare LLR results across models to see which model is best # ---------- def LLR_test(mod_1, mod_2, DF=1): L1 = mod_1.fit().llf L2 = mod_2.fit().llf LR = (2 * (L2 - L1)) p = chi2.sf(LR, DF).round(3) return p
pmax = int(len(D_arima_data) / 10) #一般阶数不超过length/10 qmax = int(len(D_arima_data) / 10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax + 1): tmp = [] for q in range(qmax + 1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(arima_data, (p, 1, q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) # In[73]: bic_matrix = pd.DataFrame(bic_matrix) #从中可以找出最小值 p, q = bic_matrix.stack().idxmin() #先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:{0}、{1}'.format(p, q)) # In[68]: model = ARIMA(arima_data, (0, 1, 1)).fit() #建立ARIMA(0, 1, 1)模型 # In[69]: model.summary() #给出一份模型报告 # In[70]: model.forecast(5) #作为期5天的预测,返回预测结果、标准误差、置信区间。
from statsmodels.datasets.macrodata import load_pandas from statsmodels.tsa.base.datetools import dates_from_range from statsmodels.tsa.arima_model import ARIMA import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm plt.interactive(False) # let's examine an ARIMA model of CPI cpi = load_pandas().data["cpi"] dates = dates_from_range("1959q1", "2009q3") cpi.index = dates res = ARIMA(cpi, (1,1,1), freq='Q').fit() print res.summary() # we can look at the series cpi.diff().plot() # maybe logs are better log_cpi = np.log(cpi) # check the ACF and PCF plots acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95) # center the confidence intervals about zero #confint_acf -= confint_acf.mean(1)[:,None] pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols') # confidence interval is now an option to pacf from scipy import stats confint_pacf = stats.norm.ppf(1-.025) * np.sqrt(1/202.)
from statsmodels.tsa.base.datetools import dates_from_range from statsmodels.tsa.arima_model import ARIMA import matplotlib.pyplot as plt import numpy as np from scipy import stats import statsmodels.api as sm plt.interactive(False) # let's examine an ARIMA model of CPI cpi = load_pandas().data['cpi'] dates = dates_from_range('1959q1', '2009q3') cpi.index = dates res = ARIMA(cpi, (1, 1, 1), freq='Q').fit() print(res.summary()) # we can look at the series cpi.diff().plot() # maybe logs are better log_cpi = np.log(cpi) # check the ACF and PCF plots acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95) # center the confidence intervals about zero # TODO: demean? --> confint_acf -= confint_acf.mean(1)[:, None] pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols') # confidence interval is now an option to pacf confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
# that helps us determine the number of AR terms plot_acf(hourly_sentiment_series_diff2) pyplot.show() plot_pacf(hourly_sentiment_series_diff2) pyplot.show() # Depending on ACF and PACF, create ARMA/ARIMA model # with AR and MA terms # This will infer the frequency, so make sure there are # no gaps between datetimes ARMA1model_hourly_sentiment = ARIMA(hourly_sentiment_series, order=(5,2,1)).fit(transparams=False) # If the p-value for a AR/MA coef is > 0.05, it's not significant # enough to keep in the model # Might want to re-model using only significant terms print(ARMA1model_hourly_sentiment.summary()) # Predict the next 5 hours (5 time steps ahead), # which is the test/holdout set ARMA1predict_5hourly_sentiment = ARMA1model_hourly_sentiment.predict('2/6/2019 7:00:00 PM','2/6/2019 11:00:00 PM', typ='levels') print('Forecast/preditions for 5 hours ahead ', ARMA1predict_5hourly_sentiment) # Back transform so we can compare de-diff'd predicted values # with the de-diff'd/original actual values # This is automatically done when predicting (specify typ='levels'), # so no need to manually de-diff # Nevertheless, let's demo how we de-transform 2 rounds of diffs # using cumulative sum with original data given #diff2 back to diff1 undiff1 = hourly_sentiment_series_diff2.cumsum().fillna(hourly_sentiment_series_diff2) #undiff1 back to original data
plot_acf(dftaxi_day.response_variable, lags=52) #%% Stationariaty on entire dataset from statsmodels.tsa.arima_model import ARMA from statsmodels.tsa.arima_model import ARIMA dftaxi_day = dftaxi_day[['response_variable']].astype(float) model = ARMA(dftaxi_day, (1, 0)).fit() model.summary() #Matches autocorr(1), therefore stationary dataset! #%% Residuals for AR(1) using entire dataset type(model.resid) print(model.resid.plot()) print(plot_acf(model.resid, lags = 50)) #%% ARIMA arima_model = ARIMA(dftaxi_day, (28,1, 1)).fit() arima_model.summary() print(arima_model.resid.plot()) print(plot_acf(arima_model.resid, lags = 50)) #%% Predict #arima_model.predict(1,100).plot() import matplotlib.pyplot as plt fig, ax = plt.subplots() ax = train.plot(ax=ax) fig = arima_model.plot_predict('2016-10-01', '2016-12-31', ax=ax, plot_insample=False) ############################################################################################################### ############################################################################################################### #%%
# then consider adding an MA term to the model. The lag at which the ACF cuts off is the indicated number of MA terms. fig1 = sm.graphics.tsa.plot_acf(trainWTI['WTI']) ax = fig1.add_subplot(111) ax.set_xlabel("Lag") ax.set_ylabel("ACF") plt.show() fig2 = sm.graphics.tsa.plot_pacf(trainWTI['WTI']) ax = fig2.add_subplot(111) ax.set_xlabel("Lag") ax.set_ylabel("PACF") plt.show() # Parameter freq indicates that monthly statistics is used arima_mod100 = ARIMA(trainWTI, (2, 0, 0), freq='M').fit() # try (1,0,1) print arima_mod100.summary() # Check assumptions: # 1) The residuals are not correlated serially from one observation to the next. # The Durbin-Watson Statistic is used to test for the presence of serial correlation among the residuals # The value of the Durbin-Watson statistic ranges from 0 to 4. # As a general rule of thumb, the residuals are uncorrelated is the Durbin-Watson statistic is approximately 2. # A value close to 0 indicates strong positive correlation, while a value of 4 indicates strong negative correlation. print "==================== Durbin-Watson =====================" print sm.stats.durbin_watson(arima_mod100.resid.values) print "========================================================" fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(111) ax = arima_mod100.resid.plot(ax=ax) ax.set_title("Residual series")
import math from statistics import mean plt.style.use('fivethirtyeight') df = pd.read_excel("../00Daily/Australia.xlsx", squeeze=True, parse_dates=True) df = df[["Date", "LocalTransmission"]] df.set_index("Date", inplace=True) df.dropna(inplace=True) ##df['Date'] = pd.to_datetime(df['Date']) LocalTransmission = df['LocalTransmission'].astype('int32') #print (df.head()) print(df.index) result = ARIMA(df, order=(1, 1, 1)).fit(disp=False) print(result.summary()) #print(result.params) predictions = result.predict(start="2020-03-01", end="2020-05-01") #accuracy = result.score() print(predictions) ##accuracy = result.score() #print (accuracy) result.plot_predict(start="2020-03-01", end="2020-05-01") plt.suptitle( 'Prediction for postive cases in Australia \n Algorithm used: ARIMA', fontsize=12) plt.show() ##def mean_forecast_error(y, yhat): ## return y.sub(yhat).mean()
plot_acf(D_data).show() #自相关图 from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(D_data).show() #偏自相关图 ADF(D_data[u'销量差分'])#平稳性检测 #白噪声检验 from statsmodels.stats.diagnostic import acorr_ljungbox acorr_ljungbox(D_data, lags=1) #返回统计量和p值 from statsmodels.tsa.arima_model import ARIMA #定阶 pmax = int(len(D_data)/10) #一般阶数不超过length/10 qmax = int(len(D_data)/10) #一般阶数不超过length/10 bic_matrix = [] #bic矩阵 for p in range(pmax+1): tmp = [] for q in range(qmax+1): try: #存在部分报错,所以用try来跳过报错。 tmp.append(ARIMA(data, (p,1,q)).fit().bic) except: tmp.append(None) bic_matrix.append(tmp) bic_matrix = pd.DataFrame(bic_matrix) #从中可以找出最小值 p,q = bic_matrix.stack().idxmin() #先用stack展平,然后用idxmin找出最小值位置。 print(u'BIC最小的p值和q值为:%s、%s' %(p,q)) model = ARIMA(data, (0,1,1)).fit() #建立ARIMA(0, 1, 1)模型 model.summary() #给出一份模型报告 model.forecast(5) #作为期5天的预测,返回预测结果、标准误差、置信区间。
if h > 0: print('模型ARIMA(%s,1, %s)不符合白噪音检验' % (p, q)) print('在BIC矩阵中去掉[%s,%s]组合,重新进行计算' % (p, q)) matrix.iloc[p, q] = np.nan arimafail = arima continue else: # print(p,q) print('模型ARIMA(%s,%s)符合白噪声检验' % (p, q)) break ''' ''' # 第 5 步--C盘---------模型预测 print('模型报告:summary():\n', arima.summary()) forecast_values, forecasts_standard_error, forecast_confidence_interval = arima.forecast( 5) pre_data = pd.DataFrame(xtest_value) pre_data.insert(1, 'CWXT_DB:184:D:\\_predict', forecast_values) pre_data.rename(columns={ 'CWXT_DB:184:D:\\': '实际值', 'CWXT_DB:184:D:\\_predict': '预测值' }, inplace=True) result_d = pre_data.applymap(lambda x: '%.2f' % x) result_d.to_excel('../my_data/pedictdata_D_BIC_ARMA.xlsx') # 第 5 步--D盘---------模型评价 # 为了评价时序预测模型效果的好坏,本章采用3个衡量模型预测精度的统计量指标:平均绝对误差、均方根误差、平均绝对百分误差
print (u'模型ARIMA(%s,1,%s)不符合白噪声检验' % (p,q)) print ('在AIC矩阵中去掉[%s,%s]组合,重新进行计算' % (p,q)) aic_matrix.iloc[p,q] = np.nan arimafail = arima continue else: print (p,q) print (u'模型ARIMA(%s,1,%s)符合白噪声检验' % (p,q)) break # In[7]: arima.summary() # 注意当p,q值为0,0时,summary方法报错 # In[8]: forecast_values, forecasts_standard_error, forecast_confidence_interval = arima.forecast(5) forecast_values # arimaf = ARIMA(xdata2, (0,1,1)).fit() # arimaf.forecast(5)[0] # In[9]: predictdata = pd.DataFrame(xtest_value) predictdata.insert(1,'CWXT_DB:184:C:\\_predict',forecast_values) predictdata.rename(columns={'CWXT_DB:184:C:\\':u'实际值','CWXT_DB:184:C:\_predict':u'预测值'},inplace=True)
from statsmodels.tsa.base.datetools import dates_from_range from statsmodels.tsa.arima_model import ARIMA import matplotlib.pyplot as plt import numpy as np from scipy import stats import statsmodels.api as sm plt.interactive(False) # 让我们以 CPI 的 ARIMA 模型来举例 cpi = load_pandas().data['cpi'] dates = dates_from_range('1959q1', '2009q3') cpi.index = dates res = ARIMA(cpi, (1, 1, 1), freq='Q').fit() print(res.summary()) # 我们可以画图查看序列 cpi.diff().plot() # 或许查看日志会更好 log_cpi = np.log(cpi) # 检查 ACF 和 PCF 图 acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95) # 将置信区间定为零 # TODO: demean? --> confint_acf -= confint_acf.mean(1)[:, None] pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols') # 置信区间是 pacf 的一个选项 confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
# # diff(diff(y)) = ARMA(p, q) # # The order of differencing is the same as applying the `diff` function _d_ times. # Compared to an ARMA model, ARIMA models _do not rely on the underlying series being stationary._ The differencing operation can _convert_ the series to one that is stationary. # # Since ARIMA models automatically include differencing, we can use this on a broader set of data without assumptions of a constant mean. # In[163]: from statsmodels.tsa.arima_model import ARIMA # We can see that this model in fact simplifies automatically to an ARMA model. arima101 = ARIMA(store1_sales_data, (1, 0, 1)).fit() arima101.summary() # In[168]: # Let's remove the moving average component since it wasn't particularly useful before. # Also, lets add the differencing parameter. # Now this is equivalent to a AR(1) model on the differenced data. arima110 = ARIMA(store1_sales_data, (1, 1, 0)).fit() # Note the value of the coeffient. arima110.summary() # In[169]: # We can compute the lag 1 auto correlation of the difference series and see if they match! store1_sales_data.Sales.diff(1).autocorr(1)