def get(self, request, *args, **kwargs): n_steps = int(self.request.query_params.get('nsteps', 10)) last_date = MultivarientData.objects.latest( 'date').date + datetime.timedelta(days=30) data = read_frame(MultivarientData.objects.all()) data['date'] = pd.to_datetime(data['date']) data = data.drop('id', axis=1) data = data.set_index('date') oildf = data['oil_price'] date_index = pd.date_range(start=last_date, periods=n_steps, freq='M') df = pd.DataFrame() arima = SARIMAX(endog=oildf, order=(3, 0, 4), freq='M', seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit() df['oilpriceprediction'] = arima.predict( date_index.min(), date_index.max()) arima = SARIMAX(endog=data['iron_price'], exog=data[['oil_price']], order=(1, 0, 0), freq='M', seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit() df['ironpriceprediction'] = arima.predict(df.index.min(), df.index.max(), exog=df[['oilpriceprediction']]) df['date'] = df.index oil_data = df[['date', 'oilpriceprediction']].values.tolist() iron_data = df[['date', 'ironpriceprediction']].values.tolist() return Response({'oil_data': oil_data, 'iron_data': iron_data})
def run_sarimax(language): create_predictions_folder() series = read_csv(os.path.join(DATA_FOLDER, language + CSV_FILE_SUFFIX), header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) data = series.values.tolist() train, test = data[:-12], data[-12:] model_fit = SARIMAX(train, order=(2, 1, 4), seasonal_order=(1, 1, 1, 12)).fit() dates_list = get_future_date_list() print(len(dates_list)) test_pred = model_fit.predict(len(train) + 1, len(data), dynamic=True) future_pred = model_fit.predict(len(data), len(data) + 59, dynamic=True) pyplot.figure() pyplot.title("Predictions based on SARIMAX model : " + language + " repositories") pyplot.plot(series, label='Historical data') pyplot.plot(series.keys().tolist(), [None for i in range(len(train))] + test_pred.tolist(), label='Predictions - Test data') pyplot.plot(dates_list, future_pred, label='Predictions - 2019 to 2023') pyplot.legend() pyplot.savefig( os.path.join(PREDICTIONS_FOLDER, language + "_predictions_SARIMAX.png")) rmse = RMSE(test, test_pred) print('SARIMAX RMSE: %.3f' % rmse + " for " + language + " repos test set") write_to_csv([str(date_)[:-3] for date_ in dates_list], future_pred, language, SARIMAX_) return future_pred
def get(self, request, *args, **kwargs): first_date = str(MultivarientData.objects.earliest('date').date) last_date = str(MultivarientData.objects.latest( 'date').date - datetime.timedelta(days=250)) start_date = self.request.query_params.get('startdate', first_date) end_date = self.request.query_params.get('enddate', last_date) if end_date > last_date: end_date = last_date date_valid = MultivarientData.objects.exclude( date__gt=end_date).exclude(date__lt=start_date) if not date_valid: start_date = first_date end_date = last_date data = read_frame(MultivarientData.objects.all()) data['date'] = pd.to_datetime(data['date']) data = data.drop('id', axis=1) data = data.set_index('date') startdate = dat.strptime(start_date, '%Y-%m-%d') enddate = dat.strptime(end_date, '%Y-%m-%d') nextmonth = enddate + relativedelta.relativedelta(months=1) train, test = data[startdate:nextmonth], data[nextmonth:] oiltrain = train['oil_price'] arima = SARIMAX(endog=oiltrain, order=(3, 0, 4), freq='M', seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit() test['oilpriceprediction'] = arima.predict( test.index.min(), test.index.max()) arima = SARIMAX(endog=train['iron_price'], exog=train[['oil_price']], order=(1, 0, 0), freq='M', seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit() test['ironpriceprediction'] = arima.predict(test.index.min(), test.index.max(), exog=test[['oilpriceprediction']]) test['date'] = test.index.astype('str') ironactual_data = test[['date', 'iron_price']].values.tolist() ironpredicted_data = test[[ 'date', 'ironpriceprediction']].values.tolist() oilactual_data = test[['date', 'oil_price']].values.tolist() oilpredicted_data = test[[ 'date', 'oilpriceprediction']].values.tolist() ironmetrics = forecast_accuracy( test['iron_price'], test['ironpriceprediction']) oilmetrics = forecast_accuracy( test['oil_price'], test['oilpriceprediction']) return Response({ 'actual_irondata': ironactual_data, 'predicted_irondata': ironpredicted_data, 'actual_oildata': oilactual_data, 'predicted_oildata': oilpredicted_data, 'ironmape': ironmetrics.get('mape', 0)*100, 'oilmape': oilmetrics.get('mape', 0)*100} )
def sarimax(train,test): train_pred=pd.DataFrame(data=None,index=train.index,columns=train.columns) # in sample predictions on train set test_pred=pd.DataFrame(data=None,index=test.index,columns=test.columns) # out of sample prediction on test set for (i,train_day,test_day) in [(i, dp.split(train,nsplits=7)[i], dp.split(test,nsplits=7)[i]) for i in dp.split(train,nsplits=7)]: # for each day train_pred_day=pd.DataFrame(data=None,index=train_day.index,columns=train_day.columns) # in sample predictions on train set test_pred_day=pd.DataFrame(data=None,index=test_day.index,columns=test_day.columns) # out of sample prediction on test set for hour in train_day: # for each hour in a day train_day_hour=train_day[hour] # train samples for particular hour test_day_hour=test_day[hour] # test samples for particular hour model_train = SARIMAX(train_day_hour, order=(0,1,1),seasonal_order=(0,1,1,7),trend='c',measurement_error=True).fit() # train model model_test=SARIMAX(pd.concat([train_day_hour,test_day_hour]), order=(0,1,1),seasonal_order=(0,1,1,7),trend='c',measurement_error=True).filter(model_train.params) # workaround for rolling day ahead forecast train_pred_day[hour]=model_test.predict(start=0,end=len(train_day)-1) # predict in sample on train set test_pred_day[hour]=model_test.predict(start=len(train_day)) # predict out of sample on test set train_pred.iloc[i::7,:]=train_pred_day # fill corresponding rows with in sample predictions test_pred.iloc[i::7,:]=test_pred_day # fill corresponding rows with out of sample predictions return train_pred,test_pred
def sarima_model(request): df = pd.read_csv('sales/data/IPN31152N.csv', index_col=0) df.index = pd.date_range(start='1972-01-01', end='2020-01-01', freq='M') train_df = df[df.index <= '2017-12-31'] test_df = df[df.index > '2017-12-31'] model1 = SARIMAX(train_df['IPN31152N'], order=(3, 1, 3), seasonal_order=(0, 1, 1, 12)).fit() pred = model1.predict(start=len(train_df), end=len(train_df) + len(test_df) - 1, type='levels') df_pred = pd.DataFrame(pred) df_pred.columns = ['IPN31152N'] results = { 'test': [[time_unix(test_df.index[i]), test_df.iloc[i]['IPN31152N']] for i in range(0, len(test_df))], 'predict': [[time_unix(df_pred.index[i]), df_pred.iloc[i]['IPN31152N']] for i in range(0, len(df_pred))] } re = {} re['2018'] = [ round( measure_metric(test_df['IPN31152N'][:12].values, df_pred['IPN31152N'][:12]) * 100, 2) ] re['2019'] = [ round( measure_metric(test_df['IPN31152N'][-12:].values, df_pred['IPN31152N'][-12:]) * 100, 2) ] context = {"data": json.dumps(results), 'mape': json.dumps(re)} return render(request, 'charts_model.html', context=context)
def testTime(request): try: df = pd.read_csv('MyApp/data/IPN31152N.csv', index_col=0) df.index = pd.date_range(start='1972-01-01', end='2020-01-01', freq='M') train_df = df[df.index <= '2017-12-31'] test_df = df[df.index > '2017-12-31'] model1 = SARIMAX(train_df['IPN31152N'], order=(3, 1, 1), seasonal_order=(0, 1, 1, 12)).fit() pred = model1.predict(start=len(train_df), end=len(train_df) + len(test_df) - 1, type='levels') df_pred = pd.DataFrame(pred) df_pred.columns = ['IPN31152N'] results = { 'test': [[time_unix(test_df.index[i]), test_df.iloc[i]['IPN31152N']] for i in range(0, len(test_df) - 1)], 'predict': [[time_unix(df_pred.index[i]), df_pred.iloc[i]['IPN31152N']] for i in range(0, len(df_pred) - 1)] } #context = {"data":json.dumps(results), "aa":"hihi","haha":"hahahaha"} return Response({'result': results}) except ValueError as e: # return JsonResponse(e.args[0],status.HTTP_400_BAD_REQUEST) return Response(status=status.HTTP_400_BAD_REQUEST)
def sarima(data, col, train, test, order_val, s_ord, tr, frequency): """ data - Entire Dataframe col - Target value train - Train Data Frame test - Test Data Frame order_val - (p,d,q) s_ord - (P,D,Q,s) tr = str{‘n’,’c’,’t’,’ct’} or iterable, optional """ y_hat_avg = test.copy() fit1 = SARIMAX(train[col], order=order_val,seasonal_order=s_ord, trend=tr ).fit() y_hat_avg['SARIMA'] = fit1.predict(start=train.index[-1], end=test.index[-1], dynamic=True) print('Rmse= ', rmse(test[col], y_hat_avg['SARIMA'])) #print(y_hat_avg) plt.figure(figsize=(16,8)) plt.plot( train[col], label='Train') plt.plot(test[col], label='Test') plt.plot(y_hat_avg['SARIMA'], label='SARIMA') plt.legend(loc='best') plt.savefig(frequency+'sarima.png')
def sarima_models_for_27_zipcodes(): #sarima orders after running grid search sarima_orders = [((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12))] #training models based on optimal sarima orders regions = zipcodes_top27() data = load_data_top_27() train, test = train_test_split(data, '2013-01-01', '2017-10-01') sarima_test_predictions = [] sarima_models = [] for i in range(len(regions)): model = SARIMAX(train.iloc[:, i], order=sarima_orders[i][0], seasonal_order=sarima_orders[i][1], enforce_invertibility=False, enforce_stationarity=False).fit() test_preds = model.predict(start=test.iloc[:, i].index[0], end=test.iloc[:, i].index[-1], typ='levels') sarima_test_predictions.append(test_preds) sarima_models.append(model) sns.set(font_scale=1) sns.set_style('white') pd.plotting.register_matplotlib_converters() fig, ax = plt.subplots(9, 3, figsize=(20, 18)) i = 0 for row in range(9): for col in range(3): err = round( np.sqrt(mse(test.iloc[:, i], sarima_test_predictions[i])), 0) test.iloc[:, i].plot(ax=ax[row][col], color='blue', label='Actual :' + str(regions[i])) sarima_test_predictions[i].plot(ax=ax[row][col], color='k', label='Preds, RMSE = ' + str(err)) ax[row][col].legend(loc='upper left') i += 1 return plt.show()
def arimax(self, gr, feat, param): # if no external features, no forecast result if self.ext is None: return pd.DataFrame(columns = ['ds', 'y']) # input monthly data df = self.df_m.copy() df = self.monthlyfeat(self.df_m, col=feat) df['y'] = self.valtogr(df) if gr else df['y'] # clean data - drop null from growth calculation, fill 0 when no external data df = df.iloc[len([x for x in df['y'] if pd.isnull(x)]):, :] df = df.fillna(0).reset_index(drop=True) # prepare data x = df['y'].values ex = df.iloc[:, 2:].values # fit model1 with external m1 = SARIMAX(x, exog=ex, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse') m1 = m1.fit(disp = False) # prepare external data df_pred = pd.DataFrame(columns = ['ds', 'y']) for i in self.dt_m: df_pred = df_pred.append({'ds' : i} , ignore_index=True) df_pred = self.monthlyfeat(df_pred, col=feat) if np.isnan(list(df_pred.iloc[-1, 2:].values)).any(): df_pred = df_pred.iloc[:-1, :] break # forecast model1 ex_pred = df_pred.iloc[:, 2:].values r1 = m1.predict(start=df.index[-1] + 1, end=df.index[-1] + ex_pred.shape[0], exog=ex_pred) # model2 (used when there is no external features in future prediction) if len(r1) < self.fcst_pr: # fit model2 without external m2 = SARIMAX(x, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse') m2 = m2.fit(disp = False) # forecast model2 r2 = m2.predict(start=df.index[-1] + ex_pred.shape[0] + 1, end=df.index[-1] + self.fcst_pr) else: r2 = [] # summarize result r = list(r1) + list(r2) r = pd.DataFrame(zip(self.dt_m, r), columns =['ds', 'y']) r['y'] = self.grtoval(r, self.df_m) if gr else r['y'] return self.correctzero(r)
def walk_forward_validation_single_run(model, train, test): fitted = SARIMAX(train.values, order=(model.ar, model.d, model.ma), seasonal_order=(model.s_ar, model.s_d, model.s_ma, model.s_period), trend='c').fit() predicted_vals = fitted.predict( model.d, train.shape[0] - model.d + test.shape[0] - 1) # typ arg only exists for ARIMA, not SARIMAX model rmsse = get_rmsse(train, test, predicted_vals[-test.shape[0]:]) return rmsse
def meta_grid_search(ts, TEST_SIZE=0.2, model_kws={}, verbose=True, return_kws=False): import pmdarima as pm from statsmodels.tsa.statespace.sarimax import SARIMAX ## Train Test Split idx_split = get_train_test_split_index(ts, TEST_SIZE=TEST_SIZE) ts_train = ts.iloc[:idx_split].copy() ts_test = ts.iloc[idx_split:].copy() ## Combine Default kwargs and model_kws model_kwargs = dict(start_p=0, start_q=0, start_P=0, start_Q=0, max_p=5, max_q=6, max_P=5, max_Q=5, max_D=3, suppress_warnings=True, stepwise=False, trace=False, m=6, seasonal=True, with_intercept=True, stionarity=False) for k, v in model_kws.items(): model_kwargs[k] = v if verbose: print("pm.auto_arima args:") print(model_kwargs) model = pm.auto_arima(ts_train, **model_kwargs) display(model.summary()) model_sarimax = SARIMAX(ts_train, **model.get_params()).fit() preds = model_sarimax.predict(ts_test.index[0], ts_test.index[-1]) res = get_model_metrics(ts_test, preds, ts_train) display(res) return model_sarimax
def arima(self, gr, param): # input monthly data df = self.df_m.copy() df['y'] = self.valtogr(df) if gr else df['y'] df = df.dropna().reset_index(drop=True) # prepare tranining data x = df['y'].values # fit model m = SARIMAX(x, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse') m = m.fit(disp = False) # forecast r = m.predict(start=df.index[-1] + 1, end=df.index[-1] + self.fcst_pr) r = pd.DataFrame(zip(self.dt_m, r), columns =['ds', 'y']) r['y'] = self.grtoval(r, self.df_m) if gr else r['y'] return self.correctzero(r)
def pipeline(data, cfg): if cfg['autoencoder']: # encoder, cfg = pre_training(data=avocado_data, cfg=cfg) autoencoder = Autoencoder(data, cfg) autoencoder.train() autoencoder.test() else: autoencoder = None # Extract data train_x, train_y, train_f = data.get_train_sequence() plt.figure() plt.plot(data.data) plt.show() # Fit model mc_model = MonteCarloNetwork(data, autoencoder, cfg) mc_model.train(train_x, train_y, train_f) # model = train_model(train_x, train_y, cfg) # Fit seasonal arima # sarimax = Sarimax(data.get, cfg) test_x, test_y, test_f = data.get_test_sequence() # Forecast on the last proportion of the data set mse_test, pred_test = monte_carlo_dropout(mc_model, test_x, test_y, test_f) model_es = ExponentialSmoothing(data.train) model_es = model_es.fit() pred_es = model_es.predict(start=data.test.index[cfg['sequence_length']], end=data.test.index[-1]) model_arima = SARIMAX(data.train, order=(3, 1, 0)) model_arima = model_arima.fit() pred_arima = model_arima.predict( start=data.test.index[cfg['sequence_length']], end=data.test.index[-1]) pred_es = np.asarray(pred_es).reshape(test_y.shape) pred_arima = np.asarray(pred_arima).reshape(test_y.shape) print('======= Test Statistics =======') statistics(test_x, test_y, mse_test, pred_test) print('Exponential Smoothing:', mean_squared_error(test_y, pred_es)) print('SARIMAX:', mean_squared_error(test_y, pred_arima)) plt.figure() plt.plot(data.data) plt.show() plot_airpassengers(data.data, pred_test, mse_test, pred_es, pred_arima)
def get(self, request, *args, **kwargs): first_date = str(UnivarientData.objects.earliest('date').date) last_date = str(UnivarientData.objects.latest( 'date').date - datetime.timedelta(days=250)) start_date = self.request.query_params.get('startdate', first_date) end_date = self.request.query_params.get('enddate', last_date) if end_date > last_date: end_date = last_date date_valid = UnivarientData.objects.exclude( date__gt=end_date).exclude(date__lt=start_date) if not date_valid: start_date = first_date end_date = last_date data = read_frame(UnivarientData.objects.all()) data['date'] = pd.to_datetime(data['date']) data = data.drop('id', axis=1) data = data.set_index('date') startdate = dat.strptime(start_date, '%Y-%m-%d') enddate = dat.strptime(end_date, '%Y-%m-%d') nextmonth = enddate + relativedelta.relativedelta(months=1) train, test = data[startdate:nextmonth], data[nextmonth:] arima = SARIMAX(train, order=(1, 0, 2), freq='M', seasonal_order=( 1, 1, 2, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit() predict = arima.predict(test.index.min(), test.index.max()) predictdata = pd.DataFrame( predict, index=test.index, columns=['predictprice']) metrics = forecast_accuracy(predictdata.values, test.values) predictdata['actual'] = test.values predictdata['date'] = predictdata.index.astype('str') actual_data = predictdata[['date', 'actual']].values.tolist() predicted_data = predictdata[['date', 'predictprice']].values.tolist() return Response({'actual_data': actual_data, 'predicted_data': predicted_data, 'mape': metrics.get('mape', 0)*100})
class Sarimax: def __init__(self, df, cfg): self.series = df[cfg['target_feature']] self.model = SARIMAX(self.series, order=(3, 1, 0), seasonal_order=(0, 0, 0, 12)) def fit_model(self): # Fit model self.model = self.model.fit(disp=0) print(self.model.summary()) def plot_autocorrelation(self): # Plot auto correlation autocorrelation_plot(self.series) plt.show() def predict_arima(self, series): return self.model.predict(series)
def get(self, request, *args, **kwargs): n_steps = int(self.request.query_params.get('nsteps', 10)) last_date = UnivarientData.objects.latest( 'date').date + datetime.timedelta(days=30) data = read_frame(UnivarientData.objects.all()) data['date'] = pd.to_datetime(data['date']) data = data.drop('id', axis=1) data = data.set_index('date') arima = SARIMAX(data, order=(1, 0, 2), freq='M', seasonal_order=(1, 2, 1, 6), enforce_stationarity=False, enforce_invertibility=False, ).fit() date_index = pd.date_range(start=last_date, periods=n_steps, freq='M') data = pd.DataFrame() data['prediction'] = arima.predict(date_index.min(), date_index.max()) data['date'] = date_index data['date'] = data['date'] predicted_data = data[['date', 'prediction']].values.tolist() return Response({'predicted_data': predicted_data})
def Auto_Arima(df,dirloc,filename): import itertools from statsmodels.tsa.statespace.sarimax import SARIMAX p=d=q=range(0,3) pdq = list(itertools.product(p,d,q)) seas_decomp=[] for x in pdq: x1=(x[0],x[1],x[2],12) seas_decomp.append(x1) print("Computating AIC of Different Sesonal ARIMA.....\n") arima_order=[] seas_order=[] aic_val=[] for params in pdq: for seas_par in seas_decomp: mod = SARIMAX(df,order=params,seasonal_order=seas_par,enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit() arima_order.append(params) seas_order.append(seas_par) aic_val.append(round(mod.aic,2)) print("SARIMA: {} X {} | AIC = {}".format(params,seas_par,round(mod.aic,2))) results = pd.DataFrame({"ARIMA Order":arima_order,"Seasonal Order":seas_order,"AIC Value":aic_val}) results_sorted = results.sort_values(by="AIC Value",ascending=True) results_sorted=results_sorted.reset_index(drop=True) print("Selected SARIMA Order:",results_sorted.head(2)) final_model = SARIMAX(df,order=results_sorted["ARIMA Order"][0],seasona_order=results_sorted["Seasonal Order"][0],enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit() print("Final Model Result Summary {}".format(final_model.summary())) print(results_sorted["ARIMA Order"][0]) print(results_sorted["Seasonal Order"][0]) predictions = final_model.predict(start=dt.datetime.strptime("2020-06-01","%Y-%m-%d"),end=dt.datetime.strptime("2020-12-01","%Y-%m-%d")) print("Average Monthly WTI Crude Oil Spot Price from June to Dec 2020:") print(predictions) with open(os.path.join(dirloc[:-5],outputfile),"a") as f: f.write("Simulation Result of SARIMA....\n") f.write(str(results_sorted)) f.write("\n") f.write(str(predictions)) f.close() return results_sorted
# Prepare the data X = timeseriesgenerator y = from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # ========================= SARIMAX ========================= from statsmodels.tsa.statespace.sarimax import SARIMAX mod = SARIMAX(data['ln_wpi'], trend='c', order=(1,1,(1,0,0,1))) mod = mod.fit(X_train) print(mod.summary()) pred = mod.predict(X_test) plt.plot(X, y) plt.plot(X_test, pred) # ========================= XGBoost ========================= from xgboost import XGBRegressor xgb = XGBRegressor() xgb.fit(X_train_scaled, y_train) pred = xgb.predict(X_test) plt.plot()
def run_app(): image = Image.open('Air_pol.JPG') st.image(image, use_column_width=True) no2 = Image.open('no2.JPG') st.sidebar.image(no2, use_column_width=True) df2 = df.copy() df2 = df2['Nitrogen_dioxide'] train = df2[0:-30] test = df2[-30:] add_selectbox = st.sidebar.selectbox( "Select Forecasting Model", ("Simple Moving Average", "LSTM", "Triple Exponential Smoothing", "Seasonal ARIMA", "Gradient Boosting Regressor", "ML Model Comparison Table")) st.sidebar.info( 'This application is developed by Siddhesh D. Munagekar to forecast Nitrogen dioxide concentration in air using multiple forecasting technique' ) if add_selectbox == 'Simple Moving Average': df1 = df.Nitrogen_dioxide.copy() df1 = pd.DataFrame(df1) df1['SMA_20'] = df1.Nitrogen_dioxide.rolling(20, min_periods=1).mean() df1['SMA_10'] = df1.Nitrogen_dioxide.rolling(10, min_periods=1).mean() df1['SMA_3'] = df1.Nitrogen_dioxide.rolling(3, min_periods=1).mean() fig = plt.figure() df1.plot(figsize=(25, 15)) plt.xlabel('Date', fontsize=20) plt.ylabel('Nitrogen dioxide', fontsize=20) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.title("Simple Moving Average for 20, 10 and 3 days", fontsize=30) plt.legend( labels=['Temperature', '20-days SMA', '10-days SMA', '3-days SMA'], fontsize=22) plt.grid() plt.show() st.pyplot(use_column_width=True) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_20']) st.write("MAE for 20 days is {:,.2f}".format(mae)) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_10']) st.write("MAE for 10 days is {:,.2f}".format(mae)) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_3']) st.write("MAE for 3 days is {:,.2f}".format(mae)) if add_selectbox == 'Triple Exponential Smoothing': train = pd.DataFrame(train) test = pd.DataFrame(test) pred = test.copy() fit1 = ExponentialSmoothing(np.asarray(train['Nitrogen_dioxide']), trend='add', seasonal_periods=7, seasonal='add').fit() pred['Holt_Winter'] = fit1.forecast(len(test)) # Calculate KPI's mae = metrics.mean_absolute_error(test.Nitrogen_dioxide, pred.Holt_Winter) # Plot plt.figure(figsize=(16, 8)) plt.plot(train['Nitrogen_dioxide'], label='Train') plt.plot(test['Nitrogen_dioxide'], label='Test') plt.plot(pred['Holt_Winter'], label='Holt_Winter (MAE={:.2f})'.format(mae)) plt.title("Triple Exponential smoothing", fontsize=30) plt.xlabel('Date', fontsize=20) plt.ylabel('Nitrogen dioxide', fontsize=20) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.grid() plt.show() st.pyplot(use_column_width=True) st.write("MAE for 30 days is {:,.2f}".format(mae)) ##Seasonal_Arima if add_selectbox == 'Seasonal ARIMA': df3 = df.copy() #train = df3[0:-30] test = df3[-30:] model = SARIMAX(df3['Nitrogen_dioxide'], order=(0, 1, 0), seasonal_order=(2, 1, 0, 30), enforce_stationarity=False, enforce_invertibility=False, dynamic=True) results = model.fit() df3['predicted_test'] = results.predict(start=360, end=390, dynamic=True) seasonal_forecast = pd.DataFrame(results.forecast(len(test))) seasonal_forecast = seasonal_forecast.rename( {0: 'Seasonal forecast for 30 periods'}, axis=1) plt.figure(figsize=(16, 8)) seasonal_forecast.plot(figsize=(25, 10), color='green') df3['Nitrogen_dioxide'].plot(figsize=(20, 10)) df3['predicted_test'].plot(figsize=(20, 10)) plt.legend(fontsize=19) plt.ylabel("Nitrogen_dioxide", fontsize=20) plt.xlabel('Date', fontsize=20) plt.title("Seasonal Arima", fontsize=30) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.grid() plt.show() st.pyplot(use_column_width=True) # Calculate KPI mae = metrics.mean_absolute_error(df3.Nitrogen_dioxide[360:], df3.predicted_test[360:]) st.write("MAE of Seasonal Arima is {:.2f}".format(mae)) if add_selectbox == 'ML Model Comparison Table': acc_table = { 'Model': [ 'Linear Regression', 'Decision Tree', 'Random_forest', 'Gradient_Boosting' ], 'Train_score': [0.59, 0.70, 0.91, 0.83], 'Test_score': [0.49, 0.46, 0.40, 0.50], 'MAE_train': [4265.36, 3713.86, 2220.76, 2958.29], 'MAE_test': [3053.96, 3116.57, 3161.29, 2726.36] } acc_table = pd.DataFrame(acc_table) acc_table = acc_table.sort_values( by='Test_score', ascending=False).reset_index(drop=True) st.table(acc_table) #Gradient Boosting if add_selectbox == 'Gradient Boosting Regressor': df55 = df.Nitrogen_dioxide.copy() df55 = pd.DataFrame(df55) dfML = pd.DataFrame() for i in range(7, 0, -1): dfML[['t-' + str(i)]] = df55.shift(i) dfML['t'] = df55.values df_ML = dfML[7:] # Split Data into dependent(target) and independent(features) variables df_ML22 = df_ML.values # Lagged variables (features) and original time series data (target) X2 = df_ML22[:, 0: -1] # slice all rows and start with column 0 and go up to but not including the last column y2 = df_ML22[:, -1] # slice all rows and last column, essentially separating out 't' column traintarget_size = int(len(y2) * 0.70) train_target, test_target = y2[:traintarget_size], y2[ traintarget_size:len(y2)] trainfeature_size = int(len(X2) * 0.70) train_feature, test_feature = X2[:trainfeature_size], X2[ trainfeature_size:len(X2)] gbr = GradientBoostingRegressor(max_features=3, max_depth=2, learning_rate=0.1, n_estimators=100, subsample=0.8, random_state=50) gbr.fit(train_feature, train_target) gbr_train_70_30 = gbr.score(train_feature, train_target) gbr_test_70_30 = gbr.score(test_feature, test_target) plot_test_pred = gbr.predict(test_feature) plot_test_pred = pd.DataFrame(plot_test_pred) plot_test_pred = plot_test_pred.rename({0: 'Predicted_test'}, axis=1) plot_test_target = pd.DataFrame(test_target) plot_test_target = plot_test_target.rename({0: 'Actual_test'}, axis=1) gbr_test_plot = pd.concat([plot_test_target, plot_test_pred], axis=1) gbr_test_plot.plot( title='Gradient boosting Actual vs Predicted test of last 116 days' ) plt.grid() st.pyplot(use_column_width=True) st.write("Gradient boosting training score {:.2f}".format( round(gbr_train_70_30, 2))) st.write("Gradient boosting test score {:.2f}".format( round(gbr_test_70_30, 2))) if st.checkbox("Visualize for last 10 days"): gbr_test_plot[106:].plot(title=' GBR Plot of last 10 days') st.pyplot(use_column_width=True) if add_selectbox == 'LSTM': data = df.copy() data = data.iloc[:, 7].values data = data.reshape(-1, 1) data = data.astype('float32') # Scaling the data scalar = MinMaxScaler() data = scalar.fit_transform(data) train_lstm = data[:-30, :] test_lstm = data[-30:, :] # Building the 2D array for supervised learning def create_dataset(sequence, time_step): dataX = [] dataY = [] for i in range(len(sequence) - time_step - 1): a = sequence[i:(i + time_step), 0] dataX.append(a) dataY.append(sequence[i + time_step, 0]) return np.array(dataX), np.array(dataY) time_step = 1 # Apply the 2D array function to train and test datasets train_X, train_Y = create_dataset(train_lstm, time_step) test_X, test_Y = create_dataset(test_lstm, time_step) train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1])) test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1])) # Build the LSTM Model model = Sequential() # Adding the input layer and LSTM layer model.add( LSTM(50, activation='relu', input_shape=(1, time_step), return_sequences=True)) model.add(LSTM(50, return_sequences=True)) model.add(LSTM(50)) model.add(Dropout(0.15)) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse') model.fit(train_X, train_Y, batch_size=4, epochs=50, verbose=2) # Make predictions train_predict = model.predict(train_X) test_predict = model.predict(test_X) # inverting predictions train_predict = scalar.inverse_transform(train_predict) train_Y = scalar.inverse_transform([train_Y]) test_predict = scalar.inverse_transform(test_predict) test_Y = scalar.inverse_transform([test_Y]) # calculate root mean squared error train_score = mean_absolute_error(train_Y[0], train_predict[:, 0]) test_score = mean_absolute_error(test_Y[0], test_predict[:, 0]) # LSTM plot train_plot = np.empty_like( data) # create an array with the same shape as provided train_plot[:, :] = np.nan train_plot[time_step:len(train_predict) + time_step, :] = train_predict # shifting test predictions for plotting test_plot = np.empty_like(data) test_plot[:, :] = np.nan test_plot[len(train_predict) + (time_step * 2) + 1:len(data) - 1, :] = test_predict # plot baseline and predictions plt.figure(figsize=(16, 8)) plt.plot(train_plot) plt.plot(test_plot, color='green') plt.plot(scalar.inverse_transform(data), color='orange') plt.title( "Long Short Term Memory Network with train ,test and forecast", fontsize=20) plt.ylabel("Nitrogen_dioxide", fontsize=20) plt.legend(labels=['Train plot', 'Test set', 'LSTM forecast'], fontsize=19) plt.xticks(fontsize=8) plt.yticks(fontsize=8) plt.grid() plt.show() st.pyplot(use_column_width=True) st.write('Train Score: %.3f MAE' % (train_score)) st.write('Test Score: %.3f MAE' % (test_score)) if st.checkbox('Visualize forecasted chart for 10 future days'): test_predict = scalar.fit_transform(test_predict) time_step = 10 x_input = test_predict[(len(test_predict) - time_step):].reshape( 1, -1) # Converting it to list temp_input = list(x_input) # Arranging list vertically temp_input = temp_input[0].tolist() # demonstrate prediction for next 10 days lst_output = [] future_day = 10 n_steps = 10 i = 0 # Forcast next 10 days output while (i < future_day): if (len(temp_input) > 10): x_input = np.array(temp_input[1:]) print("{} day input {}".format(i, x_input)) x_input = x_input.reshape(1, -1) # Converting to 3d array for lstm x_input = x_input.reshape(1, n_steps, 1) # print(x_input) ypred = model.predict(x_input, verbose=0) print("{} day predicted output {}".format(i, ypred)) # adding predicted output to temp_input list temp_input.extend(ypred[0].tolist()) temp_input = temp_input[1:] # print(temp_input) lst_output.extend(ypred.tolist()) i = i + 1 else: x_input = x_input.reshape((n_steps, 1, 1)) ypred = model.predict(x_input, verbose=0) print("Predicted y of 0 day", ypred[0]) # Addding ypred value in temp_input(previous input) temp_input.extend(ypred[0].tolist()) print(len(temp_input)) lst_output.extend(ypred.tolist()) i = i + 1 # print(lst_output) previous_days1 = np.arange(len(data) - n_steps, len(data)) predicted_future1 = np.arange(len(data), len(data) + future_day) lst_output = lst_output[:future_day] outputlist = data.tolist() outputlist.extend(lst_output) #data[len(data) - n_steps:] plt.plot( np.append(previous_days1, predicted_future1), scalar.inverse_transform(outputlist[len(data) - n_steps:])) plt.plot(predicted_future1, scalar.inverse_transform(lst_output)) plt.title("Forecast for 10 future days", fontsize=20) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=8) plt.ylabel("Nitrogen dioxide") plt.show() st.pyplot(use_column_width=True)
def main(dataset): def plot_cf(ts, field): """NOTE: I did NOT write this function. It was taken from: http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/ """ lag_acf = acf(field, nlags=20) lag_pacf = pacf(field, nlags=20) #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(df_diff)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(df_diff)), linestyle='--', color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(ts)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(ts)), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function') plt.show() if dataset == 1: df = pd.read_csv('q1_train.csv') df.Date = pd.to_datetime(df.Date) df.set_index("Date", inplace=True) ### Differenced Signal ### df_diff = df - df.shift() df_diff.dropna(inplace=True) df_diff_2 = df - df.shift(52) df_diff_2.dropna(inplace=True) df_diff_3 = df_diff - df_diff.shift(52) df_diff_3.dropna(inplace=True) p, q = 0, 2 arma_fit = SARIMAX(df, order=(p, 1, q), seasonal_order=(1, 1, 1, 52)).fit(disp=-1) prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True) ### Plots the original data with the prediction ### plt.plot(df, color='blue') plt.plot(prediction, color='red') plt.title("Original Data with Predictions for Two Years") plt.show() # Put the predictions into a .txt file txt = open("Q1_Daniel_March_24196320.txt", 'w') txt.write('"x"') for line in prediction: txt.write("\n" + str(line)) elif dataset == 2: df = pd.read_csv('q2_train.csv') df.Date = pd.to_datetime(df.Date) df.set_index("Date", inplace=True) df_diff = df - df.shift() df_diff.dropna(inplace=True) df_diff_2 = df - df.shift(52) df_diff_2.dropna(inplace=True) df_diff_3 = df_diff - df_diff.shift(52) df_diff_3.dropna(inplace=True) # plot_cf(df_diff,df_diff.activity) p, q = 0, 2 arma_fit = SARIMAX(df, order=(p, 1, q), seasonal_order=(1, 1, 1, 52)).fit(disp=-1) prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True) ## Plots the original data with the prediction ### plt.plot(df, color='blue') plt.plot(prediction, color='red') plt.title("Original Data with Predictions for Two Years") plt.show() # Put the predictions into a .txt file txt = open("Q2_Daniel_March_24196320.txt", 'w') txt.write('"x"') for line in prediction: txt.write("\n" + str(line)) if dataset == 3: df = pd.read_csv('q3_train.csv') df.Date = pd.to_datetime(df.Date) df.set_index("Date", inplace=True) ### Differenced Signal ### df_diff = df - df.shift() df_diff.dropna(inplace=True) df_diff_2 = df - df.shift(52) df_diff_2.dropna(inplace=True) df_diff_3 = df_diff - df_diff.shift(52) df_diff_3.dropna(inplace=True) p, q = 0, 2 arma_fit = SARIMAX(df, order=(p, 1, q), seasonal_order=(1, 1, 1, 52)).fit(disp=-1) prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True) ### Plots the original data with the prediction ### plt.plot(df, color='blue') plt.plot(prediction, color='red') plt.title("Original Data with Predictions for Two Years") plt.show() # Put the predictions into a .txt file txt = open("Q3_Daniel_March_24196320.txt", 'w') txt.write('"x"') for line in prediction: txt.write("\n" + str(line)) if dataset == 4: df = pd.read_csv('q4_train.csv') df.Date = pd.to_datetime(df.Date) df.set_index("Date", inplace=True) ### Differenced Signal ### df_diff = df - df.shift() df_diff.dropna(inplace=True) df_diff_2 = df - df.shift(52) df_diff_2.dropna(inplace=True) df_diff_3 = df_diff - df_diff.shift(52) df_diff_3.dropna(inplace=True) p, q = 0, 2 arma_fit = SARIMAX(df, order=(p, 1, q), seasonal_order=(1, 1, 1, 52)).fit(disp=-1) prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True) ### Plots the original data with the prediction ### plt.plot(df, color='blue') plt.plot(prediction, color='red') plt.title("Original Data with Predictions for Two Years") plt.show() # Put the predictions into a .txt file txt = open("Q4_Daniel_March_24196320.txt", 'w') txt.write('"x"') for line in prediction: txt.write("\n" + str(line)) elif dataset == 5: df = pd.read_csv('q5_train.csv') df.Date = pd.to_datetime(df.Date) df.set_index("Date", inplace=True) ### Differenced Signal ### df_diff = df - df.shift() df_diff.dropna(inplace=True) df_diff_2 = df - df.shift(52) df_diff_2.dropna(inplace=True) df_diff_3 = df_diff - df_diff.shift(52) df_diff_3.dropna(inplace=True) # plot_cf(df_diff,df_diff.activity) p, q = 0, 2 arma_fit = SARIMAX(df, order=(p, 1, q), seasonal_order=(1, 1, 1, 52)).fit(disp=-1) prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True) ### Plots the original data with the prediction ### plt.plot(df, color='blue') plt.plot(prediction, color='red') plt.title("Original Data with Predictions for Two Years") plt.show() # Put the predictions into a .txt file txt = open("Q5_Daniel_March_24196320.txt", 'w') txt.write('"x"') for line in prediction: txt.write("\n" + str(line))
def arima(): failedMonths = 0 #Records if any months could not be successfully trained on (pred is zero) full_df=pd.read_csv('../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_mode.csv', infer_datetime_format=True, parse_dates=True) full_df['originalCases'] = full_df['num_cases'] #preserve original case values as additional feature by_state=full_df['sub_region_1'].unique() #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers offset = 14 full_dataframe=pd.DataFrame() for region in by_state: temp=full_df.loc[(full_df['sub_region_1']==region)] temp=temp.loc[(temp['date']<'2020-11-20')] #Shift CDC data by offset value cdc_dataframe=temp['num_cases'].shift(periods=offset,fill_value=0) mobility_dataframe=temp.drop(columns=['date', 'num_cases']) all_states=pd.concat([cdc_dataframe, mobility_dataframe],axis=1) all_states=all_states.loc[(all_states['num_cases']>0)] #remove rows with zero cases full_dataframe=full_dataframe.append(all_states) #Build new full data array #mobility_dataframe_truc = mobility_dataframe.drop(columns=['date']) #full_dataframe = pd.concat([cdc_dataframe_truc, mobility_dataframe_truc], axis=1) #full_dataframe['originalCases'] = cdc_dataframe['newAndPnew'] #preserve original case values as additional feature #full_dataframe_noDate = full_dataframe.drop(columns=['submission_date']) #full_dataframe_noDate = full_dataframe_noDate.loc[(full_dataframe_noDate['newAndPnew']!=0)] #remove rows with zero cases #Find length of shorted state dataframe minLength = np.inf for region in by_state: state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)] length = state_data.shape[0] if length < minLength: minLength = length stride = 10 #trains a new model every {stride} days percentErrors = [] for t in range(3):#(minLength-90)//stride): #Linear Mobility Data linearTrainX = [] linearTrainy = [] linearTestX = [] linearTesty = [] #Logarithmic Mobility Data logTrainX = [] logTrainy = [] logTestX = [] logTesty = [] MLPTrainX = [] for region in by_state[:3]: state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)].drop(columns=['sub_region_1', 'grocery_and_pharmacy_percent_change_from_baseline']) #Convert data to numpy linearData = state_data.to_numpy() logData = np.log(state_data+1-np.min(state_data.to_numpy())).to_numpy() timeTrain = np.arange(1,61).reshape(-1, 1) timeTest = np.arange(61,91).reshape(-1, 1) #Linear Mobility Data linearTrainX.append(linearData[t*stride:t*stride+60,1:]) linearTrainy.append(linearData[t*stride:t*stride+60,:1]) linearTestX.append(linearData[t*stride+60:t*stride+90,1:]) linearTesty.append(linearData[t*stride+60:t*stride+90,:1]) #Logarithmic Mobility Data logTrainX.append(logData[t*stride:t*stride+60,1:]) logTrainy.append(logData[t*stride:t*stride+60,:1]) logTestX.append(logData[t*stride+60:t*stride+90,1:]) logTesty.append(logData[t*stride+60:t*stride+90,:1]) MLPTrainXState = [] for i,feature in enumerate(linearData[t*stride:t*stride+60,1:].T): #print("Feature:", i) #fit ARIMA #Perform grid search to determine ARIMA Order #stepwise_fit = auto_arima(feature, start_p = 1, start_q = 1, # max_p = 3, max_q = 3, m = 7, # start_P = 0, seasonal = True, # d = None, D = 1, trace = True, # error_action ='ignore', # we don't want to know if an order does not work # suppress_warnings = True, # we don't want convergence warnings # stepwise = True) # set to stepwise #stepwise_fit.summary() #print("===============================================================================================") predictArima =[] arimaOrders = [(1,0,0),(1,0,1),(3,0,0),(1,0,0),(0,1,1),(1,0,0),(2,0,0)] seasonalOrders = [(2, 1, 0, 7), (2, 1, 0, 7), (1, 1, 0, 7), (1, 1, 0, 7),(0,1,1,7),(0,1,1,7),(2, 1, 0, 7)] model = SARIMAX(feature, order = arimaOrders[i], seasonal_order =seasonalOrders[i], initialization='approximate_diffuse') result = model.fit(disp=False) if showPlot >=2 : visualize_ARIMA(result, timeTrain, linearTrainX[:,i], timeTest, linearTestX[:,i]) predictArima.append(result.predict(61, 90, typ = 'levels')) predictArima = np.mean(predictArima, axis=0) MLPTrainXState.append(predictArima) MLPTrainX.append(np.array(MLPTrainXState).T) MLPTrainX = np.array(MLPTrainX).reshape(-1,6) linearTrainX = np.array(linearTrainX).reshape(-1,6) linearTrainy = np.array(linearTrainy).reshape(-1,1) linearTesty = np.array(linearTesty).reshape(-1,1) #Use "Last known case value" as bias #(I completely made this up but it improved accuracy by ~5%) #bias1 = np.ones((30,1))#*linearTrainy[0] #bias2 = np.ones((30,1))#*linearTrainy[30] bias = np.ones((linearTrainX.shape[0],1))#np.vstack((bias1, bias2)) linearTrainX = np.hstack((linearTrainX, bias)) bias3 = np.ones((MLPTrainX.shape[0],1))#*linearTrainy[-1] MLPTrainX = np.hstack((MLPTrainX, bias3)) failCounter = 0 maxFail = 4 while failCounter < maxFail: #Retrain if prediction is zero model = Sequential() #model.add(BatchNormalization()) model.add(Dense(10, input_dim=7, activation='relu')) #model.add(Dropout(0.15)) model.add(Dense(30, activation='relu')) #model.add(Dropout(0.15)) model.add(Dense(1, activation='relu')) model.compile(optimizer='adam',loss='mean_squared_error', metrics=['accuracy']) model.fit(linearTrainX, linearTrainy, epochs=100, verbose=0) y_pred = model.predict(MLPTrainX) if np.sum(y_pred==0) < 0.1 * MLPTrainX.shape[0]: break print("Prediction is zero. Retraining...") failCounter += 1 if failCounter == maxFail: failedMonths += 1 percentError = 1 print("Could not train model on this data") if failCounter != maxFail: error = y_pred-linearTesty percentError = np.abs(error/linearTesty).T percentErrorsByState = [] print(percentError.shape) for i in range(len(by_state)): percentErrorsByState.append(percentError[i*30:(i+1)*30]) percentErrorsByState = np.array(percentErrorsByState).reshape(51) print("Loss:", np.mean(percentError)) #print("Percent Error:",percentError) percentErrors.append(percentErrorsByState) if showPlot >= 1 or np.mean(percentError) > 0.4: plt.plot(timeTrain, linearTrainy[0:60], label="Past") plt.plot(timeTest, linearTesty[0:30], label="True Future") plt.plot(timeTest, y_pred[0:30], label="Predicted Future") plt.plot(timeTest, MLPTrainX[0:30,-2], label="Predicted ARIMA (case only)") plt.legend() plt.show() print(np.array(percentErrors).shape) print("Failed Months:", failedMonths) print(np.mean(percentErrors, axis=1)) plt.plot(np.mean(percentErrors, axis=1).flatten()) plt.show() return
def sarima_detect(train_set, test_set, shoptype, categories, thresh=1, tol=0.4, order=(0, 0, 0), seasonal_order=(0, 1, 0, 12)): global outputs outputs = output_dir + shoptype + '\\' dicky_test = pd.DataFrame() if not os.path.exists(outputs + '\\broken'): os.makedirs(outputs + '\\broken') if not os.path.exists(outputs + '\\good'): os.makedirs(outputs + '\\good') for cat in categories: x = np.log(train_set[cat] + 1) y = np.log(test_set[cat] + 1) ax = plt.gca() x.plot(title=cat, colormap='jet') y.plot() x.rolling(6).mean().plot() # x.interpolate(inplace = True) x.index = x.index.to_timestamp() # result_mul = seasonal_decompose(x, model='addtive') # deseasonalized = x / result_mul.seasonal # results_AR = model.fit(disp=-1) # x_log_diff = x - x.shift() # x_log_diff.dropna(inplace=True) # plt.plot(results_AR.fittedvalues, color='red') # plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-x_log_diff)**2)) # deseasonalized.plot() sarima_mod = SARIMAX(x, trend='n', order=order, seasonal_order=seasonal_order, enforce_stationarity=False).fit() # print(sarima_mod.summary()) forecast = sarima_mod.predict('2018-07-01', '2019-04-01') forecast.plot() # D_data = x.diff().dropna() # D_data.columns = [u'sales diff'] # D_data.plot() #y.rolling(6).std().plot() ax.legend(["2017-2018", "2019", "rolling", "predicted"]) y.index = y.index.to_timestamp() a = y.corr(forecast) diff = y - forecast b = diff.std() c = "{:.2%}".format(abs(diff[0]) / y[0]) d = abs(y[0] - x[17]) / abs(forecast[0] - x[17]) dftest = adfuller(x, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value # print(dfoutput) dfoutput['CAT'] = cat dicky_test = dicky_test.append(dfoutput.transpose(), ignore_index=True) dicky_test['if_unitroot'] = 0 dicky_test.loc[ dicky_test['Critical Value (10%)'] < dicky_test['Test Statistic'], 'if_unitroot'] = 1 dicky_test.loc[dicky_test.CAT == cat, 'AIC'] = sarima_mod.aic dicky_test.loc[dicky_test.CAT == cat, 'BIC'] = sarima_mod.bic dicky_test.loc[dicky_test.CAT == cat, 'corr'] = a dicky_test.loc[dicky_test.CAT == cat, 'diff_std'] = b dicky_test.loc[dicky_test.CAT == cat, 'diff_fore201901'] = c dicky_test.loc[dicky_test.CAT == cat, 'diff_18-19'] = d if (x.max() * (1 + tol) < y.max()) or (x.min() > y.min() * (1 + tol)): dicky_test.loc[dicky_test.CAT == cat, 'extremum'] = 1 else: dicky_test.loc[dicky_test.CAT == cat, 'extremum'] = 0 if (x.max() * (1 + tol) < y.max()) or (x.min() > y.min() * (1 + tol)) or (d > thresh): dicky_test.loc[dicky_test.CAT == cat, 'TAG'] = 1 plt.savefig(outputs + 'broken\\' + cat + '.png') else: dicky_test.loc[dicky_test.CAT == cat, 'TAG'] = 0 plt.savefig(outputs + 'good\\' + cat + '.png') plt.show() return dicky_test
def baseline(): showPlot = False np.set_printoptions(precision=3, suppress=True) mobility_dataframe = pd.read_csv('google_baseline_test.csv', infer_datetime_format=True, parse_dates=True) cdc_dataframe = pd.read_csv('cdc_baseline_test_movingAvg.csv', infer_datetime_format=True, parse_dates=True) #=========================FIND BEST OFFSET======================================== bestLinearCorr = 0 bestLogCorr = 0 bestLinearOffset = -1 bestLogOffset = -1 bestLinearData = 0 bestLogData = 0 correlationScores = [] correlationLogScores = [] for offset in range(100): #Shift CDC data by offset value cdc_dataframe_truc = cdc_dataframe.shift(periods=offset, fill_value=0) #Build new full data array mobility_dataframe_truc = mobility_dataframe.drop(columns=['date']) full_dataframe = pd.concat( [cdc_dataframe_truc, mobility_dataframe_truc], axis=1) full_dataframe['originalCases'] = cdc_dataframe[ 'newAndPnew'] #preserve original case values as additional feature full_dataframe_noDate = full_dataframe.drop( columns=['submission_date']) full_dataframe_noDate = full_dataframe_noDate.loc[( full_dataframe_noDate['newAndPnew'] != 0)] #remove rows with zero cases #Compute linear and logatrithmic correlations linearCorr = full_dataframe_noDate.corr() linearCorr = linearCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data logData = np.log(full_dataframe_noDate + 1 - np.min(full_dataframe_noDate.to_numpy())) logCorr = logData.corr() logCorr = logCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data print("Offset:", offset, "Correlation: ", linearCorr) print(" Log Correlation:", logCorr) #Save best values if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr): bestLinearCorr = linearCorr bestLinearOffset = offset bestLinearData = full_dataframe_noDate if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr): bestLogCorr = logCorr bestLogOffset = offset bestLogData = logData correlationScores.append(np.linalg.norm(linearCorr)) correlationLogScores.append(np.linalg.norm(logCorr)) if showPlot: plt.plot(correlationScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Linear correlation vs. data offset") plt.show() plt.plot(correlationLogScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Logarithmic correlation vs. data offset") plt.show() #Plot data correlations #sns.pairplot(bestLinearData[['newAndPnew','retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline','originalCases']], diag_kind='kde') #plt.show() #sns.pairplot(bestLogData[['newAndPnew','retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline','originalCases']], diag_kind='kde') #plt.show() print("Best Full Correlation:", bestLinearCorr) print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr)) print("Best Full Offset:", bestLinearOffset) print("Best Log Correlation:", bestLogCorr) print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr)) print("Best Log Offset:", bestLogOffset) #=========================BEGIN MODEL FITTING======================================== linearMSE = [] logMSEAdj = [] linearCasesMSE = [] logCasesMSE = [] logisticMSE = [] dataNoise = [] arimaMSE = [] gaussMSE = [] #Convert data to numpy linearCasesOnly = bestLinearData['originalCases'].to_numpy() logCasesOnly = np.log(linearCasesOnly + 1) bestLinearData = bestLinearData.to_numpy() bestLogData = bestLogData.to_numpy() stride = 3 #trains a new model every {stride} days maxEpoch = 100 for t in range( (min(bestLinearData.shape[0], bestLogData.shape[0]) - 90) // stride): print("Training model:", t) #Linear Mobility Data linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:] linearTrainy = bestLinearData[t * stride:t * stride + 60, :1] linearTestX = bestLinearData[t * stride + 60:t * stride + 90, 1:] linearTesty = bestLinearData[t * stride + 60:t * stride + 90, :1] #Logarithmic Mobility Data logTrainX = bestLogData[t * stride:t * stride + 60, 1:] logTrainy = bestLogData[t * stride:t * stride + 60, :1] logTestX = bestLogData[t * stride + 60:t * stride + 90, 1:] logTesty = bestLogData[t * stride + 60:t * stride + 90, :1] #Cases-only data linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60] logCasesTrainX = logCasesOnly[t * stride:t * stride + 60] linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride + 90] logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 90] timeTrain = np.arange(1, 61).reshape(-1, 1) timeTest = np.arange(61, 91).reshape(-1, 1) #Uncomment to add time data to mobility dataset #linearTrainX = np.hstack((linearTrainX, timeTrain)) #logTrainX = np.hstack((logTrainX, timeTrain)) #linearTestX = np.hstack((linearTestX, timeTest)) #logTestX = np.hstack((logTestX, timeTest)) #fit linear model linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy) predict = linear_model.predict(linearTestX) linearMSE.append(np.abs(predict - linearTesty) / linearTesty) #fit log model linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy) predict = linear_model.predict(logTestX) predictAdj = np.exp(predict) - 1 + np.min( full_dataframe_noDate.to_numpy( )) #convert from log back to raw case number logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty) #fit linear cases only model cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX) if showPlot: visualize_cases(cases_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = cases_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit log cases only model cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX) if showPlot: visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX, np.log(timeTest), logCasesTestX) predict = cases_model.predict(np.log(timeTest)) predictAdj = np.exp( predict) - 1 #convert from log back to raw case number logCasesMSE.append( np.abs(predictAdj - linearCasesTestX) / linearCasesTestX) #fit logistic model logistic_model, cov = optimize.curve_fit( logisticDerivative, timeTrain.reshape(linearCasesTrainX.shape), linearCasesTrainX, p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30], maxfev=10000, bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf, np.Inf]))) if showPlot: visualize_logistic(logistic_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) logisticMSE.append( np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX) predict = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) dataNoise.append( np.mean(np.abs(predict - linearCasesTrainX) / linearCasesTrainX)) #fit stacking regressor estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=42), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)))] reg = StackingRegressor(estimators=estimators, final_estimator=GaussianProcessRegressor( kernel=DotProduct() + WhiteKernel(), random_state=0)) stacking_model = reg.fit(timeTrain, linearCasesTrainX) if showPlot: visualize_cases(stacking_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = stacking_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit ARIMA #Perform grid search to determine ARIMA Order #stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1, # max_p = 3, max_q = 3, m = 7, # start_P = 0, seasonal = True, # d = None, D = 1, trace = True, # error_action ='ignore', # we don't want to know if an order does not work # suppress_warnings = True, # we don't want convergence warnings # stepwise = True) # set to stepwise #stepwise_fit.summary() model = SARIMAX(linearCasesTrainX, order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)) result = model.fit(disp=False) if showPlot: visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictArima = result.predict(61, 90, typ='levels') arimaMSE.append( np.abs(predictArima - linearCasesTestX) / linearCasesTestX) #Evaluate other models to use as input to gaussian process arima1 = SARIMAX(linearCasesTrainX, order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)).fit(disp=False) arima2 = SARIMAX(linearCasesTrainX, order=(2, 0, 0), seasonal_order=(2, 1, 1, 7)).fit(disp=False) arima3 = SARIMAX(linearCasesTrainX, order=(1, 1, 0), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima4 = SARIMAX(linearCasesTrainX, order=(0, 1, 1), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima5 = SARIMAX(linearCasesTrainX, order=(0, 1, 1), seasonal_order=(2, 1, 0, 7)).fit(disp=False) predictLog = cases_model.predict(np.log(timeTrain)) #Log model predictAdj = np.exp( predictLog) - 1 #convert from log back to raw case number predictLogistic = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model predictArima1 = arima1.predict(1, 60, typ='levels') predictArima2 = arima2.predict(1, 60, typ='levels') predictArima3 = arima3.predict(1, 60, typ='levels') predictArima4 = arima4.predict(1, 60, typ='levels') predictArima5 = arima5.predict(1, 60, typ='levels') testLog = cases_model.predict(np.log(timeTest)) #Log model testAdj = np.exp( testLog) - 1 #convert from log back to raw case number testLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model testArima1 = arima1.predict(61, 90, typ='levels') testArima2 = arima2.predict(61, 90, typ='levels') testArima3 = arima3.predict(61, 90, typ='levels') testArima4 = arima4.predict(61, 90, typ='levels') testArima5 = arima5.predict(61, 90, typ='levels') #fit gaussian process meta-learner gaussTrain = np.array([ predictLogistic, predictArima1, predictArima2, predictArima3, predictArima4, predictArima5 ]).T gaussTest = np.array([ testLogistic, testArima1, testArima2, testArima3, testArima4, testArima5 ]).T reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(), random_state=0) stacking_model = reg.fit(gaussTrain, linearCasesTrainX) predictTrain = stacking_model.predict(gaussTrain) predictTest = stacking_model.predict(gaussTest) if showPlot: visualize_gauss( np.hstack((predictTrain, predictTest)).T, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) gaussMSE.append( np.abs(predictTest - linearCasesTestX) / linearCasesTestX) #Plot proof-of-concept graph if True: plt.plot(np.array(linearMSE).mean(axis=0), label='Mobility (linear, non-temporal)') plt.plot(np.array(logMSEAdj).mean(axis=0), label='Mobility (logarithmic, non-temporal)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() #Plot baseline graph #plt.plot(np.array(linearCasesMSE).mean(axis=0), label='Cases (linear, temporal)') #Don't plot because performance is terrible plt.plot(np.array(logCasesMSE).mean(axis=0), label='Cases (logarithmic temporal)') plt.plot(np.array(logisticMSE).mean(axis=0), label='Cases (logistic temporal)') plt.plot(np.array(arimaMSE).mean(axis=0), label='Cases (ARIMA)') plt.plot(np.array(gaussMSE).mean(axis=0), label='Cases (Gaussian Process meta)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() print("Average logistic Test error:", np.mean(dataNoise))
def ts_crossvalidation(X, param_grid, y=None, cv=5, model="ARIMA", ignore_warnings=True): """ Function to perform Timeseries crossv validation using nested cross validation Only possible to use ARIMA and SVR models right now :params X - Prediction variable with time series param_grid - dictionary with name of the parameter and list of options y - Target variable for SVR cv - number of folds to use in each validation model - model on which cross validation will be performed ignore_warnings - Option to silence warnings """ if ignore_warnings: # Ignore all warnigs for Nested Cross Validations warnings.filterwarnings("ignore") # Time series cross validation initialization tscv = TimeSeriesSplit(n_splits=cv) # Initialization of results grid_search_result = pd.DataFrame() # Getting possible combinations of parameters grid_list = param_grid_product(param_grid) if model == "ARIMA": # ----- # ARIMA Model Cross validation start # ----- assert 'order' in param_grid, "No order parameters for ARIMA" # Loop over possible combinations of parameters for grid_dict in grid_list: order = grid_dict['order'] # Set defaults for ARIMA parameters param_arima = { 'seasonal_order': (0, 0, 0, 0), 'freq': None, 'enforce_stationarity': True, 'enforce_invertibility': True } # Update ARIMA parameters if they were specified for key in param_arima: if key in grid_dict: param_arima[key] = grid_dict[key] # Initialize error lists rmse_list = [] aic_list = [] # Nested cross validation run per each configuration for train_index, test_index in tscv.split(X): # Train and test initialization for specific nested crossvalidation step X_train, X_test = X[train_index], X[test_index] # Model initialization and training try: model = SARIMAX( X_train, freq=param_arima['freq'], order=order, seasonal_order=param_arima['seasonal_order'], enforce_stationarity=param_arima[ 'enforce_stationarity'], enforce_invertibility=param_arima[ 'enforce_invertibility']).fit(disp=0) # Model test for crossvalidation step pred = model.predict(X_test.index[0], X_test.index[-1]) error = np.sqrt(mean_squared_error(X_test, pred)) # Save results for crossvalidation step rmse_list.append(error) aic_list.append(model.aic) except: # If error continue to next model evaluation continue # Consolidate metrics for parameter configuration using mean try: total_error = np.mean(rmse_list) total_aic = np.mean(aic_list) except: # If error continue to next parameter configuration continue # Save results on main DataFrame to_append = pd.DataFrame([{ 'name': 'ARIMA{}x{}'.format(order, param_arima['seasonal_order']), 'AIC': total_aic, 'RMSE': total_error }]) grid_search_result = grid_search_result.append(to_append, sort=False, ignore_index=True) elif model == "SVR": # ----- # SVR Model Cross validation start # ----- assert y is not None, "No target variable samples (y) for SVR" for grid_dict in grid_list: param_svr = { 'C': 1, 'kernel': 'rbf', 'gamma': 'scale', } for key in param_svr: if key in grid_dict: param_svr[key] = grid_dict[key] # Initialize confidence lists and model confidence = None conf_list = [] svr_rbf = SVR(kernel=param_svr['kernel'], C=param_svr['C'], gamma=param_svr['gamma']) for train_index, test_index in tscv.split(X): # Train and test initialization for specific nested crossvalidation step X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] try: # Model training svr_rbf.fit(X_train, y_train) # Model test for nested crossvalidation step svm_confidence = svr_rbf.score(X_test, y_test) # Accuracy must be a valid number, otherwise the model failed to converge assert (0 <= svm_confidence <= 1) conf_list.append(svm_confidence) except: continue # Consolidate confidence for parameter configuration using mean try: confidence = np.mean(conf_list) except: continue # Save results on main DataFrame to_append = pd.DataFrame([{ 'kernel': param_svr['kernel'], 'gamma': param_svr['gamma'], 'C': param_svr['C'], 'confidence': confidence }]) grid_search_result = grid_search_result.append(to_append, sort=False, ignore_index=True) return grid_search_result
def TIME_SERIES_ALGO(df, bool_stat): dict_rmse = dict() bool_log, df_log = log_transformation(df) col = df.columns[0] # 1.. NAIVE APPROACH # IN THIS APPROCAH WE ASSIGN RECENT VALUE TO THE TEST DATAFRAME try: train, test = train_test_split(df) y_prd = np.asarray([train.ix[train.shape[0] - 1].values[0]] * (test.shape[0])) rs_naive = sqrt(mean_squared_error(test[col].values, y_prd)) print(rs_naive) dict_rmse["naive"] = rs_naive insert_into_database("NAIVE", rs_naive, "{}") if bool_log: # PERFORM SAME ABOVE THING FOR LOG TRANSFORMED DATA train, test = train_test_split(df_log) y_prd = np.asarray([train.ix[train.shape[0] - 1].values[0]] * (test.shape[0])) y_prd = np.exp(y_prd) rs_naive_log = sqrt(mean_squared_error(test[col].values, y_prd)) print(rs_naive_log) dict_rmse["naive_log"] = rs_naive_log insert_into_database("NAIVE", rs_naive_log, "{}") except Exception as e: insert_into_database("NAIVE", None, e) print(("error in modelling in naive approach,{}".format(e))) # 2..SIMPLE AVERAGE try: train, test = train_test_split(df) mean_forecast = train[col].mean() y_prd = np.asarray([mean_forecast] * test.shape[0]) rs_mean = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["simple_avg"] = rs_mean insert_into_database("SIMPLE_AVG", rs_mean, "{}") if bool_log: train, test = train_test_split(df_log) mean_forecast = train[col].mean() y_prd = np.asarray([mean_forecast] * test.shape[0]) y_prd = np.exp(y_prd) rs_mean = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["simple_avg_log"] = rs_mean insert_into_database("SIMPLE_AVG", rs_mean, "{}") except Exception as e: insert_into_database("SIMPLE_AVG", None, e) print(("error in moving average,{}".format(e))) # 3..MOVING AVERAGE # IN PROGRESS HAVE TO MODIFY IT... try: train, test = train_test_split(df) for i in range(25, 90): # As rolling mean returns mean fo ecah row we want mean f only last row because it is onlu used to forecast mean_moving = train[col].rolling(i).mean().ix[train.shape[0] - 1] print(mean_moving) y_prd = np.asarray([mean_moving] * test.shape[0]) rs_moving = sqrt(mean_squared_error(test[col].values, y_prd)) insert_into_database("MVG_AVG", rs_moving, "{}") except Exception as e: insert_into_database("MVG_AVG", None, e) print(("error in moving average,{}".format(e))) try: if bool_log: for i in range(25, 90): train, test = train_test_split(df_log) # print(type(train[col].rolling(i).mean())) mean_moving = train[col].rolling(i).mean().ix[train.shape[0] - 1] y_prd = np.array([mean_moving] * test.shape[0]) print(y_prd) y_prd = np.exp(y_prd) rs_moving_log = sqrt( mean_squared_error(test[col].values, y_prd)) insert_into_database("MVG_AVERAGE", rs_moving_log, "{}") except Exception as e: insert_into_database("MVG_AVERAGE", None, e) print(("error in log moving average model, {}".format(e))) # 4.. SIMPLE EXPONENTIAL SMOOTHING try: train, test = train_test_split(df) fit2 = SimpleExpSmoothing(df[col]).fit(smoothing_level=0.6, optimized=False) # print(test.index[0]) # print(test.index[test.shape[0]-1]) y_prd = fit2.forecast(len(test)) print(y_prd) rs_simple = sqrt(mean_squared_error(test.values, y_prd)) dict_rmse["simple"] = rs_simple insert_into_database("SIMPLE_EXP", rs_simple, "{}") except Exception as e: print(("error is simple exp without log,{}".format(e))) insert_into_database("SIMPLE_EXP", None, e) try: if bool_log: train, test = train_test_split(df_log) fit2 = SimpleExpSmoothing(df[col]).fit(smoothing_level=0.6, optimized=False) y_prd = fit2.forecast(len(test)) y_prd = np.exp(y_prd) rs_simple = sqrt(mean_squared_error(test.values, y_prd)) dict_rmse["simple_log"] = rs_simple insert_into_database("SIMPLE_EXP", rs_simple, "{}") except Exception as e: insert_into_database("SIMPLE_EXP", None, e) print(("simple exponential smoothing log,{}".format(e))) # HOT LINEAR METHOD FOR FORECASTING try: train, test = train_test_split(df) fit2 = Holt(train[col], exponential=True, damped=False).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) rs_hotl = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["rs_hotl"] = rs_hotl insert_into_database("HOLT_LINEAR", rs_hotl, "{}") if bool_log: train, test = train_test_split(df) fit2 = Holt(train[col], exponential=True, damped=False).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) y_prd = np.exp(y_prd) rs_hotl_log = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["rs_hotl_log"] = rs_hotl_log insert_into_database("HOLT_LINEAR", rs_hotl_log, "{}") except Exception as e: insert_into_database("HOLT_LINEAR", None, e) print(( "error in HOLT linear forecasting in without damped.{}".format(e))) try: fit2 = Holt(train[col], exponential=True, damped=True).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) rs_holtld = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["rs_holtld"] = rs_holtld insert_into_database("HOLT_LINEAR", rs_holtld, "{}") if bool_log: fit2 = Holt(train[col], exponential=True, damped=True).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) y_prd = np.exp(y_prd) rs_holtld = sqrt(mean_squared_error(test[col].values, y_prd)) dict_rmse["rs_holtld"] = rs_holtld insert_into_database("HOLT_LINEAR", rs_holtld, "{}") except Exception as e: print(("error in HOLT linear smoothing damped,{}".format(e))) insert_into_database("HOLT_LINEAR", None, e) # HOLT WINTERS FORECASTING.. try: train, test = train_test_split(df) # print("fmmf") fit2 = ExponentialSmoothing(test[col], trend="mul", seasonal="mul", seasonal_periods=12).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) rs_hlw = sqrt(mean_squared_error(test[col].values, y_prd)) print(rs_hlw) dict_rmse["rs_hlw"] = rs_hlw insert_into_database("HOLT_WINTER", rs_hlw, "{}") if bool_log: train, test = train_test_split(df_log) fit2 = ExponentialSmoothing(test[col], trend="add", seasonal="add", seasonal_periods=12).fit() y_prd = fit2.predict(test.index.values[0], test.index.values[test.shape[0] - 1]) y_prd = np.exp(y_prd) rs_hlw_log = sqrt(mean_squared_error(test[col].values, y_prd)) print(rs_hlw_log) dict_rmse["rs_hlw_log"] = rs_hlw_log insert_into_database("HOLT_WINTER", rs_hlw_log, "{}") except Exception as e: print(("error in HOLT winter forecasting,{}".format(e))) insert_into_database("HOLT_WINTER", None, e) # ARIMA MODEL.... # try: # rs = test_stationary(df, col) # if rs: # # # Here we decide the order of diffrencing the Time Series # df_diff = df - df.shift() # df_diff.dropna(inplace=True) # rs = test_stationary(df_diff, col) # if rs: # df_diff = df_diff - df_diff.shift() # # df_diff.dropna(inplace=True) # # train, test = train_test_split(df_diff) # # """ The acf and pacf plots are # used to calculate the the parametre for AR # AND MA MODELS""" # # ar_list = get_params_p(train) # ma_list = get_params_q(train) # # for i in ma_list: # for j in ar_list: # try: # model = ARIMA(train, order=(j, 0, i)).fit() # y_prd = model.predict(start=test.index.values[0], end=test.index.values[test.shape[0] - 1]) # # rs = sqrt(mean_squared_error(test[col].values, y_prd)) # insert_into_database("ARIMA", rs, "{}") # except Exception as e: # # print(("error while training arima,{}".format(e))) # insert_into_database("ARIMA", None, e) # except Exception as e: # # print(("error in arima model,{}".format(e))) # insert_into_database("ARIMA", None, e) # .. SARIMAX try: train, test = train_test_split(df) p = d = q = list(range(0, 2)) non_seas = list(itertools.product(p, d, q)) lis = [1, 3, 6, 12, 24, 56] for i in lis: sea_so = [(x[0], x[1], x[2], i) for x in list(itertools.product(p, d, q))] for j in non_seas: for k in sea_so: try: model = SARIMAX(train, order=j, seasonal_order=k, enforce_stationarity=False, enforce_invertibility=False).fit() y_prd = model.predict( start=test.index.values[0], end=test.index.values[test.shape[0] - 1]) rs = sqrt(mean_squared_error(test.values, y_prd)) print(rs) insert_into_database("SARIMAX", rs, "{}") except Exception as e: print(("error while training the SARIMAX MODELS,{}". format(e))) insert_into_database("SARIMAX", None, e) except Exception as e: print(("error in seasonal_arima,{}".format(e))) insert_into_database("SARIMAX", None, e) # ..AUTO_ARIMA.. try: train, test = train_test_split(df) model = auto_arima(train, start_p=1, start_q=1, start_P=1, start_Q=1, max_p=5, max_q=5, max_P=5, max_Q=1, d=1, D=1, seasonal=True) model = model.fit(train) y_prd = model.predict(n_periods=len(test)) rs = sqrt(mean_squared_error(test.values, y_prd)) print("results in auto_Arima", rs) dict_rmse["auto_arima"] = rs insert_into_database("AUTO_ARIMA", rs, "{}") except Exception as e: print("error in auto_Arima,{}".format(e)) insert_into_database("Auto_arima", None, e)
def baseline(showPlot): np.set_printoptions(precision=3, suppress=True) full_df = pd.read_csv( '../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_lin_int.csv', infer_datetime_format=True, parse_dates=True) #=========================FIND BEST OFFSET======================================== by_state = full_df['sub_region_1'].unique() bestLinearCorr = 0 bestLogCorr = 0 bestLinearOffset = -1 bestLogOffset = -1 bestLinearData = 0 bestLogData = 0 #min_all_states_lin_dim=100 #min_all_states_log_dim=100 correlationScores = [] correlationLogScores = [] for offset in range(30): #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers full_dataframe = pd.DataFrame() min_dim = 100 for region in by_state: temp = full_df.loc[(full_df['sub_region_1'] == region)] temp = temp.loc[(temp['date'] > '2020-05-01')] #Shift CDC data by offset value cdc_dataframe = temp['num_cases'].shift(periods=offset, fill_value=0) mobility_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) all_states = pd.concat([cdc_dataframe, mobility_dataframe], axis=1) all_states = all_states.loc[(all_states['num_cases'] > 0)] #remove rows with zero cases full_dataframe = full_dataframe.append(all_states) '''if(all_states.shape[0]<min_dim): min_dim=all_states.shape[0]''' #Compute linear and logatrithmic correlations linearCorr = full_dataframe.corr() linearCorr = linearCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data logData = np.log(full_dataframe + 1 - np.min(full_dataframe.to_numpy())) logCorr = logData.corr() logCorr = logCorr.to_numpy()[ 0, 1:] #Take only correlations between 'cases' and mobility data #print("Offset:", offset, "Min_state_dim: ", min_dim) #print(" Log Correlation:", logCorr) #Save best values if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr): bestLinearCorr = linearCorr bestLinearOffset = offset min_all_states_lin_dim = min_dim #bestLinearData = full_dataframe if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr): bestLogCorr = logCorr bestLogOffset = offset min_all_states_log_dim = min_dim #bestLogData = logData correlationScores.append(np.linalg.norm(linearCorr)) correlationLogScores.append(np.linalg.norm(logCorr)) if showPlot: plt.plot(correlationScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Linear correlation vs. data offset") plt.show() plt.plot(correlationLogScores) plt.xlabel("Cases offset (days)") plt.ylabel("Norm of correlation vector") plt.title("Logarithmic correlation vs. data offset") plt.show() print("Best Full Correlation:", bestLinearCorr) print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr)) print("Best Full Offset:", bestLinearOffset) print("Best Log Correlation:", bestLogCorr) print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr)) print("Best Log Offset:", bestLogOffset) #num_models=(min(min_all_states_lin_dim, min_all_states_log_dim)-111)//3 linearMSE_by_state = [] logMSEAdj_by_state = [] linearCasesMSE_by_state = [] logCasesMSE_by_state = [] logisticMSE_by_state = [] dataNoise_by_state = [] arimaMSE_by_state = [] gaussMSE_by_state = [] for s in range(len(by_state)): #=========================BEGIN MODEL FITTING======================================== #Get the data for that state and shift it bestLinearData = pd.DataFrame() bestLogDf = pd.DataFrame() temp = full_df.loc[(full_df['sub_region_1'] == by_state[s])] temp = temp.loc[(temp['date'] < '2020-11-30')] #Shift CDC data by offset value cdc_lin_dataframe = temp['num_cases'].shift(periods=bestLinearOffset, fill_value=0) mobility_lin_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) all_lin_states = pd.concat([cdc_lin_dataframe, mobility_lin_dataframe], axis=1) all_lin_states = all_lin_states.loc[(all_lin_states['num_cases'] > 0)] #remove rows with zero cases bestLinearData = bestLinearData.append(all_lin_states) #Shift CDC data by offset value cdc_log_dataframe = temp['num_cases'].shift(periods=bestLogOffset, fill_value=0) mobility_log_dataframe = temp.drop( columns=['date', 'sub_region_1', 'num_cases']) all_log_states = pd.concat([cdc_log_dataframe, mobility_log_dataframe], axis=1) all_log_states = all_log_states.loc[(all_log_states['num_cases'] > 0)] #remove rows with zero cases bestLogDf = bestLogDf.append(all_log_states) bestLogData = np.log(bestLogDf + 1 - np.min(bestLogDf.to_numpy())) linearMSE = [] logMSEAdj = [] linearCasesMSE = [] logCasesMSE = [] logisticMSE = [] dataNoise = [] arimaMSE = [] gaussMSE = [] #Convert data to numpy linearCasesOnly = bestLinearData['num_cases'].to_numpy() logCasesOnly = np.log(linearCasesOnly + 1) bestLinearData = bestLinearData.to_numpy() bestLogData = bestLogData.to_numpy() stride = 3 #trains a new model every {stride} days maxEpoch = 100 for t in range( (min(bestLinearData.shape[0], bestLogData.shape[0]) - 111) // stride): #print("Size of training:", range((min(bestLinearData.shape[0], bestLogData.shape[0])-111)//stride)) print("Training model:", t) print("State:", by_state[s]) #Linear Mobility Data linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:] linearTrainy = bestLinearData[t * stride:t * stride + 60, :1] linearTestX = bestLinearData[t * stride + 60:t * stride + 111, 1:] linearTesty = bestLinearData[t * stride + 60:t * stride + 111, :1] #Logarithmic Mobility Data logTrainX = bestLogData[t * stride:t * stride + 60, 1:] logTrainy = bestLogData[t * stride:t * stride + 60, :1] logTestX = bestLogData[t * stride + 60:t * stride + 111, 1:] logTesty = bestLogData[t * stride + 60:t * stride + 111, :1] #Cases-only data linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60] logCasesTrainX = logCasesOnly[t * stride:t * stride + 60] linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride + 111] logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 111] timeTrain = np.arange(1, 61).reshape(-1, 1) timeTest = np.arange(61, 112).reshape(-1, 1) #Uncomment to add time data to mobility dataset #linearTrainX = np.hstack((linearTrainX, timeTrain)) #logTrainX = np.hstack((logTrainX, timeTrain)) #linearTestX = np.hstack((linearTestX, timeTest)) #logTestX = np.hstack((logTestX, timeTest)) #fit linear model linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy) predict = linear_model.predict(linearTestX) linearMSE.append(np.abs(predict - linearTesty) / linearTesty) #fit log model linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy) predict = linear_model.predict(logTestX) predictAdj = np.exp(predict) - 1 + np.min(full_dataframe.to_numpy( )) #convert from log back to raw case number logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty) #fit linear cases only model cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX) if False: visualize_cases(cases_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = cases_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit log cases only model cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX) if False: visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX, np.log(timeTest), logCasesTestX) predict = cases_model.predict(np.log(timeTest)) predictAdj = np.exp( predict) - 1 #convert from log back to raw case number logCasesMSE.append( np.abs(predictAdj - linearCasesTestX) / linearCasesTestX) #fit logistic model logistic_model, cov = optimize.curve_fit( logisticDerivative, timeTrain.reshape(linearCasesTrainX.shape), linearCasesTrainX, p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30], maxfev=10000, bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf, np.Inf]))) if False: visualize_logistic(logistic_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) logisticMSE.append( np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX) predict = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) dataNoise.append( np.mean( np.abs(predict - linearCasesTrainX) / linearCasesTrainX)) #fit stacking regressor estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=42), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)))] reg = StackingRegressor(estimators=estimators, final_estimator=GaussianProcessRegressor( kernel=DotProduct() + WhiteKernel(), random_state=0)) stacking_model = reg.fit(timeTrain, linearCasesTrainX) if False: visualize_cases(stacking_model, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predict = stacking_model.predict(timeTest) linearCasesMSE.append( np.abs(predict - linearCasesTestX) / linearCasesTestX) #fit ARIMA #Perform grid search to determine ARIMA Order '''stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1, max_p = 3, max_q = 3, m = 7, start_P = 0, seasonal = True, d = None, D = 1, trace = True, error_action ='ignore', # we don't want to know if an order does not work suppress_warnings = True, # we don't want convergence warnings stepwise = True) # set to stepwise stepwise_fit.summary()''' model = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)) result = model.fit(disp=False) if True: visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) predictArima = result.predict(61, 111, typ='levels') arimaMSE.append( np.abs(predictArima - linearCasesTestX) / linearCasesTestX) #Evaluate other models to use as input to gaussian process arima1 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 0, 7)).fit(disp=False) arima2 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(2, 0, 0), seasonal_order=(2, 1, 1, 7)).fit(disp=False) arima3 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(1, 1, 0), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima4 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(0, 1, 1), seasonal_order=(1, 1, 1, 7)).fit(disp=False) arima5 = SARIMAX(linearCasesTrainX, initialization='approximate_diffuse', order=(0, 1, 1), seasonal_order=(2, 1, 0, 7)).fit(disp=False) predictLog = cases_model.predict(np.log(timeTrain)) #Log model predictAdj = np.exp( predictLog) - 1 #convert from log back to raw case number predictLogistic = logisticDerivative( timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model predictArima1 = arima1.predict(1, 60, typ='levels') predictArima2 = arima2.predict(1, 60, typ='levels') predictArima3 = arima3.predict(1, 60, typ='levels') predictArima4 = arima4.predict(1, 60, typ='levels') predictArima5 = arima5.predict(1, 60, typ='levels') testLog = cases_model.predict(np.log(timeTest)) #Log model testAdj = np.exp( testLog) - 1 #convert from log back to raw case number testLogistic = logisticDerivative( timeTest.reshape(linearCasesTestX.shape), logistic_model[0], logistic_model[1], logistic_model[2]) #logistic model testArima1 = arima1.predict(61, 111, typ='levels') testArima2 = arima2.predict(61, 111, typ='levels') testArima3 = arima3.predict(61, 111, typ='levels') testArima4 = arima4.predict(61, 111, typ='levels') testArima5 = arima5.predict(61, 111, typ='levels') #fit gaussian process meta-learner gaussTrain = np.array([ predictLogistic, predictArima1, predictArima2, predictArima3, predictArima4, predictArima5 ]).T gaussTest = np.array([ testLogistic, testArima1, testArima2, testArima3, testArima4, testArima5 ]).T reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(), random_state=0) stacking_model = reg.fit(gaussTrain, linearCasesTrainX) predictTrain = stacking_model.predict(gaussTrain) predictTest = stacking_model.predict(gaussTest) if False: visualize_gauss( np.hstack((predictTrain, predictTest)).T, timeTrain, linearCasesTrainX, timeTest, linearCasesTestX) gaussMSE.append( np.abs(predictTest - linearCasesTestX) / linearCasesTestX) #Append to state totals linearMSE_by_state.append( np.reshape(np.array(linearMSE).mean(axis=0), (51))) logMSEAdj_by_state.append( np.reshape(np.array(logMSEAdj).mean(axis=0), (51))) linearCasesMSE_by_state.append( np.reshape(np.array(linearCasesMSE).mean(axis=0), (51))) logCasesMSE_by_state.append( np.reshape(np.array(logCasesMSE).mean(axis=0), (51))) logisticMSE_by_state.append( np.reshape(np.array(logisticMSE).mean(axis=0), (51))) dataNoise_by_state.append(np.mean(dataNoise)) arimaMSE_by_state.append( np.reshape(np.array(arimaMSE).mean(axis=0), (51))) gaussMSE_by_state.append( np.reshape(np.array(gaussMSE).mean(axis=0), (51))) print("Average logistic Test error:", np.mean(dataNoise)) #Plot proof-of-concept graph if showPlot: plt.plot(np.array(linearMSE_by_state).mean(axis=0), label='Mobility (linear, non-temporal)') plt.plot(np.array(logMSEAdj_by_state).mean(axis=0), label='Mobility (logarithmic, non-temporal)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() #Plot baseline graph #plt.plot(np.array(linearCasesMSE_by_state).mean(axis=0), label='Cases (linear, temporal)') #Don't plot because performance is terrible plt.plot(np.array(logCasesMSE_by_state).mean(axis=0), label='Cases (logarithmic temporal)') plt.plot(np.array(logisticMSE_by_state).mean(axis=0), label='Cases (logistic temporal)') plt.plot(np.array(arimaMSE_by_state).mean(axis=0), label='Cases (ARIMA)') plt.plot(np.array(gaussMSE_by_state).mean(axis=0), label='Cases (Gaussian Process meta)') plt.xlabel("Days in advance to predict") plt.ylabel("Percent deviation from true value") plt.legend(loc="upper left") plt.show() print("Average logistic test error:", np.mean(dataNoise_by_state))
# best_model, models = best_sarima_model(train_data=log_transformed_train_data,p=range(3),q=range(3),P=range(3),Q=range(3)) # preds_best = np.exp(best_model.predict(start='2019-01-01', dynamic=True, typ='levels')) # print(f'MAPE{np.round(mean_abs_pct_error(log_transformed_test_data,preds_best),2)}') agile_model = SARIMAX(endog=log_transformed_train_data, order=(1, 1, 2), seasonal_order=(1, 1, 2, 52), enforce_invertibility=False).fit() agile_model.summary() #just do deactive warnings regarding PyCharm and Numpy # noinspection PyTypeChecker agile_model_pred = np.exp( agile_model.predict(start=test_first_date, end=test_last_date, dynamic=True, typ='levels')) print(f'MAPE {np.round(mean_abs_pct_error(test_data,agile_model_pred),2)}%') # print(f'MAE:{np.round(mean_absolute_error(test_data,agile_model_pred),2)}') # noinspection PyTypeChecker agile_model_forecast = np.exp(agile_model.forecast(steps=2)) print(agile_model_forecast) def plot_prediciton(training_data, agile_model, agile_model_pred, original_data): model_data = training_data.values[1:].reshape(-1) - agile_model.resid[1:] model_data = pd.concat((model_data, agile_model_pred)) plt.figure(figsize=(16, 6))
model = SARIMAX(series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52), trend='n', enforce_stationarity=False, enforce_invertibility=False).fit() print("________________________") print("MODEL SUMMARY") print(model.summary().tables[1]) # Nice way to check residuals follow a Gaussian distribution model.plot_diagnostics(figsize=(15, 12)) plt.show() train_pred = model.predict() train_pred_cpy = train_pred.copy() print(train_pred_cpy) print(type(train_pred_cpy)) print(type(series)) cdf_index = a_organic[0:train_size].index #print(cdf_index) #print(type(cdf_index)) #________________________________________________ #Comparing the FIT with the trained data #________________________________________________ #I need to create here a data frame from the series compare_frame = {
# In[162]: loss_per_epoch = model.history.history['loss'] plt.plot(range(len(loss_per_epoch)), loss_per_epoch) # In[163]: first_eval_batch = scaled_train[-30:] # In[164]: first_eval_batch = first_eval_batch.reshape((1, n_input, n_features)) # In[165]: model.predict(first_eval_batch) # In[166]: scaled_test[0] # In[167]: test_predictions = [] first_eval_batch = scaled_train[-n_input:] current_batch = first_eval_batch.reshape((1, n_input, n_features)) # In[168]: np.append(current_batch[:, 1:, :], [[[99]]], axis=1)
#fifth, run SARIMA train_model with the order determined by auto_arima # ---------- train_model = SARIMAX(train['total'], order=(0, 0, 1), seasonal_order=(2, 0, 0, 7), enforce_invertibility=False).fit() print(train_model.summary()) # enforce invertibility allows to keep coefficients below 1, just to avoid ValueError #sixth, test predictions vs test set # ---------- start = len(train) end = len(train) + len(test) - 1 predictions = train_model.predict( start, end, exog=test[['holiday']]).rename('SARIMAX predictions vs test') test['total'].plot(legend=True) predictions.plot(legend=True) plt.show() #seventh, evaluate the model on rmse error # ---------- error = rmse(test['total'], predictions) std = test['total'].std() error_result = error / std * 100 print('rmse error is the following percentage out of standard dev: ') print(error_result) # eight, run forecast into the future with full dataset # ----------