def run_app(): image = Image.open('Air_pol.JPG') st.image(image, use_column_width=True) no2 = Image.open('no2.JPG') st.sidebar.image(no2, use_column_width=True) df2 = df.copy() df2 = df2['Nitrogen_dioxide'] train = df2[0:-30] test = df2[-30:] add_selectbox = st.sidebar.selectbox( "Select Forecasting Model", ("Simple Moving Average", "LSTM", "Triple Exponential Smoothing", "Seasonal ARIMA", "Gradient Boosting Regressor", "ML Model Comparison Table")) st.sidebar.info( 'This application is developed by Siddhesh D. Munagekar to forecast Nitrogen dioxide concentration in air using multiple forecasting technique' ) if add_selectbox == 'Simple Moving Average': df1 = df.Nitrogen_dioxide.copy() df1 = pd.DataFrame(df1) df1['SMA_20'] = df1.Nitrogen_dioxide.rolling(20, min_periods=1).mean() df1['SMA_10'] = df1.Nitrogen_dioxide.rolling(10, min_periods=1).mean() df1['SMA_3'] = df1.Nitrogen_dioxide.rolling(3, min_periods=1).mean() fig = plt.figure() df1.plot(figsize=(25, 15)) plt.xlabel('Date', fontsize=20) plt.ylabel('Nitrogen dioxide', fontsize=20) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.title("Simple Moving Average for 20, 10 and 3 days", fontsize=30) plt.legend( labels=['Temperature', '20-days SMA', '10-days SMA', '3-days SMA'], fontsize=22) plt.grid() plt.show() st.pyplot(use_column_width=True) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_20']) st.write("MAE for 20 days is {:,.2f}".format(mae)) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_10']) st.write("MAE for 10 days is {:,.2f}".format(mae)) mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_3']) st.write("MAE for 3 days is {:,.2f}".format(mae)) if add_selectbox == 'Triple Exponential Smoothing': train = pd.DataFrame(train) test = pd.DataFrame(test) pred = test.copy() fit1 = ExponentialSmoothing(np.asarray(train['Nitrogen_dioxide']), trend='add', seasonal_periods=7, seasonal='add').fit() pred['Holt_Winter'] = fit1.forecast(len(test)) # Calculate KPI's mae = metrics.mean_absolute_error(test.Nitrogen_dioxide, pred.Holt_Winter) # Plot plt.figure(figsize=(16, 8)) plt.plot(train['Nitrogen_dioxide'], label='Train') plt.plot(test['Nitrogen_dioxide'], label='Test') plt.plot(pred['Holt_Winter'], label='Holt_Winter (MAE={:.2f})'.format(mae)) plt.title("Triple Exponential smoothing", fontsize=30) plt.xlabel('Date', fontsize=20) plt.ylabel('Nitrogen dioxide', fontsize=20) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.grid() plt.show() st.pyplot(use_column_width=True) st.write("MAE for 30 days is {:,.2f}".format(mae)) ##Seasonal_Arima if add_selectbox == 'Seasonal ARIMA': df3 = df.copy() #train = df3[0:-30] test = df3[-30:] model = SARIMAX(df3['Nitrogen_dioxide'], order=(0, 1, 0), seasonal_order=(2, 1, 0, 30), enforce_stationarity=False, enforce_invertibility=False, dynamic=True) results = model.fit() df3['predicted_test'] = results.predict(start=360, end=390, dynamic=True) seasonal_forecast = pd.DataFrame(results.forecast(len(test))) seasonal_forecast = seasonal_forecast.rename( {0: 'Seasonal forecast for 30 periods'}, axis=1) plt.figure(figsize=(16, 8)) seasonal_forecast.plot(figsize=(25, 10), color='green') df3['Nitrogen_dioxide'].plot(figsize=(20, 10)) df3['predicted_test'].plot(figsize=(20, 10)) plt.legend(fontsize=19) plt.ylabel("Nitrogen_dioxide", fontsize=20) plt.xlabel('Date', fontsize=20) plt.title("Seasonal Arima", fontsize=30) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.grid() plt.show() st.pyplot(use_column_width=True) # Calculate KPI mae = metrics.mean_absolute_error(df3.Nitrogen_dioxide[360:], df3.predicted_test[360:]) st.write("MAE of Seasonal Arima is {:.2f}".format(mae)) if add_selectbox == 'ML Model Comparison Table': acc_table = { 'Model': [ 'Linear Regression', 'Decision Tree', 'Random_forest', 'Gradient_Boosting' ], 'Train_score': [0.59, 0.70, 0.91, 0.83], 'Test_score': [0.49, 0.46, 0.40, 0.50], 'MAE_train': [4265.36, 3713.86, 2220.76, 2958.29], 'MAE_test': [3053.96, 3116.57, 3161.29, 2726.36] } acc_table = pd.DataFrame(acc_table) acc_table = acc_table.sort_values( by='Test_score', ascending=False).reset_index(drop=True) st.table(acc_table) #Gradient Boosting if add_selectbox == 'Gradient Boosting Regressor': df55 = df.Nitrogen_dioxide.copy() df55 = pd.DataFrame(df55) dfML = pd.DataFrame() for i in range(7, 0, -1): dfML[['t-' + str(i)]] = df55.shift(i) dfML['t'] = df55.values df_ML = dfML[7:] # Split Data into dependent(target) and independent(features) variables df_ML22 = df_ML.values # Lagged variables (features) and original time series data (target) X2 = df_ML22[:, 0: -1] # slice all rows and start with column 0 and go up to but not including the last column y2 = df_ML22[:, -1] # slice all rows and last column, essentially separating out 't' column traintarget_size = int(len(y2) * 0.70) train_target, test_target = y2[:traintarget_size], y2[ traintarget_size:len(y2)] trainfeature_size = int(len(X2) * 0.70) train_feature, test_feature = X2[:trainfeature_size], X2[ trainfeature_size:len(X2)] gbr = GradientBoostingRegressor(max_features=3, max_depth=2, learning_rate=0.1, n_estimators=100, subsample=0.8, random_state=50) gbr.fit(train_feature, train_target) gbr_train_70_30 = gbr.score(train_feature, train_target) gbr_test_70_30 = gbr.score(test_feature, test_target) plot_test_pred = gbr.predict(test_feature) plot_test_pred = pd.DataFrame(plot_test_pred) plot_test_pred = plot_test_pred.rename({0: 'Predicted_test'}, axis=1) plot_test_target = pd.DataFrame(test_target) plot_test_target = plot_test_target.rename({0: 'Actual_test'}, axis=1) gbr_test_plot = pd.concat([plot_test_target, plot_test_pred], axis=1) gbr_test_plot.plot( title='Gradient boosting Actual vs Predicted test of last 116 days' ) plt.grid() st.pyplot(use_column_width=True) st.write("Gradient boosting training score {:.2f}".format( round(gbr_train_70_30, 2))) st.write("Gradient boosting test score {:.2f}".format( round(gbr_test_70_30, 2))) if st.checkbox("Visualize for last 10 days"): gbr_test_plot[106:].plot(title=' GBR Plot of last 10 days') st.pyplot(use_column_width=True) if add_selectbox == 'LSTM': data = df.copy() data = data.iloc[:, 7].values data = data.reshape(-1, 1) data = data.astype('float32') # Scaling the data scalar = MinMaxScaler() data = scalar.fit_transform(data) train_lstm = data[:-30, :] test_lstm = data[-30:, :] # Building the 2D array for supervised learning def create_dataset(sequence, time_step): dataX = [] dataY = [] for i in range(len(sequence) - time_step - 1): a = sequence[i:(i + time_step), 0] dataX.append(a) dataY.append(sequence[i + time_step, 0]) return np.array(dataX), np.array(dataY) time_step = 1 # Apply the 2D array function to train and test datasets train_X, train_Y = create_dataset(train_lstm, time_step) test_X, test_Y = create_dataset(test_lstm, time_step) train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1])) test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1])) # Build the LSTM Model model = Sequential() # Adding the input layer and LSTM layer model.add( LSTM(50, activation='relu', input_shape=(1, time_step), return_sequences=True)) model.add(LSTM(50, return_sequences=True)) model.add(LSTM(50)) model.add(Dropout(0.15)) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse') model.fit(train_X, train_Y, batch_size=4, epochs=50, verbose=2) # Make predictions train_predict = model.predict(train_X) test_predict = model.predict(test_X) # inverting predictions train_predict = scalar.inverse_transform(train_predict) train_Y = scalar.inverse_transform([train_Y]) test_predict = scalar.inverse_transform(test_predict) test_Y = scalar.inverse_transform([test_Y]) # calculate root mean squared error train_score = mean_absolute_error(train_Y[0], train_predict[:, 0]) test_score = mean_absolute_error(test_Y[0], test_predict[:, 0]) # LSTM plot train_plot = np.empty_like( data) # create an array with the same shape as provided train_plot[:, :] = np.nan train_plot[time_step:len(train_predict) + time_step, :] = train_predict # shifting test predictions for plotting test_plot = np.empty_like(data) test_plot[:, :] = np.nan test_plot[len(train_predict) + (time_step * 2) + 1:len(data) - 1, :] = test_predict # plot baseline and predictions plt.figure(figsize=(16, 8)) plt.plot(train_plot) plt.plot(test_plot, color='green') plt.plot(scalar.inverse_transform(data), color='orange') plt.title( "Long Short Term Memory Network with train ,test and forecast", fontsize=20) plt.ylabel("Nitrogen_dioxide", fontsize=20) plt.legend(labels=['Train plot', 'Test set', 'LSTM forecast'], fontsize=19) plt.xticks(fontsize=8) plt.yticks(fontsize=8) plt.grid() plt.show() st.pyplot(use_column_width=True) st.write('Train Score: %.3f MAE' % (train_score)) st.write('Test Score: %.3f MAE' % (test_score)) if st.checkbox('Visualize forecasted chart for 10 future days'): test_predict = scalar.fit_transform(test_predict) time_step = 10 x_input = test_predict[(len(test_predict) - time_step):].reshape( 1, -1) # Converting it to list temp_input = list(x_input) # Arranging list vertically temp_input = temp_input[0].tolist() # demonstrate prediction for next 10 days lst_output = [] future_day = 10 n_steps = 10 i = 0 # Forcast next 10 days output while (i < future_day): if (len(temp_input) > 10): x_input = np.array(temp_input[1:]) print("{} day input {}".format(i, x_input)) x_input = x_input.reshape(1, -1) # Converting to 3d array for lstm x_input = x_input.reshape(1, n_steps, 1) # print(x_input) ypred = model.predict(x_input, verbose=0) print("{} day predicted output {}".format(i, ypred)) # adding predicted output to temp_input list temp_input.extend(ypred[0].tolist()) temp_input = temp_input[1:] # print(temp_input) lst_output.extend(ypred.tolist()) i = i + 1 else: x_input = x_input.reshape((n_steps, 1, 1)) ypred = model.predict(x_input, verbose=0) print("Predicted y of 0 day", ypred[0]) # Addding ypred value in temp_input(previous input) temp_input.extend(ypred[0].tolist()) print(len(temp_input)) lst_output.extend(ypred.tolist()) i = i + 1 # print(lst_output) previous_days1 = np.arange(len(data) - n_steps, len(data)) predicted_future1 = np.arange(len(data), len(data) + future_day) lst_output = lst_output[:future_day] outputlist = data.tolist() outputlist.extend(lst_output) #data[len(data) - n_steps:] plt.plot( np.append(previous_days1, predicted_future1), scalar.inverse_transform(outputlist[len(data) - n_steps:])) plt.plot(predicted_future1, scalar.inverse_transform(lst_output)) plt.title("Forecast for 10 future days", fontsize=20) plt.legend(fontsize=19) plt.xticks(fontsize=20) plt.yticks(fontsize=8) plt.ylabel("Nitrogen dioxide") plt.show() st.pyplot(use_column_width=True)
scaled_train = scaler.transform(train) scaled_test = scaler.transform(test) # In[157]: n_input = 30 n_features = 1 generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=30) # In[158]: model = Sequential() model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features))) model.add(Dense(50)) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse') # In[159]: model.summary() # In[160]: model.fit_generator(generator, epochs=30) # In[161]: model.history.history.keys()
def arima(): failedMonths = 0 #Records if any months could not be successfully trained on (pred is zero) full_df=pd.read_csv('../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_mode.csv', infer_datetime_format=True, parse_dates=True) full_df['originalCases'] = full_df['num_cases'] #preserve original case values as additional feature by_state=full_df['sub_region_1'].unique() #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers offset = 14 full_dataframe=pd.DataFrame() for region in by_state: temp=full_df.loc[(full_df['sub_region_1']==region)] temp=temp.loc[(temp['date']<'2020-11-20')] #Shift CDC data by offset value cdc_dataframe=temp['num_cases'].shift(periods=offset,fill_value=0) mobility_dataframe=temp.drop(columns=['date', 'num_cases']) all_states=pd.concat([cdc_dataframe, mobility_dataframe],axis=1) all_states=all_states.loc[(all_states['num_cases']>0)] #remove rows with zero cases full_dataframe=full_dataframe.append(all_states) #Build new full data array #mobility_dataframe_truc = mobility_dataframe.drop(columns=['date']) #full_dataframe = pd.concat([cdc_dataframe_truc, mobility_dataframe_truc], axis=1) #full_dataframe['originalCases'] = cdc_dataframe['newAndPnew'] #preserve original case values as additional feature #full_dataframe_noDate = full_dataframe.drop(columns=['submission_date']) #full_dataframe_noDate = full_dataframe_noDate.loc[(full_dataframe_noDate['newAndPnew']!=0)] #remove rows with zero cases #Find length of shorted state dataframe minLength = np.inf for region in by_state: state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)] length = state_data.shape[0] if length < minLength: minLength = length stride = 10 #trains a new model every {stride} days percentErrors = [] for t in range(3):#(minLength-90)//stride): #Linear Mobility Data linearTrainX = [] linearTrainy = [] linearTestX = [] linearTesty = [] #Logarithmic Mobility Data logTrainX = [] logTrainy = [] logTestX = [] logTesty = [] MLPTrainX = [] for region in by_state[:3]: state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)].drop(columns=['sub_region_1', 'grocery_and_pharmacy_percent_change_from_baseline']) #Convert data to numpy linearData = state_data.to_numpy() logData = np.log(state_data+1-np.min(state_data.to_numpy())).to_numpy() timeTrain = np.arange(1,61).reshape(-1, 1) timeTest = np.arange(61,91).reshape(-1, 1) #Linear Mobility Data linearTrainX.append(linearData[t*stride:t*stride+60,1:]) linearTrainy.append(linearData[t*stride:t*stride+60,:1]) linearTestX.append(linearData[t*stride+60:t*stride+90,1:]) linearTesty.append(linearData[t*stride+60:t*stride+90,:1]) #Logarithmic Mobility Data logTrainX.append(logData[t*stride:t*stride+60,1:]) logTrainy.append(logData[t*stride:t*stride+60,:1]) logTestX.append(logData[t*stride+60:t*stride+90,1:]) logTesty.append(logData[t*stride+60:t*stride+90,:1]) MLPTrainXState = [] for i,feature in enumerate(linearData[t*stride:t*stride+60,1:].T): #print("Feature:", i) #fit ARIMA #Perform grid search to determine ARIMA Order #stepwise_fit = auto_arima(feature, start_p = 1, start_q = 1, # max_p = 3, max_q = 3, m = 7, # start_P = 0, seasonal = True, # d = None, D = 1, trace = True, # error_action ='ignore', # we don't want to know if an order does not work # suppress_warnings = True, # we don't want convergence warnings # stepwise = True) # set to stepwise #stepwise_fit.summary() #print("===============================================================================================") predictArima =[] arimaOrders = [(1,0,0),(1,0,1),(3,0,0),(1,0,0),(0,1,1),(1,0,0),(2,0,0)] seasonalOrders = [(2, 1, 0, 7), (2, 1, 0, 7), (1, 1, 0, 7), (1, 1, 0, 7),(0,1,1,7),(0,1,1,7),(2, 1, 0, 7)] model = SARIMAX(feature, order = arimaOrders[i], seasonal_order =seasonalOrders[i], initialization='approximate_diffuse') result = model.fit(disp=False) if showPlot >=2 : visualize_ARIMA(result, timeTrain, linearTrainX[:,i], timeTest, linearTestX[:,i]) predictArima.append(result.predict(61, 90, typ = 'levels')) predictArima = np.mean(predictArima, axis=0) MLPTrainXState.append(predictArima) MLPTrainX.append(np.array(MLPTrainXState).T) MLPTrainX = np.array(MLPTrainX).reshape(-1,6) linearTrainX = np.array(linearTrainX).reshape(-1,6) linearTrainy = np.array(linearTrainy).reshape(-1,1) linearTesty = np.array(linearTesty).reshape(-1,1) #Use "Last known case value" as bias #(I completely made this up but it improved accuracy by ~5%) #bias1 = np.ones((30,1))#*linearTrainy[0] #bias2 = np.ones((30,1))#*linearTrainy[30] bias = np.ones((linearTrainX.shape[0],1))#np.vstack((bias1, bias2)) linearTrainX = np.hstack((linearTrainX, bias)) bias3 = np.ones((MLPTrainX.shape[0],1))#*linearTrainy[-1] MLPTrainX = np.hstack((MLPTrainX, bias3)) failCounter = 0 maxFail = 4 while failCounter < maxFail: #Retrain if prediction is zero model = Sequential() #model.add(BatchNormalization()) model.add(Dense(10, input_dim=7, activation='relu')) #model.add(Dropout(0.15)) model.add(Dense(30, activation='relu')) #model.add(Dropout(0.15)) model.add(Dense(1, activation='relu')) model.compile(optimizer='adam',loss='mean_squared_error', metrics=['accuracy']) model.fit(linearTrainX, linearTrainy, epochs=100, verbose=0) y_pred = model.predict(MLPTrainX) if np.sum(y_pred==0) < 0.1 * MLPTrainX.shape[0]: break print("Prediction is zero. Retraining...") failCounter += 1 if failCounter == maxFail: failedMonths += 1 percentError = 1 print("Could not train model on this data") if failCounter != maxFail: error = y_pred-linearTesty percentError = np.abs(error/linearTesty).T percentErrorsByState = [] print(percentError.shape) for i in range(len(by_state)): percentErrorsByState.append(percentError[i*30:(i+1)*30]) percentErrorsByState = np.array(percentErrorsByState).reshape(51) print("Loss:", np.mean(percentError)) #print("Percent Error:",percentError) percentErrors.append(percentErrorsByState) if showPlot >= 1 or np.mean(percentError) > 0.4: plt.plot(timeTrain, linearTrainy[0:60], label="Past") plt.plot(timeTest, linearTesty[0:30], label="True Future") plt.plot(timeTest, y_pred[0:30], label="Predicted Future") plt.plot(timeTest, MLPTrainX[0:30,-2], label="Predicted ARIMA (case only)") plt.legend() plt.show() print(np.array(percentErrors).shape) print("Failed Months:", failedMonths) print(np.mean(percentErrors, axis=1)) plt.plot(np.mean(percentErrors, axis=1).flatten()) plt.show() return