예제 #1
0
def run_app():
    image = Image.open('Air_pol.JPG')

    st.image(image, use_column_width=True)
    no2 = Image.open('no2.JPG')
    st.sidebar.image(no2, use_column_width=True)

    df2 = df.copy()
    df2 = df2['Nitrogen_dioxide']
    train = df2[0:-30]
    test = df2[-30:]

    add_selectbox = st.sidebar.selectbox(
        "Select Forecasting Model",
        ("Simple Moving Average", "LSTM", "Triple Exponential Smoothing",
         "Seasonal ARIMA", "Gradient Boosting Regressor",
         "ML Model Comparison Table"))
    st.sidebar.info(
        'This application is developed by Siddhesh D. Munagekar to forecast Nitrogen dioxide concentration in air using multiple forecasting technique'
    )

    if add_selectbox == 'Simple Moving Average':
        df1 = df.Nitrogen_dioxide.copy()
        df1 = pd.DataFrame(df1)
        df1['SMA_20'] = df1.Nitrogen_dioxide.rolling(20, min_periods=1).mean()
        df1['SMA_10'] = df1.Nitrogen_dioxide.rolling(10, min_periods=1).mean()
        df1['SMA_3'] = df1.Nitrogen_dioxide.rolling(3, min_periods=1).mean()
        fig = plt.figure()

        df1.plot(figsize=(25, 15))
        plt.xlabel('Date', fontsize=20)
        plt.ylabel('Nitrogen dioxide', fontsize=20)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.title("Simple Moving  Average for 20, 10 and 3 days", fontsize=30)
        plt.legend(
            labels=['Temperature', '20-days SMA', '10-days SMA', '3-days SMA'],
            fontsize=22)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'],
                                          df1['SMA_20'])
        st.write("MAE for 20 days is {:,.2f}".format(mae))
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'],
                                          df1['SMA_10'])
        st.write("MAE for 10 days is {:,.2f}".format(mae))
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_3'])
        st.write("MAE for 3 days is {:,.2f}".format(mae))

    if add_selectbox == 'Triple Exponential Smoothing':

        train = pd.DataFrame(train)
        test = pd.DataFrame(test)
        pred = test.copy()
        fit1 = ExponentialSmoothing(np.asarray(train['Nitrogen_dioxide']),
                                    trend='add',
                                    seasonal_periods=7,
                                    seasonal='add').fit()

        pred['Holt_Winter'] = fit1.forecast(len(test))
        # Calculate KPI's
        mae = metrics.mean_absolute_error(test.Nitrogen_dioxide,
                                          pred.Holt_Winter)

        # Plot
        plt.figure(figsize=(16, 8))
        plt.plot(train['Nitrogen_dioxide'], label='Train')
        plt.plot(test['Nitrogen_dioxide'], label='Test')
        plt.plot(pred['Holt_Winter'],
                 label='Holt_Winter (MAE={:.2f})'.format(mae))
        plt.title("Triple Exponential smoothing", fontsize=30)
        plt.xlabel('Date', fontsize=20)
        plt.ylabel('Nitrogen dioxide', fontsize=20)
        plt.legend(fontsize=19)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)
        st.write("MAE for 30 days is {:,.2f}".format(mae))

    ##Seasonal_Arima
    if add_selectbox == 'Seasonal ARIMA':

        df3 = df.copy()
        #train = df3[0:-30]
        test = df3[-30:]

        model = SARIMAX(df3['Nitrogen_dioxide'],
                        order=(0, 1, 0),
                        seasonal_order=(2, 1, 0, 30),
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                        dynamic=True)
        results = model.fit()

        df3['predicted_test'] = results.predict(start=360,
                                                end=390,
                                                dynamic=True)

        seasonal_forecast = pd.DataFrame(results.forecast(len(test)))
        seasonal_forecast = seasonal_forecast.rename(
            {0: 'Seasonal forecast for 30 periods'}, axis=1)

        plt.figure(figsize=(16, 8))
        seasonal_forecast.plot(figsize=(25, 10), color='green')
        df3['Nitrogen_dioxide'].plot(figsize=(20, 10))
        df3['predicted_test'].plot(figsize=(20, 10))
        plt.legend(fontsize=19)
        plt.ylabel("Nitrogen_dioxide", fontsize=20)
        plt.xlabel('Date', fontsize=20)
        plt.title("Seasonal Arima", fontsize=30)
        plt.legend(fontsize=19)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)

        # Calculate KPI
        mae = metrics.mean_absolute_error(df3.Nitrogen_dioxide[360:],
                                          df3.predicted_test[360:])

        st.write("MAE of Seasonal Arima is  {:.2f}".format(mae))

    if add_selectbox == 'ML Model Comparison Table':

        acc_table = {
            'Model': [
                'Linear Regression', 'Decision Tree', 'Random_forest',
                'Gradient_Boosting'
            ],
            'Train_score': [0.59, 0.70, 0.91, 0.83],
            'Test_score': [0.49, 0.46, 0.40, 0.50],
            'MAE_train': [4265.36, 3713.86, 2220.76, 2958.29],
            'MAE_test': [3053.96, 3116.57, 3161.29, 2726.36]
        }
        acc_table = pd.DataFrame(acc_table)
        acc_table = acc_table.sort_values(
            by='Test_score', ascending=False).reset_index(drop=True)
        st.table(acc_table)

    #Gradient Boosting
    if add_selectbox == 'Gradient Boosting Regressor':

        df55 = df.Nitrogen_dioxide.copy()
        df55 = pd.DataFrame(df55)
        dfML = pd.DataFrame()
        for i in range(7, 0, -1):
            dfML[['t-' + str(i)]] = df55.shift(i)

        dfML['t'] = df55.values
        df_ML = dfML[7:]
        # Split Data into dependent(target) and independent(features) variables

        df_ML22 = df_ML.values
        # Lagged variables (features) and original time series data (target)
        X2 = df_ML22[:, 0:
                     -1]  # slice all rows and start with column 0 and go up to but not including the last column
        y2 = df_ML22[:,
                     -1]  # slice all rows and last column, essentially separating out 't' column

        traintarget_size = int(len(y2) * 0.70)
        train_target, test_target = y2[:traintarget_size], y2[
            traintarget_size:len(y2)]
        trainfeature_size = int(len(X2) * 0.70)
        train_feature, test_feature = X2[:trainfeature_size], X2[
            trainfeature_size:len(X2)]

        gbr = GradientBoostingRegressor(max_features=3,
                                        max_depth=2,
                                        learning_rate=0.1,
                                        n_estimators=100,
                                        subsample=0.8,
                                        random_state=50)

        gbr.fit(train_feature, train_target)

        gbr_train_70_30 = gbr.score(train_feature, train_target)
        gbr_test_70_30 = gbr.score(test_feature, test_target)

        plot_test_pred = gbr.predict(test_feature)
        plot_test_pred = pd.DataFrame(plot_test_pred)
        plot_test_pred = plot_test_pred.rename({0: 'Predicted_test'}, axis=1)

        plot_test_target = pd.DataFrame(test_target)
        plot_test_target = plot_test_target.rename({0: 'Actual_test'}, axis=1)
        gbr_test_plot = pd.concat([plot_test_target, plot_test_pred], axis=1)

        gbr_test_plot.plot(
            title='Gradient boosting Actual vs Predicted test of last 116 days'
        )
        plt.grid()
        st.pyplot(use_column_width=True)
        st.write("Gradient boosting training score {:.2f}".format(
            round(gbr_train_70_30, 2)))
        st.write("Gradient boosting test score {:.2f}".format(
            round(gbr_test_70_30, 2)))

        if st.checkbox("Visualize for last 10 days"):
            gbr_test_plot[106:].plot(title=' GBR Plot of last 10 days')
            st.pyplot(use_column_width=True)

    if add_selectbox == 'LSTM':

        data = df.copy()
        data = data.iloc[:, 7].values
        data = data.reshape(-1, 1)
        data = data.astype('float32')

        # Scaling the data
        scalar = MinMaxScaler()
        data = scalar.fit_transform(data)

        train_lstm = data[:-30, :]
        test_lstm = data[-30:, :]

        # Building the 2D array for supervised learning
        def create_dataset(sequence, time_step):
            dataX = []
            dataY = []
            for i in range(len(sequence) - time_step - 1):
                a = sequence[i:(i + time_step), 0]
                dataX.append(a)
                dataY.append(sequence[i + time_step, 0])
            return np.array(dataX), np.array(dataY)

        time_step = 1
        # Apply the 2D array function to train and test datasets
        train_X, train_Y = create_dataset(train_lstm, time_step)
        test_X, test_Y = create_dataset(test_lstm, time_step)

        train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
        test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))

        # Build the LSTM Model
        model = Sequential()
        # Adding the input layer and LSTM layer
        model.add(
            LSTM(50,
                 activation='relu',
                 input_shape=(1, time_step),
                 return_sequences=True))
        model.add(LSTM(50, return_sequences=True))
        model.add(LSTM(50))
        model.add(Dropout(0.15))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        model.fit(train_X, train_Y, batch_size=4, epochs=50, verbose=2)

        # Make predictions

        train_predict = model.predict(train_X)
        test_predict = model.predict(test_X)
        # inverting predictions
        train_predict = scalar.inverse_transform(train_predict)
        train_Y = scalar.inverse_transform([train_Y])
        test_predict = scalar.inverse_transform(test_predict)
        test_Y = scalar.inverse_transform([test_Y])
        # calculate root mean squared error
        train_score = mean_absolute_error(train_Y[0], train_predict[:, 0])

        test_score = mean_absolute_error(test_Y[0], test_predict[:, 0])

        # LSTM plot
        train_plot = np.empty_like(
            data)  # create an array with the same shape as provided
        train_plot[:, :] = np.nan
        train_plot[time_step:len(train_predict) + time_step, :] = train_predict
        # shifting test predictions for plotting
        test_plot = np.empty_like(data)
        test_plot[:, :] = np.nan
        test_plot[len(train_predict) + (time_step * 2) + 1:len(data) -
                  1, :] = test_predict
        # plot baseline and predictions
        plt.figure(figsize=(16, 8))

        plt.plot(train_plot)
        plt.plot(test_plot, color='green')
        plt.plot(scalar.inverse_transform(data), color='orange')
        plt.title(
            "Long Short Term Memory Network with train ,test and forecast",
            fontsize=20)
        plt.ylabel("Nitrogen_dioxide", fontsize=20)
        plt.legend(labels=['Train plot', 'Test set', 'LSTM forecast'],
                   fontsize=19)
        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        plt.grid()

        plt.show()
        st.pyplot(use_column_width=True)

        st.write('Train Score: %.3f MAE' % (train_score))
        st.write('Test Score: %.3f MAE' % (test_score))

        if st.checkbox('Visualize forecasted chart for 10 future days'):
            test_predict = scalar.fit_transform(test_predict)
            time_step = 10
            x_input = test_predict[(len(test_predict) - time_step):].reshape(
                1, -1)
            # Converting it to list
            temp_input = list(x_input)
            # Arranging list vertically
            temp_input = temp_input[0].tolist()

            # demonstrate prediction for next 10 days

            lst_output = []
            future_day = 10
            n_steps = 10
            i = 0
            # Forcast next 10 days output
            while (i < future_day):

                if (len(temp_input) > 10):

                    x_input = np.array(temp_input[1:])
                    print("{} day input {}".format(i, x_input))
                    x_input = x_input.reshape(1, -1)
                    # Converting to 3d array for lstm
                    x_input = x_input.reshape(1, n_steps, 1)
                    # print(x_input)
                    ypred = model.predict(x_input, verbose=0)
                    print("{} day predicted output {}".format(i, ypred))
                    # adding predicted output  to temp_input list
                    temp_input.extend(ypred[0].tolist())
                    temp_input = temp_input[1:]

                    # print(temp_input)
                    lst_output.extend(ypred.tolist())
                    i = i + 1
                else:
                    x_input = x_input.reshape((n_steps, 1, 1))
                    ypred = model.predict(x_input, verbose=0)
                    print("Predicted y of 0 day", ypred[0])
                    # Addding ypred value in temp_input(previous input)
                    temp_input.extend(ypred[0].tolist())
                    print(len(temp_input))
                    lst_output.extend(ypred.tolist())
                    i = i + 1
                # print(lst_output)

            previous_days1 = np.arange(len(data) - n_steps, len(data))
            predicted_future1 = np.arange(len(data), len(data) + future_day)
            lst_output = lst_output[:future_day]
            outputlist = data.tolist()
            outputlist.extend(lst_output)
            #data[len(data) - n_steps:]

            plt.plot(
                np.append(previous_days1, predicted_future1),
                scalar.inverse_transform(outputlist[len(data) - n_steps:]))
            plt.plot(predicted_future1, scalar.inverse_transform(lst_output))
            plt.title("Forecast for 10 future days", fontsize=20)
            plt.legend(fontsize=19)
            plt.xticks(fontsize=20)
            plt.yticks(fontsize=8)
            plt.ylabel("Nitrogen dioxide")
            plt.show()
            st.pyplot(use_column_width=True)
예제 #2
0
# In[157]:

n_input = 30
n_features = 1
generator = TimeseriesGenerator(scaled_train,
                                scaled_train,
                                length=n_input,
                                batch_size=30)

# In[158]:

model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(50))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# In[159]:

model.summary()

# In[160]:

model.fit_generator(generator, epochs=30)

# In[161]:

model.history.history.keys()

# In[162]:
예제 #3
0
def arima():
    failedMonths = 0 #Records if any months could not be successfully trained on (pred is zero)

    full_df=pd.read_csv('../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_mode.csv', infer_datetime_format=True, parse_dates=True)
    full_df['originalCases'] = full_df['num_cases'] #preserve original case values as additional feature

    by_state=full_df['sub_region_1'].unique()
    

    #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers
    offset = 14
    full_dataframe=pd.DataFrame()
    for region in by_state:
        temp=full_df.loc[(full_df['sub_region_1']==region)]
        temp=temp.loc[(temp['date']<'2020-11-20')]
        #Shift CDC data by offset value
        cdc_dataframe=temp['num_cases'].shift(periods=offset,fill_value=0)
        mobility_dataframe=temp.drop(columns=['date', 'num_cases'])
        all_states=pd.concat([cdc_dataframe, mobility_dataframe],axis=1)
        all_states=all_states.loc[(all_states['num_cases']>0)] #remove rows with zero cases
        full_dataframe=full_dataframe.append(all_states)

    #Build new full data array
    #mobility_dataframe_truc = mobility_dataframe.drop(columns=['date'])
    #full_dataframe = pd.concat([cdc_dataframe_truc, mobility_dataframe_truc], axis=1)
    #full_dataframe['originalCases'] = cdc_dataframe['newAndPnew'] #preserve original case values as additional feature
    #full_dataframe_noDate = full_dataframe.drop(columns=['submission_date'])
    #full_dataframe_noDate = full_dataframe_noDate.loc[(full_dataframe_noDate['newAndPnew']!=0)] #remove rows with zero cases

    #Find length of shorted state dataframe
    minLength = np.inf
    for region in by_state:
        state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)]
        length = state_data.shape[0]
        if length < minLength:
            minLength = length

    stride = 10 #trains a new model every {stride} days
    percentErrors = []
    for t in range(3):#(minLength-90)//stride):
        #Linear Mobility Data
        linearTrainX = []
        linearTrainy = []
        linearTestX = []
        linearTesty = []

        #Logarithmic Mobility Data
        logTrainX = []
        logTrainy = []
        logTestX = []
        logTesty = []

        MLPTrainX = []

        for region in by_state[:3]:
            state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)].drop(columns=['sub_region_1', 'grocery_and_pharmacy_percent_change_from_baseline'])
            #Convert data to numpy
            linearData = state_data.to_numpy()
            logData = np.log(state_data+1-np.min(state_data.to_numpy())).to_numpy()

            timeTrain = np.arange(1,61).reshape(-1, 1)
            timeTest = np.arange(61,91).reshape(-1, 1)
        
            #Linear Mobility Data
            linearTrainX.append(linearData[t*stride:t*stride+60,1:])
            linearTrainy.append(linearData[t*stride:t*stride+60,:1])
            linearTestX.append(linearData[t*stride+60:t*stride+90,1:])
            linearTesty.append(linearData[t*stride+60:t*stride+90,:1])

            #Logarithmic Mobility Data
            logTrainX.append(logData[t*stride:t*stride+60,1:])
            logTrainy.append(logData[t*stride:t*stride+60,:1])
            logTestX.append(logData[t*stride+60:t*stride+90,1:])
            logTesty.append(logData[t*stride+60:t*stride+90,:1])

            
            MLPTrainXState = []
            for i,feature in enumerate(linearData[t*stride:t*stride+60,1:].T):
                #print("Feature:", i)
                #fit ARIMA
                #Perform grid search to determine ARIMA Order
                #stepwise_fit = auto_arima(feature, start_p = 1, start_q = 1, 
                #                max_p = 3, max_q = 3, m = 7, 
                #                start_P = 0, seasonal = True, 
                #                d = None, D = 1, trace = True, 
                #                error_action ='ignore',   # we don't want to know if an order does not work 
                #                suppress_warnings = True,  # we don't want convergence warnings 
                #                stepwise = True)           # set to stepwise 
                #stepwise_fit.summary() 
                #print("===============================================================================================")
                
                predictArima =[]
                arimaOrders = [(1,0,0),(1,0,1),(3,0,0),(1,0,0),(0,1,1),(1,0,0),(2,0,0)]
                seasonalOrders = [(2, 1, 0, 7), (2, 1, 0, 7), (1, 1, 0, 7), (1, 1, 0, 7),(0,1,1,7),(0,1,1,7),(2, 1, 0, 7)]

                model = SARIMAX(feature,  
                        order = arimaOrders[i],  
                        seasonal_order =seasonalOrders[i],
                        initialization='approximate_diffuse') 
        
                result = model.fit(disp=False) 
                if showPlot >=2 :
                    visualize_ARIMA(result, timeTrain, linearTrainX[:,i], timeTest, linearTestX[:,i])

                predictArima.append(result.predict(61, 90, typ = 'levels'))
                predictArima = np.mean(predictArima, axis=0)
                MLPTrainXState.append(predictArima)
            MLPTrainX.append(np.array(MLPTrainXState).T)
        MLPTrainX = np.array(MLPTrainX).reshape(-1,6)
        linearTrainX = np.array(linearTrainX).reshape(-1,6)
        linearTrainy = np.array(linearTrainy).reshape(-1,1)
        linearTesty = np.array(linearTesty).reshape(-1,1)

        #Use "Last known case value" as bias
        #(I completely made this up but it improved accuracy by ~5%)
        #bias1 = np.ones((30,1))#*linearTrainy[0]
        #bias2 = np.ones((30,1))#*linearTrainy[30]
        bias = np.ones((linearTrainX.shape[0],1))#np.vstack((bias1, bias2))
        linearTrainX = np.hstack((linearTrainX, bias))

        bias3 = np.ones((MLPTrainX.shape[0],1))#*linearTrainy[-1]
        MLPTrainX = np.hstack((MLPTrainX, bias3))
        
        failCounter = 0
        maxFail = 4
        while failCounter < maxFail: #Retrain if prediction is zero
            model = Sequential()
            #model.add(BatchNormalization())
            model.add(Dense(10, input_dim=7, activation='relu'))
            #model.add(Dropout(0.15))
            model.add(Dense(30, activation='relu'))
            #model.add(Dropout(0.15))
            model.add(Dense(1, activation='relu'))

            model.compile(optimizer='adam',loss='mean_squared_error', metrics=['accuracy'])
            model.fit(linearTrainX, linearTrainy, epochs=100, verbose=0)

            y_pred = model.predict(MLPTrainX)
            if np.sum(y_pred==0) < 0.1 * MLPTrainX.shape[0]:
                break
            print("Prediction is zero. Retraining...")
            failCounter += 1
            if failCounter == maxFail:
                failedMonths += 1
                percentError = 1
                print("Could not train model on this data")
        if failCounter != maxFail:
            error = y_pred-linearTesty
            percentError = np.abs(error/linearTesty).T
            percentErrorsByState = []
            print(percentError.shape)
            for i in range(len(by_state)):
                percentErrorsByState.append(percentError[i*30:(i+1)*30])
            percentErrorsByState = np.array(percentErrorsByState).reshape(51)
            print("Loss:", np.mean(percentError))
            #print("Percent Error:",percentError)
            percentErrors.append(percentErrorsByState)

        if showPlot >= 1 or np.mean(percentError) > 0.4:
            plt.plot(timeTrain, linearTrainy[0:60], label="Past")
            plt.plot(timeTest, linearTesty[0:30], label="True Future")
            plt.plot(timeTest, y_pred[0:30], label="Predicted Future")
            plt.plot(timeTest, MLPTrainX[0:30,-2], label="Predicted ARIMA (case only)")
            plt.legend()
            plt.show()
    print(np.array(percentErrors).shape)
    print("Failed Months:", failedMonths)
    print(np.mean(percentErrors, axis=1))
    plt.plot(np.mean(percentErrors, axis=1).flatten())
    plt.show()
    return