예제 #1
0
def predict_ind(data_ind, end_date):
    # Build and train model
    best_sarima_full_data = SARIMAX(endog=data_ind['amount'],
                                    order=(0, 1, 1),
                                    seasonal_order=(1, 1, 0, 52))
    best_sarima_full_data = best_sarima_full_data.fit()
    # Predict

    #EXCEL
    #future_prediction_full_data = best_sarima_full_data.get_prediction(start = data_ind.index[-1] + timedelta(days=1), end = end_date, dynamic = True, full_results = True)

    #CSV
    future_prediction_full_data = best_sarima_full_data.get_prediction(
        start=((datetime.strptime(data_ind.index[-1], '%Y-%m-%d').date()) +
               timedelta(days=1)),
        end=end_date,
        dynamic=True,
        full_results=True)
    # Create results and confidence intervals
    future_predicted_amount_full_data = future_prediction_full_data.prediction_results.forecasts[
        0]
    future_predicted_amount_df_full_data_ind = pd.DataFrame(
        future_predicted_amount_full_data,
        index=future_prediction_full_data.row_labels)
    future_pred_ci_full_data_ind = future_prediction_full_data.conf_int(
        alpha=0.05)
    # Returning prediction and CI
    return future_predicted_amount_df_full_data_ind, future_pred_ci_full_data_ind
예제 #2
0
def future_forecast(request):
    df = pd.read_csv('sales/data/IPN31152N.csv', index_col=0)
    df.index = pd.date_range(start='1972-01-01', end='2020-01-01', freq='M')
    train_df = df[df.index <= '2017-12-31']
    test_df = df[df.index > '2017-12-31']
    model1 = SARIMAX(train_df['IPN31152N'],
                     order=(3, 1, 3),
                     seasonal_order=(0, 1, 1, 12)).fit()
    pred = model1.get_prediction(start='2020-01-31', end='2020-12-31')
    df_pred = pd.DataFrame(pred.predicted_mean)
    df_pred.columns = ['IPN31152N']
    results = {
        '2020': [[time_unix(df_pred.index[i]), df_pred.iloc[i]['IPN31152N']]
                 for i in range(0, len(df_pred))]
    }
    re = {}
    re['2020'] = [[[round(i, 2)] for i in pred.predicted_mean]]
    re['2020'].append(
        [[round(i, 2)]
         for i in np.array([round(i, 2) for i in pred.predicted_mean]) -
         np.array([i for i in df[df.index.year == 2019]['IPN31152N']])])
    context = {"data": json.dumps(results), "result_changes": json.dumps(re)}
    return render(request, 'charts_forecast.html', context=context)
예제 #3
0
def mod_sarima(train,
               test,
               dependent_var_col,
               trend,
               p,
               d,
               q,
               P,
               D,
               Q,
               S,
               is_log,
               outpath,
               name,
               xreg,
               plot_regressors,
               mle_regression=True,
               time_varying_regression=False,
               periodicity='daily'):
    """
This function trains and tests the SARIMA model. for this two dataframes must be given, train and test.
trend, pdq and PDQS, are the statsmodels.SARIMAX variables.
    :param train (Pandas Dataframe): train data
    :param test (Pandas Dataframe): test data
    :param ts_col (int): column of the objective variable
    :param trend (str): Parameter controlling the deterministic trend polynomial A(t)
    :param p (int): Autorregresive parameter
    :param d (int): Differencing parameter
    :param q (int): Differencing Moving Average parameter
    :param P (int): Seasonal Autorregresive parameter
    :param D (int): Seasonal Differencing parameter
    :param Q (int): Seasonal Differencing Moving Average parameter
    :param S (int): Lags for the seasonal
    :param is_log (bool): true if the series is in logarithm. defaults to False.
    :param outpath (str): path where the results will be stored
    :param name (str): name to use when saving the files returned by the model
    :xreg(list): list of strings with names of columns in the test/train datasets to be used as regressors
    :plot_regressors: whether the regressors should be plotted in the function
    :return: mae_error (float): Mean Absolute Error
    rmse_error (float): root mean squared error
     res_df (Pandas Dataframe): Dataframe with all data and the prediction in the Forecast column.
      mod (statsmodel object): Model object.
    """
    print(
        'Modelling \n', name,
        ' Forecast - SARIMAX ' + '(' + str(p) + ',' + str(d) + ',' + str(q) +
        ')' + 'S' + '(' + str(P) + ',' + str(D) + ',' + str(Q) + ')' + str(S))

    # path definition
    if name not in os.listdir(outpath):
        os.mkdir(outpath + name)
        print('creating output folder in: \n', outpath + name)
    report_output_path = str(outpath) + str(name) + '/'

    # fit the model
    if len(xreg) == 0:
        mod = SARIMAX(train[dependent_var_col],
                      trend=trend,
                      order=(p, d, q),
                      seasonal_order=(P, D, Q, S),
                      time_varying_regression=time_varying_regression,
                      mle_regression=mle_regression).fit()
    else:
        mod = SARIMAX(train[dependent_var_col],
                      trend=trend,
                      order=(p, d, q),
                      seasonal_order=(P, D, Q, S),
                      exog=train[xreg],
                      enforce_stationarity=False,
                      time_varying_regression=time_varying_regression,
                      mle_regression=mle_regression).fit()

    # plot diagnostics
    plt.figure()
    plt.title('Plot diagnostics for' + dependent_var_col +
              ' Forecast - SARIMA ' + '(' + str(p) + ',' + str(d) + ',' +
              str(q) + ')' + 'S' + '(' + str(P) + ',' + str(D) + ',' + str(Q) +
              ')' + str(S))
    mod.plot_diagnostics(figsize=(15, 9), lags=40)
    plt.savefig(report_output_path + 'diagnostics_' + name + '.png')

    # predict with the model
    # I know this seems like a lot, but to be able to support broken time series in the forecast you need to reset the indexes

    test_aux = test.copy(deep=True)

    # TODO: remove this parameter
    test_aux[xreg] = np.exp(test_aux[xreg])
    test_aux[xreg] = test_aux[xreg] * 0.9
    test_aux[xreg] = np.log(test_aux[xreg])

    test_aux.reset_index(drop=True, inplace=True)
    train_aux = train.copy(deep=True)
    train_aux.reset_index(drop=True, inplace=True)

    # get the predictions with the model
    if len(xreg) == 0:
        predictions = mod.predict(train_aux.index.max() + 1,
                                  end=train_aux.index.max() + 1 +
                                  test_aux.index.max())
        conf_intervals = mod.get_prediction(
            train_aux.index.max() + 1,
            end=train_aux.index.max() + 1 +
            test_aux.index.max()).conf_int(alpha=0.5)
    else:
        predictions = mod.predict(train_aux.index.max() + 1,
                                  end=train_aux.index.max() + 1 +
                                  test_aux.index.max(),
                                  exog=test_aux[xreg])
        conf_intervals = mod.get_prediction(
            train_aux.index.max() + 1,
            end=train_aux.index.max() + 1 + test_aux.index.max(),
            exog=test_aux[xreg]).conf_int(alpha=0.5)

    predictions.index = test.index
    conf_intervals.index = test.index

    # the confidence interval is trimmed for extreme values so they don't overextort after missing dates and doing the inverse log transf (exp)
    conf_intervals = pd.DataFrame(conf_intervals)
    # conf_intervals[(conf_intervals['lower log_revenue_emi'] < conf_intervals['lower log_revenue_emi'].quantile(q=0.01)) | (
    #         conf_intervals['upper log_revenue_emi'] > conf_intervals['upper log_revenue_emi'].quantile(q=0.99))] = np.nan

    conf_intervals.index = conf_intervals.index.date
    conf_intervals.index = conf_intervals.index.map(str)

    # assign the predictions to the test dataframe to be used later in the plotting
    test['Forecast'] = predictions
    train['Forecast'] = mod.fittedvalues

    # add the columns that are in the regressors to the dataframe that will be used and get a dataframe to plot (train aux)
    columns = [dependent_var_col, 'Forecast']
    columns.append(xreg)
    columns = list(flatten(columns))
    train_aux = train[columns]
    test_aux = test[columns]
    test_aux = pd.merge(test_aux,
                        conf_intervals,
                        left_index=True,
                        right_index=True)

    # transform the data back from logarithm if the series is in that scale
    if is_log is True:
        res_df = pd.concat([train_aux, test_aux])
        res_df['Forecast'] = np.exp(res_df['Forecast'])
        res_df[dependent_var_col] = np.exp(res_df[dependent_var_col])

        mae_error = mean_absolute_error(np.exp(test[dependent_var_col]),
                                        np.exp(predictions))
        rmse_error = np.sqrt(
            mean_squared_error(np.exp(test[dependent_var_col]),
                               np.exp(predictions)))
        mape = mean_absolute_percentage_error(np.exp(test[dependent_var_col]),
                                              np.exp(predictions))

        preds = np.exp(predictions)

    else:
        res_df = pd.concat([train_aux, test_aux])
        mae_error = mean_absolute_error(test[dependent_var_col], predictions)
        rmse_error = np.sqrt(
            mean_squared_error(test[dependent_var_col], predictions))
        mape = mean_absolute_percentage_error(test[dependent_var_col],
                                              predictions)
        preds = predictions

    # Create a text box for the iteration results
    textstr = 'MAE:' + str(round(mae_error, 0)) + '\n' + 'MAPE:' + str(
        round(mape, 2))

    aux_res_df = res_df.tail(365)  # only plot the 6 months
    aux_res_df.index = pd.to_datetime(aux_res_df.index)
    if str(periodicity).upper() is 'daily':
        aux_res_df = aux_res_df.reindex(pd.date_range(aux_res_df.index.min(),
                                                      aux_res_df.index.max()),
                                        fill_value=np.nan)

    # Upper and lower confidence intervals
    lower = aux_res_df[str('lower ' + str(dependent_var_col))]
    upper = aux_res_df[str('upper ' + str(dependent_var_col))]
    if is_log is True:
        lower = np.exp(lower)
        upper = np.exp(upper)

    # plot the figure with the prediction
    fig, ax = plt.subplots(figsize=(15, 10))
    plt.subplots_adjust(right=0.85, left=0.05, bottom=0.1)
    ax2 = ax.twinx()
    ax.plot(aux_res_df["Forecast"], color='darkred', label='Forecast')
    ax.plot(aux_res_df[dependent_var_col], color='darkblue', label='Real')
    if plot_regressors is True:
        for i in xreg:
            ax2.plot(aux_res_df[i], color='grey', alpha=0.4, label=str(i))
    ax.plot(lower, color='darkgreen', label='Lower', alpha=0.5)
    ax.plot(upper, color='darkgreen', label='Upper', alpha=0.5)
    ax.fill_between(upper.dropna().index,
                    upper.dropna(),
                    lower.dropna(),
                    facecolor='darkgreen',
                    alpha=0.2,
                    interpolate=False)
    ax.axvline(x=pd.to_datetime(test.index.min(), format='%Y-%m-%d'),
               color='grey',
               linestyle='--')
    ax.xaxis.set_major_locator(mticker.MultipleLocator(30))
    plt.gcf().autofmt_xdate()
    # generate a text box
    props = dict(boxstyle='round', facecolor='white')
    # place a text box in upper left in axes coords
    ax.text(0.05,
            0.95,
            textstr,
            transform=ax.transAxes,
            fontsize=14,
            verticalalignment='top',
            bbox=props)

    ax.legend(title='Forecast Legend',
              bbox_to_anchor=(1.05, 1),
              loc='upper left')
    ax2.legend(title='Regressors',
               bbox_to_anchor=(1.05, 0.7),
               loc='center left')
    plt.savefig(report_output_path + 'Forecast_' + name + '_' + str(
        datetime.strftime(pd.to_datetime(test.index.min()), format='%Y-%m-%d'))
                + '.png')
    plt.title('SARIMAX Forecast of ' + name)
    plt.show()

    plt.close('all')

    # plotting the results in plotly
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=res_df.index,
                   y=res_df[dependent_var_col],
                   mode='lines',
                   name='Real'))
    fig.add_trace(
        go.Scatter(x=res_df.index,
                   y=res_df['Forecast'],
                   mode='lines+markers',
                   name='Fitted - Forecasted'))

    fig.add_shape(
        dict(type="line",
             x0=test.index.min(),
             y0=res_df[dependent_var_col].min(),
             x1=test.index.min(),
             y1=res_df[dependent_var_col].max(),
             line=dict(color="grey", width=1)))
    fig.update_xaxes(rangeslider_visible=True)
    fig.update_layout(title=dependent_var_col + ' Forecast - SARIMA ' + '(' +
                      str(p) + ',' + str(d) + ',' + str(q) + ')' + 'S' + '(' +
                      str(P) + ',' + str(D) + ',' + str(Q) + ')' + str(S),
                      xaxis_title=dependent_var_col,
                      yaxis_title='Date',
                      font=dict(family="Century gothic",
                                size=18,
                                color="darkgrey"))
    fig.write_html(report_output_path + name + '_forecast_SARIMA.html')
    plt.close('all')

    print('MAE', mae_error)
    print('RMSE', rmse_error)
    print('MAPE', mape)
    print(mod.summary())

    return mae_error, rmse_error, mape, name, preds, conf_intervals
예제 #4
0
            continue

# plug in results with lowest AIC score
sarima_model = SARIMAX(y, order=(1,1,1), seasonal_order=(0,1,1,12))
sarima_model = sarima_model.fit(disp=False)

# summary table of SARIMA
print("SARIMA summary table:")
print(sarima_model.summary().tables[1])

# show plot diagnostics
sarima_model.plot_diagnostics(figsize=(15,12))
plt.show()

# Show predictions using one-step forecast
pred = sarima_model.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=False)
pred_ci = pred.conf_int()

ax = y['1990':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One step ahead forecast', alpha=0.7)
ax.fill_between(pred_ci.index, 
                pred_ci.iloc[:, 0], 
                pred_ci.iloc[:, 1], color='k', alpha=0.2)
ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')
plt.legend()
plt.show()

# Show predictions using dynamic forecast
pred_dynamic = sarima_model.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=True, full_results=True)
pred_dynamic_ci = pred_dynamic.conf_int()
test = df.loc[df.year >= 2017]
print(test.shape)

# ARIMA

# %%
# ARIMA parameters search
results_arima = pm.auto_arima(
    train['diff'], d=0, start_p=1, start_1=1, max_p=3, max_q=3)
print(results_arima.summary())

#%%
# using the ARIMA model
model_arima = SARIMAX(train['diff'], order=(3, 0, 2)).fit()
# prediction
prediction_arima = model_arima.get_prediction(
    start=-50, dynamic=True).predicted_mean
# forecasting
forecast_arima = model_arima.get_forecast(steps=20).predicted_mean

#%%
# model diagnostics
arima_residual = model_arima.resid
arima_mae = np.mean(np.abs(arima_residual))
print(arima_mae)

#%%
# pymarima results
results_arima.plot_diagnostics()
plt.show()

# %%