示例#1
0
# If we are not starting a new recalibration but re-starting an old one, we import the
# existing files and print metrics
if not new_recalibration:
    # Import existinf forecasting file
    forecast = pd.read_csv(forecast_file_path, index_col=0)
    forecast.index = pd.to_datetime(forecast.index)

    # Reading dates to still be forecasted by checking NaN values
    forecast_dates = forecast[forecast.isna().any(axis=1)].index

    # If all the dates to be forecasted have already been forecast, we print information
    # and exit the script
    if len(forecast_dates) == 0:

        mae = np.mean(MAE(forecast.values.squeeze(), real_values.values))
        smape = np.mean(sMAPE(forecast.values.squeeze(),
                              real_values.values)) * 100
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
            'Final metrics', smape, mae))

else:
    forecast_dates = forecast.index

model = DNN(experiment_id=experiment_id,
            path_hyperparameter_folder=path_hyperparameter_folder,
            nlayers=nlayers,
            dataset=dataset,
            years_test=years_test,
            shuffle_train=shuffle_train,
            data_augmentation=data_augmentation,
            calibration_window=calibration_window)
示例#2
0
def evaluate_dnn_in_test_dataset(
        experiment_id,
        path_datasets_folder=os.path.join('.', 'datasets'),
        path_hyperparameter_folder=os.path.join('.', 'experimental_files'),
        path_recalibration_folder=os.path.join('.', 'experimental_files'),
        nlayers=2,
        dataset='PJM',
        years_test=2,
        shuffle_train=True,
        data_augmentation=0,
        calibration_window=4,
        new_recalibration=False,
        begin_test_date=None,
        end_test_date=None):
    """Function for easy evaluation of the DNN model in a test dataset using daily recalibration. 
    
    The test dataset is defined by a market name and the test dates dates. The function
    generates the test and training datasets, and evaluates a DNN model considering daily recalibration
    and an optimal set of hyperparameters. 
    
    Note that before using this class, a hyperparameter optimization run must be done using the
    :class:`hyperparameter_optimizer` function. Moreover, the hyperparameter optimization must be done
    using the same parameters: ``nlayers``, ``dataset``, ``shuffle_train``, 
    ``data_augmentation``, ``calibration_window``, and either the ``years_test`` or the same
    ``begin_test_date``/``end_test_date``
    
    An example on how to use this function is provided :ref:`here<dnnex2>`.

    Parameters
    ----------
    experiment_id : str
        Unique identifier to read the trials file. In particular, every hyperparameter optimization 
        set has an unique identifier associated with. See :class:`hyperparameter_optimizer` for further
        details
    path_datasets_folder : str, optional
        Path where the datasets are stored or, if they do not exist yet, the path where the datasets 
        are to be stored
    path_hyperparameter_folder : str, optional
        Path of the folder containing the trials file with the optimal hyperparameters
    path_recalibration_folder : str, optional
        Path to save the forecast of the test dataset
    nlayers : int, optional
        Number of hidden layers in the neural network
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.
    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments begin_test_date and end_test_date are not provided.
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        end_test_date. If either of them is not provided, the test dataset is built using the 
        years_test argument. begin_test_date should either be a string with the following 
        format d/m/Y H:M, or a datetime object
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        begin_test_date. If either of them is not provided, the test dataset is built using the 
        years_test argument. end_test_date should either be a string with the following 
        format d/m/Y H:M, or a datetime object       
    shuffle_train : bool, optional
        Boolean that selects whether the validation and training datasets were shuffled when
        performing the hyperparameter optimization. Note that it does not select whether
        shuffling is used for recalibration as for recalibration the validation and the
        training datasets are always shuffled.
    data_augmentation : bool, optional
        Boolean that selects whether a data augmentation technique for electricity price forecasting
        is employed
    calibration_window : int, optional
        Number of days used in the training/validation dataset for recalibration
    new_recalibration : bool, optional
        Boolean that selects whether a new recalibration is performed or the function re-starts an old one.
        To restart an old one, the .csv file with the forecast must exist in the 
        ``path_recalibration_folder`` folder 
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with all the predictions in the test dataset. The dataframe is also
        written to the folder ``path_recalibration_folder``
    """

    # Checking if provided directory for recalibration exists and if not create it
    if not os.path.exists(path_recalibration_folder):
        os.makedirs(path_recalibration_folder)

    # Defining train and testing data
    df_train, df_test = read_data(dataset=dataset,
                                  years_test=years_test,
                                  path=path_datasets_folder,
                                  begin_test_date=begin_test_date,
                                  end_test_date=end_test_date)
    # Defining unique name to save the forecast

    forecast_file_name = 'DNN_forecast_nl' + str(nlayers) + '_dat' + str(dataset) + \
                         '_YT' + str(years_test) + '_SFH' + str(shuffle_train) + \
                         '_DA' * data_augmentation + '_CW' + str(calibration_window) + \
                         '_' + str(experiment_id) + '.csv'

    forecast_file_path = os.path.join(path_recalibration_folder,
                                      forecast_file_name)

    # Defining empty forecast array and the real values to be predicted in a more friendly format
    forecast = pd.DataFrame(index=df_test.index[::24],
                            columns=['h' + str(k) for k in range(24)])
    real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24)
    real_values = pd.DataFrame(real_values,
                               index=forecast.index,
                               columns=forecast.columns)

    # If we are not starting a new recalibration but re-starting an old one, we import the
    # existing files and print metrics
    if not new_recalibration:
        # Import existinf forecasting file
        forecast = pd.read_csv(forecast_file_path, index_col=0)
        forecast.index = pd.to_datetime(forecast.index)

        # Reading dates to still be forecasted by checking NaN values
        forecast_dates = forecast[forecast.isna().any(axis=1)].index

        # If all the dates to be forecasted have already been forecast, we print information
        # and exit the script
        if len(forecast_dates) == 0:

            mae = np.mean(MAE(forecast.values.squeeze(), real_values.values))
            smape = np.mean(
                sMAPE(forecast.values.squeeze(), real_values.values)) * 100
            print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
                'Final metrics', smape, mae))

    else:
        forecast_dates = forecast.index

    model = DNN(experiment_id=experiment_id,
                path_hyperparameter_folder=path_hyperparameter_folder,
                nlayers=nlayers,
                dataset=dataset,
                years_test=years_test,
                shuffle_train=shuffle_train,
                data_augmentation=data_augmentation,
                calibration_window=calibration_window)

    # For loop over the recalibration dates
    for date in forecast_dates:

        # For simulation purposes, we assume that the available data is
        # the data up to current date where the prices of current date are not known
        data_available = pd.concat(
            [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

        # We set the real prices for current date to NaN in the dataframe of available data
        data_available.loc[date:date + pd.Timedelta(hours=23),
                           'Price'] = np.NaN

        # Recalibrating the model with the most up-to-date available data and making a prediction
        # for the next day
        Yp = model.recalibrate_and_forecast_next_day(df=data_available,
                                                     next_day_date=date)

        # Saving the current prediction
        forecast.loc[date, :] = Yp

        # Computing metrics up-to-current-date
        mae = np.mean(
            MAE(forecast.loc[:date].values.squeeze(),
                real_values.loc[:date].values))
        smape = np.mean(
            sMAPE(forecast.loc[:date].values.squeeze(),
                  real_values.loc[:date].values)) * 100

        # Pringint information
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
            str(date)[:10], smape, mae))

        # Saving forecast
        forecast.to_csv(forecast_file_path)

    return forecast
示例#3
0
def _hyperopt_objective(hyperparameters, trials, trials_file_path, max_evals, nlayers, dfTrain, dfTest, 
                        shuffle_train, dataset, data_augmentation, 
                        calibration_window, n_exogenous_inputs):
    """Function that defines the hyperparameter optimization objective/loss
    
    This function receives as input a set of hyperparameters, trains a DNN using them,
    and returns the performance of the DNN for the selected hyperparameters in a validation
    dataset

    Parameters
    ----------
    hyperparameters : dict
        A dictionary provided by hyperopt indicating whether each hyperparameter/feature is selected
    trials : hyperopt.Trials
        The trials object that stores the hyperparameter optimization runs
    trials_file_path : str
        The path to store the trials object
    max_evals : int
        Maximum number of iterations for hyperparameter optimization
    nlayers : int
        Number of layers in the DNN model
    dfTrain : pandas.DataFrame
        Dataframe containing the training data
    dfTrain : pandas.DataFrame
        Dataframe containing the testing data
    shuffle_train : bool
        Boolean that selects whether the training and validation datasets are shuffled
    dataset : TYPE
        Description
    data_augmentation : TYPE
        Description
    calibration_window : TYPE
        Description
    n_exogenous_inputs : TYPE
        Description
    
    Returns
    -------
    dict
        A dictionary summarizing the result of the hyperparameter run
    """

    # Re-defining the training dataset based on the calibration window. The calibration window
    # can be given as an external parameter. If the value 0 is given, the calibration window
    # is included as a hyperparameter to optimize
    dfTrain_cw = dfTrain.loc[dfTrain.index[-1] - pd.Timedelta(weeks=52) * calibration_window +
                             pd.Timedelta(hours=1):]

    # Saving hyperoptimization state and printing message
    pc.dump(trials, open(trials_file_path, "wb"))
    if trials.losses()[0] is not None:

        MAEVal = trials.best_trial['result']['MAE Val']
        MAETest = trials.best_trial['result']['MAE Test']

        sMAPEVal = trials.best_trial['result']['sMAPE Val']
        sMAPETest = trials.best_trial['result']['sMAPE Test']
        
        print('\n\nTested {}/{} iterations.'.format(len(trials.losses()) - 1,
              max_evals))

        print('Best MAE - Validation Dataset')            
        print("  MAE: {:.1f} | sMAPE: {:.2f} %".format(MAEVal, sMAPEVal))
        print('\nBest MAE - Test Dataset')
        print("  MAE: {:.1f} | sMAPE: {:.2f} %".format(MAETest, sMAPETest))

    # Defining X,Y datasets
    Xtrain, Ytrain, Xval, Yval, Xtest, Ytest, indexTest = \
        _build_and_split_XYs(dfTrain=dfTrain_cw, dfTest=dfTest, features=hyperparameters, 
                          shuffle_train=shuffle_train, hyperoptimization=True,
                          data_augmentation=data_augmentation, n_exogenous_inputs=n_exogenous_inputs)
    
    # If required, datasets are scaled
    if hyperparameters['scaleX'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']:
        [Xtrain, Xval, Xtest], _ = scaling([Xtrain, Xval, Xtest], hyperparameters['scaleX'])

    if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']:
        [Ytrain, Yval], scaler = scaling([Ytrain, Yval], hyperparameters['scaleY'])
    else:
        scaler = None

    neurons = [int(hyperparameters['neurons' + str(k)]) for k in range(1, nlayers + 1)
               if int(hyperparameters['neurons' + str(k)]) >= 50]
        
    np.random.seed(int(hyperparameters['seed']))

    # Initialize model
    forecaster = DNNModel(neurons=neurons, n_features=Xtrain.shape[-1], 
                     dropout=hyperparameters['dropout'], batch_normalization=hyperparameters['batch_normalization'], 
                     lr=hyperparameters['lr'], verbose=False,
                     optimizer='adam', activation=hyperparameters['activation'],
                     epochs_early_stopping=20, scaler=scaler, loss='mae',
                     regularization=hyperparameters['reg']['val'], 
                     lambda_reg=hyperparameters['reg']['lambda'],
                     initializer=hyperparameters['init'])

    forecaster.fit(Xtrain, Ytrain, Xval, Yval)

    Yp = forecaster.predict(Xval).squeeze()
    if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']:
        Yval = scaler.inverse_transform(Yval)
        Yp = scaler.inverse_transform(Yp)

    mae_validation = np.mean(MAE(Yval, Yp))
    smape_validation = np.mean(sMAPE(Yval, Yp)) * 100

    # If required, datasets are normalized
    Yp = forecaster.predict(Xtest).squeeze()
    if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']:
        Yp = scaler.inverse_transform(Yp).squeeze()

    maeTest = np.mean(MAE(Ytest, Yp)) 
    smape_test = np.mean(sMAPE(Ytest, Yp)) * 100

    # The test dataset is returned for directly evaluating the models without recalibration
    # while performing hyperopt. However, the hyperparameter search is performed using a validation
    # dataset
    return_values = {'loss': mae_validation, 'MAE Val': mae_validation, 'MAE Test': maeTest,
                     'sMAPE Val': smape_validation, 'sMAPE Test': smape_test, 
                     'status': STATUS_OK}
                          
    return return_values
示例#4
0
    # the data up to current date where the prices of current date are not known
    data_available = pd.concat(
        [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

    # We set the real prices for current date to NaN in the dataframe of available data
    data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN

    # Recalibrating the model with the most up-to-date available data and making a prediction
    # for the next day
    Yp = model.recalibrate_and_forecast_next_day(
        df=data_available,
        next_day_date=date,
        calibration_window=calibration_window)
    # Saving the current prediction
    forecast.loc[date, :] = Yp

    # Computing metrics up-to-current-date
    mae = np.mean(
        MAE(forecast.loc[:date].values.squeeze(),
            real_values.loc[:date].values))
    smape = np.mean(
        sMAPE(forecast.loc[:date].values.squeeze(),
              real_values.loc[:date].values)) * 100

    # Pringint information
    print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
        str(date)[:10], smape, mae))

    # Saving forecast
    forecast.to_csv(forecast_file_path)
示例#5
0
def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), 
                                  path_recalibration_folder=os.path.join('.', 'experimental_files'),
                                  dataset='PJM', years_test=2, calibration_window=364 * 3, 
                                  begin_test_date=None, end_test_date=None):
    """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. 
    
    The test dataset is defined by a market name and the test dates dates. The function
    generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. 
    
    An example on how to use this function is provided :ref:`here<learex1>`.   

    Parameters
    ----------
    path_datasets_folder : str, optional
        path where the datasets are stored or, if they do not exist yet,
        the path where the datasets are to be stored.
    
    path_recalibration_folder : str, optional
        path to save the files of the experiment dataset.
    
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.

    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments ``begin_test_date`` and ``end_test_date`` are not provided.
    
    calibration_window : int, optional
        Number of days used in the training dataset for recalibration.
    
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``end_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``begin_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.
    
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``begin_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``end_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.       
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder.
    """

    # Checking if provided directory for recalibration exists and if not create it
    if not os.path.exists(path_recalibration_folder):
        os.makedirs(path_recalibration_folder)

    # Defining train and testing data
    df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder,
                                  begin_test_date=begin_test_date, end_test_date=end_test_date)

    # Defining unique name to save the forecast
    forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \
                         '_CW' + str(calibration_window) + '.csv'

    forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name)


    # Defining empty forecast array and the real values to be predicted in a more friendly format
    forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)])
    real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24)
    real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns)

    forecast_dates = forecast.index

    model = LEAR(calibration_window=calibration_window)

    # For loop over the recalibration dates
    for date in forecast_dates:

        # For simulation purposes, we assume that the available data is
        # the data up to current date where the prices of current date are not known
        data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

        # We set the real prices for current date to NaN in the dataframe of available data
        data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN

        # Recalibrating the model with the most up-to-date available data and making a prediction
        # for the next day
        Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, 
                                                     calibration_window=calibration_window)
        # Saving the current prediction
        forecast.loc[date, :] = Yp

        # Computing metrics up-to-current-date
        mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) 
        smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100

        # Pringint information
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(str(date)[:10], smape, mae))

        # Saving forecast
        forecast.to_csv(forecast_file_path)

    return forecast
示例#6
0
# Building the same datasets with shape (ndays, n_prices/day) instead 
# of shape (nprices, 1) and display
fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), 
                                  index=fc_DNN_ensemble.index[::24], 
                                  columns=['h' + str(hour) for hour in range(24)])
real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), 
                             index=real_price.index[::24], 
                             columns=['h' + str(hour) for hour in range(24)])
fc_DNN_ensemble_2D.head()


# According to the paper, the sMAPE of the DNN ensemble for the NP market is 4.85%.
# Let's test the metric for different conditions

# Evaluating sMAPE when real price and forecasts are both dataframes
sMAPE(p_pred=fc_DNN_ensemble, p_real=real_price) * 100

# Evaluating sMAPE when real price and forecasts are both numpy arrays
sMAPE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values) * 100

# Evaluating sMAPE when input values are of shape (ndays, n_prices/day) instead 
# of shape (nprices, 1)
# Dataframes
sMAPE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D) * 100
# Numpy arrays
sMAPE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values) * 100

# Evaluating sMAPE when input values are of shape (nprices,) 
# instead of shape (nprices, 1)
# Pandas Series
sMAPE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'],