# If we are not starting a new recalibration but re-starting an old one, we import the # existing files and print metrics if not new_recalibration: # Import existinf forecasting file forecast = pd.read_csv(forecast_file_path, index_col=0) forecast.index = pd.to_datetime(forecast.index) # Reading dates to still be forecasted by checking NaN values forecast_dates = forecast[forecast.isna().any(axis=1)].index # If all the dates to be forecasted have already been forecast, we print information # and exit the script if len(forecast_dates) == 0: mae = np.mean(MAE(forecast.values.squeeze(), real_values.values)) smape = np.mean(sMAPE(forecast.values.squeeze(), real_values.values)) * 100 print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( 'Final metrics', smape, mae)) else: forecast_dates = forecast.index model = DNN(experiment_id=experiment_id, path_hyperparameter_folder=path_hyperparameter_folder, nlayers=nlayers, dataset=dataset, years_test=years_test, shuffle_train=shuffle_train, data_augmentation=data_augmentation, calibration_window=calibration_window)
def evaluate_dnn_in_test_dataset( experiment_id, path_datasets_folder=os.path.join('.', 'datasets'), path_hyperparameter_folder=os.path.join('.', 'experimental_files'), path_recalibration_folder=os.path.join('.', 'experimental_files'), nlayers=2, dataset='PJM', years_test=2, shuffle_train=True, data_augmentation=0, calibration_window=4, new_recalibration=False, begin_test_date=None, end_test_date=None): """Function for easy evaluation of the DNN model in a test dataset using daily recalibration. The test dataset is defined by a market name and the test dates dates. The function generates the test and training datasets, and evaluates a DNN model considering daily recalibration and an optimal set of hyperparameters. Note that before using this class, a hyperparameter optimization run must be done using the :class:`hyperparameter_optimizer` function. Moreover, the hyperparameter optimization must be done using the same parameters: ``nlayers``, ``dataset``, ``shuffle_train``, ``data_augmentation``, ``calibration_window``, and either the ``years_test`` or the same ``begin_test_date``/``end_test_date`` An example on how to use this function is provided :ref:`here<dnnex2>`. Parameters ---------- experiment_id : str Unique identifier to read the trials file. In particular, every hyperparameter optimization set has an unique identifier associated with. See :class:`hyperparameter_optimizer` for further details path_datasets_folder : str, optional Path where the datasets are stored or, if they do not exist yet, the path where the datasets are to be stored path_hyperparameter_folder : str, optional Path of the folder containing the trials file with the optimal hyperparameters path_recalibration_folder : str, optional Path to save the forecast of the test dataset nlayers : int, optional Number of hidden layers in the neural network dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments begin_test_date and end_test_date are not provided. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument end_test_date. If either of them is not provided, the test dataset is built using the years_test argument. begin_test_date should either be a string with the following format d/m/Y H:M, or a datetime object end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument begin_test_date. If either of them is not provided, the test dataset is built using the years_test argument. end_test_date should either be a string with the following format d/m/Y H:M, or a datetime object shuffle_train : bool, optional Boolean that selects whether the validation and training datasets were shuffled when performing the hyperparameter optimization. Note that it does not select whether shuffling is used for recalibration as for recalibration the validation and the training datasets are always shuffled. data_augmentation : bool, optional Boolean that selects whether a data augmentation technique for electricity price forecasting is employed calibration_window : int, optional Number of days used in the training/validation dataset for recalibration new_recalibration : bool, optional Boolean that selects whether a new recalibration is performed or the function re-starts an old one. To restart an old one, the .csv file with the forecast must exist in the ``path_recalibration_folder`` folder Returns ------- pandas.DataFrame A dataframe with all the predictions in the test dataset. The dataframe is also written to the folder ``path_recalibration_folder`` """ # Checking if provided directory for recalibration exists and if not create it if not os.path.exists(path_recalibration_folder): os.makedirs(path_recalibration_folder) # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'DNN_forecast_nl' + str(nlayers) + '_dat' + str(dataset) + \ '_YT' + str(years_test) + '_SFH' + str(shuffle_train) + \ '_DA' * data_augmentation + '_CW' + str(calibration_window) + \ '_' + str(experiment_id) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)]) real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24) real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns) # If we are not starting a new recalibration but re-starting an old one, we import the # existing files and print metrics if not new_recalibration: # Import existinf forecasting file forecast = pd.read_csv(forecast_file_path, index_col=0) forecast.index = pd.to_datetime(forecast.index) # Reading dates to still be forecasted by checking NaN values forecast_dates = forecast[forecast.isna().any(axis=1)].index # If all the dates to be forecasted have already been forecast, we print information # and exit the script if len(forecast_dates) == 0: mae = np.mean(MAE(forecast.values.squeeze(), real_values.values)) smape = np.mean( sMAPE(forecast.values.squeeze(), real_values.values)) * 100 print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( 'Final metrics', smape, mae)) else: forecast_dates = forecast.index model = DNN(experiment_id=experiment_id, path_hyperparameter_folder=path_hyperparameter_folder, nlayers=nlayers, dataset=dataset, years_test=years_test, shuffle_train=shuffle_train, data_augmentation=data_augmentation, calibration_window=calibration_window) # For loop over the recalibration dates for date in forecast_dates: # For simulation purposes, we assume that the available data is # the data up to current date where the prices of current date are not known data_available = pd.concat( [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean( MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean( sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path) return forecast
def _hyperopt_objective(hyperparameters, trials, trials_file_path, max_evals, nlayers, dfTrain, dfTest, shuffle_train, dataset, data_augmentation, calibration_window, n_exogenous_inputs): """Function that defines the hyperparameter optimization objective/loss This function receives as input a set of hyperparameters, trains a DNN using them, and returns the performance of the DNN for the selected hyperparameters in a validation dataset Parameters ---------- hyperparameters : dict A dictionary provided by hyperopt indicating whether each hyperparameter/feature is selected trials : hyperopt.Trials The trials object that stores the hyperparameter optimization runs trials_file_path : str The path to store the trials object max_evals : int Maximum number of iterations for hyperparameter optimization nlayers : int Number of layers in the DNN model dfTrain : pandas.DataFrame Dataframe containing the training data dfTrain : pandas.DataFrame Dataframe containing the testing data shuffle_train : bool Boolean that selects whether the training and validation datasets are shuffled dataset : TYPE Description data_augmentation : TYPE Description calibration_window : TYPE Description n_exogenous_inputs : TYPE Description Returns ------- dict A dictionary summarizing the result of the hyperparameter run """ # Re-defining the training dataset based on the calibration window. The calibration window # can be given as an external parameter. If the value 0 is given, the calibration window # is included as a hyperparameter to optimize dfTrain_cw = dfTrain.loc[dfTrain.index[-1] - pd.Timedelta(weeks=52) * calibration_window + pd.Timedelta(hours=1):] # Saving hyperoptimization state and printing message pc.dump(trials, open(trials_file_path, "wb")) if trials.losses()[0] is not None: MAEVal = trials.best_trial['result']['MAE Val'] MAETest = trials.best_trial['result']['MAE Test'] sMAPEVal = trials.best_trial['result']['sMAPE Val'] sMAPETest = trials.best_trial['result']['sMAPE Test'] print('\n\nTested {}/{} iterations.'.format(len(trials.losses()) - 1, max_evals)) print('Best MAE - Validation Dataset') print(" MAE: {:.1f} | sMAPE: {:.2f} %".format(MAEVal, sMAPEVal)) print('\nBest MAE - Test Dataset') print(" MAE: {:.1f} | sMAPE: {:.2f} %".format(MAETest, sMAPETest)) # Defining X,Y datasets Xtrain, Ytrain, Xval, Yval, Xtest, Ytest, indexTest = \ _build_and_split_XYs(dfTrain=dfTrain_cw, dfTest=dfTest, features=hyperparameters, shuffle_train=shuffle_train, hyperoptimization=True, data_augmentation=data_augmentation, n_exogenous_inputs=n_exogenous_inputs) # If required, datasets are scaled if hyperparameters['scaleX'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']: [Xtrain, Xval, Xtest], _ = scaling([Xtrain, Xval, Xtest], hyperparameters['scaleX']) if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']: [Ytrain, Yval], scaler = scaling([Ytrain, Yval], hyperparameters['scaleY']) else: scaler = None neurons = [int(hyperparameters['neurons' + str(k)]) for k in range(1, nlayers + 1) if int(hyperparameters['neurons' + str(k)]) >= 50] np.random.seed(int(hyperparameters['seed'])) # Initialize model forecaster = DNNModel(neurons=neurons, n_features=Xtrain.shape[-1], dropout=hyperparameters['dropout'], batch_normalization=hyperparameters['batch_normalization'], lr=hyperparameters['lr'], verbose=False, optimizer='adam', activation=hyperparameters['activation'], epochs_early_stopping=20, scaler=scaler, loss='mae', regularization=hyperparameters['reg']['val'], lambda_reg=hyperparameters['reg']['lambda'], initializer=hyperparameters['init']) forecaster.fit(Xtrain, Ytrain, Xval, Yval) Yp = forecaster.predict(Xval).squeeze() if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']: Yval = scaler.inverse_transform(Yval) Yp = scaler.inverse_transform(Yp) mae_validation = np.mean(MAE(Yval, Yp)) smape_validation = np.mean(sMAPE(Yval, Yp)) * 100 # If required, datasets are normalized Yp = forecaster.predict(Xtest).squeeze() if hyperparameters['scaleY'] in ['Norm', 'Norm1', 'Std', 'Median', 'Invariant']: Yp = scaler.inverse_transform(Yp).squeeze() maeTest = np.mean(MAE(Ytest, Yp)) smape_test = np.mean(sMAPE(Ytest, Yp)) * 100 # The test dataset is returned for directly evaluating the models without recalibration # while performing hyperopt. However, the hyperparameter search is performed using a validation # dataset return_values = {'loss': mae_validation, 'MAE Val': mae_validation, 'MAE Test': maeTest, 'sMAPE Val': smape_validation, 'sMAPE Test': smape_test, 'status': STATUS_OK} return return_values
# the data up to current date where the prices of current date are not known data_available = pd.concat( [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day( df=data_available, next_day_date=date, calibration_window=calibration_window) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean( MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean( sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path)
def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), path_recalibration_folder=os.path.join('.', 'experimental_files'), dataset='PJM', years_test=2, calibration_window=364 * 3, begin_test_date=None, end_test_date=None): """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. The test dataset is defined by a market name and the test dates dates. The function generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. An example on how to use this function is provided :ref:`here<learex1>`. Parameters ---------- path_datasets_folder : str, optional path where the datasets are stored or, if they do not exist yet, the path where the datasets are to be stored. path_recalibration_folder : str, optional path to save the files of the experiment dataset. dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments ``begin_test_date`` and ``end_test_date`` are not provided. calibration_window : int, optional Number of days used in the training dataset for recalibration. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``end_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``begin_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``begin_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``end_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. Returns ------- pandas.DataFrame A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder. """ # Checking if provided directory for recalibration exists and if not create it if not os.path.exists(path_recalibration_folder): os.makedirs(path_recalibration_folder) # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \ '_CW' + str(calibration_window) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)]) real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24) real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns) forecast_dates = forecast.index model = LEAR(calibration_window=calibration_window) # For loop over the recalibration dates for date in forecast_dates: # For simulation purposes, we assume that the available data is # the data up to current date where the prices of current date are not known data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, calibration_window=calibration_window) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format(str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path) return forecast
# Building the same datasets with shape (ndays, n_prices/day) instead # of shape (nprices, 1) and display fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), index=fc_DNN_ensemble.index[::24], columns=['h' + str(hour) for hour in range(24)]) real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), index=real_price.index[::24], columns=['h' + str(hour) for hour in range(24)]) fc_DNN_ensemble_2D.head() # According to the paper, the sMAPE of the DNN ensemble for the NP market is 4.85%. # Let's test the metric for different conditions # Evaluating sMAPE when real price and forecasts are both dataframes sMAPE(p_pred=fc_DNN_ensemble, p_real=real_price) * 100 # Evaluating sMAPE when real price and forecasts are both numpy arrays sMAPE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values) * 100 # Evaluating sMAPE when input values are of shape (ndays, n_prices/day) instead # of shape (nprices, 1) # Dataframes sMAPE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D) * 100 # Numpy arrays sMAPE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values) * 100 # Evaluating sMAPE when input values are of shape (nprices,) # instead of shape (nprices, 1) # Pandas Series sMAPE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'],