def __init__(self,
              n_feature: int,
              n_hidden: int,
              num_hidden_layer: int,
              n_output: int = 1,
              dropout_rate: float = 0.0):
     """
     :param n_feature: number of features for ANN input
     :param n_hidden: number of hidden neurons (first hidden layer)
     :param num_hidden_layer: number of hidden layers
     :param n_output: number of outputs
     :param dropout_rate: probability of element being zeroed in dropout layer
     """
     super(ANN, self).__init__()
     TrainHelper.init_pytorch_seeds()
     self.hidden_layer = nn.ModuleList()
     hidden_in = n_feature
     hidden_out = n_hidden
     for layer_num in range(num_hidden_layer):
         self.hidden_layer.append(
             nn.Linear(in_features=hidden_in, out_features=hidden_out))
         hidden_in = hidden_out
         hidden_out = int(hidden_in / 2)
     self.output_layer = nn.Linear(in_features=hidden_in,
                                   out_features=n_output)
     self.dropout = nn.Dropout(p=dropout_rate)
 def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame:
     """
     Deliver, if specified one step ahead, out-of-sample predictions
     :param test: test set
     :param train: train set
     :return: DataFrame with predictions, upper and lower confidence level
     """
     TrainHelper.init_pytorch_seeds()
     x_test = torch.tensor(data=self.x_scaler.transform(
         test.drop(self.target_column, axis=1)).astype(np.float32))
     if self.one_step_ahead:
         train_manip = train.copy()
         predict_lst = []
         # deep copy model as predict function should not change class model
         model = copy.deepcopy(self.model)
         for i in range(0, test.shape[0]):
             model.eval()
             # predict on cpu
             model.to(torch.device("cpu"))
             fc = model(x=x_test[i].view(1, -1)).item()
             train_manip = train_manip.append(test.iloc[[i]])
             self.update(train=train_manip, model=model)
             predict_lst.append(fc)
         predict = np.array(predict_lst).flatten()
     else:
         self.model.eval()
         # predict on cpu
         self.model.to(torch.device("cpu"))
         predict = self.model(x=x_test).data.numpy().flatten()
     predictions = pd.DataFrame({'Prediction': predict}, index=test.index)
     return predictions
 def create_train_valid_sets(self, train: pd.DataFrame) -> tuple:
     """
     Create train and validation set respective train loader with batches
     :param train: train dataset
     :return: DataLoader with batches of train data as well as validation data
     """
     TrainHelper.init_pytorch_seeds()
     # create train and validation set
     valid_size = 0.2
     split_ind = int(train.shape[0] * (1 - valid_size))
     train_data = train.iloc[:split_ind]
     valid_data = train.iloc[split_ind:]
     # scale input data
     x_train = self.x_scaler.fit_transform(
         train_data.drop(self.target_column, axis=1))
     x_valid = self.x_scaler.transform(
         valid_data.drop(self.target_column, axis=1))
     # create train ready data
     x_train = torch.tensor(x_train.astype(np.float32))
     x_valid = torch.tensor(x_valid.astype(np.float32))
     y_train = torch.tensor(data=train_data[
         self.target_column].values.reshape(-1, 1).astype(np.float32))
     y_valid = torch.tensor(data=valid_data[
         self.target_column].values.reshape(-1, 1).astype(np.float32))
     train_loader = torch.utils.data.DataLoader(
         dataset=torch.utils.data.TensorDataset(x_train, y_train),
         batch_size=self.batch_size,
         shuffle=False,
         drop_last=False,
         worker_init_fn=np.random.seed(0))
     return train_loader, x_valid, y_valid
 def create_train_valid_sequence_sets(self, train: pd.DataFrame) -> tuple:
     """
     Create train and validation sequenced set respective train loader with sequenced batches
     :param train: train data to use
     :return: DataLoader with batches of sequenced train data as well as sequenced validation data
     """
     TrainHelper.init_pytorch_seeds()
     # scale input data
     x_train_scaled = self.x_scaler.fit_transform(
         train.drop(self.target_column, axis=1))
     y_train_scaled = self.y_scaler.fit_transform(
         train[self.target_column].values.reshape(-1, 1))
     # create sequences
     x_seq_train, y_seq_train = self.create_sequences(
         data=np.hstack((x_train_scaled, y_train_scaled)))
     # split into train and validation set
     valid_size = 0.2
     split_ind = int(x_seq_train.shape[0] * (1 - valid_size))
     x_train = torch.tensor(x_seq_train[:split_ind, :, :].astype(
         np.float32))
     x_valid = torch.tensor(x_seq_train[split_ind:, :, :].astype(
         np.float32))
     y_train = torch.tensor(y_seq_train[:split_ind].reshape(-1, 1).astype(
         np.float32))
     y_valid = torch.tensor(y_seq_train[split_ind:].reshape(-1, 1).astype(
         np.float32))
     train_loader = torch.utils.data.DataLoader(
         dataset=torch.utils.data.TensorDataset(x_train, y_train),
         batch_size=self.batch_size,
         shuffle=False,
         drop_last=False)
     return train_loader, x_valid, y_valid
Exemplo n.º 5
0
 def run_pytorch_optim_loop(self, train_loader, x_valid, y_valid, model, checkpoint_name: str = 'train'):
     """
     Optimization of hyperparameters
     :param train_loader: DataLoader with sequenced train batches
     :param x_valid: sequenced validation data
     :param y_valid: validation labels
     :param model: model to optimize
     :param checkpoint_name: save name for best checkpoints
     :return:
     """
     TrainHelper.init_pytorch_seeds()
     # name for checkpoint for temporary storing during optimization with early stopping
     # detailed timestamp to prevent interference with parallel running jobs using same directory
     checkpoint_name += '_' + datetime.datetime.now().strftime("%d-%b-%Y_%H-%M-%S-%f")
     min_valid_loss = 99999999
     epochs_wo_improvement_threshold = 0
     epochs_wo_improvement_total = 0
     # instantiate new optimizer to ensure independence of previous runs
     optimizer = torch.optim.Adam(params=model.parameters(), lr=self.learning_rate)
     # get device and shift model and data to it
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     model.to(device)
     x_valid, y_valid = x_valid.to(device), y_valid.to(device)
     for e in range(self.epochs):
         model.train()
         for (batch_x, batch_y) in train_loader:
             TrainHelper.init_pytorch_seeds()
             # copy data to device
             batch_x, batch_y = batch_x.to(device), batch_y.to(device)
             # gradients are summed up so they need to be zeroed for new run
             optimizer.zero_grad()
             y_pred = model(batch_x)
             loss_train = self.loss(y_pred, batch_y)
             loss_train.backward()
             optimizer.step()
         model.eval()
         y_pred_valid = model(x_valid)
         loss_valid = self.loss(y_pred_valid, y_valid).item()
         if loss_valid < min_valid_loss:
             min_valid_loss = loss_valid
             epochs_wo_improvement_threshold = 0
             epochs_wo_improvement_total = 0
             torch.save(model.state_dict(), 'Checkpoints/checkpoint_' + checkpoint_name + '.pt')
         elif (loss_valid - min_valid_loss) > self.min_val_loss_improvement:
             # Early Stopping with thresholds for counter incrementing and max_epochs
             epochs_wo_improvement_threshold += 1
             if epochs_wo_improvement_threshold > self.max_epochs_wo_improvement:
                 print('Early Stopping after epoch ' + str(e))
                 break
         elif loss_valid >= min_valid_loss:
             # Early stopping if there is no improvement with a higher threshold
             epochs_wo_improvement_total += 1
             if epochs_wo_improvement_total > 2 * self.max_epochs_wo_improvement:
                 print('Early Stopping after epoch ' + str(e))
                 break
         if e % 100 == 0:
             print('Epoch ' + str(e) + ': valid loss = ' + str(loss_valid)
                   + ', min_valid_loss = ' + str(min_valid_loss))
     model.load_state_dict(state_dict=torch.load('Checkpoints/checkpoint_' + checkpoint_name + '.pt'))
     os.remove('Checkpoints/checkpoint_' + checkpoint_name + '.pt')
 def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict:
     """
     Train model
     :param train: train set
     :param cross_val_call: called to perform cross validation
     :return dictionary with cross validated scores (if specified)
     """
     TrainHelper.init_pytorch_seeds()
     cross_val_score_dict = {}
     if cross_val_call:
         cross_val_score_dict_ts, self.model = self.get_cross_val_score(
             train=train)
         cross_val_score_dict_shuf, self.model = self.get_cross_val_score(
             train=train, normal_cv=True)
         cross_val_score_dict = {
             **cross_val_score_dict_ts,
             **cross_val_score_dict_shuf
         }
     # create train and validation set
     train_loader, x_valid, y_valid = self.create_train_valid_sequence_sets(
         train=train)
     # run optim loop
     self.run_pytorch_optim_loop(train_loader=train_loader,
                                 x_valid=x_valid,
                                 y_valid=y_valid,
                                 model=self.model,
                                 checkpoint_name='lstm_train')
     return cross_val_score_dict
 def insample(self, train: pd.DataFrame) -> pd.DataFrame:
     """
     Deliver  insample predictions
     :param train: train set
     :return: DataFrame with insample predictions
     """
     TrainHelper.init_pytorch_seeds()
     self.model.eval()
     # predict on cpu
     self.model.to(torch.device("cpu"))
     # scale
     x_train_scaled = self.x_scaler.transform(
         train.drop(self.target_column, axis=1))
     y_train_scaled = self.y_scaler.transform(
         train[self.target_column].values.reshape(-1, 1))
     # create sequences
     x_seq_train, _ = self.create_sequences(
         data=np.hstack((x_train_scaled, y_train_scaled)))
     x_train = torch.tensor(x_seq_train.astype(np.float32))
     # predict and transform back
     y_insample = self.y_scaler.inverse_transform(
         self.model(x_train).data.numpy())
     # insert dummy values for train samples before first full sequence
     y_insample = np.insert(y_insample, 0, self.seq_length * [-9999])
     insample = pd.DataFrame(data=y_insample,
                             index=train.index,
                             columns=['Insample'])
     return insample
 def __init__(self,
              target_column: str,
              seasonal_periods: int,
              one_step_ahead: bool,
              n_feature: int,
              lstm_hidden_dim: int = 10,
              lstm_num_layers: int = 1,
              seq_length: int = 7,
              n_output: int = 1,
              dropout_rate: float = 0.0,
              epochs: int = 5000,
              batch_size: int = 16,
              learning_rate: float = 1e-3,
              loss=nn.MSELoss(),
              min_val_loss_improvement: float = 1000,
              max_epochs_wo_improvement: int = 100):
     """
     :param target_column: target_column for prediction
     :param seasonal_periods: period of seasonality
     :param one_step_ahead: perform one step ahead prediction
     :param n_feature: number of features for ANN input
     :param lstm_hidden_dim: dimensionality of hidden layer
     :param lstm_num_layers: depth of lstm network
     :param seq_length: sequence length for input of lstm network
     :param n_output: number of outputs
     :param dropout_rate: probability of element being zeroed in dropout layer
     :param epochs: number of epochs
     :param batch_size: size of a batch
     :param learning_rate: learning rate for optimizer
     :param loss: loss function to use
     :param min_val_loss_improvement: deviation validation loss to min_val_loss for being counted for early stopping
     :param max_epochs_wo_improvement: maximum number of epochs without improvement before early stopping
     """
     super().__init__(target_column=target_column,
                      seasonal_periods=seasonal_periods,
                      name='LSTM',
                      one_step_ahead=one_step_ahead)
     TrainHelper.init_pytorch_seeds()
     self.model = LSTM(n_feature=n_feature,
                       lstm_hidden_dim=lstm_hidden_dim,
                       lstm_num_layers=lstm_num_layers,
                       n_output=n_output,
                       dropout_rate=dropout_rate)
     self.seq_length = seq_length
     self.optimizer = 'adam'
     self.learning_rate = learning_rate
     self.loss = loss
     self.x_scaler = sklearn.preprocessing.StandardScaler()
     self.y_scaler = sklearn.preprocessing.StandardScaler()
     self.batch_size = batch_size
     self.epochs = epochs
     self.min_val_loss_improvement = min_val_loss_improvement
     self.max_epochs_wo_improvement = max_epochs_wo_improvement
 def forward(self, x):
     """
     Feedforward path
     :param x: data to process
     :return: prediction value
     """
     TrainHelper.init_pytorch_seeds()
     for layer in self.hidden_layer:
         x = F.relu(layer(x))
         x = self.dropout(x)
     out = self.output_layer(x)
     return out
    def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame:
        """
        Deliver (back-transformed), if specified one step ahead, out-of-sample predictions
        :param test: test set
        :param train: train set
        :return: DataFrame with predictions
        """
        if (self.power_transformer is not None) or self.log:
            test = TrainHelper.get_transformed_set(
                dataset=test,
                target_column=self.target_column,
                power_transformer=self.power_transformer,
                log=self.log,
                only_transform=True)
            train = TrainHelper.get_transformed_set(
                dataset=train,
                target_column=self.target_column,
                power_transformer=self.power_transformer,
                log=self.log)
        if self.one_step_ahead:
            train_manip = train.copy()[self.target_column]
            predict = []
            # deep copy model as predict function should not change class model
            model_results = copy.deepcopy(self.model_results)
            for ind in test.index:
                fc = model_results.forecast()
                predict.append(fc[ind])
                train_manip = train_manip.append(
                    pd.Series(data=test[self.target_column], index=[ind]))
                model_results = self.update(train=pd.DataFrame(
                    data=train_manip, columns=[self.target_column]))
        else:
            predict = self.model_results.predict(start=test.index[0],
                                                 end=test.index[-1])
        predictions = pd.DataFrame({'Prediction': predict}, index=test.index)

        if self.power_transformer is not None:
            predictions = pd.DataFrame(
                {
                    'Prediction':
                    self.power_transformer.inverse_transform(
                        predictions['Prediction'].values.reshape(-1,
                                                                 1)).flatten()
                },
                index=predictions.index)
        if self.log:
            if self.contains_zeros:
                predictions = predictions.apply(np.exp) + 1
            else:
                predictions = predictions.apply(np.exp)

        return predictions
 def forward(self, x):
     """
     Feedforward path
     :param x: data to process
     :return: prediction value
     """
     TrainHelper.init_pytorch_seeds()
     # input (batch x seq_length x input_size) (batch_first is set True)
     lstm_out, (hn, cn) = self.lstm(x.view(x.shape[0], x.shape[1], -1))
     # only take last output of sequence
     out = self.dropout(lstm_out[:, -1, :])
     out = self.output_layer(out)
     return out
 def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame:
     """
     Deliver, if specified one step ahead, out-of-sample predictions
     :param test: test set
     :param train: train set
     :return: DataFrame with predictions, upper and lower confidence level
     """
     TrainHelper.init_pytorch_seeds()
     x_train_scaled = self.x_scaler.transform(
         train.drop(self.target_column, axis=1))
     y_train_scaled = self.y_scaler.transform(
         train[self.target_column].values.reshape(-1, 1))
     x_test_scaled = self.x_scaler.transform(
         test.drop(self.target_column, axis=1))
     y_test_scaled = self.y_scaler.transform(
         test[self.target_column].values.reshape((-1, 1)))
     # add last elements of train to complete first test sequence
     x_test_full = np.vstack(
         (x_train_scaled[-self.seq_length:], x_test_scaled))
     y_test_full = np.vstack(
         (y_train_scaled[-self.seq_length:], y_test_scaled))
     # create test sequences
     x_seq_test, _ = self.create_sequences(data=np.hstack((x_test_full,
                                                           y_test_full)))
     if self.one_step_ahead:
         predict_lst = []
         train_manip = train.copy()
         # deep copy model as predict function should not change class model
         model = copy.deepcopy(self.model)
         for i in range(0, test.shape[0]):
             test_seq = x_seq_test[i].reshape(1, self.seq_length, -1)
             model.eval()
             # predict on cpu
             model.to(torch.device("cpu"))
             fc = self.y_scaler.inverse_transform(
                 model(x=torch.tensor(test_seq.astype(
                     np.float32))).data.numpy())
             train_manip = train_manip.append(test.iloc[[i]])
             self.update(train=train, model=model)
             predict_lst.append(fc)
         predict = np.array(predict_lst).flatten()
     else:
         # predict on cpu
         self.model.to(torch.device("cpu"))
         self.model.eval()
         predict = self.y_scaler.inverse_transform(
             self.model(x=torch.tensor(x_seq_test.astype(
                 np.float32))).data.numpy()).flatten()
     predictions = pd.DataFrame({'Prediction': predict}, index=test.index)
     return predictions
 def update(self, train: pd.DataFrame, model):
     """
     Update existing model due to new samples
     :param train: train set with new samples
     :param model: model to update
     """
     TrainHelper.init_pytorch_seeds()
     train_loader, x_valid, y_valid = self.create_train_valid_sequence_sets(
         train=train)
     self.run_pytorch_optim_loop(train_loader=train_loader,
                                 x_valid=x_valid,
                                 y_valid=y_valid,
                                 model=model,
                                 checkpoint_name='lstm_update')
 def insample(self, train: pd.DataFrame) -> pd.DataFrame:
     """
     Deliver  insample predictions
     :param train: train set
     :return: DataFrame with insample predictions
     """
     TrainHelper.init_pytorch_seeds()
     self.model.eval()
     # predict on cpu
     self.model.to(torch.device("cpu"))
     x_train = torch.tensor(data=self.x_scaler.transform(
         train.drop(self.target_column, axis=1)).astype(np.float32))
     insample = pd.DataFrame(data=self.model(x=x_train).data.numpy(),
                             index=train.index,
                             columns=['Insample'])
     return insample
 def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict:
     """
     Train Exponential Smoothing model
     :param train: train set
     :param cross_val_call: called to perform cross validation
     """
     cross_val_score_dict = {}
     if cross_val_call:
         cross_val_score_dict, self.model = self.get_cross_val_score(
             train=train)
     if (self.power_transformer is not None) or self.log:
         train = TrainHelper.get_transformed_set(
             dataset=train,
             target_column=self.target_column,
             power_transformer=self.power_transformer,
             log=self.log)
     if (0 in train[self.target_column].values) and (
             self.trend == 'mul' or self.seasonal == 'mul'):
         # multiplicative trend or seasonal only working with strictly-positive data
         # only done if no transform was performed, otherwise values would need to be corrected a lot
         train = train.copy()
         train[self.target_column] += 0.01
     self.model = statsmodels.tsa.api.ExponentialSmoothing(
         endog=train[self.target_column],
         trend=self.trend,
         damped=self.damped,
         seasonal=self.seasonal,
         seasonal_periods=self.seasonal_periods)
     self.model_results = self.model.fit(remove_bias=self.remove_bias,
                                         use_brute=self.use_brute)
     return cross_val_score_dict
 def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict:
     """
     Train (S)ARIMA(X) model
     :param train: train set
     :param cross_val_call: called to perform cross validation
     :return dictionary with cross validated scores (if specified)
     """
     cross_val_score_dict = {}
     if cross_val_call:
         cross_val_score_dict, self.model = self.get_cross_val_score(
             train=train)
     train_exog = None
     if (self.power_transformer is not None) or self.log:
         train = TrainHelper.get_transformed_set(
             dataset=train,
             target_column=self.target_column,
             power_transformer=self.power_transformer,
             log=self.log)
     if self.use_exog:
         train_exog = train.drop(labels=[self.target_column], axis=1)
         self.exog_cols_dropped = train_exog.columns[
             train_exog.isna().any()].tolist()
         PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped)
         train_exog = train_exog.to_numpy(dtype=float)
     self.model.fit(y=train[self.target_column],
                    exogenous=train_exog,
                    trend=self.trend)
     return cross_val_score_dict
Exemplo n.º 17
0
 def evaluate(self, train: pd.DataFrame, test: pd.DataFrame) -> dict:
     """
     Evaluate model against all implemented evaluation metrics and baseline methods.
     Deliver dictionary with evaluation metrics.
     :param train: train set
     :param test: test set
     :return: dictionary with evaluation metrics of model and all baseline methods
     """
     TrainHelper.init_pytorch_seeds()
     insample_rw, prediction_rw = SimpleBaselines.RandomWalk(one_step_ahead=self.one_step_ahead)\
         .get_insample_prediction(train=train, test=test, target_column=self.target_column)
     insample_seasrw, prediction_seasrw = SimpleBaselines.RandomWalk(one_step_ahead=self.one_step_ahead)\
         .get_insample_prediction(train=train, test=test, target_column=self.target_column,
                                  seasonal_periods=self.seasonal_periods)
     insample_ha, prediction_ha = SimpleBaselines.HistoricalAverage(one_step_ahead=self.one_step_ahead)\
         .get_insample_prediction(train=train, test=test, target_column=self.target_column)
     insample_model = self.insample(train=train)
     prediction_model = self.predict(test=test, train=train)
     rmse_train_rw, mape_train_rw, smape_train_rw = EvaluationHelper.get_all_eval_vals(
         actual=train[self.target_column], prediction=insample_rw['Insample'])
     rmse_test_rw, mape_test_rw, smape_test_rw = EvaluationHelper.get_all_eval_vals(
         actual=test[self.target_column], prediction=prediction_rw['Prediction'])
     rmse_train_seasrw, mape_train_seasrw, smape_train_seasrw = EvaluationHelper.get_all_eval_vals(
         actual=train[self.target_column], prediction=insample_seasrw['Insample'])
     rmse_test_seasrw, mape_test_seasrw, smape_test_seasrw = EvaluationHelper.get_all_eval_vals(
         actual=test[self.target_column], prediction=prediction_seasrw['Prediction'])
     rmse_train_ha, mape_train_ha, smape_train_ha = EvaluationHelper.get_all_eval_vals(
         actual=train[self.target_column], prediction=insample_ha['Insample'])
     rmse_test_ha, mape_test_ha, smape_test_ha = EvaluationHelper.get_all_eval_vals(
         actual=test[self.target_column], prediction=prediction_ha['Prediction'])
     rmse_train_model, mape_train_model, smape_train_model = EvaluationHelper.get_all_eval_vals(
         actual=train[self.target_column], prediction=insample_model['Insample'])
     rmse_test_model, mape_test_model, smape_test_model = EvaluationHelper.get_all_eval_vals(
         actual=test[self.target_column], prediction=prediction_model['Prediction'])
     return {'RMSE_Train_RW': rmse_train_rw, 'MAPE_Train_RW': mape_train_rw, 'sMAPE_Train_RW': smape_train_rw,
             'RMSE_Test_RW': rmse_test_rw, 'MAPE_Test_RW': mape_test_rw, 'sMAPE_Test_RW': smape_test_rw,
             'RMSE_Train_seasRW': rmse_train_seasrw, 'MAPE_Train_seasRW': mape_train_seasrw,
             'sMAPE_Train_seasRW': smape_train_seasrw,
             'RMSE_Test_seasRW': rmse_test_seasrw, 'MAPE_Test_seasRW': mape_test_seasrw,
             'sMAPE_Test_seasRW': smape_test_seasrw,
             'RMSE_Train_HA': rmse_train_ha, 'MAPE_Train_HA': mape_train_ha, 'sMAPE_Train_HA': smape_train_ha,
             'RMSE_Test_HA': rmse_test_ha, 'MAPE_Test_HA': mape_test_ha, 'sMAPE_Test_HA': smape_test_ha,
             'RMSE_Train': rmse_train_model, 'MAPE_Train': mape_train_model, 'sMAPE_Train': smape_train_model,
             'RMSE_Test': rmse_test_model, 'MAPE_Test': mape_test_model, 'sMAPE_Test': smape_test_model
             }
 def __init__(self,
              n_feature: int,
              lstm_hidden_dim: int,
              lstm_num_layers: int = 1,
              n_output: int = 1,
              dropout_rate: float = 0.0):
     """
     :param n_feature: number of features for ANN input
     :param lstm_hidden_dim: dimensionality of hidden layer
     :param lstm_num_layers: depth of lstm network
     :param n_output: number of outputs
     :param dropout_rate: probability of element being zeroed in dropout layer
     """
     super(LSTM, self).__init__()
     TrainHelper.init_pytorch_seeds()
     self.lstm = nn.LSTM(input_size=n_feature,
                         hidden_size=lstm_hidden_dim,
                         num_layers=lstm_num_layers,
                         batch_first=True,
                         dropout=dropout_rate)
     self.dropout = nn.Dropout(p=dropout_rate)
     self.output_layer = nn.Linear(in_features=lstm_hidden_dim,
                                   out_features=n_output)
Exemplo n.º 19
0
def run_es_optim(target_column: str, split_perc: float, imputation: str):
    """
    Run whole ES optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'trend': ['add', None],
        'damp': [False, True],
        'seasonality': ['add', 'mul', None],
        'remove_bias': [False, True],
        'brute': [False, True],
        'osa': [True],
        'transf': [False, 'log', 'pw']
    }
    # random sample from parameter grid
    params_lst = sorted(list(
        sklearn.model_selection.ParameterSampler(
            param_distributions=param_grid,
            n_iter=int(
                1 * MixedHelper.get_product_len_dict(dictionary=param_grid)),
            random_state=np.random.RandomState(42))),
                        key=lambda d: (d['dataset'].name, d['imputation']))

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        tr = params_lst[i]['trend']
        damp = params_lst[i]['damp']
        season = params_lst[i]['seasonality']
        remo_bias = params_lst[i]['remove_bias']
        brute = params_lst[i]['brute']
        one_step_ahead = params_lst[i]['osa']
        transf = params_lst[i]['transf']
        power, log = TrainHelper.get_pw_l_for_transf(transf=transf)

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                reset_index=True)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsES.ExponentialSmoothing(
                    target_column=target_column,
                    trend=tr,
                    damped=damp,
                    seasonal=season,
                    seasonal_periods=seasonal_periods,
                    remove_bias=remo_bias,
                    use_brute=brute,
                    one_step_ahead=one_step_ahead,
                    power_transf=power,
                    log=log)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset': dataset.name,
                'imputation':
                str('None' if imputation is None else imputation),
                'init_train_len': init_train_len,
                'test_len': test_len,
                'split_perc': split_perc,
                'trend': tr,
                'damped': damp,
                'seasonal': season,
                'seasonal_periods': seasonal_periods,
                'remove_bias': remo_bias,
                'use_brute': brute,
                'one_step_ahead': one_step_ahead,
                'power_transform': power,
                'log': log
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset': 'Failure',
                'imputation':
                str('None' if imputation is None else imputation),
                'init_train_len': init_train_len,
                'test_len': test_len,
                'split_perc': split_perc,
                'trend': tr,
                'damped': damp,
                'seasonal': season,
                'seasonal_periods': seasonal_periods,
                'remove_bias': remo_bias,
                'use_brute': brute,
                'one_step_ahead': one_step_ahead,
                'power_transform': power,
                'log': log
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='es',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Exemplo n.º 20
0
    i = 1
    for target_column in result_file_dict.keys():
        print('++++++ Processing Dataset ' + str(i) + '/' + str(len(result_file_dict.keys())) + ' ++++++')
        i += 1
        # set standard values
        split_perc = 0.8
        company = 'General'
        doc_results = None
        result_file_str = result_file_dict[target_column]

        # read config file
        config = configparser.ConfigParser()
        config.read('Configs/dataset_specific_config.ini')
        # get optim parameters
        base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
            TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column,
                                                        split_perc=split_perc)
        # set const hazard and scale window based on seasonal periods
        const_hazard = const_hazard_factor * seasonal_periods if const_hazard_user == 9999 else const_hazard_user
        scale_window = max(scale_window_minimum, int(scale_window_factor * seasonal_periods))
        max_samples = max_samples_factor * seasonal_periods if max_samples_factor is not None else max_samples_user
        # read result file config
        result_file = pd.read_csv(optim_results_dir + result_file_str, sep=';', decimal=',', index_col=False)
        result_file.drop('Unnamed: 0', axis=1, inplace=True)
        result_file.replace(to_replace='NaN', value=np.nan, inplace=True)
        result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'].isna()], inplace=True)
        result_file.dropna(subset=[el for el in result_file.columns if 'cv' in el], inplace=True)
        result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'] == 0], inplace=True)
        sort_col = 'shuf_cv_rmse_mean'
        sorted_results = result_file.sort_values(sort_col)
        top_config = sorted_results.head(1).iloc[0]
        dict_top_config = TrainHelper.read_config_info(top_config, seasonal_periods)
Exemplo n.º 21
0
def run_xgb_optim(target_column: str, split_perc: float, imputation: str,
                  featureset: str):
    """
    Run whole XGB optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'learning_rate': [0.05, 0.1, 0.3],
        'max_depth': [3, 5, 10],
        'subsample': [0.3, 0.7, 1],
        'n_estimators': [10, 100, 1000],
        'gamma': [0, 1, 10],
        'alpha': [0, 0.1, 1, 10],
        'reg_lambda': [0, 0.1, 1, 10],
        'osa': [True]
    }

    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        learning_rate = params_lst[i]['learning_rate']
        max_depth = params_lst[i]['max_depth']
        subsample = params_lst[i]['subsample']
        n_estimators = params_lst[i]['n_estimators']
        gamma = params_lst[i]['gamma']
        alpha = params_lst[i]['alpha']
        reg_lambda = params_lst[i]['reg_lambda']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelXGBoost.XGBoostRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    subsample=subsample,
                    n_estimators=n_estimators,
                    gamma=gamma,
                    alpha=alpha,
                    reg_lambda=reg_lambda,
                    one_step_ahead=one_step_ahead)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'learning_rate':
                learning_rate,
                'max_depth':
                max_depth,
                'subsample':
                subsample,
                'n_estimators':
                n_estimators,
                'gamma':
                gamma,
                'alpha':
                alpha,
                'lambda':
                reg_lambda,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'learning_rate':
                learning_rate,
                'max_depth':
                max_depth,
                'subsample':
                subsample,
                'n_estimators':
                n_estimators,
                'gamma':
                gamma,
                'alpha':
                alpha,
                'lambda':
                reg_lambda,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='xgb',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
    def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame:
        """
        Deliver (back-transformed), if specified one step ahead, out-of-sample predictions
        :param test: test set
        :param train: train set
        :return: DataFrame with predictions, upper and lower confidence level
        """
        test_exog = None
        if (self.power_transformer is not None) or self.log:
            test = TrainHelper.get_transformed_set(
                dataset=test,
                target_column=self.target_column,
                power_transformer=self.power_transformer,
                log=self.log,
                only_transform=True)
        if self.use_exog:
            test_exog = test.drop(labels=[self.target_column], axis=1)
            PreparationHelper.drop_columns(test_exog, self.exog_cols_dropped)
            test_exog = test_exog.to_numpy(dtype=float)
        if self.one_step_ahead:
            predict = []
            conf_low = []
            conf_up = []
            # deep copy model as predict function should not change class model
            model = copy.deepcopy(self.model)
            for i in range(0, test.shape[0]):
                if self.use_exog:
                    fc, conf = model.predict(n_periods=1,
                                             exogenous=pd.DataFrame(
                                                 test_exog[i].reshape(1, -1)),
                                             return_conf_int=True,
                                             alpha=0.05)
                    model.update(test[self.target_column][i],
                                 exogenous=pd.DataFrame(test_exog[i].reshape(
                                     1, -1)))
                else:
                    fc, conf = model.predict(n_periods=1,
                                             return_conf_int=True,
                                             alpha=0.05)
                    model.update(test[self.target_column][i])
                predict.append(fc[0])
                conf_low.append(conf[0][0])
                conf_up.append(conf[0][1])
        else:
            predict, conf = self.model.predict(n_periods=test.shape[0],
                                               exogenous=test_exog,
                                               return_conf_int=True,
                                               alpha=0.05)
            conf_low = conf[:, 0]
            conf_up = conf[:, 1]
        predictions = pd.DataFrame(
            {
                'Prediction': predict,
                'LowerConf': conf_low,
                'UpperConf': conf_up
            },
            index=test.index)

        if self.power_transformer is not None:
            predictions = pd.DataFrame(
                {
                    'Prediction':
                    self.power_transformer.inverse_transform(
                        predictions['Prediction'].values.reshape(-1,
                                                                 1)).flatten(),
                    'LowerConf':
                    self.power_transformer.inverse_transform(
                        predictions['LowerConf'].values.reshape(-1,
                                                                1)).flatten(),
                    'UpperConf':
                    self.power_transformer.inverse_transform(
                        predictions['UpperConf'].values.reshape(-1,
                                                                1)).flatten()
                },
                index=predictions.index)
        if self.log:
            predict_backtr = np.exp(predictions['Prediction'])
            if self.contains_zeros:
                predict_backtr += 1
            lower_dist = (
                (predictions['Prediction'] - predictions['LowerConf']) /
                predictions['Prediction']) * predict_backtr
            upper_dist = (
                (predictions['UpperConf'] - predictions['Prediction']) /
                predictions['Prediction']) * predict_backtr
            predictions = pd.DataFrame(
                {
                    'Prediction': predict_backtr,
                    'LowerConf': predict_backtr - lower_dist,
                    'UpperConf': predict_backtr + upper_dist
                },
                index=predictions.index)
        return predictions
def run_regressions_optim(target_column: str, split_perc: float, algo: str):
    """
    Run whole multiple linear regression optimization loops
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param algo: algo to use for optimization (['lasso', 'ridge', 'elasticnet', 'bayesridge', 'ard'])
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    multiple_nans_raw_set = config[target_column].getboolean(
        'multiple_nans_raw_set')
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    # parameters relevant for all algos
    param_grid = {
        'dataset': datasets,
        'imputation': ['mean', 'iterative', 'knn'],
        'featureset': ['full', 'cal', 'stat', 'none'],
        'dim_reduction': ['None', 'pca'],
        'normalize': [False, True],
        'osa': [True]
    }
    # parameters relevant for lasso, ridge and elasticnet
    if algo in ['lasso', 'ridge', 'elasticnet']:
        param_grid['alpha'] = [10**x for x in range(-5, 5)]
        if algo == 'elasticnet':
            param_grid['l1_ratio'] = np.arange(0.1, 1, 0.1)
        # random sample from parameter grid: all combis for lasso, ridge, elasticnet
        params_lst = TrainHelper.random_sample_parameter_grid(
            param_grid=param_grid, sample_share=1)
    # parameters relevant for bayesian ridge and ard regression
    else:
        param_grid['alpha_1'] = [10**x for x in range(-6, 1)]
        param_grid['alpha_2'] = [10**x for x in range(-6, -4)]
        param_grid['lambda_1'] = [10**x for x in range(-6, 1)]
        param_grid['lambda_2'] = [10**x for x in range(-6, 1)]
        # random sample from parameter grid: 0.25 share for bayesridge
        params_lst = TrainHelper.random_sample_parameter_grid(
            param_grid=param_grid, sample_share=0.2)
        if algo == 'ard':
            param_grid['threshold_lambda'] = [10**x for x in range(2, 6)]
            # random sample from parameter grid: 0.2 share for ard
            params_lst = TrainHelper.random_sample_parameter_grid(
                param_grid=param_grid, sample_share=0.2)
    # remove non-relevant featureset imputation combis
    if not multiple_nans_raw_set:
        params_lst_small = params_lst.copy()
        for param_set in params_lst:
            feat = param_set['featureset']
            imp = param_set['imputation']
            if (feat == 'cal' or feat == 'none') and (imp == 'iterative'
                                                      or imp == 'knn'):
                params_lst_small.remove(param_set)
        params_lst = params_lst_small

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        normalize = params_lst[i]['normalize']
        one_step_ahead = params_lst[i]['osa']
        l1_ratio = params_lst[i]['l1_ratio'] if 'l1_ratio' in params_lst[
            i] else None
        alpha = params_lst[i]['alpha'] if 'alpha' in params_lst[i] else None
        alpha_1 = params_lst[i]['alpha_1'] if 'alpha_1' in params_lst[
            i] else None
        alpha_2 = params_lst[i]['alpha_2'] if 'alpha_2' in params_lst[
            i] else None
        lambda_1 = params_lst[i]['lambda_1'] if 'lambda_1' in params_lst[
            i] else None
        lambda_2 = params_lst[i]['lambda_2'] if 'lambda_2' in params_lst[
            i] else None
        threshold_lambda = params_lst[i][
            'threshold_lambda'] if 'threshold_lambda' in params_lst[i] else None

        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsMLR.MultipleLinearRegression(
                    model_to_use=algo,
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    one_step_ahead=one_step_ahead,
                    normalize=normalize,
                    l1_ratio=l1_ratio,
                    alpha=alpha,
                    alpha_1=alpha_1,
                    alpha_2=alpha_2,
                    lambda_1=lambda_1,
                    lambda_2=lambda_2,
                    threshold_lambda=threshold_lambda)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'normalize':
                normalize,
                'alpha':
                alpha,
                'l1_ratio':
                l1_ratio,
                'alpha_1':
                alpha_1,
                'alpha_2':
                alpha_2,
                'lambda_1':
                lambda_1,
                'lambda_2':
                lambda_2,
                'threshold_lambda':
                threshold_lambda,
                'one_step_ahead':
                one_step_ahead,
                'fitted_coef':
                model.model.coef_,
                'fitted_intercept':
                model.model.intercept_
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'normalize':
                normalize,
                'alpha':
                alpha,
                'l1_ratio':
                l1_ratio,
                'alpha_1':
                alpha_1,
                'alpha_2':
                alpha_2,
                'lambda_1':
                lambda_1,
                'lambda_2':
                lambda_2,
                'threshold_lambda':
                threshold_lambda,
                'one_step_ahead':
                one_step_ahead,
                'fitted_coef':
                'failed',
                'fitted_intercept':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc=algo,
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Exemplo n.º 24
0
    ])
    doc_results = pd.DataFrame(columns=columns)

    rmse_dict = {}
    rmse_ratio_dict = {}
    n_cps_detected_dict = {}
    n_refits_dict = {}
    rmse_base_dict = {}

    # iterate over all seasonal lengths
    for seas_len in seasons_list:
        print('+++++++++++ Seasonal Length ' + str(seas_len) + ' +++++++++++')
        # create base data
        season_length = seas_len
        X = TrainHelper.get_periodic_noisy_x(x_base=np.linspace(
            -0.5 * math.pi, 1.5 * math.pi, season_length),
                                             n_periods=n_periods)
        Y = TrainHelper.noisy_sin(X)
        data = pd.DataFrame(columns=['X', 'Y'])
        data['X'] = X
        data['Y'] = Y
        train_ind = int(0.6 * data.shape[0])
        train = data[0:train_ind]
        # Train offline base model
        target_column = 'Y'
        kernel = ExpSineSquared()
        alpha = 0.1
        n_restarts_optimizer = 10
        standardize = False
        normalize_y = True
        model_sine = ModelsGaussianProcessRegression.GaussianProcessRegression(
def run_sarimax_optim(target_column: str, split_perc: float, imputation: str,
                      featureset: str, univariate: bool):
    """
    Run whole (S)ARIMA(X) optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    :param univariate: use univariate version SARIMA as well
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'p': [0, 1, 2, 3],
        'd': [0, 1],
        'q': [0, 1, 2, 3],
        'P': [0, 1, 2, 3],
        'D': [0, 1],
        'Q': [0, 1, 2, 3],
        'osa': [True],
        'transf': [False, 'log', 'pw'],
        'exog': [True],
        'wi': [True]
    }
    if univariate:
        param_grid['exog'] = [False, True]
    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        p = params_lst[i]['p']
        d = params_lst[i]['d']
        q = params_lst[i]['q']
        P = params_lst[i]['P']
        D = params_lst[i]['D']
        Q = params_lst[i]['Q']
        one_step_ahead = params_lst[i]['osa']
        transf = params_lst[i]['transf']
        power, log = TrainHelper.get_pw_l_for_transf(transf=transf)
        use_exog = params_lst[i]['exog']
        with_interc = params_lst[i]['wi']
        order = [p, d, q]
        seasonal_order = [P, D, Q, seasonal_periods]

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsARIMA.ARIMA(target_column=target_column,
                                          order=order,
                                          seasonal_order=seasonal_order,
                                          one_step_ahead=one_step_ahead,
                                          power_transf=power,
                                          log=log,
                                          use_exog=use_exog,
                                          with_intercept=with_interc)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'order':
                order,
                'seasonal_order':
                seasonal_order,
                'one_step_ahead':
                one_step_ahead,
                'power_transform':
                power,
                'log_transform':
                log,
                'use_exog':
                use_exog,
                'with_intercept':
                with_interc
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'order':
                order,
                'seasonal_order':
                seasonal_order,
                'one_step_ahead':
                one_step_ahead,
                'power_transform':
                power,
                'log_transform':
                log,
                'use_exog':
                use_exog,
                'with_intercept':
                with_interc
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='sarima-x',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Exemplo n.º 26
0
def run_gp_optim(company: str, target_column: str, split_perc: float,
                 imputation: str, featureset: str):
    """
    Run GPR offline optimization loop
    :param company: prefix for data in case company data is also used
    :param target_column: target column to use
    :param split_perc: share of train data
    :param imputation: imputation method
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column,
                                                    split_perc=split_perc)

    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         company=company,
                                         target_column=target_column)

    # prepare parameter grid
    kernels = []
    base_kernels = [
        ConstantKernel(constant_value=1000, constant_value_bounds=(1e-5, 1e5)),
        Matern(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)),
        ExpSineSquared(length_scale=1.0,
                       periodicity=seasonal_periods,
                       length_scale_bounds=(1e-5, 1e5),
                       periodicity_bounds=(int(seasonal_periods * 0.8),
                                           int(seasonal_periods * 1.2))),
        RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)),
        RationalQuadratic(length_scale=1.0,
                          alpha=1.0,
                          length_scale_bounds=(1e-5, 1e5),
                          alpha_bounds=(1e-5, 1e5)),
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 1e5))
    ]
    TrainHelper.extend_kernel_combinations(kernels=kernels,
                                           base_kernels=base_kernels)
    param_grid = {
        'dataset': [datasets[0]],
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'kernel': kernels,
        'alpha': [1e-5, 1e-3, 1e-1, 1, 1e1, 1e3],
        'n_restarts_optimizer': [0, 5, 10],
        'standardize': [False, True],
        'norm_y': [False, True],
        'osa': [False]
    }
    # random sample from parameter grid
    sample_share = 0.1
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=sample_share)

    doc_results = None
    best_rmse = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        kernel = params_lst[i]['kernel']
        alpha = params_lst[i]['alpha']
        n_restarts_optimizer = params_lst[i]['n_restarts_optimizer']
        stand = params_lst[i]['standardize']
        norm_y = params_lst[i]['norm_y']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction can only be done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # 'dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsGaussianProcessRegression.GaussianProcessRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    kernel=kernel,
                    alpha=alpha,
                    n_restarts_optimizer=n_restarts_optimizer,
                    one_step_ahead=one_step_ahead,
                    standardize=stand,
                    normalize_y=norm_y)
                cross_val_dict = model.train(train=train, cross_val_call=True)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel,
                'alpha':
                alpha,
                'n_restarts_optimizer':
                n_restarts_optimizer,
                'standardize':
                stand,
                'normalize_y':
                norm_y,
                'one_step_ahead':
                one_step_ahead,
                'optimized_kernel':
                model.model.kernel_
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel,
                'alpha':
                alpha,
                'n_restarts_optimizer':
                n_restarts_optimizer,
                'standardize':
                stand,
                'normalize_y':
                norm_y,
                'one_step_ahead':
                one_step_ahead,
                'optimized_kernel':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc=company +
                                 '-gp-sklearn_raw',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Exemplo n.º 27
0
def run_gp_optim(target_column: str, split_perc: float, imputation: str,
                 featureset: str):
    """
    Run whole GPR optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    kernels = []
    base_kernels = [
        SquaredExponential(),
        Matern52(),
        White(),
        RationalQuadratic(),
        Polynomial()
    ]
    for kern in base_kernels:
        if isinstance(kern, IsotropicStationary):
            base_kernels.append(Periodic(kern, period=seasonal_periods))
    TrainHelper.extend_kernel_combinations(kernels=kernels,
                                           base_kernels=base_kernels)
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'kernel': kernels,
        'mean_function': [None, gpflow.mean_functions.Constant()],
        'noise_variance': [0.01, 1, 10, 100],
        'optimizer': [gpflow.optimizers.Scipy()],
        'standardize_x': [False, True],
        'standardize_y': [False, True],
        'osa': [True]
    }
    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        # deepcopy to prevent impact of previous optimizations
        kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel'])
        mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function'])
        noise_var = params_lst[i]['noise_variance']
        optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer'])
        stand_x = params_lst[i]['standardize_x']
        stand_y = params_lst[i]['standardize_y']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        kernel_string, mean_fct_string, optimizer_string = get_docresults_strings(
            kernel=kernel, mean_function=mean_fct, optimizer=optimizer)
        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsGPR.GaussianProcessRegressionGPFlow(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    kernel=kernel,
                    mean_function=mean_fct,
                    noise_variance=noise_var,
                    optimizer=optimizer,
                    standardize_x=stand_x,
                    standardize_y=stand_y,
                    one_step_ahead=one_step_ahead)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                model.model.parameters
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            # print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='gpr',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Exemplo n.º 28
0
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')


if __name__ == '__main__':
    target_column = str(sys.argv[1])
    split_perc = float(sys.argv[2])
    imputations = ['mean', 'iterative', 'knn']
    featuresets = ['full', 'cal', 'stat', 'none']
    imp_feat_combis = TrainHelper.get_imputation_featureset_combis(
        imputations=imputations,
        featuresets=featuresets,
        target_column=target_column)
    for (imputation, featureset) in imp_feat_combis:
        new_pid = os.fork()
        if new_pid == 0:
            run_gp_optim(target_column=target_column,
                         split_perc=split_perc,
                         imputation=imputation,
                         featureset=featureset)
            sys.exit()
        else:
            os.waitpid(new_pid, 0)
            print('finished run with ' + featureset + ' ' +
                  str('None' if imputation is None else imputation))
Exemplo n.º 29
0
def run_evars_gpr(base_model: ModelsGaussianProcessRegression.GaussianProcessRegression,
                  data: pd.DataFrame, season_length: int, target_column: str, train_ind: int,
                  comparison_partners: bool = False, da: str = 'scaled', cpd: str = 'cf',
                  scale_thr: float = 0.1, scale_seasons: int = 2,
                  scale_window: int = None, scale_window_factor: float = 0.1, scale_window_minimum: int = 2,
                  const_hazard: int = None, const_hazard_factor: int = 2,
                  cf_r: float = 0.4, cf_order: int = 1, cf_smooth: int = 4, cf_thr_perc: int = 90,
                  append: str = 'no', max_samples: int = None, max_samples_factor: int = 10,
                  o_perc: float = 1.1, u_perc: float = 0.1, thr: float = 0.2, under_samp: bool = False,
                  rel_thr: float = 0.5, rel_coef: float = 1.5, verbose: bool = False):
    """
    Run EVARS-GPR algo
    :param base_model: base model fitted during offline phase
    :param data: data to use
    :param season_length: length of one season
    :param target_column: target column
    :param train_ind: index of last train sample
    :param comparison_partners: specify whether to include comparison partners in optimization loop
    :param da: data augmentation method
    :param cpd: change point detection method
    :param scale_thr: threshold for output scaling factor
    :param scale_seasons: number of seasons to consider for calculation of output scaling factor
    :param scale_window: number of samples prior to change point for calculation of output scaling factor
    :param scale_window_factor: scale window as a multiple of the season length
    :param scale_window_minimum: minimum of the scale window
    :param const_hazard: constant hazard value in case of bocpd
    :param const_hazard_factor: constant hazard value as a multiple of the season length
    :param cf_r: r value (forgetting factor) for changefinder
    :param cf_order: order of SDAR models for changefinder
    :param cf_smooth: smoothing constant for changefinder
    :param cf_thr_perc: percentile of offline anomaly scores to use for declaration of a change point
    :param append: specify whether to append original and scaled dataset for da or not
    :param max_samples: maximum samples to consider for data augmentation
    :param max_samples_factor: maximum samples to consider for data augmentation as a multiple of the season length
    :param o_perc: oversampling percentage for GN
    :param u_perc: undersampling percentage for GN
    :param thr: threshold for GN
    :param under_samp: specify whether to undersample for SMOGN
    :param rel_thr: relevance threshold for SMOGN
    :param rel_coef: relevance coefficient for SMOGN
    :param verbose: print debug info
    :return: list of detected change points, evars-gpr predictions, dictionary with predictions of comparison partners,
    number of refits
    """
    scale_window = max(scale_window_minimum, int(scale_window_factor * season_length)) \
        if scale_window is None else scale_window
    const_hazard = const_hazard_factor * season_length if const_hazard is None else const_hazard
    max_samples = max_samples_factor * season_length if max_samples is None else max_samples
    data = data.copy()
    data.reset_index(drop=True, inplace=True)
    train = data[:train_ind]
    # setup cpd
    y_deseas = data[target_column].diff(season_length).dropna().values
    y_train_deseas = y_deseas[:train_ind-season_length]
    if cpd == 'bocd':
        mean = np.mean(y_train_deseas)
        std = np.std(y_train_deseas)
        train_std = (y_train_deseas - mean) / std
        bc = bocd.BayesianOnlineChangePointDetection(bocd.ConstantHazard(const_hazard),
                                                     bocd.StudentT(mu=0, kappa=1, alpha=1, beta=1))
        for i, d_bocd_train in enumerate(train_std):
            bc.update(d_bocd_train)
    elif cpd == 'cf':
        scores = []
        cf = changefinder.ChangeFinder(r=cf_r, order=cf_order, smooth=cf_smooth)
        for i in y_train_deseas:
            scores.append(cf.update(i))
        cf_threshold = np.percentile(scores, cf_thr_perc)
        if verbose:
            print('CF_Scores_Train: threshold=' + str(cf_threshold)
                  + ', mean=' + str(np.mean(scores)) + ', max=' + str(np.max(scores))
                  + ', 70perc=' + str(np.percentile(scores, 70)) + ', 80perc=' + str(np.percentile(scores, 80))
                  + ', 90perc=' + str(np.percentile(scores, 90)) + ', 95perc=' + str(np.percentile(scores, 95))
                  )
    # online part
    test = data[train_ind:]
    y_train_deseas_manip = y_train_deseas.copy()
    rt_mle = np.empty(test[target_column].shape)
    predictions = None
    train_manip = train.copy()
    model = copy.deepcopy(base_model)
    # setup comparison partners
    if comparison_partners:
        model_cpd_retrain_full = copy.deepcopy(base_model)
        predictions_cpd_retrain_full = None
        model_cpd_moving_window_full = copy.deepcopy(base_model)
        predictions_cpd_moving_window_full = None
        predictions_cpd_scaled_full = None
    cp_detected = []
    output_scale_old = 1
    output_scale = 1
    n_refits = 0
    # iterate over whole test set
    for index in test.index:
        sample = test.loc[index]
        train_manip = train_manip.append(sample)
        # predict next target value
        prediction = model.predict(test=sample.to_frame().T, train=train_manip)
        if predictions is None:
            predictions = prediction.copy()
        else:
            predictions = predictions.append(prediction)
        # get predictions of comparison partners if specified
        if comparison_partners:
            prediction_cpd_retrain_full = model_cpd_retrain_full.predict(test=sample.to_frame().T, train=train_manip)
            prediction_cpd_moving_window_full = model_cpd_moving_window_full.predict(test=sample.to_frame().T,
                                                                                     train=train_manip)
            prediction_cpd_scaled_full = prediction.copy()
            prediction_cpd_scaled_full *= output_scale_old
            if predictions_cpd_retrain_full is None:
                predictions_cpd_retrain_full = prediction_cpd_retrain_full.copy()
                predictions_cpd_moving_window_full = prediction_cpd_moving_window_full.copy()
                predictions_cpd_scaled_full = prediction_cpd_scaled_full.copy()
            else:
                predictions_cpd_retrain_full = predictions_cpd_retrain_full.append(prediction_cpd_retrain_full)
                predictions_cpd_moving_window_full = \
                    predictions_cpd_moving_window_full.append(prediction_cpd_moving_window_full)
                predictions_cpd_scaled_full = predictions_cpd_scaled_full.append(prediction_cpd_scaled_full)
        # CPD
        change_point_detected = False
        y_deseas = sample[target_column] - data.loc[index-season_length][target_column]
        if cpd == 'bocd':
            d_bocd = (y_deseas - mean) / std
            bc.update(d_bocd)
            rt_mle_index = index-train_ind
            rt_mle[rt_mle_index] = bc.rt
            y_train_deseas_manip = np.append(y_train_deseas_manip, y_deseas)
            mean = np.mean(y_train_deseas_manip)
            std = np.std(y_train_deseas_manip)
            if rt_mle_index > 0 and (rt_mle[rt_mle_index] - rt_mle[rt_mle_index-1] < 0):
                change_point_detected = True
                curr_ind = rt_mle_index
        elif cpd == 'cf':
            score = cf.update(y_deseas)
            scores.append(score)
            if score >= cf_threshold:
                if verbose:
                    print('Anomaly Score ' + str(score) + ' > ' + 'threshold ' + str(cf_threshold))
                change_point_detected = True
                curr_ind = index - train_ind
        # Trigger remaining EVARS-GPR procedures if a change point is detected
        if change_point_detected:
            if verbose:
                print('CP Detected ' + str(curr_ind + train.shape[0]))
            cp_detected.append(curr_ind)
            try:
                # Calculate output scaling factor
                change_point_index = curr_ind + train.shape[0]
                mean_now = np.mean(data[change_point_index-scale_window+1:change_point_index+1][target_column])
                mean_prev_seas_1 = \
                    np.mean(data[change_point_index-season_length-scale_window+1:change_point_index-season_length+1]
                            [target_column])
                mean_prev_seas_2 = \
                    np.mean(data[change_point_index-2*season_length-scale_window+1:change_point_index-2*season_length+1]
                            [target_column])
                if scale_seasons == 1:
                    output_scale = mean_now / mean_prev_seas_1
                elif scale_seasons == 2:
                    output_scale = np.mean([mean_now / mean_prev_seas_1, mean_now / mean_prev_seas_2])
                if output_scale == 0:
                    raise Exception
                if verbose:
                    print('ScaleDiff=' + str(np.abs(output_scale - output_scale_old) / output_scale_old))
                # Check deviation to previous scale factor
                if np.abs(output_scale - output_scale_old) / output_scale_old > scale_thr:
                    n_refits += 1
                    if verbose:
                        print('try to retrain model: ' + str(change_point_index)
                              + ' , output_scale=' + str(output_scale))
                    if output_scale > 1:
                        focus = 'high'
                    else:
                        focus = 'low'
                    # augment data
                    train_samples = TrainHelper.get_augmented_data(data=data, target_column=target_column, da=da,
                                                                   change_point_index=curr_ind + train.shape[0],
                                                                   output_scale=output_scale,
                                                                   rel_coef=rel_coef, rel_thr=rel_thr,
                                                                   under_samp=under_samp, focus=focus,
                                                                   o_perc=o_perc, u_perc=u_perc, thr=thr,
                                                                   append=append, max_samples=max_samples)
                    # retrain current model
                    model = ModelsGaussianProcessRegression.GaussianProcessRegression(
                        target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods,
                        kernel=base_model.model.kernel_, alpha=base_model.model.alpha,
                        n_restarts_optimizer=base_model.model.n_restarts_optimizer,
                        standardize=base_model.standardize, normalize_y=base_model.model.normalize_y,
                        one_step_ahead=base_model.one_step_ahead)
                    model.train(train_samples, cross_val_call=False)
                    if comparison_partners:
                        train_data = data.copy()[:change_point_index+1]
                        # cpd Retrain
                        model_cpd_retrain_full = ModelsGaussianProcessRegression.GaussianProcessRegression(
                            target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods,
                            kernel=base_model.model.kernel_, alpha=base_model.model.alpha,
                            n_restarts_optimizer=base_model.model.n_restarts_optimizer,
                            standardize=base_model.standardize, normalize_y=base_model.model.normalize_y,
                            one_step_ahead=base_model.one_step_ahead)
                        model_cpd_retrain_full.train(train_data, cross_val_call=False)
                        # Moving Window
                        model_cpd_moving_window_full = ModelsGaussianProcessRegression.GaussianProcessRegression(
                            target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods,
                            kernel=base_model.model.kernel_, alpha=base_model.model.alpha,
                            n_restarts_optimizer=base_model.model.n_restarts_optimizer,
                            standardize=base_model.standardize, normalize_y=base_model.model.normalize_y,
                            one_step_ahead=base_model.one_step_ahead)
                        model_cpd_moving_window_full.train(train_data[-season_length:], cross_val_call=False)
                    # in case of a successful refit change output_scale_old
                    output_scale_old = output_scale
            except Exception as exc:
                print(exc)
    if comparison_partners:
        comparison_partners_dict = {'cpd_retrain_full': predictions_cpd_retrain_full,
                                    'cpd_cpd_moving_window_full': predictions_cpd_moving_window_full,
                                    'cpd_scaled_full': predictions_cpd_scaled_full
                                    }
    else:
        comparison_partners_dict = {}
    return cp_detected, predictions, comparison_partners_dict, n_refits
def run_ann_optim(target_column: str, split_perc: float, imputation: str,
                  featureset: str):
    """
    Run whole ANN optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'dropout_rate': [0.0, 0.5],
        'batch_size': [4, 8, 16, 32],
        'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1],
        'min_val_loss_improvement': [100, 1000],
        'max_epochs_wo_improvement': [20, 50, 100],
        'n_hidden': [10, 20, 50, 100],
        'num_hidden_layer': [1, 2, 3],
        'osa': [True]
    }
    # random samples from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.1)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        dropout_rate = params_lst[i]['dropout_rate']
        batch_size = params_lst[i]['batch_size']
        learning_rate = params_lst[i]['learning_rate']
        min_val_loss_improvement = params_lst[i]['min_val_loss_improvement']
        max_epochs_wo_improvement = params_lst[i]['max_epochs_wo_improvement']
        one_step_ahead = params_lst[i]['osa']
        n_hidden = params_lst[i]['n_hidden']
        num_hidden_layer = params_lst[i]['num_hidden_layer']

        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsANN.AnnRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    one_step_ahead=one_step_ahead,
                    n_feature=train.shape[1] - 1,
                    n_hidden=n_hidden,
                    num_hidden_layer=num_hidden_layer,
                    dropout_rate=dropout_rate,
                    batch_size=batch_size,
                    learning_rate=learning_rate,
                    min_val_loss_improvement=min_val_loss_improvement,
                    max_epochs_wo_improvement=max_epochs_wo_improvement)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'dropout_rate':
                dropout_rate,
                'batch_size':
                batch_size,
                'learning_rate':
                learning_rate,
                'min_val_loss_improvement':
                min_val_loss_improvement,
                'max_epochs_wo_improvement':
                max_epochs_wo_improvement,
                'n_hidden':
                n_hidden,
                'num_hidden_layer':
                num_hidden_layer,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'dropout_rate':
                dropout_rate,
                'batch_size':
                batch_size,
                'learning_rate':
                learning_rate,
                'min_val_loss_improvement':
                min_val_loss_improvement,
                'max_epochs_wo_improvement':
                max_epochs_wo_improvement,
                'n_hidden':
                n_hidden,
                'num_hidden_layer':
                num_hidden_layer,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='ANN',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')