def __init__(self, n_feature: int, n_hidden: int, num_hidden_layer: int, n_output: int = 1, dropout_rate: float = 0.0): """ :param n_feature: number of features for ANN input :param n_hidden: number of hidden neurons (first hidden layer) :param num_hidden_layer: number of hidden layers :param n_output: number of outputs :param dropout_rate: probability of element being zeroed in dropout layer """ super(ANN, self).__init__() TrainHelper.init_pytorch_seeds() self.hidden_layer = nn.ModuleList() hidden_in = n_feature hidden_out = n_hidden for layer_num in range(num_hidden_layer): self.hidden_layer.append( nn.Linear(in_features=hidden_in, out_features=hidden_out)) hidden_in = hidden_out hidden_out = int(hidden_in / 2) self.output_layer = nn.Linear(in_features=hidden_in, out_features=n_output) self.dropout = nn.Dropout(p=dropout_rate)
def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame: """ Deliver, if specified one step ahead, out-of-sample predictions :param test: test set :param train: train set :return: DataFrame with predictions, upper and lower confidence level """ TrainHelper.init_pytorch_seeds() x_test = torch.tensor(data=self.x_scaler.transform( test.drop(self.target_column, axis=1)).astype(np.float32)) if self.one_step_ahead: train_manip = train.copy() predict_lst = [] # deep copy model as predict function should not change class model model = copy.deepcopy(self.model) for i in range(0, test.shape[0]): model.eval() # predict on cpu model.to(torch.device("cpu")) fc = model(x=x_test[i].view(1, -1)).item() train_manip = train_manip.append(test.iloc[[i]]) self.update(train=train_manip, model=model) predict_lst.append(fc) predict = np.array(predict_lst).flatten() else: self.model.eval() # predict on cpu self.model.to(torch.device("cpu")) predict = self.model(x=x_test).data.numpy().flatten() predictions = pd.DataFrame({'Prediction': predict}, index=test.index) return predictions
def create_train_valid_sets(self, train: pd.DataFrame) -> tuple: """ Create train and validation set respective train loader with batches :param train: train dataset :return: DataLoader with batches of train data as well as validation data """ TrainHelper.init_pytorch_seeds() # create train and validation set valid_size = 0.2 split_ind = int(train.shape[0] * (1 - valid_size)) train_data = train.iloc[:split_ind] valid_data = train.iloc[split_ind:] # scale input data x_train = self.x_scaler.fit_transform( train_data.drop(self.target_column, axis=1)) x_valid = self.x_scaler.transform( valid_data.drop(self.target_column, axis=1)) # create train ready data x_train = torch.tensor(x_train.astype(np.float32)) x_valid = torch.tensor(x_valid.astype(np.float32)) y_train = torch.tensor(data=train_data[ self.target_column].values.reshape(-1, 1).astype(np.float32)) y_valid = torch.tensor(data=valid_data[ self.target_column].values.reshape(-1, 1).astype(np.float32)) train_loader = torch.utils.data.DataLoader( dataset=torch.utils.data.TensorDataset(x_train, y_train), batch_size=self.batch_size, shuffle=False, drop_last=False, worker_init_fn=np.random.seed(0)) return train_loader, x_valid, y_valid
def create_train_valid_sequence_sets(self, train: pd.DataFrame) -> tuple: """ Create train and validation sequenced set respective train loader with sequenced batches :param train: train data to use :return: DataLoader with batches of sequenced train data as well as sequenced validation data """ TrainHelper.init_pytorch_seeds() # scale input data x_train_scaled = self.x_scaler.fit_transform( train.drop(self.target_column, axis=1)) y_train_scaled = self.y_scaler.fit_transform( train[self.target_column].values.reshape(-1, 1)) # create sequences x_seq_train, y_seq_train = self.create_sequences( data=np.hstack((x_train_scaled, y_train_scaled))) # split into train and validation set valid_size = 0.2 split_ind = int(x_seq_train.shape[0] * (1 - valid_size)) x_train = torch.tensor(x_seq_train[:split_ind, :, :].astype( np.float32)) x_valid = torch.tensor(x_seq_train[split_ind:, :, :].astype( np.float32)) y_train = torch.tensor(y_seq_train[:split_ind].reshape(-1, 1).astype( np.float32)) y_valid = torch.tensor(y_seq_train[split_ind:].reshape(-1, 1).astype( np.float32)) train_loader = torch.utils.data.DataLoader( dataset=torch.utils.data.TensorDataset(x_train, y_train), batch_size=self.batch_size, shuffle=False, drop_last=False) return train_loader, x_valid, y_valid
def run_pytorch_optim_loop(self, train_loader, x_valid, y_valid, model, checkpoint_name: str = 'train'): """ Optimization of hyperparameters :param train_loader: DataLoader with sequenced train batches :param x_valid: sequenced validation data :param y_valid: validation labels :param model: model to optimize :param checkpoint_name: save name for best checkpoints :return: """ TrainHelper.init_pytorch_seeds() # name for checkpoint for temporary storing during optimization with early stopping # detailed timestamp to prevent interference with parallel running jobs using same directory checkpoint_name += '_' + datetime.datetime.now().strftime("%d-%b-%Y_%H-%M-%S-%f") min_valid_loss = 99999999 epochs_wo_improvement_threshold = 0 epochs_wo_improvement_total = 0 # instantiate new optimizer to ensure independence of previous runs optimizer = torch.optim.Adam(params=model.parameters(), lr=self.learning_rate) # get device and shift model and data to it device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) x_valid, y_valid = x_valid.to(device), y_valid.to(device) for e in range(self.epochs): model.train() for (batch_x, batch_y) in train_loader: TrainHelper.init_pytorch_seeds() # copy data to device batch_x, batch_y = batch_x.to(device), batch_y.to(device) # gradients are summed up so they need to be zeroed for new run optimizer.zero_grad() y_pred = model(batch_x) loss_train = self.loss(y_pred, batch_y) loss_train.backward() optimizer.step() model.eval() y_pred_valid = model(x_valid) loss_valid = self.loss(y_pred_valid, y_valid).item() if loss_valid < min_valid_loss: min_valid_loss = loss_valid epochs_wo_improvement_threshold = 0 epochs_wo_improvement_total = 0 torch.save(model.state_dict(), 'Checkpoints/checkpoint_' + checkpoint_name + '.pt') elif (loss_valid - min_valid_loss) > self.min_val_loss_improvement: # Early Stopping with thresholds for counter incrementing and max_epochs epochs_wo_improvement_threshold += 1 if epochs_wo_improvement_threshold > self.max_epochs_wo_improvement: print('Early Stopping after epoch ' + str(e)) break elif loss_valid >= min_valid_loss: # Early stopping if there is no improvement with a higher threshold epochs_wo_improvement_total += 1 if epochs_wo_improvement_total > 2 * self.max_epochs_wo_improvement: print('Early Stopping after epoch ' + str(e)) break if e % 100 == 0: print('Epoch ' + str(e) + ': valid loss = ' + str(loss_valid) + ', min_valid_loss = ' + str(min_valid_loss)) model.load_state_dict(state_dict=torch.load('Checkpoints/checkpoint_' + checkpoint_name + '.pt')) os.remove('Checkpoints/checkpoint_' + checkpoint_name + '.pt')
def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict: """ Train model :param train: train set :param cross_val_call: called to perform cross validation :return dictionary with cross validated scores (if specified) """ TrainHelper.init_pytorch_seeds() cross_val_score_dict = {} if cross_val_call: cross_val_score_dict_ts, self.model = self.get_cross_val_score( train=train) cross_val_score_dict_shuf, self.model = self.get_cross_val_score( train=train, normal_cv=True) cross_val_score_dict = { **cross_val_score_dict_ts, **cross_val_score_dict_shuf } # create train and validation set train_loader, x_valid, y_valid = self.create_train_valid_sequence_sets( train=train) # run optim loop self.run_pytorch_optim_loop(train_loader=train_loader, x_valid=x_valid, y_valid=y_valid, model=self.model, checkpoint_name='lstm_train') return cross_val_score_dict
def insample(self, train: pd.DataFrame) -> pd.DataFrame: """ Deliver insample predictions :param train: train set :return: DataFrame with insample predictions """ TrainHelper.init_pytorch_seeds() self.model.eval() # predict on cpu self.model.to(torch.device("cpu")) # scale x_train_scaled = self.x_scaler.transform( train.drop(self.target_column, axis=1)) y_train_scaled = self.y_scaler.transform( train[self.target_column].values.reshape(-1, 1)) # create sequences x_seq_train, _ = self.create_sequences( data=np.hstack((x_train_scaled, y_train_scaled))) x_train = torch.tensor(x_seq_train.astype(np.float32)) # predict and transform back y_insample = self.y_scaler.inverse_transform( self.model(x_train).data.numpy()) # insert dummy values for train samples before first full sequence y_insample = np.insert(y_insample, 0, self.seq_length * [-9999]) insample = pd.DataFrame(data=y_insample, index=train.index, columns=['Insample']) return insample
def __init__(self, target_column: str, seasonal_periods: int, one_step_ahead: bool, n_feature: int, lstm_hidden_dim: int = 10, lstm_num_layers: int = 1, seq_length: int = 7, n_output: int = 1, dropout_rate: float = 0.0, epochs: int = 5000, batch_size: int = 16, learning_rate: float = 1e-3, loss=nn.MSELoss(), min_val_loss_improvement: float = 1000, max_epochs_wo_improvement: int = 100): """ :param target_column: target_column for prediction :param seasonal_periods: period of seasonality :param one_step_ahead: perform one step ahead prediction :param n_feature: number of features for ANN input :param lstm_hidden_dim: dimensionality of hidden layer :param lstm_num_layers: depth of lstm network :param seq_length: sequence length for input of lstm network :param n_output: number of outputs :param dropout_rate: probability of element being zeroed in dropout layer :param epochs: number of epochs :param batch_size: size of a batch :param learning_rate: learning rate for optimizer :param loss: loss function to use :param min_val_loss_improvement: deviation validation loss to min_val_loss for being counted for early stopping :param max_epochs_wo_improvement: maximum number of epochs without improvement before early stopping """ super().__init__(target_column=target_column, seasonal_periods=seasonal_periods, name='LSTM', one_step_ahead=one_step_ahead) TrainHelper.init_pytorch_seeds() self.model = LSTM(n_feature=n_feature, lstm_hidden_dim=lstm_hidden_dim, lstm_num_layers=lstm_num_layers, n_output=n_output, dropout_rate=dropout_rate) self.seq_length = seq_length self.optimizer = 'adam' self.learning_rate = learning_rate self.loss = loss self.x_scaler = sklearn.preprocessing.StandardScaler() self.y_scaler = sklearn.preprocessing.StandardScaler() self.batch_size = batch_size self.epochs = epochs self.min_val_loss_improvement = min_val_loss_improvement self.max_epochs_wo_improvement = max_epochs_wo_improvement
def forward(self, x): """ Feedforward path :param x: data to process :return: prediction value """ TrainHelper.init_pytorch_seeds() for layer in self.hidden_layer: x = F.relu(layer(x)) x = self.dropout(x) out = self.output_layer(x) return out
def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame: """ Deliver (back-transformed), if specified one step ahead, out-of-sample predictions :param test: test set :param train: train set :return: DataFrame with predictions """ if (self.power_transformer is not None) or self.log: test = TrainHelper.get_transformed_set( dataset=test, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log, only_transform=True) train = TrainHelper.get_transformed_set( dataset=train, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log) if self.one_step_ahead: train_manip = train.copy()[self.target_column] predict = [] # deep copy model as predict function should not change class model model_results = copy.deepcopy(self.model_results) for ind in test.index: fc = model_results.forecast() predict.append(fc[ind]) train_manip = train_manip.append( pd.Series(data=test[self.target_column], index=[ind])) model_results = self.update(train=pd.DataFrame( data=train_manip, columns=[self.target_column])) else: predict = self.model_results.predict(start=test.index[0], end=test.index[-1]) predictions = pd.DataFrame({'Prediction': predict}, index=test.index) if self.power_transformer is not None: predictions = pd.DataFrame( { 'Prediction': self.power_transformer.inverse_transform( predictions['Prediction'].values.reshape(-1, 1)).flatten() }, index=predictions.index) if self.log: if self.contains_zeros: predictions = predictions.apply(np.exp) + 1 else: predictions = predictions.apply(np.exp) return predictions
def forward(self, x): """ Feedforward path :param x: data to process :return: prediction value """ TrainHelper.init_pytorch_seeds() # input (batch x seq_length x input_size) (batch_first is set True) lstm_out, (hn, cn) = self.lstm(x.view(x.shape[0], x.shape[1], -1)) # only take last output of sequence out = self.dropout(lstm_out[:, -1, :]) out = self.output_layer(out) return out
def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame: """ Deliver, if specified one step ahead, out-of-sample predictions :param test: test set :param train: train set :return: DataFrame with predictions, upper and lower confidence level """ TrainHelper.init_pytorch_seeds() x_train_scaled = self.x_scaler.transform( train.drop(self.target_column, axis=1)) y_train_scaled = self.y_scaler.transform( train[self.target_column].values.reshape(-1, 1)) x_test_scaled = self.x_scaler.transform( test.drop(self.target_column, axis=1)) y_test_scaled = self.y_scaler.transform( test[self.target_column].values.reshape((-1, 1))) # add last elements of train to complete first test sequence x_test_full = np.vstack( (x_train_scaled[-self.seq_length:], x_test_scaled)) y_test_full = np.vstack( (y_train_scaled[-self.seq_length:], y_test_scaled)) # create test sequences x_seq_test, _ = self.create_sequences(data=np.hstack((x_test_full, y_test_full))) if self.one_step_ahead: predict_lst = [] train_manip = train.copy() # deep copy model as predict function should not change class model model = copy.deepcopy(self.model) for i in range(0, test.shape[0]): test_seq = x_seq_test[i].reshape(1, self.seq_length, -1) model.eval() # predict on cpu model.to(torch.device("cpu")) fc = self.y_scaler.inverse_transform( model(x=torch.tensor(test_seq.astype( np.float32))).data.numpy()) train_manip = train_manip.append(test.iloc[[i]]) self.update(train=train, model=model) predict_lst.append(fc) predict = np.array(predict_lst).flatten() else: # predict on cpu self.model.to(torch.device("cpu")) self.model.eval() predict = self.y_scaler.inverse_transform( self.model(x=torch.tensor(x_seq_test.astype( np.float32))).data.numpy()).flatten() predictions = pd.DataFrame({'Prediction': predict}, index=test.index) return predictions
def update(self, train: pd.DataFrame, model): """ Update existing model due to new samples :param train: train set with new samples :param model: model to update """ TrainHelper.init_pytorch_seeds() train_loader, x_valid, y_valid = self.create_train_valid_sequence_sets( train=train) self.run_pytorch_optim_loop(train_loader=train_loader, x_valid=x_valid, y_valid=y_valid, model=model, checkpoint_name='lstm_update')
def insample(self, train: pd.DataFrame) -> pd.DataFrame: """ Deliver insample predictions :param train: train set :return: DataFrame with insample predictions """ TrainHelper.init_pytorch_seeds() self.model.eval() # predict on cpu self.model.to(torch.device("cpu")) x_train = torch.tensor(data=self.x_scaler.transform( train.drop(self.target_column, axis=1)).astype(np.float32)) insample = pd.DataFrame(data=self.model(x=x_train).data.numpy(), index=train.index, columns=['Insample']) return insample
def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict: """ Train Exponential Smoothing model :param train: train set :param cross_val_call: called to perform cross validation """ cross_val_score_dict = {} if cross_val_call: cross_val_score_dict, self.model = self.get_cross_val_score( train=train) if (self.power_transformer is not None) or self.log: train = TrainHelper.get_transformed_set( dataset=train, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log) if (0 in train[self.target_column].values) and ( self.trend == 'mul' or self.seasonal == 'mul'): # multiplicative trend or seasonal only working with strictly-positive data # only done if no transform was performed, otherwise values would need to be corrected a lot train = train.copy() train[self.target_column] += 0.01 self.model = statsmodels.tsa.api.ExponentialSmoothing( endog=train[self.target_column], trend=self.trend, damped=self.damped, seasonal=self.seasonal, seasonal_periods=self.seasonal_periods) self.model_results = self.model.fit(remove_bias=self.remove_bias, use_brute=self.use_brute) return cross_val_score_dict
def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict: """ Train (S)ARIMA(X) model :param train: train set :param cross_val_call: called to perform cross validation :return dictionary with cross validated scores (if specified) """ cross_val_score_dict = {} if cross_val_call: cross_val_score_dict, self.model = self.get_cross_val_score( train=train) train_exog = None if (self.power_transformer is not None) or self.log: train = TrainHelper.get_transformed_set( dataset=train, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log) if self.use_exog: train_exog = train.drop(labels=[self.target_column], axis=1) self.exog_cols_dropped = train_exog.columns[ train_exog.isna().any()].tolist() PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped) train_exog = train_exog.to_numpy(dtype=float) self.model.fit(y=train[self.target_column], exogenous=train_exog, trend=self.trend) return cross_val_score_dict
def evaluate(self, train: pd.DataFrame, test: pd.DataFrame) -> dict: """ Evaluate model against all implemented evaluation metrics and baseline methods. Deliver dictionary with evaluation metrics. :param train: train set :param test: test set :return: dictionary with evaluation metrics of model and all baseline methods """ TrainHelper.init_pytorch_seeds() insample_rw, prediction_rw = SimpleBaselines.RandomWalk(one_step_ahead=self.one_step_ahead)\ .get_insample_prediction(train=train, test=test, target_column=self.target_column) insample_seasrw, prediction_seasrw = SimpleBaselines.RandomWalk(one_step_ahead=self.one_step_ahead)\ .get_insample_prediction(train=train, test=test, target_column=self.target_column, seasonal_periods=self.seasonal_periods) insample_ha, prediction_ha = SimpleBaselines.HistoricalAverage(one_step_ahead=self.one_step_ahead)\ .get_insample_prediction(train=train, test=test, target_column=self.target_column) insample_model = self.insample(train=train) prediction_model = self.predict(test=test, train=train) rmse_train_rw, mape_train_rw, smape_train_rw = EvaluationHelper.get_all_eval_vals( actual=train[self.target_column], prediction=insample_rw['Insample']) rmse_test_rw, mape_test_rw, smape_test_rw = EvaluationHelper.get_all_eval_vals( actual=test[self.target_column], prediction=prediction_rw['Prediction']) rmse_train_seasrw, mape_train_seasrw, smape_train_seasrw = EvaluationHelper.get_all_eval_vals( actual=train[self.target_column], prediction=insample_seasrw['Insample']) rmse_test_seasrw, mape_test_seasrw, smape_test_seasrw = EvaluationHelper.get_all_eval_vals( actual=test[self.target_column], prediction=prediction_seasrw['Prediction']) rmse_train_ha, mape_train_ha, smape_train_ha = EvaluationHelper.get_all_eval_vals( actual=train[self.target_column], prediction=insample_ha['Insample']) rmse_test_ha, mape_test_ha, smape_test_ha = EvaluationHelper.get_all_eval_vals( actual=test[self.target_column], prediction=prediction_ha['Prediction']) rmse_train_model, mape_train_model, smape_train_model = EvaluationHelper.get_all_eval_vals( actual=train[self.target_column], prediction=insample_model['Insample']) rmse_test_model, mape_test_model, smape_test_model = EvaluationHelper.get_all_eval_vals( actual=test[self.target_column], prediction=prediction_model['Prediction']) return {'RMSE_Train_RW': rmse_train_rw, 'MAPE_Train_RW': mape_train_rw, 'sMAPE_Train_RW': smape_train_rw, 'RMSE_Test_RW': rmse_test_rw, 'MAPE_Test_RW': mape_test_rw, 'sMAPE_Test_RW': smape_test_rw, 'RMSE_Train_seasRW': rmse_train_seasrw, 'MAPE_Train_seasRW': mape_train_seasrw, 'sMAPE_Train_seasRW': smape_train_seasrw, 'RMSE_Test_seasRW': rmse_test_seasrw, 'MAPE_Test_seasRW': mape_test_seasrw, 'sMAPE_Test_seasRW': smape_test_seasrw, 'RMSE_Train_HA': rmse_train_ha, 'MAPE_Train_HA': mape_train_ha, 'sMAPE_Train_HA': smape_train_ha, 'RMSE_Test_HA': rmse_test_ha, 'MAPE_Test_HA': mape_test_ha, 'sMAPE_Test_HA': smape_test_ha, 'RMSE_Train': rmse_train_model, 'MAPE_Train': mape_train_model, 'sMAPE_Train': smape_train_model, 'RMSE_Test': rmse_test_model, 'MAPE_Test': mape_test_model, 'sMAPE_Test': smape_test_model }
def __init__(self, n_feature: int, lstm_hidden_dim: int, lstm_num_layers: int = 1, n_output: int = 1, dropout_rate: float = 0.0): """ :param n_feature: number of features for ANN input :param lstm_hidden_dim: dimensionality of hidden layer :param lstm_num_layers: depth of lstm network :param n_output: number of outputs :param dropout_rate: probability of element being zeroed in dropout layer """ super(LSTM, self).__init__() TrainHelper.init_pytorch_seeds() self.lstm = nn.LSTM(input_size=n_feature, hidden_size=lstm_hidden_dim, num_layers=lstm_num_layers, batch_first=True, dropout=dropout_rate) self.dropout = nn.Dropout(p=dropout_rate) self.output_layer = nn.Linear(in_features=lstm_hidden_dim, out_features=n_output)
def run_es_optim(target_column: str, split_perc: float, imputation: str): """ Run whole ES optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'trend': ['add', None], 'damp': [False, True], 'seasonality': ['add', 'mul', None], 'remove_bias': [False, True], 'brute': [False, True], 'osa': [True], 'transf': [False, 'log', 'pw'] } # random sample from parameter grid params_lst = sorted(list( sklearn.model_selection.ParameterSampler( param_distributions=param_grid, n_iter=int( 1 * MixedHelper.get_product_len_dict(dictionary=param_grid)), random_state=np.random.RandomState(42))), key=lambda d: (d['dataset'].name, d['imputation'])) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] tr = params_lst[i]['trend'] damp = params_lst[i]['damp'] season = params_lst[i]['seasonality'] remo_bias = params_lst[i]['remove_bias'] brute = params_lst[i]['brute'] one_step_ahead = params_lst[i]['osa'] transf = params_lst[i]['transf'] power, log = TrainHelper.get_pw_l_for_transf(transf=transf) if not ((dataset.name == dataset_last_name) and (imputation == imputation_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, reset_index=True) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation sum_dict = None try: for train, test in train_test_list: model = ModelsES.ExponentialSmoothing( target_column=target_column, trend=tr, damped=damp, seasonal=season, seasonal_periods=seasonal_periods, remove_bias=remo_bias, use_brute=brute, one_step_ahead=one_step_ahead, power_transf=power, log=log) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'imputation': str('None' if imputation is None else imputation), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'trend': tr, 'damped': damp, 'seasonal': season, 'seasonal_periods': seasonal_periods, 'remove_bias': remo_bias, 'use_brute': brute, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log': log } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'imputation': str('None' if imputation is None else imputation), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'trend': tr, 'damped': damp, 'seasonal': season, 'seasonal_periods': seasonal_periods, 'remove_bias': remo_bias, 'use_brute': brute, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log': log } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='es', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
i = 1 for target_column in result_file_dict.keys(): print('++++++ Processing Dataset ' + str(i) + '/' + str(len(result_file_dict.keys())) + ' ++++++') i += 1 # set standard values split_perc = 0.8 company = 'General' doc_results = None result_file_str = result_file_dict[target_column] # read config file config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column, split_perc=split_perc) # set const hazard and scale window based on seasonal periods const_hazard = const_hazard_factor * seasonal_periods if const_hazard_user == 9999 else const_hazard_user scale_window = max(scale_window_minimum, int(scale_window_factor * seasonal_periods)) max_samples = max_samples_factor * seasonal_periods if max_samples_factor is not None else max_samples_user # read result file config result_file = pd.read_csv(optim_results_dir + result_file_str, sep=';', decimal=',', index_col=False) result_file.drop('Unnamed: 0', axis=1, inplace=True) result_file.replace(to_replace='NaN', value=np.nan, inplace=True) result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'].isna()], inplace=True) result_file.dropna(subset=[el for el in result_file.columns if 'cv' in el], inplace=True) result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'] == 0], inplace=True) sort_col = 'shuf_cv_rmse_mean' sorted_results = result_file.sort_values(sort_col) top_config = sorted_results.head(1).iloc[0] dict_top_config = TrainHelper.read_config_info(top_config, seasonal_periods)
def run_xgb_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole XGB optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'learning_rate': [0.05, 0.1, 0.3], 'max_depth': [3, 5, 10], 'subsample': [0.3, 0.7, 1], 'n_estimators': [10, 100, 1000], 'gamma': [0, 1, 10], 'alpha': [0, 0.1, 1, 10], 'reg_lambda': [0, 0.1, 1, 10], 'osa': [True] } # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] learning_rate = params_lst[i]['learning_rate'] max_depth = params_lst[i]['max_depth'] subsample = params_lst[i]['subsample'] n_estimators = params_lst[i]['n_estimators'] gamma = params_lst[i]['gamma'] alpha = params_lst[i]['alpha'] reg_lambda = params_lst[i]['reg_lambda'] one_step_ahead = params_lst[i]['osa'] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelXGBoost.XGBoostRegression( target_column=target_column, seasonal_periods=seasonal_periods, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, n_estimators=n_estimators, gamma=gamma, alpha=alpha, reg_lambda=reg_lambda, one_step_ahead=one_step_ahead) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'learning_rate': learning_rate, 'max_depth': max_depth, 'subsample': subsample, 'n_estimators': n_estimators, 'gamma': gamma, 'alpha': alpha, 'lambda': reg_lambda, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'learning_rate': learning_rate, 'max_depth': max_depth, 'subsample': subsample, 'n_estimators': n_estimators, 'gamma': gamma, 'alpha': alpha, 'lambda': reg_lambda, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='xgb', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame: """ Deliver (back-transformed), if specified one step ahead, out-of-sample predictions :param test: test set :param train: train set :return: DataFrame with predictions, upper and lower confidence level """ test_exog = None if (self.power_transformer is not None) or self.log: test = TrainHelper.get_transformed_set( dataset=test, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log, only_transform=True) if self.use_exog: test_exog = test.drop(labels=[self.target_column], axis=1) PreparationHelper.drop_columns(test_exog, self.exog_cols_dropped) test_exog = test_exog.to_numpy(dtype=float) if self.one_step_ahead: predict = [] conf_low = [] conf_up = [] # deep copy model as predict function should not change class model model = copy.deepcopy(self.model) for i in range(0, test.shape[0]): if self.use_exog: fc, conf = model.predict(n_periods=1, exogenous=pd.DataFrame( test_exog[i].reshape(1, -1)), return_conf_int=True, alpha=0.05) model.update(test[self.target_column][i], exogenous=pd.DataFrame(test_exog[i].reshape( 1, -1))) else: fc, conf = model.predict(n_periods=1, return_conf_int=True, alpha=0.05) model.update(test[self.target_column][i]) predict.append(fc[0]) conf_low.append(conf[0][0]) conf_up.append(conf[0][1]) else: predict, conf = self.model.predict(n_periods=test.shape[0], exogenous=test_exog, return_conf_int=True, alpha=0.05) conf_low = conf[:, 0] conf_up = conf[:, 1] predictions = pd.DataFrame( { 'Prediction': predict, 'LowerConf': conf_low, 'UpperConf': conf_up }, index=test.index) if self.power_transformer is not None: predictions = pd.DataFrame( { 'Prediction': self.power_transformer.inverse_transform( predictions['Prediction'].values.reshape(-1, 1)).flatten(), 'LowerConf': self.power_transformer.inverse_transform( predictions['LowerConf'].values.reshape(-1, 1)).flatten(), 'UpperConf': self.power_transformer.inverse_transform( predictions['UpperConf'].values.reshape(-1, 1)).flatten() }, index=predictions.index) if self.log: predict_backtr = np.exp(predictions['Prediction']) if self.contains_zeros: predict_backtr += 1 lower_dist = ( (predictions['Prediction'] - predictions['LowerConf']) / predictions['Prediction']) * predict_backtr upper_dist = ( (predictions['UpperConf'] - predictions['Prediction']) / predictions['Prediction']) * predict_backtr predictions = pd.DataFrame( { 'Prediction': predict_backtr, 'LowerConf': predict_backtr - lower_dist, 'UpperConf': predict_backtr + upper_dist }, index=predictions.index) return predictions
def run_regressions_optim(target_column: str, split_perc: float, algo: str): """ Run whole multiple linear regression optimization loops :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param algo: algo to use for optimization (['lasso', 'ridge', 'elasticnet', 'bayesridge', 'ard']) """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) multiple_nans_raw_set = config[target_column].getboolean( 'multiple_nans_raw_set') # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid # parameters relevant for all algos param_grid = { 'dataset': datasets, 'imputation': ['mean', 'iterative', 'knn'], 'featureset': ['full', 'cal', 'stat', 'none'], 'dim_reduction': ['None', 'pca'], 'normalize': [False, True], 'osa': [True] } # parameters relevant for lasso, ridge and elasticnet if algo in ['lasso', 'ridge', 'elasticnet']: param_grid['alpha'] = [10**x for x in range(-5, 5)] if algo == 'elasticnet': param_grid['l1_ratio'] = np.arange(0.1, 1, 0.1) # random sample from parameter grid: all combis for lasso, ridge, elasticnet params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=1) # parameters relevant for bayesian ridge and ard regression else: param_grid['alpha_1'] = [10**x for x in range(-6, 1)] param_grid['alpha_2'] = [10**x for x in range(-6, -4)] param_grid['lambda_1'] = [10**x for x in range(-6, 1)] param_grid['lambda_2'] = [10**x for x in range(-6, 1)] # random sample from parameter grid: 0.25 share for bayesridge params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) if algo == 'ard': param_grid['threshold_lambda'] = [10**x for x in range(2, 6)] # random sample from parameter grid: 0.2 share for ard params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) # remove non-relevant featureset imputation combis if not multiple_nans_raw_set: params_lst_small = params_lst.copy() for param_set in params_lst: feat = param_set['featureset'] imp = param_set['imputation'] if (feat == 'cal' or feat == 'none') and (imp == 'iterative' or imp == 'knn'): params_lst_small.remove(param_set) params_lst = params_lst_small doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] normalize = params_lst[i]['normalize'] one_step_ahead = params_lst[i]['osa'] l1_ratio = params_lst[i]['l1_ratio'] if 'l1_ratio' in params_lst[ i] else None alpha = params_lst[i]['alpha'] if 'alpha' in params_lst[i] else None alpha_1 = params_lst[i]['alpha_1'] if 'alpha_1' in params_lst[ i] else None alpha_2 = params_lst[i]['alpha_2'] if 'alpha_2' in params_lst[ i] else None lambda_1 = params_lst[i]['lambda_1'] if 'lambda_1' in params_lst[ i] else None lambda_2 = params_lst[i]['lambda_2'] if 'lambda_2' in params_lst[ i] else None threshold_lambda = params_lst[i][ 'threshold_lambda'] if 'threshold_lambda' in params_lst[i] else None # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsMLR.MultipleLinearRegression( model_to_use=algo, target_column=target_column, seasonal_periods=seasonal_periods, one_step_ahead=one_step_ahead, normalize=normalize, l1_ratio=l1_ratio, alpha=alpha, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, threshold_lambda=threshold_lambda) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'normalize': normalize, 'alpha': alpha, 'l1_ratio': l1_ratio, 'alpha_1': alpha_1, 'alpha_2': alpha_2, 'lambda_1': lambda_1, 'lambda_2': lambda_2, 'threshold_lambda': threshold_lambda, 'one_step_ahead': one_step_ahead, 'fitted_coef': model.model.coef_, 'fitted_intercept': model.model.intercept_ } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'normalize': normalize, 'alpha': alpha, 'l1_ratio': l1_ratio, 'alpha_1': alpha_1, 'alpha_2': alpha_2, 'lambda_1': lambda_1, 'lambda_2': lambda_2, 'threshold_lambda': threshold_lambda, 'one_step_ahead': one_step_ahead, 'fitted_coef': 'failed', 'fitted_intercept': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc=algo, target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
]) doc_results = pd.DataFrame(columns=columns) rmse_dict = {} rmse_ratio_dict = {} n_cps_detected_dict = {} n_refits_dict = {} rmse_base_dict = {} # iterate over all seasonal lengths for seas_len in seasons_list: print('+++++++++++ Seasonal Length ' + str(seas_len) + ' +++++++++++') # create base data season_length = seas_len X = TrainHelper.get_periodic_noisy_x(x_base=np.linspace( -0.5 * math.pi, 1.5 * math.pi, season_length), n_periods=n_periods) Y = TrainHelper.noisy_sin(X) data = pd.DataFrame(columns=['X', 'Y']) data['X'] = X data['Y'] = Y train_ind = int(0.6 * data.shape[0]) train = data[0:train_ind] # Train offline base model target_column = 'Y' kernel = ExpSineSquared() alpha = 0.1 n_restarts_optimizer = 10 standardize = False normalize_y = True model_sine = ModelsGaussianProcessRegression.GaussianProcessRegression(
def run_sarimax_optim(target_column: str, split_perc: float, imputation: str, featureset: str, univariate: bool): """ Run whole (S)ARIMA(X) optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use :param univariate: use univariate version SARIMA as well """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'p': [0, 1, 2, 3], 'd': [0, 1], 'q': [0, 1, 2, 3], 'P': [0, 1, 2, 3], 'D': [0, 1], 'Q': [0, 1, 2, 3], 'osa': [True], 'transf': [False, 'log', 'pw'], 'exog': [True], 'wi': [True] } if univariate: param_grid['exog'] = [False, True] # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] p = params_lst[i]['p'] d = params_lst[i]['d'] q = params_lst[i]['q'] P = params_lst[i]['P'] D = params_lst[i]['D'] Q = params_lst[i]['Q'] one_step_ahead = params_lst[i]['osa'] transf = params_lst[i]['transf'] power, log = TrainHelper.get_pw_l_for_transf(transf=transf) use_exog = params_lst[i]['exog'] with_interc = params_lst[i]['wi'] order = [p, d, q] seasonal_order = [P, D, Q, seasonal_periods] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsARIMA.ARIMA(target_column=target_column, order=order, seasonal_order=seasonal_order, one_step_ahead=one_step_ahead, power_transf=power, log=log, use_exog=use_exog, with_intercept=with_interc) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'order': order, 'seasonal_order': seasonal_order, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log_transform': log, 'use_exog': use_exog, 'with_intercept': with_interc } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'order': order, 'seasonal_order': seasonal_order, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log_transform': log, 'use_exog': use_exog, 'with_intercept': with_interc } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='sarima-x', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_gp_optim(company: str, target_column: str, split_perc: float, imputation: str, featureset: str): """ Run GPR offline optimization loop :param company: prefix for data in case company data is also used :param target_column: target column to use :param split_perc: share of train data :param imputation: imputation method :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, company=company, target_column=target_column) # prepare parameter grid kernels = [] base_kernels = [ ConstantKernel(constant_value=1000, constant_value_bounds=(1e-5, 1e5)), Matern(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)), ExpSineSquared(length_scale=1.0, periodicity=seasonal_periods, length_scale_bounds=(1e-5, 1e5), periodicity_bounds=(int(seasonal_periods * 0.8), int(seasonal_periods * 1.2))), RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)), RationalQuadratic(length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5)), WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 1e5)) ] TrainHelper.extend_kernel_combinations(kernels=kernels, base_kernels=base_kernels) param_grid = { 'dataset': [datasets[0]], 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'kernel': kernels, 'alpha': [1e-5, 1e-3, 1e-1, 1, 1e1, 1e3], 'n_restarts_optimizer': [0, 5, 10], 'standardize': [False, True], 'norm_y': [False, True], 'osa': [False] } # random sample from parameter grid sample_share = 0.1 params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=sample_share) doc_results = None best_rmse = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] kernel = params_lst[i]['kernel'] alpha = params_lst[i]['alpha'] n_restarts_optimizer = params_lst[i]['n_restarts_optimizer'] stand = params_lst[i]['standardize'] norm_y = params_lst[i]['norm_y'] one_step_ahead = params_lst[i]['osa'] # dim_reduction can only be done without NaNs if imputation is None and dim_reduction is not None: continue # 'dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsGaussianProcessRegression.GaussianProcessRegression( target_column=target_column, seasonal_periods=seasonal_periods, kernel=kernel, alpha=alpha, n_restarts_optimizer=n_restarts_optimizer, one_step_ahead=one_step_ahead, standardize=stand, normalize_y=norm_y) cross_val_dict = model.train(train=train, cross_val_call=True) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel, 'alpha': alpha, 'n_restarts_optimizer': n_restarts_optimizer, 'standardize': stand, 'normalize_y': norm_y, 'one_step_ahead': one_step_ahead, 'optimized_kernel': model.model.kernel_ } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel, 'alpha': alpha, 'n_restarts_optimizer': n_restarts_optimizer, 'standardize': stand, 'normalize_y': norm_y, 'one_step_ahead': one_step_ahead, 'optimized_kernel': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc=company + '-gp-sklearn_raw', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_gp_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole GPR optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid kernels = [] base_kernels = [ SquaredExponential(), Matern52(), White(), RationalQuadratic(), Polynomial() ] for kern in base_kernels: if isinstance(kern, IsotropicStationary): base_kernels.append(Periodic(kern, period=seasonal_periods)) TrainHelper.extend_kernel_combinations(kernels=kernels, base_kernels=base_kernels) param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'kernel': kernels, 'mean_function': [None, gpflow.mean_functions.Constant()], 'noise_variance': [0.01, 1, 10, 100], 'optimizer': [gpflow.optimizers.Scipy()], 'standardize_x': [False, True], 'standardize_y': [False, True], 'osa': [True] } # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] # deepcopy to prevent impact of previous optimizations kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel']) mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function']) noise_var = params_lst[i]['noise_variance'] optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer']) stand_x = params_lst[i]['standardize_x'] stand_y = params_lst[i]['standardize_y'] one_step_ahead = params_lst[i]['osa'] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset kernel_string, mean_fct_string, optimizer_string = get_docresults_strings( kernel=kernel, mean_function=mean_fct, optimizer=optimizer) sum_dict = None try: for train, test in train_test_list: model = ModelsGPR.GaussianProcessRegressionGPFlow( target_column=target_column, seasonal_periods=seasonal_periods, kernel=kernel, mean_function=mean_fct, noise_variance=noise_var, optimizer=optimizer, standardize_x=stand_x, standardize_y=stand_y, one_step_ahead=one_step_ahead) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': model.model.parameters } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: # print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='gpr', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.') if __name__ == '__main__': target_column = str(sys.argv[1]) split_perc = float(sys.argv[2]) imputations = ['mean', 'iterative', 'knn'] featuresets = ['full', 'cal', 'stat', 'none'] imp_feat_combis = TrainHelper.get_imputation_featureset_combis( imputations=imputations, featuresets=featuresets, target_column=target_column) for (imputation, featureset) in imp_feat_combis: new_pid = os.fork() if new_pid == 0: run_gp_optim(target_column=target_column, split_perc=split_perc, imputation=imputation, featureset=featureset) sys.exit() else: os.waitpid(new_pid, 0) print('finished run with ' + featureset + ' ' + str('None' if imputation is None else imputation))
def run_evars_gpr(base_model: ModelsGaussianProcessRegression.GaussianProcessRegression, data: pd.DataFrame, season_length: int, target_column: str, train_ind: int, comparison_partners: bool = False, da: str = 'scaled', cpd: str = 'cf', scale_thr: float = 0.1, scale_seasons: int = 2, scale_window: int = None, scale_window_factor: float = 0.1, scale_window_minimum: int = 2, const_hazard: int = None, const_hazard_factor: int = 2, cf_r: float = 0.4, cf_order: int = 1, cf_smooth: int = 4, cf_thr_perc: int = 90, append: str = 'no', max_samples: int = None, max_samples_factor: int = 10, o_perc: float = 1.1, u_perc: float = 0.1, thr: float = 0.2, under_samp: bool = False, rel_thr: float = 0.5, rel_coef: float = 1.5, verbose: bool = False): """ Run EVARS-GPR algo :param base_model: base model fitted during offline phase :param data: data to use :param season_length: length of one season :param target_column: target column :param train_ind: index of last train sample :param comparison_partners: specify whether to include comparison partners in optimization loop :param da: data augmentation method :param cpd: change point detection method :param scale_thr: threshold for output scaling factor :param scale_seasons: number of seasons to consider for calculation of output scaling factor :param scale_window: number of samples prior to change point for calculation of output scaling factor :param scale_window_factor: scale window as a multiple of the season length :param scale_window_minimum: minimum of the scale window :param const_hazard: constant hazard value in case of bocpd :param const_hazard_factor: constant hazard value as a multiple of the season length :param cf_r: r value (forgetting factor) for changefinder :param cf_order: order of SDAR models for changefinder :param cf_smooth: smoothing constant for changefinder :param cf_thr_perc: percentile of offline anomaly scores to use for declaration of a change point :param append: specify whether to append original and scaled dataset for da or not :param max_samples: maximum samples to consider for data augmentation :param max_samples_factor: maximum samples to consider for data augmentation as a multiple of the season length :param o_perc: oversampling percentage for GN :param u_perc: undersampling percentage for GN :param thr: threshold for GN :param under_samp: specify whether to undersample for SMOGN :param rel_thr: relevance threshold for SMOGN :param rel_coef: relevance coefficient for SMOGN :param verbose: print debug info :return: list of detected change points, evars-gpr predictions, dictionary with predictions of comparison partners, number of refits """ scale_window = max(scale_window_minimum, int(scale_window_factor * season_length)) \ if scale_window is None else scale_window const_hazard = const_hazard_factor * season_length if const_hazard is None else const_hazard max_samples = max_samples_factor * season_length if max_samples is None else max_samples data = data.copy() data.reset_index(drop=True, inplace=True) train = data[:train_ind] # setup cpd y_deseas = data[target_column].diff(season_length).dropna().values y_train_deseas = y_deseas[:train_ind-season_length] if cpd == 'bocd': mean = np.mean(y_train_deseas) std = np.std(y_train_deseas) train_std = (y_train_deseas - mean) / std bc = bocd.BayesianOnlineChangePointDetection(bocd.ConstantHazard(const_hazard), bocd.StudentT(mu=0, kappa=1, alpha=1, beta=1)) for i, d_bocd_train in enumerate(train_std): bc.update(d_bocd_train) elif cpd == 'cf': scores = [] cf = changefinder.ChangeFinder(r=cf_r, order=cf_order, smooth=cf_smooth) for i in y_train_deseas: scores.append(cf.update(i)) cf_threshold = np.percentile(scores, cf_thr_perc) if verbose: print('CF_Scores_Train: threshold=' + str(cf_threshold) + ', mean=' + str(np.mean(scores)) + ', max=' + str(np.max(scores)) + ', 70perc=' + str(np.percentile(scores, 70)) + ', 80perc=' + str(np.percentile(scores, 80)) + ', 90perc=' + str(np.percentile(scores, 90)) + ', 95perc=' + str(np.percentile(scores, 95)) ) # online part test = data[train_ind:] y_train_deseas_manip = y_train_deseas.copy() rt_mle = np.empty(test[target_column].shape) predictions = None train_manip = train.copy() model = copy.deepcopy(base_model) # setup comparison partners if comparison_partners: model_cpd_retrain_full = copy.deepcopy(base_model) predictions_cpd_retrain_full = None model_cpd_moving_window_full = copy.deepcopy(base_model) predictions_cpd_moving_window_full = None predictions_cpd_scaled_full = None cp_detected = [] output_scale_old = 1 output_scale = 1 n_refits = 0 # iterate over whole test set for index in test.index: sample = test.loc[index] train_manip = train_manip.append(sample) # predict next target value prediction = model.predict(test=sample.to_frame().T, train=train_manip) if predictions is None: predictions = prediction.copy() else: predictions = predictions.append(prediction) # get predictions of comparison partners if specified if comparison_partners: prediction_cpd_retrain_full = model_cpd_retrain_full.predict(test=sample.to_frame().T, train=train_manip) prediction_cpd_moving_window_full = model_cpd_moving_window_full.predict(test=sample.to_frame().T, train=train_manip) prediction_cpd_scaled_full = prediction.copy() prediction_cpd_scaled_full *= output_scale_old if predictions_cpd_retrain_full is None: predictions_cpd_retrain_full = prediction_cpd_retrain_full.copy() predictions_cpd_moving_window_full = prediction_cpd_moving_window_full.copy() predictions_cpd_scaled_full = prediction_cpd_scaled_full.copy() else: predictions_cpd_retrain_full = predictions_cpd_retrain_full.append(prediction_cpd_retrain_full) predictions_cpd_moving_window_full = \ predictions_cpd_moving_window_full.append(prediction_cpd_moving_window_full) predictions_cpd_scaled_full = predictions_cpd_scaled_full.append(prediction_cpd_scaled_full) # CPD change_point_detected = False y_deseas = sample[target_column] - data.loc[index-season_length][target_column] if cpd == 'bocd': d_bocd = (y_deseas - mean) / std bc.update(d_bocd) rt_mle_index = index-train_ind rt_mle[rt_mle_index] = bc.rt y_train_deseas_manip = np.append(y_train_deseas_manip, y_deseas) mean = np.mean(y_train_deseas_manip) std = np.std(y_train_deseas_manip) if rt_mle_index > 0 and (rt_mle[rt_mle_index] - rt_mle[rt_mle_index-1] < 0): change_point_detected = True curr_ind = rt_mle_index elif cpd == 'cf': score = cf.update(y_deseas) scores.append(score) if score >= cf_threshold: if verbose: print('Anomaly Score ' + str(score) + ' > ' + 'threshold ' + str(cf_threshold)) change_point_detected = True curr_ind = index - train_ind # Trigger remaining EVARS-GPR procedures if a change point is detected if change_point_detected: if verbose: print('CP Detected ' + str(curr_ind + train.shape[0])) cp_detected.append(curr_ind) try: # Calculate output scaling factor change_point_index = curr_ind + train.shape[0] mean_now = np.mean(data[change_point_index-scale_window+1:change_point_index+1][target_column]) mean_prev_seas_1 = \ np.mean(data[change_point_index-season_length-scale_window+1:change_point_index-season_length+1] [target_column]) mean_prev_seas_2 = \ np.mean(data[change_point_index-2*season_length-scale_window+1:change_point_index-2*season_length+1] [target_column]) if scale_seasons == 1: output_scale = mean_now / mean_prev_seas_1 elif scale_seasons == 2: output_scale = np.mean([mean_now / mean_prev_seas_1, mean_now / mean_prev_seas_2]) if output_scale == 0: raise Exception if verbose: print('ScaleDiff=' + str(np.abs(output_scale - output_scale_old) / output_scale_old)) # Check deviation to previous scale factor if np.abs(output_scale - output_scale_old) / output_scale_old > scale_thr: n_refits += 1 if verbose: print('try to retrain model: ' + str(change_point_index) + ' , output_scale=' + str(output_scale)) if output_scale > 1: focus = 'high' else: focus = 'low' # augment data train_samples = TrainHelper.get_augmented_data(data=data, target_column=target_column, da=da, change_point_index=curr_ind + train.shape[0], output_scale=output_scale, rel_coef=rel_coef, rel_thr=rel_thr, under_samp=under_samp, focus=focus, o_perc=o_perc, u_perc=u_perc, thr=thr, append=append, max_samples=max_samples) # retrain current model model = ModelsGaussianProcessRegression.GaussianProcessRegression( target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods, kernel=base_model.model.kernel_, alpha=base_model.model.alpha, n_restarts_optimizer=base_model.model.n_restarts_optimizer, standardize=base_model.standardize, normalize_y=base_model.model.normalize_y, one_step_ahead=base_model.one_step_ahead) model.train(train_samples, cross_val_call=False) if comparison_partners: train_data = data.copy()[:change_point_index+1] # cpd Retrain model_cpd_retrain_full = ModelsGaussianProcessRegression.GaussianProcessRegression( target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods, kernel=base_model.model.kernel_, alpha=base_model.model.alpha, n_restarts_optimizer=base_model.model.n_restarts_optimizer, standardize=base_model.standardize, normalize_y=base_model.model.normalize_y, one_step_ahead=base_model.one_step_ahead) model_cpd_retrain_full.train(train_data, cross_val_call=False) # Moving Window model_cpd_moving_window_full = ModelsGaussianProcessRegression.GaussianProcessRegression( target_column=base_model.target_column, seasonal_periods=base_model.seasonal_periods, kernel=base_model.model.kernel_, alpha=base_model.model.alpha, n_restarts_optimizer=base_model.model.n_restarts_optimizer, standardize=base_model.standardize, normalize_y=base_model.model.normalize_y, one_step_ahead=base_model.one_step_ahead) model_cpd_moving_window_full.train(train_data[-season_length:], cross_val_call=False) # in case of a successful refit change output_scale_old output_scale_old = output_scale except Exception as exc: print(exc) if comparison_partners: comparison_partners_dict = {'cpd_retrain_full': predictions_cpd_retrain_full, 'cpd_cpd_moving_window_full': predictions_cpd_moving_window_full, 'cpd_scaled_full': predictions_cpd_scaled_full } else: comparison_partners_dict = {} return cp_detected, predictions, comparison_partners_dict, n_refits
def run_ann_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole ANN optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'dropout_rate': [0.0, 0.5], 'batch_size': [4, 8, 16, 32], 'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1], 'min_val_loss_improvement': [100, 1000], 'max_epochs_wo_improvement': [20, 50, 100], 'n_hidden': [10, 20, 50, 100], 'num_hidden_layer': [1, 2, 3], 'osa': [True] } # random samples from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.1) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] dropout_rate = params_lst[i]['dropout_rate'] batch_size = params_lst[i]['batch_size'] learning_rate = params_lst[i]['learning_rate'] min_val_loss_improvement = params_lst[i]['min_val_loss_improvement'] max_epochs_wo_improvement = params_lst[i]['max_epochs_wo_improvement'] one_step_ahead = params_lst[i]['osa'] n_hidden = params_lst[i]['n_hidden'] num_hidden_layer = params_lst[i]['num_hidden_layer'] # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsANN.AnnRegression( target_column=target_column, seasonal_periods=seasonal_periods, one_step_ahead=one_step_ahead, n_feature=train.shape[1] - 1, n_hidden=n_hidden, num_hidden_layer=num_hidden_layer, dropout_rate=dropout_rate, batch_size=batch_size, learning_rate=learning_rate, min_val_loss_improvement=min_val_loss_improvement, max_epochs_wo_improvement=max_epochs_wo_improvement) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'learning_rate': learning_rate, 'min_val_loss_improvement': min_val_loss_improvement, 'max_epochs_wo_improvement': max_epochs_wo_improvement, 'n_hidden': n_hidden, 'num_hidden_layer': num_hidden_layer, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'learning_rate': learning_rate, 'min_val_loss_improvement': min_val_loss_improvement, 'max_epochs_wo_improvement': max_epochs_wo_improvement, 'n_hidden': n_hidden, 'num_hidden_layer': num_hidden_layer, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='ANN', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')