limit_train_batches=30, # val_check_interval=20, # limit_val_batches=1, # fast_dev_run=True, # logger=logger, # profiler=True, callbacks=[lr_logger], ) tft = TemporalFusionTransformer.from_dataset( training, learning_rate=0.03, hidden_size=16, attention_head_size=1, dropout=0.1, hidden_continuous_size=8, output_size=7, loss=QuantileLoss(), log_interval=10, log_val_interval=1, reduce_on_plateau_patience=3, ) print(f"Number of parameters in network: {tft.size()/1e3:.1f}k") # # find optimal learning rate # # remove logging and artificial epoch size # tft.hparams.log_interval = -1 # tft.hparams.log_val_interval = -1 # trainer.limit_train_batches = 1.0 # # run learning rate finder # res = trainer.tuner.lr_find(
max_epochs=1000, #min_epochs=100, gpus=0, weights_summary="top", gradient_clip_val=0.14578, limit_train_batches=30, # val_check_interval=20, # limit_val_batches=1, # fast_dev_run=True, # logger=logger, # profiler=True, callbacks=[lr_logger, early_stop_callback], ) tft = TemporalFusionTransformer.load_from_checkpoint( "/home/johnny/tempy/lightning_logs/version_36/checkpoints/epoch=157-step=4739.ckpt" ) #/home/johnny/tempy/lightning_logs/version_31/checkpoints/epoch=20-step=83.ckpt")#best_model_path) """ tft = TemporalFusionTransformer.from_dataset( training, learning_rate=0.02, hidden_size=12, attention_head_size=6, dropout=0.1, hidden_continuous_size=12, output_size=8, loss=QuantileLoss(), log_interval=10, #log_val_interval=1, reduce_on_plateau_patience=10, )
def train( self, max_epochs=25, hidden_size=16, lstm_layers=1, dropout=0.1, attention_head_size=4, reduce_on_plateau_patience=4, hidden_continuous_size=8, learning_rate=1e-3, gradient_clip_val=0.1, ): # configure network and trainer # create dataloaders for model batch_size = 128 train_dataloader = self.intern_training.to_dataloader( train=True, batch_size=batch_size) val_dataloader = self._intern_validation.to_dataloader( train=False, batch_size=batch_size * 10) pl.seed_everything(42) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") # lr_logger = LearningRateMonitor() trainer = pl.Trainer( max_epochs=max_epochs, gpus=0, weights_summary=None, gradient_clip_val=gradient_clip_val, # limit_train_batches=30, # coment in for training, running validation every 30 batches # fast_dev_run=True, # comment in to check that networkor dataset has no serious bugs callbacks=[early_stop_callback], ) self.model = TemporalFusionTransformer.from_dataset( self.intern_training, learning_rate=learning_rate, hidden_size=hidden_size, attention_head_size=attention_head_size, dropout=dropout, hidden_continuous_size=hidden_continuous_size, lstm_layers=lstm_layers, output_size=len(self.quantiles), # 3 quantiles by default loss=QuantileLoss(self.quantiles), reduce_on_plateau_patience=reduce_on_plateau_patience, ) # res = trainer.tuner.lr_find( # self.model, # train_dataloader=train_dataloader, # val_dataloaders=val_dataloader, # max_lr=10.0, # min_lr=1e-6, # ) # self.model = TemporalFusionTransformer.from_dataset( # self.intern_training, # learning_rate=res.suggestion(), # using the suggested learining rate # hidden_size=hidden_size, # attention_head_size=attention_head_size, # dropout=dropout, # hidden_continuous_size=hidden_continuous_size, # output_size=len(self.quantiles), # 3 quantiles by default # loss=QuantileLoss(self.quantiles), # reduce_on_plateau_patience=reduce_on_plateau_patience, # ) # fit network trainer.fit( self.model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, )
def objective(trial: optuna.Trial) -> float: # Filenames for each trial must be made unique in order to access each checkpoint. checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( model_path, "trial_{}".format(trial.number), "{epoch}"), monitor="val_loss") # The default logger in PyTorch Lightning writes to event files to be consumed by # TensorBoard. We don't use any logger here as it requires us to implement several abstract # methods. Instead we setup a simple callback, that saves metrics from each validation step. metrics_callback = MetricsCallback() learning_rate_callback = LearningRateMonitor() logger = TensorBoardLogger(log_dir, name="optuna", version=trial.number) gradient_clip_val = trial.suggest_loguniform("gradient_clip_val", *gradient_clip_val_range) trainer = pl.Trainer( checkpoint_callback=checkpoint_callback, max_epochs=max_epochs, gradient_clip_val=gradient_clip_val, gpus=[0] if torch.cuda.is_available() else None, callbacks=[ metrics_callback, learning_rate_callback, PyTorchLightningPruningCallback(trial, monitor="val_loss"), ], logger=logger, **trainer_kwargs, ) # create model hidden_size = trial.suggest_int("hidden_size", *hidden_size_range, log=True) model = TemporalFusionTransformer.from_dataset( train_dataloader.dataset, dropout=trial.suggest_uniform("dropout", *dropout_range), hidden_size=hidden_size, hidden_continuous_size=trial.suggest_int( "hidden_continuous_size", hidden_continuous_size_range[0], min(hidden_continuous_size_range[1], hidden_size), log=True, ), attention_head_size=trial.suggest_int("attention_head_size", *attention_head_size_range), log_interval=-1, **kwargs, ) # find good learning rate if use_learning_rate_finder: lr_trainer = pl.Trainer( gradient_clip_val=gradient_clip_val, gpus=[0] if torch.cuda.is_available() else None, logger=False, ) res = lr_trainer.tuner.lr_find( model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, early_stop_threshold=10000.0, min_lr=learning_rate_range[0], num_training=100, max_lr=learning_rate_range[1], ) loss_finite = np.isfinite(res.results["loss"]) lr_smoothed, loss_smoothed = sm.nonparametric.lowess( np.asarray(res.results["loss"])[loss_finite], np.asarray(res.results["lr"])[loss_finite], frac=1.0 / 10.0, )[10:-1].T optimal_idx = np.gradient(loss_smoothed).argmin() optimal_lr = lr_smoothed[optimal_idx] print(f"Using learning rate of {optimal_lr:.3g}") model.hparams.learning_rate = optimal_lr else: model.hparams.learning_rate = trial.suggest_loguniform( "learning_rate_range", *learning_rate_range) # fit trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader) # report result return metrics_callback.metrics[-1]["val_loss"].item()
# of the gradient for recurrent neural networks gradient_clip_val=1e-3, limit_train_batches=30, # fast_dev_run=True, early_stop_callback=early_stop_callback, callbacks=[lr_logger], ) tft = TemporalFusionTransformer.from_dataset( training, # not meaningful for finding the learning rate but otherwise very important learning_rate=0.15, hidden_size=16, # most important hyperparameter apart from learning rate # number of attention heads. Set to up to 4 for large datasets attention_head_size=1, dropout=0.1, # between 0.1 and 0.3 are good values hidden_continuous_size=8, # set to <= hidden_size output_size=7, # 7 quantiles by default loss=QuantileLoss(), log_interval=10, # reduce learning rate if no improvement in validation loss after x epochs # reduce_on_plateau_patience=4, ) print(f"Number of parameters in network: {tft.size()/1e3:.1f}k") # find optimal learning rate # res = trainer.lr_find( # tft, # train_dataloader=train_dataloader, # val_dataloaders=val_dataloader, # max_lr=10.0,