def test_integration( dataloaders_with_covariates, dataloaders_fixed_window_without_covariates, dataloaders_multi_target, tmp_path, gpus, dataloader, ): kwargs = {} if dataloader == "with_covariates": dataloader = dataloaders_with_covariates kwargs["backcast_loss_ratio"] = 0.5 elif dataloader == "fixed_window_without_covariates": dataloader = dataloaders_fixed_window_without_covariates elif dataloader == "multi_target": dataloader = dataloaders_multi_target kwargs["loss"] = QuantileLoss() elif dataloader == "quantiles": dataloader = dataloaders_with_covariates kwargs["loss"] = QuantileLoss() elif dataloader == "implicit-quantiles": dataloader = dataloaders_with_covariates kwargs["loss"] = ImplicitQuantileNetworkDistributionLoss() elif dataloader == "multivariate-quantiles": dataloader = dataloaders_with_covariates kwargs["loss"] = MQF2DistributionLoss( prediction_length=dataloader["train"].dataset.max_prediction_length ) else: raise ValueError(f"dataloader {dataloader} unknown") _integration(dataloader, tmp_path=tmp_path, gpus=gpus, **kwargs)
def from_dataset( cls, dataset: TimeSeriesDataSet, allowed_encoder_known_variable_names: List[str] = None, **kwargs, ): """ Create model from dataset. Args: dataset: timeseries dataset allowed_encoder_known_variable_names: List of known variables that are allowed in encoder, defaults to all **kwargs: additional arguments such as hyperparameters for model (see ``__init__()``) Returns: TemporalFusionTransformer """ # add maximum encoder length new_kwargs = dict(max_encoder_length=dataset.max_encoder_length) new_kwargs.update( cls.deduce_default_output_parameters(dataset, kwargs, QuantileLoss())) # update defaults new_kwargs.update(kwargs) # create class and return return super().from_dataset(dataset, allowed_encoder_known_variable_names= allowed_encoder_known_variable_names, **new_kwargs)
def from_dataset( cls, dataset: TimeSeriesDataSet, allowed_encoder_known_variable_names: List[str] = None, **kwargs, ): """ Create model from dataset. Args: dataset: timeseries dataset allowed_encoder_known_variable_names: List of known variables that are allowed in encoder, defaults to all **kwargs: additional arguments such as hyperparameters for model (see ``__init__()``) Returns: TemporalFusionTransformer """ # add maximum encoder length new_kwargs = dict(max_encoder_length=dataset.max_encoder_length) # infer output size def get_output_size(normalizer, loss): if isinstance(loss, QuantileLoss): return len(loss.quantiles) elif isinstance(normalizer, NaNLabelEncoder): return len(normalizer.classes_) else: return 1 loss = kwargs.get("loss", QuantileLoss()) # handle multiple targets new_kwargs["n_targets"] = len(dataset.target_names) if new_kwargs["n_targets"] > 1: # try to infer number of ouput sizes if not isinstance(loss, MultiLoss): loss = MultiLoss([deepcopy(loss)] * new_kwargs["n_targets"]) new_kwargs["loss"] = loss if isinstance(loss, MultiLoss) and "output_size" not in kwargs: new_kwargs["output_size"] = [ get_output_size(normalizer, l) for normalizer, l in zip( dataset.target_normalizer.normalizers, loss.metrics) ] elif "output_size" not in kwargs: new_kwargs["output_size"] = get_output_size( dataset.target_normalizer, loss) # update defaults new_kwargs.update(kwargs) # create class and return return super().from_dataset(dataset, allowed_encoder_known_variable_names= allowed_encoder_known_variable_names, **new_kwargs)
def run_train(cfg, trainset, validset) -> None: import multiprocessing import pytorch_lightning as pl from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor from pytorch_lightning.loggers import TensorBoardLogger from pytorch_forecasting.metrics import QuantileLoss from pytorch_forecasting.models import TemporalFusionTransformer # stop training, when loss metric does not improve on validation set early_stop_callback = EarlyStopping( monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min" ) lr_monitor = LearningRateMonitor() # log the learning rate logger = TensorBoardLogger("result/lightning_logs") # log to tensorboard # create trainer params = cfg.get("trainer").params trainer = pl.Trainer( callbacks=[lr_monitor, early_stop_callback], logger=logger, **params, ) # initialise model params = cfg.get("model").params tft = TemporalFusionTransformer.from_dataset( trainset, loss=QuantileLoss(), **params, ) print(tft.size()) # 29.6k parameters in model n_cores = multiprocessing.cpu_count() loader_trainset = trainset.to_dataloader( train=True, batch_size=cfg.get("trainset").batch_size, num_workers=n_cores ) loader_validset = validset.to_dataloader( train=False, batch_size=cfg.get("validset").batch_size, num_workers=n_cores ) # fit network trainer.fit( tft, train_dataloader=loader_trainset, val_dataloaders=loader_validset, ) return
def optimize_hyperparameters( train_dataloader: DataLoader, val_dataloader: DataLoader, model_path: str, max_epochs: int = 20, n_trials: int = 100, timeout: float = 3600 * 8.0, # 8 hours gradient_clip_val_range: Tuple[float, float] = (0.01, 100.0), hidden_size_range: Tuple[int, int] = (16, 265), hidden_continuous_size_range: Tuple[int, int] = (8, 64), attention_head_size_range: Tuple[int, int] = (1, 4), dropout_range: Tuple[float, float] = (0.1, 0.3), learning_rate_range: Tuple[float, float] = (1e-5, 1.0), use_learning_rate_finder: bool = True, trainer_kwargs: Dict[str, Any] = {}, log_dir: str = "lightning_logs", study: optuna.Study = None, verbose: Union[int, bool] = None, **kwargs, ) -> optuna.Study: """ Optimize Temporal Fusion Transformer hyperparameters. Run hyperparameter optimization. Learning rate for is determined with the PyTorch Lightning learning rate finder. Args: train_dataloader (DataLoader): dataloader for training model val_dataloader (DataLoader): dataloader for validating model model_path (str): folder to which model checkpoints are saved max_epochs (int, optional): Maximum number of epochs to run training. Defaults to 20. n_trials (int, optional): Number of hyperparameter trials to run. Defaults to 100. timeout (float, optional): Time in seconds after which training is stopped regardless of number of epochs or validation metric. Defaults to 3600*8.0. hidden_size_range (Tuple[int, int], optional): Minimum and maximum of ``hidden_size`` hyperparameter. Defaults to (16, 265). hidden_continuous_size_range (Tuple[int, int], optional): Minimum and maximum of ``hidden_continuous_size`` hyperparameter. Defaults to (8, 64). attention_head_size_range (Tuple[int, int], optional): Minimum and maximum of ``attention_head_size`` hyperparameter. Defaults to (1, 4). dropout_range (Tuple[float, float], optional): Minimum and maximum of ``dropout`` hyperparameter. Defaults to (0.1, 0.3). learning_rate_range (Tuple[float, float], optional): Learning rate range. Defaults to (1e-5, 1.0). use_learning_rate_finder (bool): If to use learning rate finder or optimize as part of hyperparameters. Defaults to True. trainer_kwargs (Dict[str, Any], optional): Additional arguments to the `PyTorch Lightning trainer <https://pytorch-lightning.readthedocs.io/en/latest/trainer.html>`_ such as ``limit_train_batches``. Defaults to {}. log_dir (str, optional): Folder into which to log results for tensorboard. Defaults to "lightning_logs". study (optuna.Study, optional): study to resume. Will create new study by default. verbose (Union[int, bool]): level of verbosity. * None: no change in verbosity level (equivalent to verbose=1 by optuna-set default). * 0 or False: log only warnings. * 1 or True: log pruning events. * 2: optuna logging level at debug level. Defaults to None. **kwargs: Additional arguments for the :py:class:`~TemporalFusionTransformer`. Returns: optuna.Study: optuna study results """ assert isinstance(train_dataloader.dataset, TimeSeriesDataSet) and isinstance( val_dataloader.dataset, TimeSeriesDataSet ), "dataloaders must be built from timeseriesdataset" logging_level = { None: optuna.logging.get_verbosity(), 0: optuna.logging.WARNING, 1: optuna.logging.INFO, 2: optuna.logging.DEBUG, } optuna_verbose = logging_level[verbose] optuna.logging.set_verbosity(optuna_verbose) loss = kwargs.get( "loss", QuantileLoss() ) # need a deepcopy of loss as it will otherwise propagate from one trial to the next # create objective function def objective(trial: optuna.Trial) -> float: # Filenames for each trial must be made unique in order to access each checkpoint. checkpoint_callback = pl.callbacks.ModelCheckpoint( dirpath=os.path.join(model_path, "trial_{}".format(trial.number)), filename="{epoch}", monitor="val_loss") # The default logger in PyTorch Lightning writes to event files to be consumed by # TensorBoard. We don't use any logger here as it requires us to implement several abstract # methods. Instead we setup a simple callback, that saves metrics from each validation step. metrics_callback = MetricsCallback() learning_rate_callback = LearningRateMonitor() logger = TensorBoardLogger(log_dir, name="optuna", version=trial.number) gradient_clip_val = trial.suggest_loguniform("gradient_clip_val", *gradient_clip_val_range) default_trainer_kwargs = dict( gpus=[0] if torch.cuda.is_available() else None, max_epochs=max_epochs, gradient_clip_val=gradient_clip_val, callbacks=[ metrics_callback, learning_rate_callback, checkpoint_callback, PyTorchLightningPruningCallback(trial, monitor="val_loss"), ], logger=logger, progress_bar_refresh_rate=[0, 1 ][optuna_verbose < optuna.logging.INFO], weights_summary=[None, "top"][optuna_verbose < optuna.logging.INFO], ) default_trainer_kwargs.update(trainer_kwargs) trainer = pl.Trainer(**default_trainer_kwargs, ) # create model hidden_size = trial.suggest_int("hidden_size", *hidden_size_range, log=True) kwargs["loss"] = copy.deepcopy(loss) model = TemporalFusionTransformer.from_dataset( train_dataloader.dataset, dropout=trial.suggest_uniform("dropout", *dropout_range), hidden_size=hidden_size, hidden_continuous_size=trial.suggest_int( "hidden_continuous_size", hidden_continuous_size_range[0], min(hidden_continuous_size_range[1], hidden_size), log=True, ), attention_head_size=trial.suggest_int("attention_head_size", *attention_head_size_range), log_interval=-1, **kwargs, ) # find good learning rate if use_learning_rate_finder: lr_trainer = pl.Trainer( gradient_clip_val=gradient_clip_val, gpus=[0] if torch.cuda.is_available() else None, logger=False, progress_bar_refresh_rate=0, weights_summary=None, ) res = lr_trainer.tuner.lr_find( model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, early_stop_threshold=10000, min_lr=learning_rate_range[0], num_training=100, max_lr=learning_rate_range[1], ) loss_finite = np.isfinite(res.results["loss"]) if loss_finite.sum( ) > 3: # at least 3 valid values required for learning rate finder lr_smoothed, loss_smoothed = sm.nonparametric.lowess( np.asarray(res.results["loss"])[loss_finite], np.asarray(res.results["lr"])[loss_finite], frac=1.0 / 10.0, )[min(loss_finite.sum() - 3, 10):-1].T optimal_idx = np.gradient(loss_smoothed).argmin() optimal_lr = lr_smoothed[optimal_idx] else: optimal_idx = np.asarray(res.results["loss"]).argmin() optimal_lr = res.results["lr"][optimal_idx] optuna_logger.info(f"Using learning rate of {optimal_lr:.3g}") # add learning rate artificially model.hparams.learning_rate = trial.suggest_uniform( "learning_rate", optimal_lr, optimal_lr) else: model.hparams.learning_rate = trial.suggest_loguniform( "learning_rate", *learning_rate_range) # fit trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader) # report result return metrics_callback.metrics[-1]["val_loss"].item() # setup optuna and run pruner = optuna.pruners.SuccessiveHalvingPruner() if study is None: study = optuna.create_study(direction="minimize", pruner=pruner) study.optimize(objective, n_trials=n_trials, timeout=timeout) return study
def test_integration(multiple_dataloaders_with_covariates, tmp_path, gpus): train_dataloader = multiple_dataloaders_with_covariates["train"] val_dataloader = multiple_dataloaders_with_covariates["val"] early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min") # check training logger = TensorBoardLogger(tmp_path) checkpoint = ModelCheckpoint(filepath=tmp_path) trainer = pl.Trainer( checkpoint_callback=checkpoint, max_epochs=3, gpus=gpus, weights_summary="top", gradient_clip_val=0.1, callbacks=[early_stop_callback], fast_dev_run=True, logger=logger, ) # test monotone constraints automatically if "discount_in_percent" in train_dataloader.dataset.reals: monotone_constaints = {"discount_in_percent": +1} cuda_context = torch.backends.cudnn.flags(enabled=False) else: monotone_constaints = {} cuda_context = nullcontext() with cuda_context: if isinstance(train_dataloader.dataset.target_normalizer, NaNLabelEncoder): loss = CrossEntropy() elif isinstance(train_dataloader.dataset.target_normalizer, MultiNormalizer): loss = MultiLoss([ CrossEntropy() if isinstance(normalizer, NaNLabelEncoder) else QuantileLoss() for normalizer in train_dataloader.dataset.target_normalizer.normalizers ]) else: loss = QuantileLoss() net = TemporalFusionTransformer.from_dataset( train_dataloader.dataset, learning_rate=0.15, hidden_size=4, attention_head_size=1, dropout=0.2, hidden_continuous_size=2, loss=loss, log_interval=5, log_val_interval=1, log_gradient_flow=True, monotone_constaints=monotone_constaints, ) net.size() try: trainer.fit( net, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, ) # check loading net = TemporalFusionTransformer.load_from_checkpoint( checkpoint.best_model_path) # check prediction net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) # check prediction on gpu if not (isinstance(gpus, int) and gpus == 0): net.to("cuda") net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) finally: shutil.rmtree(tmp_path, ignore_errors=True)
def __init__( self, hidden_size: int = 16, lstm_layers: int = 1, dropout: float = 0.1, output_size: Union[int, List[int]] = 7, loss: MultiHorizonMetric = None, attention_head_size: int = 4, max_encoder_length: int = 10, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], x_reals: List[str] = [], x_categoricals: List[str] = [], hidden_continuous_size: int = 8, hidden_continuous_sizes: Dict[str, int] = {}, embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, learning_rate: float = 1e-3, log_interval: Union[int, float] = -1, log_val_interval: Union[int, float] = None, log_gradient_flow: bool = False, reduce_on_plateau_patience: int = 1000, monotone_constaints: Dict[str, int] = {}, share_single_variable_networks: bool = False, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Temporal Fusion Transformer for forecasting timeseries - use its :py:meth:`~from_dataset` method if possible. Implementation of the article `Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting <https://arxiv.org/pdf/1912.09363.pdf>`_. The network outperforms DeepAR by Amazon by 36-69% in benchmarks. Enhancements compared to the original implementation (apart from capabilities added through base model such as monotone constraints): * static variables can be continuous * multiple categorical variables can be summarized with an EmbeddingBag * variable encoder and decoder length by sample * categorical embeddings are not transformed by variable selection network (because it is a redundant operation) * variable dimension in variable selection network are scaled up via linear interpolation to reduce number of parameters * non-linear variable processing in variable selection network can be shared among decoder and encoder (not shared by default) Tune its hyperparameters with :py:func:`~pytorch_forecasting.models.temporal_fusion_transformer.tuning.optimize_hyperparameters`. Args: hidden_size: hidden size of network which is its main hyperparameter and can range from 8 to 512 lstm_layers: number of LSTM layers (2 is mostly optimal) dropout: dropout rate output_size: number of outputs (e.g. number of quantiles for QuantileLoss and one target or list of output sizes). loss: loss function taking prediction and targets attention_head_size: number of attention heads (4 is a good default) max_encoder_length: length to encode (can be far longer than the decoder length but does not have to be) static_categoricals: names of static categorical variables static_reals: names of static continuous variables time_varying_categoricals_encoder: names of categorical variables for encoder time_varying_categoricals_decoder: names of categorical variables for decoder time_varying_reals_encoder: names of continuous variables for encoder time_varying_reals_decoder: names of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function hidden_continuous_size: default for hidden size for processing continous variables (similar to categorical embedding size) hidden_continuous_sizes: dictionary mapping continuous input indices to sizes for variable selection (fallback to hidden_continuous_size if index is not in dictionary) embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels learning_rate: learning rate log_interval: log predictions every x batches, do not log if 0 or less, log interpretation if > 0. If < 1.0 , will log multiple entries per batch. Defaults to -1. log_val_interval: frequency with which to log validation set metrics, defaults to log_interval log_gradient_flow: if to log gradient flow, this takes time and should be only done to diagnose training failures reduce_on_plateau_patience (int): patience after which learning rate is reduced by a factor of 10 monotone_constaints (Dict[str, int]): dictionary of monotonicity constraints for continuous decoder variables mapping position (e.g. ``"0"`` for first position) to constraint (``-1`` for negative and ``+1`` for positive, larger numbers add more weight to the constraint vs. the loss but are usually not necessary). This constraint significantly slows down training. Defaults to {}. share_single_variable_networks (bool): if to share the single variable networks between the encoder and decoder. Defaults to False. logging_metrics (nn.ModuleList[LightningMetric]): list of metrics that are logged during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]). **kwargs: additional arguments to :py:class:`~BaseModel`. """ if logging_metrics is None: logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]) if loss is None: loss = QuantileLoss() self.save_hyperparameters() # store loss function separately as it is a module assert isinstance( loss, LightningMetric), "Loss has to be a PyTorch Lightning `Metric`" super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) # processing inputs # embeddings self.input_embeddings = MultiEmbedding( embedding_sizes=self.hparams.embedding_sizes, categorical_groups=self.hparams.categorical_groups, embedding_paddings=self.hparams.embedding_paddings, x_categoricals=self.hparams.x_categoricals, max_embedding_size=self.hparams.hidden_size, ) # continuous variable processing self.prescalers = nn.ModuleDict({ name: nn.Linear( 1, self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size)) for name in self.reals }) # variable selection # variable selection for static variables static_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.static_categoricals } static_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.static_reals }) self.static_variable_selection = VariableSelectionNetwork( input_sizes=static_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.static_categoricals }, dropout=self.hparams.dropout, prescalers=self.prescalers, ) # variable selection for encoder and decoder encoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_encoder } encoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_encoder }) decoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_decoder } decoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_decoder }) # create single variable grns that are shared across decoder and encoder if self.hparams.share_single_variable_networks: self.shared_single_variable_grns = nn.ModuleDict() for name, input_size in encoder_input_sizes.items(): self.shared_single_variable_grns[name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) for name, input_size in decoder_input_sizes.items(): if name not in self.shared_single_variable_grns: self.shared_single_variable_grns[ name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) self.encoder_variable_selection = VariableSelectionNetwork( input_sizes=encoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_encoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) self.decoder_variable_selection = VariableSelectionNetwork( input_sizes=decoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_decoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) # static encoders # for variable selection self.static_context_variable_selection = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for hidden state of the lstm self.static_context_initial_hidden_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for cell state of the lstm self.static_context_initial_cell_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for post lstm static enrichment self.static_context_enrichment = GatedResidualNetwork( self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.dropout) # lstm encoder (history) and decoder (future) for local processing self.lstm_encoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) self.lstm_decoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) # skip connection for lstm self.post_lstm_gate_encoder = GatedLinearUnit( self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_gate_decoder = self.post_lstm_gate_encoder # self.post_lstm_gate_decoder = GatedLinearUnit(self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_add_norm_encoder = AddNorm(self.hparams.hidden_size, trainable_add=False) # self.post_lstm_add_norm_decoder = AddNorm(self.hparams.hidden_size, trainable_add=True) self.post_lstm_add_norm_decoder = self.post_lstm_add_norm_encoder # static enrichment and processing past LSTM self.static_enrichment = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, ) # attention for long-range processing self.multihead_attn = InterpretableMultiHeadAttention( d_model=self.hparams.hidden_size, n_head=self.hparams.attention_head_size, dropout=self.hparams.dropout) self.post_attn_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=self.hparams.dropout, trainable_add=False) self.pos_wise_ff = GatedResidualNetwork(self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, dropout=self.hparams.dropout) # output processing -> no dropout at this late stage self.pre_output_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=None, trainable_add=False) if self.n_targets > 1: # if to run with multiple targets self.output_layer = nn.ModuleList([ nn.Linear(self.hparams.hidden_size, output_size) for output_size in self.hparams.output_size ]) else: self.output_layer = nn.Linear(self.hparams.hidden_size, self.hparams.output_size)
def test_integration(multiple_dataloaders_with_covariates, tmp_path, gpus): train_dataloader = multiple_dataloaders_with_covariates["train"] val_dataloader = multiple_dataloaders_with_covariates["val"] early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min") # check training logger = TensorBoardLogger(tmp_path) trainer = pl.Trainer( max_epochs=2, gpus=gpus, weights_summary="top", gradient_clip_val=0.1, callbacks=[early_stop_callback], checkpoint_callback=True, default_root_dir=tmp_path, limit_train_batches=2, limit_val_batches=2, logger=logger, ) # test monotone constraints automatically if "discount_in_percent" in train_dataloader.dataset.reals: monotone_constaints = {"discount_in_percent": +1} cuda_context = torch.backends.cudnn.flags(enabled=False) else: monotone_constaints = {} cuda_context = nullcontext() with cuda_context: if isinstance(train_dataloader.dataset.target_normalizer, NaNLabelEncoder): loss = CrossEntropy() elif isinstance(train_dataloader.dataset.target_normalizer, MultiNormalizer): loss = MultiLoss( [ CrossEntropy() if isinstance(normalizer, NaNLabelEncoder) else QuantileLoss() for normalizer in train_dataloader.dataset.target_normalizer.normalizers ] ) else: loss = QuantileLoss() net = TemporalFusionTransformer.from_dataset( train_dataloader.dataset, learning_rate=0.15, hidden_size=4, attention_head_size=1, dropout=0.2, hidden_continuous_size=2, loss=loss, log_interval=5, log_val_interval=1, log_gradient_flow=True, monotone_constaints=monotone_constaints, ) net.size() try: trainer.fit( net, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, ) # check loading net = TemporalFusionTransformer.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # check prediction predictions, x, index = net.predict(val_dataloader, return_index=True, return_x=True) pred_len = len(multiple_dataloaders_with_covariates["val"].dataset) # check that output is of correct shape def check(x): if isinstance(x, (tuple, list)): for xi in x: check(xi) elif isinstance(x, dict): for xi in x.values(): check(xi) else: assert pred_len == x.shape[0], "first dimension should be prediction length" check(predictions) check(x) check(index) # check prediction on gpu if not (isinstance(gpus, int) and gpus == 0): net.to("cuda") net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) finally: shutil.rmtree(tmp_path, ignore_errors=True)
def __init__( self, n_lags=60, n_forecasts=20, batch_size=None, epochs=100, patience_early_stopping=10, early_stop=True, learning_rate=3e-2, auto_lr_find=True, num_workers=3, loss_func="QuantileLoss", hidden_size=32, attention_head_size=1, hidden_continuous_size=8, dropout=0.1, ): """ Args: n_lags: int, — Number of time units that condition the predictions. Also known as 'lookback period'. Should be between 1-10 times the prediction length. Can be seen as equivalent for n_lags in NP n_forecasts: int - Number of time units that the model predicts batch_size: int, — batch_size. If set to None, automatic batch size will be set epochs: int, — number of epochs for training. Will be overwritten, if EarlyStopping is applied patience_early_stopping: int, — patience parameter of EarlyStopping callback early_stop: bool, — whether to use EarlyStopping callback learning_rate: float, — learning rate for the model. Will be overwritten, if auto_lr_find is used auto_lr_find: bool, — whether to use automatic laerning rate finder num_workers: int, — number of workers for DataLoaders loss_func: str, loss function taking prediction and targets, should be from MultiHorizonMetric class, defaults to QuantileLoss. hidden_size: int, hidden size of network which is its main hyperparameter and can range from 8 to 512 attention_head_size: int, number of attention heads, lager values (up to 8) for large amount of data hidden_continuous_size: int, dictionary mapping continuous input indices to sizes for variable selection dropout: dropout in RNN layers, should be between 0 and 1. """ self.batch_size = batch_size self.epochs = epochs self.patience_early_stopping = patience_early_stopping self.early_stop = early_stop self.learning_rate = learning_rate self.auto_lr_find = auto_lr_find if self.learning_rate != None: self.auto_lr_find = False self.num_workers = num_workers self.context_length = n_lags self.prediction_length = n_forecasts self.hidden_size = hidden_size self.attention_head_size = attention_head_size self.hidden_continuous_size = hidden_continuous_size self.dropout = dropout self.loss_func = loss_func self.fitted = False self.freq = None if type(self.loss_func) == str: if self.loss_func.lower() in ["huber", "smoothl1", "smoothl1loss"]: self.loss_func = torch.nn.SmoothL1Loss() elif self.loss_func.lower() in ["mae", "l1", "l1loss"]: self.loss_func = torch.nn.L1Loss() elif self.loss_func.lower() in ["mse", "mseloss", "l2", "l2loss"]: self.loss_func = torch.nn.MSELoss() elif self.loss_func.lower() in ["quantileloss"]: self.loss_func = QuantileLoss() else: raise NotImplementedError("Loss function {} name not defined".format(self.loss_func)) elif callable(self.loss_func): pass elif hasattr(torch.nn.modules.loss, self.loss_func.__class__.__name__): pass else: raise NotImplementedError("Loss function {} not found".format(self.loss_func)) self.metrics = metrics.MetricsCollection( metrics=[metrics.LossMetric(torch.nn.SmoothL1Loss()), metrics.MAE(), metrics.MSE(),], value_metrics=[ # metrics.ValueMetric("Loss"), ], ) self.val_metrics = metrics.MetricsCollection([m.new() for m in self.metrics.batch_metrics])
def __init__(self, output_size: int = 7, loss: MultiHorizonMetric = QuantileLoss()): self.save_hyperparameters() super().__init__() self.loss = loss
# fast_dev_run=True, # logger=logger, # profiler=True, callbacks=[lr_logger, early_stop_callback], ) tft = TemporalFusionTransformer.from_dataset( training, learning_rate=0.03, hidden_size=16, attention_head_size=1, dropout=0.1, hidden_continuous_size=8, output_size=7, loss=QuantileLoss(), log_interval=10, log_val_interval=1, reduce_on_plateau_patience=3, ) print(f"Number of parameters in network: {tft.size()/1e3:.1f}k") # # find optimal learning rate # # remove logging and artificial epoch size # tft.hparams.log_interval = -1 # tft.hparams.log_val_interval = -1 # trainer.limit_train_batches = 1.0 # # run learning rate finder # res = trainer.tuner.lr_find( # tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2 # )
def __init__( self, activation_class: str = "ReLU", hidden_size: int = 300, n_hidden_layers: int = 3, dropout: float = 0.1, norm: bool = True, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, x_reals: List[str] = [], x_categoricals: List[str] = [], output_size: Union[int, List[int]] = 1, target: Union[str, List[str]] = None, loss: MultiHorizonMetric = None, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Args: activation_class (str, optional): PyTorch activation class. Defaults to "ReLU". hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with ``n_hidden_layers``. Defaults to 10. n_hidden_layers (int, optional): Number of hidden layers - important hyperparameter. Defaults to 2. dropout (float, optional): Dropout. Defaults to 0.1. norm (bool, optional): if to use normalization in the MLP. Defaults to True. static_categoricals: integer of positions of static categorical variables static_reals: integer of positions of static continuous variables time_varying_categoricals_encoder: integer of positions of categorical variables for encoder time_varying_categoricals_decoder: integer of positions of categorical variables for decoder time_varying_reals_encoder: integer of positions of continuous variables for encoder time_varying_reals_decoder: integer of positions of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels output_size (Union[int, List[int]], optional): number of outputs (e.g. number of quantiles for QuantileLoss and one target or list of output sizes). target (str, optional): Target variable or list of target variables. Defaults to None. loss (MultiHorizonMetric, optional): loss: loss function taking prediction and targets. Defaults to QuantileLoss. logging_metrics (nn.ModuleList, optional): Metrics to log during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]). """ if loss is None: loss = QuantileLoss() if logging_metrics is None: logging_metrics = nn.ModuleList( [SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) self.save_hyperparameters() # store loss function separately as it is a module super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) self.input_embeddings = MultiEmbedding( embedding_sizes={ name: val for name, val in embedding_sizes.items() if name in self.decoder_variables + self.static_variables }, embedding_paddings=embedding_paddings, categorical_groups=categorical_groups, x_categoricals=x_categoricals, ) # define network if isinstance(self.hparams.output_size, int): mlp_output_size = self.hparams.output_size else: mlp_output_size = sum(self.hparams.output_size) cont_size = len(self.decoder_reals_positions) cat_size = sum(self.input_embeddings.output_size.values()) input_size = cont_size + cat_size self.mlp = FullyConnectedModule( dropout=dropout, norm=self.hparams.norm, activation_class=getattr(nn, self.hparams.activation_class), input_size=input_size, output_size=mlp_output_size, hidden_size=self.hparams.hidden_size, n_hidden_layers=self.hparams.n_hidden_layers, )
def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs): new_kwargs = cls.deduce_default_output_parameters( dataset, kwargs, QuantileLoss()) kwargs.update(new_kwargs) return super().from_dataset(dataset, **kwargs)
return_decoder_lengths=True) finally: shutil.rmtree(tmp_path, ignore_errors=True) net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) @pytest.mark.parametrize( "kwargs", [ {}, dict( loss=MultiLoss([QuantileLoss(), MAE()]), data_loader_kwargs=dict( time_varying_unknown_reals=["volume", "discount"], target=["volume", "discount"], ), ), dict( loss=CrossEntropy(), data_loader_kwargs=dict(target="agency", ), ), ], ) def test_integration(data_with_covariates, tmp_path, gpus, kwargs): _integration(data_with_covariates.assign(target=lambda x: x.volume), tmp_path, gpus, **kwargs)
# fast_dev_run=True, # logger=logger, # profiler=True, callbacks=[lr_logger], ) tft = TemporalFusionTransformer.from_dataset( training, learning_rate=0.1, hidden_size=32, attention_head_size=1, dropout=0.1, hidden_continuous_size=32, output_size=3, loss=QuantileLoss(quantiles=[0.1, 0.5, 0.9]), log_interval=10, log_val_interval=3, # reduce_on_plateau_patience=3, ) print(f"Number of parameters in network: {tft.size()/1e3:.1f}k") # # find optimal learning rate # tft.hparams.log_interval = -1 # tft.hparams.log_val_interval = -1 # trainer.limit_train_batches = 1.0 # res = trainer.tuner.lr_find( # tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2 # ) # print(f"suggested learning rate: {res.suggestion()}")
def train( self, max_epochs=25, hidden_size=16, lstm_layers=1, dropout=0.1, attention_head_size=4, reduce_on_plateau_patience=4, hidden_continuous_size=8, learning_rate=1e-3, gradient_clip_val=0.1, ): # configure network and trainer # create dataloaders for model batch_size = 128 train_dataloader = self.intern_training.to_dataloader( train=True, batch_size=batch_size) val_dataloader = self._intern_validation.to_dataloader( train=False, batch_size=batch_size * 10) pl.seed_everything(42) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") # lr_logger = LearningRateMonitor() trainer = pl.Trainer( max_epochs=max_epochs, gpus=0, weights_summary=None, gradient_clip_val=gradient_clip_val, # limit_train_batches=30, # coment in for training, running validation every 30 batches # fast_dev_run=True, # comment in to check that networkor dataset has no serious bugs callbacks=[early_stop_callback], ) self.model = TemporalFusionTransformer.from_dataset( self.intern_training, learning_rate=learning_rate, hidden_size=hidden_size, attention_head_size=attention_head_size, dropout=dropout, hidden_continuous_size=hidden_continuous_size, lstm_layers=lstm_layers, output_size=len(self.quantiles), # 3 quantiles by default loss=QuantileLoss(self.quantiles), reduce_on_plateau_patience=reduce_on_plateau_patience, ) # res = trainer.tuner.lr_find( # self.model, # train_dataloader=train_dataloader, # val_dataloaders=val_dataloader, # max_lr=10.0, # min_lr=1e-6, # ) # self.model = TemporalFusionTransformer.from_dataset( # self.intern_training, # learning_rate=res.suggestion(), # using the suggested learining rate # hidden_size=hidden_size, # attention_head_size=attention_head_size, # dropout=dropout, # hidden_continuous_size=hidden_continuous_size, # output_size=len(self.quantiles), # 3 quantiles by default # loss=QuantileLoss(self.quantiles), # reduce_on_plateau_patience=reduce_on_plateau_patience, # ) # fit network trainer.fit( self.model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, )