def __init__( self, input_size: int, output_size: int, hidden_size: int, n_hidden_layers: int, x_reals: List[str], x_categoricals: List[str], embedding_sizes: Dict[str, Tuple[int, int]], embedding_labels: Dict[str, List[str]], static_categoricals: List[str], static_reals: List[str], time_varying_categoricals_encoder: List[str], time_varying_categoricals_decoder: List[str], time_varying_reals_encoder: List[str], time_varying_reals_decoder: List[str], embedding_paddings: List[str], categorical_groups: Dict[str, List[str]], **kwargs, ): # saves arguments in signature to `.hparams` attribute, mandatory call - do not skip this self.save_hyperparameters() # pass additional arguments to BaseModel.__init__, mandatory call - do not skip this super().__init__(**kwargs) # create embedder - can be fed with x["encoder_cat"] or x["decoder_cat"] and will return # dictionary of category names mapped to embeddings self.input_embeddings = MultiEmbedding( embedding_sizes=self.hparams.embedding_sizes, categorical_groups=self.hparams.categorical_groups, embedding_paddings=self.hparams.embedding_paddings, x_categoricals=self.hparams.x_categoricals, max_embedding_size=self.hparams.hidden_size, ) # calculate the size of all concatenated embeddings + continous variables n_features = sum(embedding_size for classes_size, embedding_size in self.hparams. embedding_sizes.values()) + len(self.reals) # create network that will be fed with continious variables and embeddings self.network = FullyConnectedModule( input_size=self.hparams.input_size * n_features, output_size=self.hparams.output_size, hidden_size=self.hparams.hidden_size, n_hidden_layers=self.hparams.n_hidden_layers, )
def __init__( self, cell_type: str = "LSTM", hidden_size: int = 10, rnn_layers: int = 2, dropout: float = 0.1, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, x_reals: List[str] = [], x_categoricals: List[str] = [], output_size: Union[int, List[int]] = 1, target: Union[str, List[str]] = None, target_lags: Dict[str, List[int]] = {}, loss: MultiHorizonMetric = None, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Recurrent Network. Simple LSTM or GRU layer followed by output layer Args: cell_type (str, optional): Recurrent cell type ["LSTM", "GRU"]. Defaults to "LSTM". hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with ``rnn_layers``. Defaults to 10. rnn_layers (int, optional): Number of RNN layers - important hyperparameter. Defaults to 2. dropout (float, optional): Dropout in RNN layers. Defaults to 0.1. static_categoricals: integer of positions of static categorical variables static_reals: integer of positions of static continuous variables time_varying_categoricals_encoder: integer of positions of categorical variables for encoder time_varying_categoricals_decoder: integer of positions of categorical variables for decoder time_varying_reals_encoder: integer of positions of continuous variables for encoder time_varying_reals_decoder: integer of positions of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels output_size (Union[int, List[int]], optional): number of outputs (e.g. number of quantiles for QuantileLoss and one target or list of output sizes). target (str, optional): Target variable or list of target variables. Defaults to None. target_lags (Dict[str, Dict[str, int]]): dictionary of target names mapped to list of time steps by which the variable should be lagged. Lags can be useful to indicate seasonality to the models. If you know the seasonalit(ies) of your data, add at least the target variables with the corresponding lags to improve performance. Defaults to no lags, i.e. an empty dictionary. loss (MultiHorizonMetric, optional): loss: loss function taking prediction and targets. logging_metrics (nn.ModuleList, optional): Metrics to log during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]). """ if loss is None: loss = MAE() if logging_metrics is None: logging_metrics = nn.ModuleList( [SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) self.save_hyperparameters() # store loss function separately as it is a module super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) self.embeddings = MultiEmbedding( embedding_sizes=embedding_sizes, embedding_paddings=embedding_paddings, categorical_groups=categorical_groups, x_categoricals=x_categoricals, ) lagged_target_names = [ l for lags in target_lags.values() for l in lags ] assert set(self.encoder_variables) - set( to_list(target) ) - set(lagged_target_names) == set(self.decoder_variables) - set( lagged_target_names ), "Encoder and decoder variables have to be the same apart from target variable" for targeti in to_list(target): assert ( targeti in time_varying_reals_encoder ), f"target {targeti} has to be real" # todo: remove this restriction assert ( isinstance(target, str) and isinstance(loss, MultiHorizonMetric) ) or ( isinstance(target, (list, tuple)) and isinstance(loss, MultiLoss) and len(loss) == len(target) ), "number of targets should be equivalent to number of loss metrics" rnn_class = get_rnn(cell_type) cont_size = len(self.reals) cat_size = sum( [size[1] for size in self.hparams.embedding_sizes.values()]) input_size = cont_size + cat_size self.rnn = rnn_class( input_size=input_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.rnn_layers, dropout=self.hparams.dropout if self.hparams.rnn_layers > 1 else 0, batch_first=True, ) # add linear layers for argument projects if isinstance(target, str): # single target self.output_projector = nn.Linear(self.hparams.hidden_size, self.hparams.output_size) assert not isinstance( self.loss, QuantileLoss ), "QuantileLoss does not work with recurrent network" else: # multi target self.output_projector = nn.ModuleList([ nn.Linear(self.hparams.hidden_size, size) for size in self.hparams.output_size ]) for l in self.loss: assert not isinstance( l, QuantileLoss ), "QuantileLoss does not work with recurrent network"
def __init__( self, hidden_size: int = 16, lstm_layers: int = 1, dropout: float = 0.1, output_size: Union[int, List[int]] = 7, loss: MultiHorizonMetric = None, attention_head_size: int = 4, max_encoder_length: int = 10, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], x_reals: List[str] = [], x_categoricals: List[str] = [], hidden_continuous_size: int = 8, hidden_continuous_sizes: Dict[str, int] = {}, embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, learning_rate: float = 1e-3, log_interval: Union[int, float] = -1, log_val_interval: Union[int, float] = None, log_gradient_flow: bool = False, reduce_on_plateau_patience: int = 1000, monotone_constaints: Dict[str, int] = {}, share_single_variable_networks: bool = False, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Temporal Fusion Transformer for forecasting timeseries - use its :py:meth:`~from_dataset` method if possible. Implementation of the article `Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting <https://arxiv.org/pdf/1912.09363.pdf>`_. The network outperforms DeepAR by Amazon by 36-69% in benchmarks. Enhancements compared to the original implementation (apart from capabilities added through base model such as monotone constraints): * static variables can be continuous * multiple categorical variables can be summarized with an EmbeddingBag * variable encoder and decoder length by sample * categorical embeddings are not transformed by variable selection network (because it is a redundant operation) * variable dimension in variable selection network are scaled up via linear interpolation to reduce number of parameters * non-linear variable processing in variable selection network can be shared among decoder and encoder (not shared by default) Tune its hyperparameters with :py:func:`~pytorch_forecasting.models.temporal_fusion_transformer.tuning.optimize_hyperparameters`. Args: hidden_size: hidden size of network which is its main hyperparameter and can range from 8 to 512 lstm_layers: number of LSTM layers (2 is mostly optimal) dropout: dropout rate output_size: number of outputs (e.g. number of quantiles for QuantileLoss and one target or list of output sizes). loss: loss function taking prediction and targets attention_head_size: number of attention heads (4 is a good default) max_encoder_length: length to encode (can be far longer than the decoder length but does not have to be) static_categoricals: names of static categorical variables static_reals: names of static continuous variables time_varying_categoricals_encoder: names of categorical variables for encoder time_varying_categoricals_decoder: names of categorical variables for decoder time_varying_reals_encoder: names of continuous variables for encoder time_varying_reals_decoder: names of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function hidden_continuous_size: default for hidden size for processing continous variables (similar to categorical embedding size) hidden_continuous_sizes: dictionary mapping continuous input indices to sizes for variable selection (fallback to hidden_continuous_size if index is not in dictionary) embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels learning_rate: learning rate log_interval: log predictions every x batches, do not log if 0 or less, log interpretation if > 0. If < 1.0 , will log multiple entries per batch. Defaults to -1. log_val_interval: frequency with which to log validation set metrics, defaults to log_interval log_gradient_flow: if to log gradient flow, this takes time and should be only done to diagnose training failures reduce_on_plateau_patience (int): patience after which learning rate is reduced by a factor of 10 monotone_constaints (Dict[str, int]): dictionary of monotonicity constraints for continuous decoder variables mapping position (e.g. ``"0"`` for first position) to constraint (``-1`` for negative and ``+1`` for positive, larger numbers add more weight to the constraint vs. the loss but are usually not necessary). This constraint significantly slows down training. Defaults to {}. share_single_variable_networks (bool): if to share the single variable networks between the encoder and decoder. Defaults to False. logging_metrics (nn.ModuleList[LightningMetric]): list of metrics that are logged during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]). **kwargs: additional arguments to :py:class:`~BaseModel`. """ if logging_metrics is None: logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]) if loss is None: loss = QuantileLoss() self.save_hyperparameters() # store loss function separately as it is a module assert isinstance( loss, LightningMetric), "Loss has to be a PyTorch Lightning `Metric`" super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) # processing inputs # embeddings self.input_embeddings = MultiEmbedding( embedding_sizes=self.hparams.embedding_sizes, categorical_groups=self.hparams.categorical_groups, embedding_paddings=self.hparams.embedding_paddings, x_categoricals=self.hparams.x_categoricals, max_embedding_size=self.hparams.hidden_size, ) # continuous variable processing self.prescalers = nn.ModuleDict({ name: nn.Linear( 1, self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size)) for name in self.reals }) # variable selection # variable selection for static variables static_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.static_categoricals } static_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.static_reals }) self.static_variable_selection = VariableSelectionNetwork( input_sizes=static_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.static_categoricals }, dropout=self.hparams.dropout, prescalers=self.prescalers, ) # variable selection for encoder and decoder encoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_encoder } encoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_encoder }) decoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_decoder } decoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_decoder }) # create single variable grns that are shared across decoder and encoder if self.hparams.share_single_variable_networks: self.shared_single_variable_grns = nn.ModuleDict() for name, input_size in encoder_input_sizes.items(): self.shared_single_variable_grns[name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) for name, input_size in decoder_input_sizes.items(): if name not in self.shared_single_variable_grns: self.shared_single_variable_grns[ name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) self.encoder_variable_selection = VariableSelectionNetwork( input_sizes=encoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_encoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) self.decoder_variable_selection = VariableSelectionNetwork( input_sizes=decoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_decoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) # static encoders # for variable selection self.static_context_variable_selection = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for hidden state of the lstm self.static_context_initial_hidden_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for cell state of the lstm self.static_context_initial_cell_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for post lstm static enrichment self.static_context_enrichment = GatedResidualNetwork( self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.dropout) # lstm encoder (history) and decoder (future) for local processing self.lstm_encoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) self.lstm_decoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) # skip connection for lstm self.post_lstm_gate_encoder = GatedLinearUnit( self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_gate_decoder = self.post_lstm_gate_encoder # self.post_lstm_gate_decoder = GatedLinearUnit(self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_add_norm_encoder = AddNorm(self.hparams.hidden_size, trainable_add=False) # self.post_lstm_add_norm_decoder = AddNorm(self.hparams.hidden_size, trainable_add=True) self.post_lstm_add_norm_decoder = self.post_lstm_add_norm_encoder # static enrichment and processing past LSTM self.static_enrichment = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, ) # attention for long-range processing self.multihead_attn = InterpretableMultiHeadAttention( d_model=self.hparams.hidden_size, n_head=self.hparams.attention_head_size, dropout=self.hparams.dropout) self.post_attn_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=self.hparams.dropout, trainable_add=False) self.pos_wise_ff = GatedResidualNetwork(self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, dropout=self.hparams.dropout) # output processing -> no dropout at this late stage self.pre_output_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=None, trainable_add=False) if self.n_targets > 1: # if to run with multiple targets self.output_layer = nn.ModuleList([ nn.Linear(self.hparams.hidden_size, output_size) for output_size in self.hparams.output_size ]) else: self.output_layer = nn.Linear(self.hparams.hidden_size, self.hparams.output_size)
class TemporalFusionTransformer(BaseModelWithCovariates): def __init__( self, hidden_size: int = 16, lstm_layers: int = 1, dropout: float = 0.1, output_size: Union[int, List[int]] = 7, loss: MultiHorizonMetric = None, attention_head_size: int = 4, max_encoder_length: int = 10, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], x_reals: List[str] = [], x_categoricals: List[str] = [], hidden_continuous_size: int = 8, hidden_continuous_sizes: Dict[str, int] = {}, embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, learning_rate: float = 1e-3, log_interval: Union[int, float] = -1, log_val_interval: Union[int, float] = None, log_gradient_flow: bool = False, reduce_on_plateau_patience: int = 1000, monotone_constaints: Dict[str, int] = {}, share_single_variable_networks: bool = False, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Temporal Fusion Transformer for forecasting timeseries - use its :py:meth:`~from_dataset` method if possible. Implementation of the article `Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting <https://arxiv.org/pdf/1912.09363.pdf>`_. The network outperforms DeepAR by Amazon by 36-69% in benchmarks. Enhancements compared to the original implementation (apart from capabilities added through base model such as monotone constraints): * static variables can be continuous * multiple categorical variables can be summarized with an EmbeddingBag * variable encoder and decoder length by sample * categorical embeddings are not transformed by variable selection network (because it is a redundant operation) * variable dimension in variable selection network are scaled up via linear interpolation to reduce number of parameters * non-linear variable processing in variable selection network can be shared among decoder and encoder (not shared by default) Tune its hyperparameters with :py:func:`~pytorch_forecasting.models.temporal_fusion_transformer.tuning.optimize_hyperparameters`. Args: hidden_size: hidden size of network which is its main hyperparameter and can range from 8 to 512 lstm_layers: number of LSTM layers (2 is mostly optimal) dropout: dropout rate output_size: number of outputs (e.g. number of quantiles for QuantileLoss and one target or list of output sizes). loss: loss function taking prediction and targets attention_head_size: number of attention heads (4 is a good default) max_encoder_length: length to encode (can be far longer than the decoder length but does not have to be) static_categoricals: names of static categorical variables static_reals: names of static continuous variables time_varying_categoricals_encoder: names of categorical variables for encoder time_varying_categoricals_decoder: names of categorical variables for decoder time_varying_reals_encoder: names of continuous variables for encoder time_varying_reals_decoder: names of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function hidden_continuous_size: default for hidden size for processing continous variables (similar to categorical embedding size) hidden_continuous_sizes: dictionary mapping continuous input indices to sizes for variable selection (fallback to hidden_continuous_size if index is not in dictionary) embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels learning_rate: learning rate log_interval: log predictions every x batches, do not log if 0 or less, log interpretation if > 0. If < 1.0 , will log multiple entries per batch. Defaults to -1. log_val_interval: frequency with which to log validation set metrics, defaults to log_interval log_gradient_flow: if to log gradient flow, this takes time and should be only done to diagnose training failures reduce_on_plateau_patience (int): patience after which learning rate is reduced by a factor of 10 monotone_constaints (Dict[str, int]): dictionary of monotonicity constraints for continuous decoder variables mapping position (e.g. ``"0"`` for first position) to constraint (``-1`` for negative and ``+1`` for positive, larger numbers add more weight to the constraint vs. the loss but are usually not necessary). This constraint significantly slows down training. Defaults to {}. share_single_variable_networks (bool): if to share the single variable networks between the encoder and decoder. Defaults to False. logging_metrics (nn.ModuleList[LightningMetric]): list of metrics that are logged during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]). **kwargs: additional arguments to :py:class:`~BaseModel`. """ if logging_metrics is None: logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]) if loss is None: loss = QuantileLoss() self.save_hyperparameters() # store loss function separately as it is a module assert isinstance( loss, LightningMetric), "Loss has to be a PyTorch Lightning `Metric`" super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) # processing inputs # embeddings self.input_embeddings = MultiEmbedding( embedding_sizes=self.hparams.embedding_sizes, categorical_groups=self.hparams.categorical_groups, embedding_paddings=self.hparams.embedding_paddings, x_categoricals=self.hparams.x_categoricals, max_embedding_size=self.hparams.hidden_size, ) # continuous variable processing self.prescalers = nn.ModuleDict({ name: nn.Linear( 1, self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size)) for name in self.reals }) # variable selection # variable selection for static variables static_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.static_categoricals } static_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.static_reals }) self.static_variable_selection = VariableSelectionNetwork( input_sizes=static_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.static_categoricals }, dropout=self.hparams.dropout, prescalers=self.prescalers, ) # variable selection for encoder and decoder encoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_encoder } encoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_encoder }) decoder_input_sizes = { name: self.hparams.embedding_sizes[name][1] for name in self.hparams.time_varying_categoricals_decoder } decoder_input_sizes.update({ name: self.hparams.hidden_continuous_sizes.get( name, self.hparams.hidden_continuous_size) for name in self.hparams.time_varying_reals_decoder }) # create single variable grns that are shared across decoder and encoder if self.hparams.share_single_variable_networks: self.shared_single_variable_grns = nn.ModuleDict() for name, input_size in encoder_input_sizes.items(): self.shared_single_variable_grns[name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) for name, input_size in decoder_input_sizes.items(): if name not in self.shared_single_variable_grns: self.shared_single_variable_grns[ name] = GatedResidualNetwork( input_size, min(input_size, self.hparams.hidden_size), self.hparams.hidden_size, self.hparams.dropout, ) self.encoder_variable_selection = VariableSelectionNetwork( input_sizes=encoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_encoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) self.decoder_variable_selection = VariableSelectionNetwork( input_sizes=decoder_input_sizes, hidden_size=self.hparams.hidden_size, input_embedding_flags={ name: True for name in self.hparams.time_varying_categoricals_decoder }, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, prescalers=self.prescalers, single_variable_grns={} if not self.hparams.share_single_variable_networks else self.shared_single_variable_grns, ) # static encoders # for variable selection self.static_context_variable_selection = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for hidden state of the lstm self.static_context_initial_hidden_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for cell state of the lstm self.static_context_initial_cell_lstm = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, ) # for post lstm static enrichment self.static_context_enrichment = GatedResidualNetwork( self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.dropout) # lstm encoder (history) and decoder (future) for local processing self.lstm_encoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) self.lstm_decoder = LSTM( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.lstm_layers, dropout=self.hparams.dropout if self.hparams.lstm_layers > 1 else 0, batch_first=True, ) # skip connection for lstm self.post_lstm_gate_encoder = GatedLinearUnit( self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_gate_decoder = self.post_lstm_gate_encoder # self.post_lstm_gate_decoder = GatedLinearUnit(self.hparams.hidden_size, dropout=self.hparams.dropout) self.post_lstm_add_norm_encoder = AddNorm(self.hparams.hidden_size, trainable_add=False) # self.post_lstm_add_norm_decoder = AddNorm(self.hparams.hidden_size, trainable_add=True) self.post_lstm_add_norm_decoder = self.post_lstm_add_norm_encoder # static enrichment and processing past LSTM self.static_enrichment = GatedResidualNetwork( input_size=self.hparams.hidden_size, hidden_size=self.hparams.hidden_size, output_size=self.hparams.hidden_size, dropout=self.hparams.dropout, context_size=self.hparams.hidden_size, ) # attention for long-range processing self.multihead_attn = InterpretableMultiHeadAttention( d_model=self.hparams.hidden_size, n_head=self.hparams.attention_head_size, dropout=self.hparams.dropout) self.post_attn_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=self.hparams.dropout, trainable_add=False) self.pos_wise_ff = GatedResidualNetwork(self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, dropout=self.hparams.dropout) # output processing -> no dropout at this late stage self.pre_output_gate_norm = GateAddNorm(self.hparams.hidden_size, dropout=None, trainable_add=False) if self.n_targets > 1: # if to run with multiple targets self.output_layer = nn.ModuleList([ nn.Linear(self.hparams.hidden_size, output_size) for output_size in self.hparams.output_size ]) else: self.output_layer = nn.Linear(self.hparams.hidden_size, self.hparams.output_size) @classmethod def from_dataset( cls, dataset: TimeSeriesDataSet, allowed_encoder_known_variable_names: List[str] = None, **kwargs, ): """ Create model from dataset. Args: dataset: timeseries dataset allowed_encoder_known_variable_names: List of known variables that are allowed in encoder, defaults to all **kwargs: additional arguments such as hyperparameters for model (see ``__init__()``) Returns: TemporalFusionTransformer """ # add maximum encoder length new_kwargs = dict(max_encoder_length=dataset.max_encoder_length) new_kwargs.update( cls.deduce_default_output_parameters(dataset, kwargs, QuantileLoss())) # update defaults new_kwargs.update(kwargs) # create class and return return super().from_dataset(dataset, allowed_encoder_known_variable_names= allowed_encoder_known_variable_names, **new_kwargs) def expand_static_context(self, context, timesteps): """ add time dimension to static context """ return context[:, None].expand(-1, timesteps, -1) def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int): """ Returns causal mask to apply for self-attention layer. Args: self_attn_inputs: Inputs to self attention layer to determine mask shape """ # indices to which is attended attend_step = torch.arange(decoder_length, device=self.device) # indices for which is predicted predict_step = torch.arange(0, decoder_length, device=self.device)[:, None] # do not attend to steps to self or after prediction # todo: there is potential value in attending to future forecasts if they are made with knowledge currently # available # one possibility is here to use a second attention layer for future attention (assuming different effects # matter in the future than the past) # or alternatively using the same layer but allowing forward attention - i.e. only masking out non-available # data and self decoder_mask = attend_step >= predict_step # do not attend to steps where data is padded encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths) # combine masks along attended time - first encoder and then decoder mask = torch.cat( ( encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1), decoder_mask.unsqueeze(0).expand(encoder_lengths.size(0), -1, -1), ), dim=2, ) return mask def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ input dimensions: n_samples x time x variables """ encoder_lengths = x["encoder_lengths"] decoder_lengths = x["decoder_lengths"] x_cat = torch.cat([x["encoder_cat"], x["decoder_cat"]], dim=1) # concatenate in time dimension x_cont = torch.cat([x["encoder_cont"], x["decoder_cont"]], dim=1) # concatenate in time dimension timesteps = x_cont.size(1) # encode + decode length max_encoder_length = int(encoder_lengths.max()) input_vectors = self.input_embeddings(x_cat) input_vectors.update({ name: x_cont[..., idx].unsqueeze(-1) for idx, name in enumerate(self.hparams.x_reals) if name in self.reals }) # Embedding and variable selection if len(self.static_variables) > 0: # static embeddings will be constant over entire batch static_embedding = { name: input_vectors[name][:, 0] for name in self.static_variables } static_embedding, static_variable_selection = self.static_variable_selection( static_embedding) else: static_embedding = torch.zeros( (x_cont.size(0), self.hparams.hidden_size), dtype=self.dtype, device=self.device) static_variable_selection = torch.zeros((x_cont.size(0), 0), dtype=self.dtype, device=self.device) static_context_variable_selection = self.expand_static_context( self.static_context_variable_selection(static_embedding), timesteps) embeddings_varying_encoder = { name: input_vectors[name][:, :max_encoder_length] for name in self.encoder_variables } embeddings_varying_encoder, encoder_sparse_weights = self.encoder_variable_selection( embeddings_varying_encoder, static_context_variable_selection[:, :max_encoder_length], ) embeddings_varying_decoder = { name: input_vectors[name][:, max_encoder_length:] for name in self.decoder_variables # select decoder } embeddings_varying_decoder, decoder_sparse_weights = self.decoder_variable_selection( embeddings_varying_decoder, static_context_variable_selection[:, max_encoder_length:], ) # LSTM # calculate initial state input_hidden = self.static_context_initial_hidden_lstm( static_embedding).expand(self.hparams.lstm_layers, -1, -1) input_cell = self.static_context_initial_cell_lstm( static_embedding).expand(self.hparams.lstm_layers, -1, -1) # run local encoder encoder_output, (hidden, cell) = self.lstm_encoder(embeddings_varying_encoder, (input_hidden, input_cell), lengths=encoder_lengths, enforce_sorted=False) # run local decoder decoder_output, _ = self.lstm_decoder( embeddings_varying_decoder, (hidden, cell), lengths=decoder_lengths, enforce_sorted=False, ) # skip connection over lstm lstm_output_encoder = self.post_lstm_gate_encoder(encoder_output) lstm_output_encoder = self.post_lstm_add_norm_encoder( lstm_output_encoder, embeddings_varying_encoder) lstm_output_decoder = self.post_lstm_gate_decoder(decoder_output) lstm_output_decoder = self.post_lstm_add_norm_decoder( lstm_output_decoder, embeddings_varying_decoder) lstm_output = torch.cat([lstm_output_encoder, lstm_output_decoder], dim=1) # static enrichment static_context_enrichment = self.static_context_enrichment( static_embedding) attn_input = self.static_enrichment( lstm_output, self.expand_static_context(static_context_enrichment, timesteps)) # Attention attn_output, attn_output_weights = self.multihead_attn( q=attn_input[:, max_encoder_length:], # query only for predictions k=attn_input, v=attn_input, mask=self.get_attention_mask(encoder_lengths=encoder_lengths, decoder_length=timesteps - max_encoder_length), ) # skip connection over attention attn_output = self.post_attn_gate_norm( attn_output, attn_input[:, max_encoder_length:]) output = self.pos_wise_ff(attn_output) # skip connection over temporal fusion decoder (not LSTM decoder despite the LSTM output contains # a skip from the variable selection network) output = self.pre_output_gate_norm(output, lstm_output[:, max_encoder_length:]) if self.n_targets > 1: # if to use multi-target architecture output = [ output_layer(output) for output_layer in self.output_layer ] else: output = self.output_layer(output) return dict( prediction=output, attention=attn_output_weights, static_variables=static_variable_selection, encoder_variables=encoder_sparse_weights, decoder_variables=decoder_sparse_weights, decoder_lengths=decoder_lengths, encoder_lengths=encoder_lengths, groups=x["groups"], decoder_time_idx=x["decoder_time_idx"], target_scale=x["target_scale"], ) def on_fit_end(self): if self.log_interval > 0: self.log_embeddings() def step(self, x, y, batch_idx): """ run at each step for training or validation """ # extract data and run model log, out = super().step(x, y, batch_idx) # calculate interpretations etc for latter logging if self.log_interval > 0: def detach(v): if isinstance(v, torch.Tensor): return v.detach() elif isinstance(v, (list, tuple)) and len(v) > 0 and isinstance( v[0], torch.Tensor): return [vp.detach() for vp in v] else: return v detached_output = { name: detach(out_part) for name, out_part in out.items() } interpretation = self.interpret_output( detached_output, reduction="sum", attention_prediction_horizon= 0, # attention only for first prediction horizon ) log["interpretation"] = interpretation return log, out def epoch_end(self, outputs): """ run at epoch end for training or validation """ if self.log_interval > 0: self.log_interpretation(outputs) def interpret_output( self, out: Dict[str, torch.Tensor], reduction: str = "none", attention_prediction_horizon: int = 0, attention_as_autocorrelation: bool = False, ) -> Dict[str, torch.Tensor]: """ interpret output of model Args: out: output as produced by ``forward()`` reduction: "none" for no averaging over batches, "sum" for summing attentions, "mean" for normalizing by encode lengths attention_prediction_horizon: which prediction horizon to use for attention attention_as_autocorrelation: if to record attention as autocorrelation - this should be set to true in case of ``reduction != "none"`` and differing prediction times of the samples. Defaults to False Returns: interpretations that can be plotted with ``plot_interpretation()`` """ # histogram of decode and encode lengths encoder_length_histogram = integer_histogram( out["encoder_lengths"], min=0, max=self.hparams.max_encoder_length) decoder_length_histogram = integer_histogram( out["decoder_lengths"], min=1, max=out["decoder_variables"].size(1)) # mask where decoder and encoder where not applied when averaging variable selection weights encoder_variables = out["encoder_variables"].squeeze(-2) encode_mask = create_mask(encoder_variables.size(1), out["encoder_lengths"]) encoder_variables = encoder_variables.masked_fill( encode_mask.unsqueeze(-1), 0.0).sum(dim=1) encoder_variables /= (out["encoder_lengths"].where( out["encoder_lengths"] > 0, torch.ones_like(out["encoder_lengths"])).unsqueeze(-1)) decoder_variables = out["decoder_variables"].squeeze(-2) decode_mask = create_mask(decoder_variables.size(1), out["decoder_lengths"]) decoder_variables = decoder_variables.masked_fill( decode_mask.unsqueeze(-1), 0.0).sum(dim=1) decoder_variables /= out["decoder_lengths"].unsqueeze(-1) # static variables need no masking static_variables = out["static_variables"].squeeze(1) # attention is batch x time x heads x time_to_attend # average over heads + only keep prediction attention and attention on observed timesteps attention = out["attention"][:, attention_prediction_horizon, :, : out["encoder_lengths"].max() + attention_prediction_horizon].mean(1) if reduction != "none": # if to average over batches static_variables = static_variables.sum(dim=0) encoder_variables = encoder_variables.sum(dim=0) decoder_variables = decoder_variables.sum(dim=0) # reorder attention or averaging for i in range( len(attention)): # very inefficient but does the trick if 0 < out["encoder_lengths"][i] < attention.size( 1) - attention_prediction_horizon - 1: relevant_attention = attention[ i, :out["encoder_lengths"][i] + attention_prediction_horizon].clone() if attention_as_autocorrelation: relevant_attention = autocorrelation( relevant_attention) attention[ i, -out["encoder_lengths"][i] - attention_prediction_horizon:] = relevant_attention attention[i, :attention.size(1) - out["encoder_lengths"][i] - attention_prediction_horizon] = 0.0 elif attention_as_autocorrelation: attention[i] = autocorrelation(attention[i]) attention = attention.sum(dim=0) if reduction == "mean": attention = attention / encoder_length_histogram[1:].flip( 0).cumsum(0).clamp(1) attention = attention / attention.sum(-1).unsqueeze( -1) # renormalize elif reduction == "sum": pass else: raise ValueError(f"Unknown reduction {reduction}") attention = torch.zeros( self.hparams.max_encoder_length + attention_prediction_horizon, device=self.device).scatter( dim=0, index=torch.arange( self.hparams.max_encoder_length + attention_prediction_horizon - attention.size(-1), self.hparams.max_encoder_length + attention_prediction_horizon, device=self.device, ), src=attention, ) else: attention = attention / attention.sum(-1).unsqueeze( -1) # renormalize interpretation = dict( attention=attention, static_variables=static_variables, encoder_variables=encoder_variables, decoder_variables=decoder_variables, encoder_length_histogram=encoder_length_histogram, decoder_length_histogram=decoder_length_histogram, ) return interpretation def plot_prediction( self, x: Dict[str, torch.Tensor], out: Dict[str, torch.Tensor], idx: int, plot_attention: bool = True, add_loss_to_title: bool = False, show_future_observed: bool = True, ax=None, ) -> plt.Figure: """ Plot actuals vs prediction and attention Args: x (Dict[str, torch.Tensor]): network input out (Dict[str, torch.Tensor]): network output idx (int): sample index plot_attention: if to plot attention on secondary axis add_loss_to_title: if to add loss to title. Default to False. show_future_observed: if to show actuals for future. Defaults to True. ax: matplotlib axes to plot on Returns: plt.Figure: matplotlib figure """ # plot prediction as normal fig = super().plot_prediction( x, out, idx=idx, add_loss_to_title=add_loss_to_title, show_future_observed=show_future_observed, ax=ax) # add attention on secondary axis if plot_attention: interpretation = self.interpret_output(out) for f in to_list(fig): ax = f.axes[0] ax2 = ax.twinx() ax2.set_ylabel("Attention") encoder_length = x["encoder_lengths"][idx] ax2.plot( torch.arange(-encoder_length, 0), interpretation["attention"][ idx, :encoder_length].detach().cpu(), alpha=0.2, color="k", ) f.tight_layout() return fig def plot_interpretation( self, interpretation: Dict[str, torch.Tensor]) -> Dict[str, plt.Figure]: """ Make figures that interpret model. * Attention * Variable selection weights / importances Args: interpretation: as obtained from ``interpret_output()`` Returns: dictionary of matplotlib figures """ figs = {} # attention fig, ax = plt.subplots() attention = interpretation["attention"].detach().cpu() attention = attention / attention.sum(-1).unsqueeze(-1) ax.plot( np.arange(-self.hparams.max_encoder_length, attention.size(0) - self.hparams.max_encoder_length), attention) ax.set_xlabel("Time index") ax.set_ylabel("Attention") ax.set_title("Attention") figs["attention"] = fig # variable selection def make_selection_plot(title, values, labels): fig, ax = plt.subplots(figsize=(7, len(values) * 0.25 + 2)) order = np.argsort(values) values = values / values.sum(-1).unsqueeze(-1) ax.barh(np.arange(len(values)), values[order] * 100, tick_label=np.asarray(labels)[order]) ax.set_title(title) ax.set_xlabel("Importance in %") plt.tight_layout() return fig figs["static_variables"] = make_selection_plot( "Static variables importance", interpretation["static_variables"].detach().cpu(), self.static_variables) figs["encoder_variables"] = make_selection_plot( "Encoder variables importance", interpretation["encoder_variables"].detach().cpu(), self.encoder_variables) figs["decoder_variables"] = make_selection_plot( "Decoder variables importance", interpretation["decoder_variables"].detach().cpu(), self.decoder_variables) return figs def log_interpretation(self, outputs): """ Log interpretation metrics to tensorboard. """ # extract interpretations interpretation = { # use padded_stack because decoder length histogram can be of different length name: padded_stack([x["interpretation"][name].detach() for x in outputs], side="right", value=0).sum(0) for name in outputs[0]["interpretation"].keys() } # normalize attention with length histogram squared to account for: 1. zeros in attention and # 2. higher attention due to less values attention_occurances = interpretation["encoder_length_histogram"][ 1:].flip(0).cumsum(0).float() attention_occurances = attention_occurances / attention_occurances.max( ) attention_occurances = torch.cat( [ attention_occurances, torch.ones( interpretation["attention"].size(0) - attention_occurances.size(0), dtype=attention_occurances.dtype, device=attention_occurances.device, ), ], dim=0, ) interpretation["attention"] = interpretation[ "attention"] / attention_occurances.pow(2).clamp(1.0) interpretation["attention"] = interpretation[ "attention"] / interpretation["attention"].sum() figs = self.plot_interpretation( interpretation) # make interpretation figures label = ["val", "train"][self.training] # log to tensorboard for name, fig in figs.items(): self.logger.experiment.add_figure( f"{label.capitalize()} {name} importance", fig, global_step=self.global_step) # log lengths of encoder/decoder for type in ["encoder", "decoder"]: fig, ax = plt.subplots() lengths = (padded_stack([ out["interpretation"][f"{type}_length_histogram"] for out in outputs ]).sum(0).detach().cpu()) if type == "decoder": start = 1 else: start = 0 ax.plot(torch.arange(start, start + len(lengths)), lengths) ax.set_xlabel(f"{type.capitalize()} length") ax.set_ylabel("Number of samples") ax.set_title( f"{type.capitalize()} length distribution in {label} epoch") self.logger.experiment.add_figure( f"{label.capitalize()} {type} length distribution", fig, global_step=self.global_step) def log_embeddings(self): """ Log embeddings to tensorboard """ for name, emb in self.input_embeddings.items(): labels = self.hparams.embedding_labels[name] self.logger.experiment.add_embedding( emb.weight.data.detach().cpu(), metadata=labels, tag=name, global_step=self.global_step)
def __init__( self, cell_type: str = "LSTM", hidden_size: int = 10, rnn_layers: int = 2, dropout: float = 0.1, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, x_reals: List[str] = [], x_categoricals: List[str] = [], n_validation_samples: int = None, n_plotting_samples: int = None, target: Union[str, List[str]] = None, loss: MultiHorizonMetric = None, logging_metrics: nn.ModuleList = None, **kwargs, ): """ Args: cell_type (str, optional): Recurrent cell type ["LSTM", "GRU"]. Defaults to "LSTM". hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with ``rnn_layers``. Defaults to 10. rnn_layers (int, optional): Number of RNN layers - important hyperparameter. Defaults to 2. dropout (float, optional): Dropout in RNN layers. Defaults to 0.1. static_categoricals: integer of positions of static categorical variables static_reals: integer of positions of static continuous variables time_varying_categoricals_encoder: integer of positions of categorical variables for encoder time_varying_categoricals_decoder: integer of positions of categorical variables for decoder time_varying_reals_encoder: integer of positions of continuous variables for encoder time_varying_reals_decoder: integer of positions of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels n_validation_samples (int, optional): Number of samples to use for calculating validation metrics. Defaults to None, i.e. no sampling at validation stage and using "mean" of distribution for logging metrics calculation. n_plotting_samples (int, optional): Number of samples to generate for plotting predictions during training. Defaults to ``n_validation_samples`` if not None or 100 otherwise. target (str, optional): Target variable or list of target variables. Defaults to None. loss (DistributionLoss, optional): Distribution loss function. Keep in mind that each distribution loss function might have specific requirements for target normalization. Defaults to :py:class:`~pytorch_forecasting.metrics.NormalDistributionLoss`. logging_metrics (nn.ModuleList, optional): Metrics to log during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]). """ # saves arguments in signature to `.hparams` attribute, mandatory call - do not skip this self.save_hyperparameters() # pass additional arguments to BaseModel.__init__, mandatory call - do not skip this super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) assert set(self.encoder_variables) - set(to_list(target)) == set( self.decoder_variables ), "Encoder and decoder variables have to be the same apart from target variable" for targeti in to_list(target): assert ( targeti in time_varying_reals_encoder ), f"target {targeti} has to be real" # todo: remove this restriction self.embeddings = MultiEmbedding( embedding_sizes=embedding_sizes, embedding_paddings=embedding_paddings, categorical_groups=categorical_groups, x_categoricals=x_categoricals, ) time_series_rnn = get_rnn(cell_type) cont_size = len(self.reals) cat_size = sum([size[1] for size in self.hparams.embedding_sizes.values()]) input_size = cont_size + cat_size self.rnn = time_series_rnn( input_size=input_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.rnn_layers, dropout=self.hparams.dropout if self.hparams.rnn_layers > 1 else 0, batch_first=True, ) # add linear layers for argument projects if isinstance(target, str): # single target self.output_projector = nn.Linear(self.hparams.hidden_size, 1) else: # multi target self.output_projector = nn.ModuleList([nn.Linear(self.hparams.hidden_size, 1) for _ in target])
def __init__( self, cell_type: str = "LSTM", hidden_size: int = 10, rnn_layers: int = 2, dropout: float = 0.1, static_categoricals: List[str] = [], static_reals: List[str] = [], time_varying_categoricals_encoder: List[str] = [], time_varying_categoricals_decoder: List[str] = [], categorical_groups: Dict[str, List[str]] = {}, time_varying_reals_encoder: List[str] = [], time_varying_reals_decoder: List[str] = [], embedding_sizes: Dict[str, Tuple[int, int]] = {}, embedding_paddings: List[str] = [], embedding_labels: Dict[str, np.ndarray] = {}, x_reals: List[str] = [], x_categoricals: List[str] = [], n_validation_samples: int = None, n_plotting_samples: int = None, target: Union[str, List[str]] = None, target_lags: Dict[str, List[int]] = {}, loss: DistributionLoss = None, logging_metrics: nn.ModuleList = None, **kwargs, ): """ DeepAR Network. The code is based on the article `DeepAR: Probabilistic forecasting with autoregressive recurrent networks <https://www.sciencedirect.com/science/article/pii/S0169207019301888>`_. Args: cell_type (str, optional): Recurrent cell type ["LSTM", "GRU"]. Defaults to "LSTM". hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with ``rnn_layers``. Defaults to 10. rnn_layers (int, optional): Number of RNN layers - important hyperparameter. Defaults to 2. dropout (float, optional): Dropout in RNN layers. Defaults to 0.1. static_categoricals: integer of positions of static categorical variables static_reals: integer of positions of static continuous variables time_varying_categoricals_encoder: integer of positions of categorical variables for encoder time_varying_categoricals_decoder: integer of positions of categorical variables for decoder time_varying_reals_encoder: integer of positions of continuous variables for encoder time_varying_reals_decoder: integer of positions of continuous variables for decoder categorical_groups: dictionary where values are list of categorical variables that are forming together a new categorical variable which is the key in the dictionary x_reals: order of continuous variables in tensor passed to forward function x_categoricals: order of categorical variables in tensor passed to forward function embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and embedding size embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector embedding_labels: dictionary mapping (string) indices to list of categorical labels n_validation_samples (int, optional): Number of samples to use for calculating validation metrics. Defaults to None, i.e. no sampling at validation stage and using "mean" of distribution for logging metrics calculation. n_plotting_samples (int, optional): Number of samples to generate for plotting predictions during training. Defaults to ``n_validation_samples`` if not None or 100 otherwise. target (str, optional): Target variable or list of target variables. Defaults to None. target_lags (Dict[str, Dict[str, int]]): dictionary of target names mapped to list of time steps by which the variable should be lagged. Lags can be useful to indicate seasonality to the models. If you know the seasonalit(ies) of your data, add at least the target variables with the corresponding lags to improve performance. Defaults to no lags, i.e. an empty dictionary. loss (DistributionLoss, optional): Distribution loss function. Keep in mind that each distribution loss function might have specific requirements for target normalization. Defaults to :py:class:`~pytorch_forecasting.metrics.NormalDistributionLoss`. logging_metrics (nn.ModuleList, optional): Metrics to log during training. Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]). """ if loss is None: loss = NormalDistributionLoss() if logging_metrics is None: logging_metrics = nn.ModuleList( [SMAPE(), MAE(), RMSE(), MAPE(), MASE()]) if n_plotting_samples is None: if n_validation_samples is None: n_plotting_samples = n_validation_samples else: n_plotting_samples = 100 self.save_hyperparameters() # store loss function separately as it is a module super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs) self.embeddings = MultiEmbedding( embedding_sizes=embedding_sizes, embedding_paddings=embedding_paddings, categorical_groups=categorical_groups, x_categoricals=x_categoricals, ) lagged_target_names = [ l for lags in target_lags.values() for l in lags ] assert set(self.encoder_variables) - set( to_list(target) ) - set(lagged_target_names) == set(self.decoder_variables) - set( lagged_target_names ), "Encoder and decoder variables have to be the same apart from target variable" for targeti in to_list(target): assert ( targeti in time_varying_reals_encoder ), f"target {targeti} has to be real" # todo: remove this restriction assert ( isinstance(target, str) and isinstance(loss, DistributionLoss) ) or ( isinstance(target, (list, tuple)) and isinstance(loss, MultiLoss) and len(loss) == len(target) ), "number of targets should be equivalent to number of loss metrics" rnn_class = get_rnn(cell_type) cont_size = len(self.reals) cat_size = sum( [size[1] for size in self.hparams.embedding_sizes.values()]) input_size = cont_size + cat_size self.rnn = rnn_class( input_size=input_size, hidden_size=self.hparams.hidden_size, num_layers=self.hparams.rnn_layers, dropout=self.hparams.dropout if self.hparams.rnn_layers > 1 else 0, batch_first=True, ) # add linear layers for argument projects if isinstance(target, str): # single target self.distribution_projector = nn.Linear( self.hparams.hidden_size, len(self.loss.distribution_arguments)) else: # multi target self.distribution_projector = nn.ModuleList([ nn.Linear(self.hparams.hidden_size, len(args)) for args in self.loss.distribution_arguments ])