Пример #1
0
def test_integration(
    dataloaders_with_covariates,
    dataloaders_fixed_window_without_covariates,
    dataloaders_multi_target,
    tmp_path,
    gpus,
    dataloader,
):
    kwargs = {}
    if dataloader == "with_covariates":
        dataloader = dataloaders_with_covariates
        kwargs["backcast_loss_ratio"] = 0.5
    elif dataloader == "fixed_window_without_covariates":
        dataloader = dataloaders_fixed_window_without_covariates
    elif dataloader == "multi_target":
        dataloader = dataloaders_multi_target
        kwargs["loss"] = QuantileLoss()
    elif dataloader == "quantiles":
        dataloader = dataloaders_with_covariates
        kwargs["loss"] = QuantileLoss()
    elif dataloader == "implicit-quantiles":
        dataloader = dataloaders_with_covariates
        kwargs["loss"] = ImplicitQuantileNetworkDistributionLoss()
    elif dataloader == "multivariate-quantiles":
        dataloader = dataloaders_with_covariates
        kwargs["loss"] = MQF2DistributionLoss(
            prediction_length=dataloader["train"].dataset.max_prediction_length
        )
    else:
        raise ValueError(f"dataloader {dataloader} unknown")
    _integration(dataloader, tmp_path=tmp_path, gpus=gpus, **kwargs)
Пример #2
0
    def from_dataset(
        cls,
        dataset: TimeSeriesDataSet,
        allowed_encoder_known_variable_names: List[str] = None,
        **kwargs,
    ):
        """
        Create model from dataset.

        Args:
            dataset: timeseries dataset
            allowed_encoder_known_variable_names: List of known variables that are allowed in encoder, defaults to all
            **kwargs: additional arguments such as hyperparameters for model (see ``__init__()``)

        Returns:
            TemporalFusionTransformer
        """
        # add maximum encoder length
        new_kwargs = dict(max_encoder_length=dataset.max_encoder_length)
        new_kwargs.update(
            cls.deduce_default_output_parameters(dataset, kwargs,
                                                 QuantileLoss()))

        # update defaults
        new_kwargs.update(kwargs)

        # create class and return
        return super().from_dataset(dataset,
                                    allowed_encoder_known_variable_names=
                                    allowed_encoder_known_variable_names,
                                    **new_kwargs)
Пример #3
0
    def from_dataset(
        cls,
        dataset: TimeSeriesDataSet,
        allowed_encoder_known_variable_names: List[str] = None,
        **kwargs,
    ):
        """
        Create model from dataset.

        Args:
            dataset: timeseries dataset
            allowed_encoder_known_variable_names: List of known variables that are allowed in encoder, defaults to all
            **kwargs: additional arguments such as hyperparameters for model (see ``__init__()``)

        Returns:
            TemporalFusionTransformer
        """
        # add maximum encoder length
        new_kwargs = dict(max_encoder_length=dataset.max_encoder_length)

        # infer output size
        def get_output_size(normalizer, loss):
            if isinstance(loss, QuantileLoss):
                return len(loss.quantiles)
            elif isinstance(normalizer, NaNLabelEncoder):
                return len(normalizer.classes_)
            else:
                return 1

        loss = kwargs.get("loss", QuantileLoss())
        # handle multiple targets
        new_kwargs["n_targets"] = len(dataset.target_names)
        if new_kwargs["n_targets"] > 1:  # try to infer number of ouput sizes
            if not isinstance(loss, MultiLoss):
                loss = MultiLoss([deepcopy(loss)] * new_kwargs["n_targets"])
                new_kwargs["loss"] = loss
            if isinstance(loss, MultiLoss) and "output_size" not in kwargs:
                new_kwargs["output_size"] = [
                    get_output_size(normalizer, l) for normalizer, l in zip(
                        dataset.target_normalizer.normalizers, loss.metrics)
                ]
        elif "output_size" not in kwargs:
            new_kwargs["output_size"] = get_output_size(
                dataset.target_normalizer, loss)

        # update defaults
        new_kwargs.update(kwargs)

        # create class and return
        return super().from_dataset(dataset,
                                    allowed_encoder_known_variable_names=
                                    allowed_encoder_known_variable_names,
                                    **new_kwargs)
Пример #4
0
def run_train(cfg, trainset, validset) -> None:
    import multiprocessing
    import pytorch_lightning as pl
    from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
    from pytorch_lightning.loggers import TensorBoardLogger
    from pytorch_forecasting.metrics import QuantileLoss
    from pytorch_forecasting.models import TemporalFusionTransformer

    # stop training, when loss metric does not improve on validation set
    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=1e-4,
        patience=10,
        verbose=False,
        mode="min"
    )
    lr_monitor = LearningRateMonitor()  # log the learning rate
    logger = TensorBoardLogger("result/lightning_logs")  # log to tensorboard
    # create trainer
    params = cfg.get("trainer").params
    trainer = pl.Trainer(
        callbacks=[lr_monitor, early_stop_callback],
        logger=logger,
        **params,
    )

    # initialise model
    params = cfg.get("model").params
    tft = TemporalFusionTransformer.from_dataset(
        trainset,
        loss=QuantileLoss(),
        **params,
    )
    print(tft.size())   # 29.6k parameters in model

    n_cores = multiprocessing.cpu_count()
    loader_trainset = trainset.to_dataloader(
        train=True, batch_size=cfg.get("trainset").batch_size, num_workers=n_cores
    )
    loader_validset = validset.to_dataloader(
        train=False, batch_size=cfg.get("validset").batch_size, num_workers=n_cores
    )

    # fit network
    trainer.fit(
        tft,
        train_dataloader=loader_trainset,
        val_dataloaders=loader_validset,
    )

    return
Пример #5
0
def optimize_hyperparameters(
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
    model_path: str,
    max_epochs: int = 20,
    n_trials: int = 100,
    timeout: float = 3600 * 8.0,  # 8 hours
    gradient_clip_val_range: Tuple[float, float] = (0.01, 100.0),
    hidden_size_range: Tuple[int, int] = (16, 265),
    hidden_continuous_size_range: Tuple[int, int] = (8, 64),
    attention_head_size_range: Tuple[int, int] = (1, 4),
    dropout_range: Tuple[float, float] = (0.1, 0.3),
    learning_rate_range: Tuple[float, float] = (1e-5, 1.0),
    use_learning_rate_finder: bool = True,
    trainer_kwargs: Dict[str, Any] = {},
    log_dir: str = "lightning_logs",
    study: optuna.Study = None,
    verbose: Union[int, bool] = None,
    **kwargs,
) -> optuna.Study:
    """
    Optimize Temporal Fusion Transformer hyperparameters.

    Run hyperparameter optimization. Learning rate for is determined with
    the PyTorch Lightning learning rate finder.

    Args:
        train_dataloader (DataLoader): dataloader for training model
        val_dataloader (DataLoader): dataloader for validating model
        model_path (str): folder to which model checkpoints are saved
        max_epochs (int, optional): Maximum number of epochs to run training. Defaults to 20.
        n_trials (int, optional): Number of hyperparameter trials to run. Defaults to 100.
        timeout (float, optional): Time in seconds after which training is stopped regardless of number of epochs
            or validation metric. Defaults to 3600*8.0.
        hidden_size_range (Tuple[int, int], optional): Minimum and maximum of ``hidden_size`` hyperparameter. Defaults
            to (16, 265).
        hidden_continuous_size_range (Tuple[int, int], optional):  Minimum and maximum of ``hidden_continuous_size``
            hyperparameter. Defaults to (8, 64).
        attention_head_size_range (Tuple[int, int], optional):  Minimum and maximum of ``attention_head_size``
            hyperparameter. Defaults to (1, 4).
        dropout_range (Tuple[float, float], optional):  Minimum and maximum of ``dropout`` hyperparameter. Defaults to
            (0.1, 0.3).
        learning_rate_range (Tuple[float, float], optional): Learning rate range. Defaults to (1e-5, 1.0).
        use_learning_rate_finder (bool): If to use learning rate finder or optimize as part of hyperparameters.
            Defaults to True.
        trainer_kwargs (Dict[str, Any], optional): Additional arguments to the
            `PyTorch Lightning trainer <https://pytorch-lightning.readthedocs.io/en/latest/trainer.html>`_ such
            as ``limit_train_batches``. Defaults to {}.
        log_dir (str, optional): Folder into which to log results for tensorboard. Defaults to "lightning_logs".
        study (optuna.Study, optional): study to resume. Will create new study by default.
        verbose (Union[int, bool]): level of verbosity.
            * None: no change in verbosity level (equivalent to verbose=1 by optuna-set default).
            * 0 or False: log only warnings.
            * 1 or True: log pruning events.
            * 2: optuna logging level at debug level.
            Defaults to None.

        **kwargs: Additional arguments for the :py:class:`~TemporalFusionTransformer`.

    Returns:
        optuna.Study: optuna study results
    """
    assert isinstance(train_dataloader.dataset,
                      TimeSeriesDataSet) and isinstance(
                          val_dataloader.dataset, TimeSeriesDataSet
                      ), "dataloaders must be built from timeseriesdataset"

    logging_level = {
        None: optuna.logging.get_verbosity(),
        0: optuna.logging.WARNING,
        1: optuna.logging.INFO,
        2: optuna.logging.DEBUG,
    }
    optuna_verbose = logging_level[verbose]
    optuna.logging.set_verbosity(optuna_verbose)

    loss = kwargs.get(
        "loss", QuantileLoss()
    )  # need a deepcopy of loss as it will otherwise propagate from one trial to the next

    # create objective function
    def objective(trial: optuna.Trial) -> float:
        # Filenames for each trial must be made unique in order to access each checkpoint.
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=os.path.join(model_path, "trial_{}".format(trial.number)),
            filename="{epoch}",
            monitor="val_loss")

        # The default logger in PyTorch Lightning writes to event files to be consumed by
        # TensorBoard. We don't use any logger here as it requires us to implement several abstract
        # methods. Instead we setup a simple callback, that saves metrics from each validation step.
        metrics_callback = MetricsCallback()
        learning_rate_callback = LearningRateMonitor()
        logger = TensorBoardLogger(log_dir,
                                   name="optuna",
                                   version=trial.number)
        gradient_clip_val = trial.suggest_loguniform("gradient_clip_val",
                                                     *gradient_clip_val_range)
        default_trainer_kwargs = dict(
            gpus=[0] if torch.cuda.is_available() else None,
            max_epochs=max_epochs,
            gradient_clip_val=gradient_clip_val,
            callbacks=[
                metrics_callback,
                learning_rate_callback,
                checkpoint_callback,
                PyTorchLightningPruningCallback(trial, monitor="val_loss"),
            ],
            logger=logger,
            progress_bar_refresh_rate=[0, 1
                                       ][optuna_verbose < optuna.logging.INFO],
            weights_summary=[None,
                             "top"][optuna_verbose < optuna.logging.INFO],
        )
        default_trainer_kwargs.update(trainer_kwargs)
        trainer = pl.Trainer(**default_trainer_kwargs, )

        # create model
        hidden_size = trial.suggest_int("hidden_size",
                                        *hidden_size_range,
                                        log=True)
        kwargs["loss"] = copy.deepcopy(loss)
        model = TemporalFusionTransformer.from_dataset(
            train_dataloader.dataset,
            dropout=trial.suggest_uniform("dropout", *dropout_range),
            hidden_size=hidden_size,
            hidden_continuous_size=trial.suggest_int(
                "hidden_continuous_size",
                hidden_continuous_size_range[0],
                min(hidden_continuous_size_range[1], hidden_size),
                log=True,
            ),
            attention_head_size=trial.suggest_int("attention_head_size",
                                                  *attention_head_size_range),
            log_interval=-1,
            **kwargs,
        )
        # find good learning rate
        if use_learning_rate_finder:
            lr_trainer = pl.Trainer(
                gradient_clip_val=gradient_clip_val,
                gpus=[0] if torch.cuda.is_available() else None,
                logger=False,
                progress_bar_refresh_rate=0,
                weights_summary=None,
            )
            res = lr_trainer.tuner.lr_find(
                model,
                train_dataloader=train_dataloader,
                val_dataloaders=val_dataloader,
                early_stop_threshold=10000,
                min_lr=learning_rate_range[0],
                num_training=100,
                max_lr=learning_rate_range[1],
            )

            loss_finite = np.isfinite(res.results["loss"])
            if loss_finite.sum(
            ) > 3:  # at least 3 valid values required for learning rate finder
                lr_smoothed, loss_smoothed = sm.nonparametric.lowess(
                    np.asarray(res.results["loss"])[loss_finite],
                    np.asarray(res.results["lr"])[loss_finite],
                    frac=1.0 / 10.0,
                )[min(loss_finite.sum() - 3, 10):-1].T
                optimal_idx = np.gradient(loss_smoothed).argmin()
                optimal_lr = lr_smoothed[optimal_idx]
            else:
                optimal_idx = np.asarray(res.results["loss"]).argmin()
                optimal_lr = res.results["lr"][optimal_idx]
            optuna_logger.info(f"Using learning rate of {optimal_lr:.3g}")
            # add learning rate artificially
            model.hparams.learning_rate = trial.suggest_uniform(
                "learning_rate", optimal_lr, optimal_lr)
        else:
            model.hparams.learning_rate = trial.suggest_loguniform(
                "learning_rate", *learning_rate_range)

        # fit
        trainer.fit(model,
                    train_dataloader=train_dataloader,
                    val_dataloaders=val_dataloader)

        # report result
        return metrics_callback.metrics[-1]["val_loss"].item()

    # setup optuna and run
    pruner = optuna.pruners.SuccessiveHalvingPruner()
    if study is None:
        study = optuna.create_study(direction="minimize", pruner=pruner)
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    return study
Пример #6
0
def test_integration(multiple_dataloaders_with_covariates, tmp_path, gpus):
    train_dataloader = multiple_dataloaders_with_covariates["train"]
    val_dataloader = multiple_dataloaders_with_covariates["val"]
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=1e-4,
                                        patience=1,
                                        verbose=False,
                                        mode="min")

    # check training
    logger = TensorBoardLogger(tmp_path)
    checkpoint = ModelCheckpoint(filepath=tmp_path)
    trainer = pl.Trainer(
        checkpoint_callback=checkpoint,
        max_epochs=3,
        gpus=gpus,
        weights_summary="top",
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback],
        fast_dev_run=True,
        logger=logger,
    )
    # test monotone constraints automatically
    if "discount_in_percent" in train_dataloader.dataset.reals:
        monotone_constaints = {"discount_in_percent": +1}
        cuda_context = torch.backends.cudnn.flags(enabled=False)
    else:
        monotone_constaints = {}
        cuda_context = nullcontext()

    with cuda_context:
        if isinstance(train_dataloader.dataset.target_normalizer,
                      NaNLabelEncoder):
            loss = CrossEntropy()
        elif isinstance(train_dataloader.dataset.target_normalizer,
                        MultiNormalizer):
            loss = MultiLoss([
                CrossEntropy()
                if isinstance(normalizer, NaNLabelEncoder) else QuantileLoss()
                for normalizer in
                train_dataloader.dataset.target_normalizer.normalizers
            ])
        else:
            loss = QuantileLoss()
        net = TemporalFusionTransformer.from_dataset(
            train_dataloader.dataset,
            learning_rate=0.15,
            hidden_size=4,
            attention_head_size=1,
            dropout=0.2,
            hidden_continuous_size=2,
            loss=loss,
            log_interval=5,
            log_val_interval=1,
            log_gradient_flow=True,
            monotone_constaints=monotone_constaints,
        )
        net.size()
        try:
            trainer.fit(
                net,
                train_dataloader=train_dataloader,
                val_dataloaders=val_dataloader,
            )

            # check loading
            net = TemporalFusionTransformer.load_from_checkpoint(
                checkpoint.best_model_path)

            # check prediction
            net.predict(val_dataloader,
                        fast_dev_run=True,
                        return_index=True,
                        return_decoder_lengths=True)
            # check prediction on gpu
            if not (isinstance(gpus, int) and gpus == 0):
                net.to("cuda")
                net.predict(val_dataloader,
                            fast_dev_run=True,
                            return_index=True,
                            return_decoder_lengths=True)

        finally:
            shutil.rmtree(tmp_path, ignore_errors=True)
Пример #7
0
    def __init__(
        self,
        hidden_size: int = 16,
        lstm_layers: int = 1,
        dropout: float = 0.1,
        output_size: Union[int, List[int]] = 7,
        loss: MultiHorizonMetric = None,
        attention_head_size: int = 4,
        max_encoder_length: int = 10,
        static_categoricals: List[str] = [],
        static_reals: List[str] = [],
        time_varying_categoricals_encoder: List[str] = [],
        time_varying_categoricals_decoder: List[str] = [],
        categorical_groups: Dict[str, List[str]] = {},
        time_varying_reals_encoder: List[str] = [],
        time_varying_reals_decoder: List[str] = [],
        x_reals: List[str] = [],
        x_categoricals: List[str] = [],
        hidden_continuous_size: int = 8,
        hidden_continuous_sizes: Dict[str, int] = {},
        embedding_sizes: Dict[str, Tuple[int, int]] = {},
        embedding_paddings: List[str] = [],
        embedding_labels: Dict[str, np.ndarray] = {},
        learning_rate: float = 1e-3,
        log_interval: Union[int, float] = -1,
        log_val_interval: Union[int, float] = None,
        log_gradient_flow: bool = False,
        reduce_on_plateau_patience: int = 1000,
        monotone_constaints: Dict[str, int] = {},
        share_single_variable_networks: bool = False,
        logging_metrics: nn.ModuleList = None,
        **kwargs,
    ):
        """
        Temporal Fusion Transformer for forecasting timeseries - use its :py:meth:`~from_dataset` method if possible.

        Implementation of the article
        `Temporal Fusion Transformers for Interpretable Multi-horizon Time Series
        Forecasting <https://arxiv.org/pdf/1912.09363.pdf>`_. The network outperforms DeepAR by Amazon by 36-69%
        in benchmarks.

        Enhancements compared to the original implementation (apart from capabilities added through base model
        such as monotone constraints):

        * static variables can be continuous
        * multiple categorical variables can be summarized with an EmbeddingBag
        * variable encoder and decoder length by sample
        * categorical embeddings are not transformed by variable selection network (because it is a redundant operation)
        * variable dimension in variable selection network are scaled up via linear interpolation to reduce
          number of parameters
        * non-linear variable processing in variable selection network can be shared among decoder and encoder
          (not shared by default)

        Tune its hyperparameters with
        :py:func:`~pytorch_forecasting.models.temporal_fusion_transformer.tuning.optimize_hyperparameters`.

        Args:

            hidden_size: hidden size of network which is its main hyperparameter and can range from 8 to 512
            lstm_layers: number of LSTM layers (2 is mostly optimal)
            dropout: dropout rate
            output_size: number of outputs (e.g. number of quantiles for QuantileLoss and one target or list
                of output sizes).
            loss: loss function taking prediction and targets
            attention_head_size: number of attention heads (4 is a good default)
            max_encoder_length: length to encode (can be far longer than the decoder length but does not have to be)
            static_categoricals: names of static categorical variables
            static_reals: names of static continuous variables
            time_varying_categoricals_encoder: names of categorical variables for encoder
            time_varying_categoricals_decoder: names of categorical variables for decoder
            time_varying_reals_encoder: names of continuous variables for encoder
            time_varying_reals_decoder: names of continuous variables for decoder
            categorical_groups: dictionary where values
                are list of categorical variables that are forming together a new categorical
                variable which is the key in the dictionary
            x_reals: order of continuous variables in tensor passed to forward function
            x_categoricals: order of categorical variables in tensor passed to forward function
            hidden_continuous_size: default for hidden size for processing continous variables (similar to categorical
                embedding size)
            hidden_continuous_sizes: dictionary mapping continuous input indices to sizes for variable selection
                (fallback to hidden_continuous_size if index is not in dictionary)
            embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and
                embedding size
            embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector
            embedding_labels: dictionary mapping (string) indices to list of categorical labels
            learning_rate: learning rate
            log_interval: log predictions every x batches, do not log if 0 or less, log interpretation if > 0. If < 1.0
                , will log multiple entries per batch. Defaults to -1.
            log_val_interval: frequency with which to log validation set metrics, defaults to log_interval
            log_gradient_flow: if to log gradient flow, this takes time and should be only done to diagnose training
                failures
            reduce_on_plateau_patience (int): patience after which learning rate is reduced by a factor of 10
            monotone_constaints (Dict[str, int]): dictionary of monotonicity constraints for continuous decoder
                variables mapping
                position (e.g. ``"0"`` for first position) to constraint (``-1`` for negative and ``+1`` for positive,
                larger numbers add more weight to the constraint vs. the loss but are usually not necessary).
                This constraint significantly slows down training. Defaults to {}.
            share_single_variable_networks (bool): if to share the single variable networks between the encoder and
                decoder. Defaults to False.
            logging_metrics (nn.ModuleList[LightningMetric]): list of metrics that are logged during training.
                Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]).
            **kwargs: additional arguments to :py:class:`~BaseModel`.
        """
        if logging_metrics is None:
            logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()])
        if loss is None:
            loss = QuantileLoss()
        self.save_hyperparameters()
        # store loss function separately as it is a module
        assert isinstance(
            loss,
            LightningMetric), "Loss has to be a PyTorch Lightning `Metric`"
        super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)

        # processing inputs
        # embeddings
        self.input_embeddings = MultiEmbedding(
            embedding_sizes=self.hparams.embedding_sizes,
            categorical_groups=self.hparams.categorical_groups,
            embedding_paddings=self.hparams.embedding_paddings,
            x_categoricals=self.hparams.x_categoricals,
            max_embedding_size=self.hparams.hidden_size,
        )

        # continuous variable processing
        self.prescalers = nn.ModuleDict({
            name: nn.Linear(
                1,
                self.hparams.hidden_continuous_sizes.get(
                    name, self.hparams.hidden_continuous_size))
            for name in self.reals
        })

        # variable selection
        # variable selection for static variables
        static_input_sizes = {
            name: self.hparams.embedding_sizes[name][1]
            for name in self.hparams.static_categoricals
        }
        static_input_sizes.update({
            name: self.hparams.hidden_continuous_sizes.get(
                name, self.hparams.hidden_continuous_size)
            for name in self.hparams.static_reals
        })
        self.static_variable_selection = VariableSelectionNetwork(
            input_sizes=static_input_sizes,
            hidden_size=self.hparams.hidden_size,
            input_embedding_flags={
                name: True
                for name in self.hparams.static_categoricals
            },
            dropout=self.hparams.dropout,
            prescalers=self.prescalers,
        )

        # variable selection for encoder and decoder
        encoder_input_sizes = {
            name: self.hparams.embedding_sizes[name][1]
            for name in self.hparams.time_varying_categoricals_encoder
        }
        encoder_input_sizes.update({
            name: self.hparams.hidden_continuous_sizes.get(
                name, self.hparams.hidden_continuous_size)
            for name in self.hparams.time_varying_reals_encoder
        })

        decoder_input_sizes = {
            name: self.hparams.embedding_sizes[name][1]
            for name in self.hparams.time_varying_categoricals_decoder
        }
        decoder_input_sizes.update({
            name: self.hparams.hidden_continuous_sizes.get(
                name, self.hparams.hidden_continuous_size)
            for name in self.hparams.time_varying_reals_decoder
        })

        # create single variable grns that are shared across decoder and encoder
        if self.hparams.share_single_variable_networks:
            self.shared_single_variable_grns = nn.ModuleDict()
            for name, input_size in encoder_input_sizes.items():
                self.shared_single_variable_grns[name] = GatedResidualNetwork(
                    input_size,
                    min(input_size, self.hparams.hidden_size),
                    self.hparams.hidden_size,
                    self.hparams.dropout,
                )
            for name, input_size in decoder_input_sizes.items():
                if name not in self.shared_single_variable_grns:
                    self.shared_single_variable_grns[
                        name] = GatedResidualNetwork(
                            input_size,
                            min(input_size, self.hparams.hidden_size),
                            self.hparams.hidden_size,
                            self.hparams.dropout,
                        )

        self.encoder_variable_selection = VariableSelectionNetwork(
            input_sizes=encoder_input_sizes,
            hidden_size=self.hparams.hidden_size,
            input_embedding_flags={
                name: True
                for name in self.hparams.time_varying_categoricals_encoder
            },
            dropout=self.hparams.dropout,
            context_size=self.hparams.hidden_size,
            prescalers=self.prescalers,
            single_variable_grns={}
            if not self.hparams.share_single_variable_networks else
            self.shared_single_variable_grns,
        )

        self.decoder_variable_selection = VariableSelectionNetwork(
            input_sizes=decoder_input_sizes,
            hidden_size=self.hparams.hidden_size,
            input_embedding_flags={
                name: True
                for name in self.hparams.time_varying_categoricals_decoder
            },
            dropout=self.hparams.dropout,
            context_size=self.hparams.hidden_size,
            prescalers=self.prescalers,
            single_variable_grns={}
            if not self.hparams.share_single_variable_networks else
            self.shared_single_variable_grns,
        )

        # static encoders
        # for variable selection
        self.static_context_variable_selection = GatedResidualNetwork(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            output_size=self.hparams.hidden_size,
            dropout=self.hparams.dropout,
        )

        # for hidden state of the lstm
        self.static_context_initial_hidden_lstm = GatedResidualNetwork(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            output_size=self.hparams.hidden_size,
            dropout=self.hparams.dropout,
        )

        # for cell state of the lstm
        self.static_context_initial_cell_lstm = GatedResidualNetwork(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            output_size=self.hparams.hidden_size,
            dropout=self.hparams.dropout,
        )

        # for post lstm static enrichment
        self.static_context_enrichment = GatedResidualNetwork(
            self.hparams.hidden_size, self.hparams.hidden_size,
            self.hparams.hidden_size, self.hparams.dropout)

        # lstm encoder (history) and decoder (future) for local processing
        self.lstm_encoder = LSTM(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            num_layers=self.hparams.lstm_layers,
            dropout=self.hparams.dropout
            if self.hparams.lstm_layers > 1 else 0,
            batch_first=True,
        )

        self.lstm_decoder = LSTM(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            num_layers=self.hparams.lstm_layers,
            dropout=self.hparams.dropout
            if self.hparams.lstm_layers > 1 else 0,
            batch_first=True,
        )

        # skip connection for lstm
        self.post_lstm_gate_encoder = GatedLinearUnit(
            self.hparams.hidden_size, dropout=self.hparams.dropout)
        self.post_lstm_gate_decoder = self.post_lstm_gate_encoder
        # self.post_lstm_gate_decoder = GatedLinearUnit(self.hparams.hidden_size, dropout=self.hparams.dropout)
        self.post_lstm_add_norm_encoder = AddNorm(self.hparams.hidden_size,
                                                  trainable_add=False)
        # self.post_lstm_add_norm_decoder = AddNorm(self.hparams.hidden_size, trainable_add=True)
        self.post_lstm_add_norm_decoder = self.post_lstm_add_norm_encoder

        # static enrichment and processing past LSTM
        self.static_enrichment = GatedResidualNetwork(
            input_size=self.hparams.hidden_size,
            hidden_size=self.hparams.hidden_size,
            output_size=self.hparams.hidden_size,
            dropout=self.hparams.dropout,
            context_size=self.hparams.hidden_size,
        )

        # attention for long-range processing
        self.multihead_attn = InterpretableMultiHeadAttention(
            d_model=self.hparams.hidden_size,
            n_head=self.hparams.attention_head_size,
            dropout=self.hparams.dropout)
        self.post_attn_gate_norm = GateAddNorm(self.hparams.hidden_size,
                                               dropout=self.hparams.dropout,
                                               trainable_add=False)
        self.pos_wise_ff = GatedResidualNetwork(self.hparams.hidden_size,
                                                self.hparams.hidden_size,
                                                self.hparams.hidden_size,
                                                dropout=self.hparams.dropout)

        # output processing -> no dropout at this late stage
        self.pre_output_gate_norm = GateAddNorm(self.hparams.hidden_size,
                                                dropout=None,
                                                trainable_add=False)

        if self.n_targets > 1:  # if to run with multiple targets
            self.output_layer = nn.ModuleList([
                nn.Linear(self.hparams.hidden_size, output_size)
                for output_size in self.hparams.output_size
            ])
        else:
            self.output_layer = nn.Linear(self.hparams.hidden_size,
                                          self.hparams.output_size)
Пример #8
0
def test_integration(multiple_dataloaders_with_covariates, tmp_path, gpus):
    train_dataloader = multiple_dataloaders_with_covariates["train"]
    val_dataloader = multiple_dataloaders_with_covariates["val"]
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min")

    # check training
    logger = TensorBoardLogger(tmp_path)
    trainer = pl.Trainer(
        max_epochs=2,
        gpus=gpus,
        weights_summary="top",
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback],
        checkpoint_callback=True,
        default_root_dir=tmp_path,
        limit_train_batches=2,
        limit_val_batches=2,
        logger=logger,
    )
    # test monotone constraints automatically
    if "discount_in_percent" in train_dataloader.dataset.reals:
        monotone_constaints = {"discount_in_percent": +1}
        cuda_context = torch.backends.cudnn.flags(enabled=False)
    else:
        monotone_constaints = {}
        cuda_context = nullcontext()

    with cuda_context:
        if isinstance(train_dataloader.dataset.target_normalizer, NaNLabelEncoder):
            loss = CrossEntropy()
        elif isinstance(train_dataloader.dataset.target_normalizer, MultiNormalizer):
            loss = MultiLoss(
                [
                    CrossEntropy() if isinstance(normalizer, NaNLabelEncoder) else QuantileLoss()
                    for normalizer in train_dataloader.dataset.target_normalizer.normalizers
                ]
            )
        else:
            loss = QuantileLoss()
        net = TemporalFusionTransformer.from_dataset(
            train_dataloader.dataset,
            learning_rate=0.15,
            hidden_size=4,
            attention_head_size=1,
            dropout=0.2,
            hidden_continuous_size=2,
            loss=loss,
            log_interval=5,
            log_val_interval=1,
            log_gradient_flow=True,
            monotone_constaints=monotone_constaints,
        )
        net.size()
        try:
            trainer.fit(
                net,
                train_dataloader=train_dataloader,
                val_dataloaders=val_dataloader,
            )

            # check loading
            net = TemporalFusionTransformer.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

            # check prediction
            predictions, x, index = net.predict(val_dataloader, return_index=True, return_x=True)
            pred_len = len(multiple_dataloaders_with_covariates["val"].dataset)

            # check that output is of correct shape
            def check(x):
                if isinstance(x, (tuple, list)):
                    for xi in x:
                        check(xi)
                elif isinstance(x, dict):
                    for xi in x.values():
                        check(xi)
                else:
                    assert pred_len == x.shape[0], "first dimension should be prediction length"

            check(predictions)
            check(x)
            check(index)

            # check prediction on gpu
            if not (isinstance(gpus, int) and gpus == 0):
                net.to("cuda")
                net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True)

        finally:
            shutil.rmtree(tmp_path, ignore_errors=True)
Пример #9
0
    def __init__(
        self,
        n_lags=60,
        n_forecasts=20,
        batch_size=None,
        epochs=100,
        patience_early_stopping=10,
        early_stop=True,
        learning_rate=3e-2,
        auto_lr_find=True,
        num_workers=3,
        loss_func="QuantileLoss",
        hidden_size=32,
        attention_head_size=1,
        hidden_continuous_size=8,
        dropout=0.1,
    ):
        """
        Args:
            n_lags: int, — Number of time units that condition the predictions. Also known as 'lookback period'.
                Should be between 1-10 times the prediction length. Can be seen as equivalent for n_lags in NP
            n_forecasts: int - Number of time units that the model predicts
            batch_size: int, — batch_size. If set to None, automatic batch size will be set
            epochs: int, — number of epochs for training. Will be overwritten, if EarlyStopping is applied
            patience_early_stopping: int, — patience parameter of EarlyStopping callback
            early_stop: bool, — whether to use EarlyStopping callback
            learning_rate: float, — learning rate for the model. Will be overwritten, if auto_lr_find is used
            auto_lr_find: bool, — whether to use automatic laerning rate finder
            num_workers: int, — number of workers for DataLoaders
            loss_func: str, loss function taking prediction and targets, should be from MultiHorizonMetric class,
                defaults to QuantileLoss.
            hidden_size: int, hidden size of network which is its main hyperparameter and can range from 8 to 512
            attention_head_size: int, number of attention heads, lager values (up to 8) for large amount of data
            hidden_continuous_size: int, dictionary mapping continuous input indices to sizes for variable selection
            dropout: dropout in RNN layers, should be between 0 and 1.
        """

        self.batch_size = batch_size

        self.epochs = epochs
        self.patience_early_stopping = patience_early_stopping
        self.early_stop = early_stop
        self.learning_rate = learning_rate
        self.auto_lr_find = auto_lr_find
        if self.learning_rate != None:
            self.auto_lr_find = False
        self.num_workers = num_workers

        self.context_length = n_lags
        self.prediction_length = n_forecasts

        self.hidden_size = hidden_size
        self.attention_head_size = attention_head_size
        self.hidden_continuous_size = hidden_continuous_size
        self.dropout = dropout
        self.loss_func = loss_func

        self.fitted = False
        self.freq = None

        if type(self.loss_func) == str:
            if self.loss_func.lower() in ["huber", "smoothl1", "smoothl1loss"]:
                self.loss_func = torch.nn.SmoothL1Loss()
            elif self.loss_func.lower() in ["mae", "l1", "l1loss"]:
                self.loss_func = torch.nn.L1Loss()
            elif self.loss_func.lower() in ["mse", "mseloss", "l2", "l2loss"]:
                self.loss_func = torch.nn.MSELoss()
            elif self.loss_func.lower() in ["quantileloss"]:
                self.loss_func = QuantileLoss()
            else:
                raise NotImplementedError("Loss function {} name not defined".format(self.loss_func))
        elif callable(self.loss_func):
            pass
        elif hasattr(torch.nn.modules.loss, self.loss_func.__class__.__name__):
            pass
        else:
            raise NotImplementedError("Loss function {} not found".format(self.loss_func))

        self.metrics = metrics.MetricsCollection(
            metrics=[metrics.LossMetric(torch.nn.SmoothL1Loss()), metrics.MAE(), metrics.MSE(),],
            value_metrics=[
                # metrics.ValueMetric("Loss"),
            ],
        )

        self.val_metrics = metrics.MetricsCollection([m.new() for m in self.metrics.batch_metrics])
Пример #10
0
 def __init__(self, output_size: int = 7, loss: MultiHorizonMetric = QuantileLoss()):
     self.save_hyperparameters()
     super().__init__()
     self.loss = loss
Пример #11
0
    # fast_dev_run=True,
    # logger=logger,
    # profiler=True,
    callbacks=[lr_logger, early_stop_callback],
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,
    loss=QuantileLoss(),
    log_interval=10,
    log_val_interval=1,
    reduce_on_plateau_patience=3,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# # find optimal learning rate
# # remove logging and artificial epoch size
# tft.hparams.log_interval = -1
# tft.hparams.log_val_interval = -1
# trainer.limit_train_batches = 1.0
# # run learning rate finder
# res = trainer.tuner.lr_find(
#     tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2
# )
Пример #12
0
    def __init__(
        self,
        activation_class: str = "ReLU",
        hidden_size: int = 300,
        n_hidden_layers: int = 3,
        dropout: float = 0.1,
        norm: bool = True,
        static_categoricals: List[str] = [],
        static_reals: List[str] = [],
        time_varying_categoricals_encoder: List[str] = [],
        time_varying_categoricals_decoder: List[str] = [],
        categorical_groups: Dict[str, List[str]] = {},
        time_varying_reals_encoder: List[str] = [],
        time_varying_reals_decoder: List[str] = [],
        embedding_sizes: Dict[str, Tuple[int, int]] = {},
        embedding_paddings: List[str] = [],
        embedding_labels: Dict[str, np.ndarray] = {},
        x_reals: List[str] = [],
        x_categoricals: List[str] = [],
        output_size: Union[int, List[int]] = 1,
        target: Union[str, List[str]] = None,
        loss: MultiHorizonMetric = None,
        logging_metrics: nn.ModuleList = None,
        **kwargs,
    ):
        """
        Args:
            activation_class (str, optional): PyTorch activation class. Defaults to "ReLU".
            hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with
                ``n_hidden_layers``. Defaults to 10.
            n_hidden_layers (int, optional): Number of hidden layers - important hyperparameter. Defaults to 2.
            dropout (float, optional): Dropout. Defaults to 0.1.
            norm (bool, optional): if to use normalization in the MLP. Defaults to True.
            static_categoricals: integer of positions of static categorical variables
            static_reals: integer of positions of static continuous variables
            time_varying_categoricals_encoder: integer of positions of categorical variables for encoder
            time_varying_categoricals_decoder: integer of positions of categorical variables for decoder
            time_varying_reals_encoder: integer of positions of continuous variables for encoder
            time_varying_reals_decoder: integer of positions of continuous variables for decoder
            categorical_groups: dictionary where values
                are list of categorical variables that are forming together a new categorical
                variable which is the key in the dictionary
            x_reals: order of continuous variables in tensor passed to forward function
            x_categoricals: order of categorical variables in tensor passed to forward function
            embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and
                embedding size
            embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector
            embedding_labels: dictionary mapping (string) indices to list of categorical labels
            output_size (Union[int, List[int]], optional): number of outputs (e.g. number of quantiles for
                QuantileLoss and one target or list of output sizes).
            target (str, optional): Target variable or list of target variables. Defaults to None.
            loss (MultiHorizonMetric, optional): loss: loss function taking prediction and targets.
                Defaults to QuantileLoss.
            logging_metrics (nn.ModuleList, optional): Metrics to log during training.
                Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]).
        """
        if loss is None:
            loss = QuantileLoss()
        if logging_metrics is None:
            logging_metrics = nn.ModuleList(
                [SMAPE(), MAE(), RMSE(),
                 MAPE(), MASE()])
        self.save_hyperparameters()
        # store loss function separately as it is a module
        super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)

        self.input_embeddings = MultiEmbedding(
            embedding_sizes={
                name: val
                for name, val in embedding_sizes.items()
                if name in self.decoder_variables + self.static_variables
            },
            embedding_paddings=embedding_paddings,
            categorical_groups=categorical_groups,
            x_categoricals=x_categoricals,
        )
        # define network
        if isinstance(self.hparams.output_size, int):
            mlp_output_size = self.hparams.output_size
        else:
            mlp_output_size = sum(self.hparams.output_size)

        cont_size = len(self.decoder_reals_positions)
        cat_size = sum(self.input_embeddings.output_size.values())
        input_size = cont_size + cat_size

        self.mlp = FullyConnectedModule(
            dropout=dropout,
            norm=self.hparams.norm,
            activation_class=getattr(nn, self.hparams.activation_class),
            input_size=input_size,
            output_size=mlp_output_size,
            hidden_size=self.hparams.hidden_size,
            n_hidden_layers=self.hparams.n_hidden_layers,
        )
Пример #13
0
 def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
     new_kwargs = cls.deduce_default_output_parameters(
         dataset, kwargs, QuantileLoss())
     kwargs.update(new_kwargs)
     return super().from_dataset(dataset, **kwargs)
Пример #14
0
                    return_decoder_lengths=True)
    finally:
        shutil.rmtree(tmp_path, ignore_errors=True)

    net.predict(val_dataloader,
                fast_dev_run=True,
                return_index=True,
                return_decoder_lengths=True)


@pytest.mark.parametrize(
    "kwargs",
    [
        {},
        dict(
            loss=MultiLoss([QuantileLoss(), MAE()]),
            data_loader_kwargs=dict(
                time_varying_unknown_reals=["volume", "discount"],
                target=["volume", "discount"],
            ),
        ),
        dict(
            loss=CrossEntropy(),
            data_loader_kwargs=dict(target="agency", ),
        ),
    ],
)
def test_integration(data_with_covariates, tmp_path, gpus, kwargs):
    _integration(data_with_covariates.assign(target=lambda x: x.volume),
                 tmp_path, gpus, **kwargs)
Пример #15
0
    # fast_dev_run=True,
    # logger=logger,
    # profiler=True,
    callbacks=[lr_logger],
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.1,
    hidden_size=32,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=32,
    output_size=3,
    loss=QuantileLoss(quantiles=[0.1, 0.5, 0.9]),
    log_interval=10,
    log_val_interval=3,
    # reduce_on_plateau_patience=3,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# # find optimal learning rate
# tft.hparams.log_interval = -1
# tft.hparams.log_val_interval = -1
# trainer.limit_train_batches = 1.0
# res = trainer.tuner.lr_find(
#     tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2
# )

# print(f"suggested learning rate: {res.suggestion()}")
Пример #16
0
    def train(
        self,
        max_epochs=25,
        hidden_size=16,
        lstm_layers=1,
        dropout=0.1,
        attention_head_size=4,
        reduce_on_plateau_patience=4,
        hidden_continuous_size=8,
        learning_rate=1e-3,
        gradient_clip_val=0.1,
    ):
        # configure network and trainer
        # create dataloaders for model
        batch_size = 128
        train_dataloader = self.intern_training.to_dataloader(
            train=True, batch_size=batch_size)
        val_dataloader = self._intern_validation.to_dataloader(
            train=False, batch_size=batch_size * 10)

        pl.seed_everything(42)

        early_stop_callback = EarlyStopping(monitor="val_loss",
                                            min_delta=1e-4,
                                            patience=10,
                                            verbose=False,
                                            mode="min")
        # lr_logger = LearningRateMonitor()

        trainer = pl.Trainer(
            max_epochs=max_epochs,
            gpus=0,
            weights_summary=None,
            gradient_clip_val=gradient_clip_val,
            # limit_train_batches=30,  # coment in for training, running validation every 30 batches
            # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
            callbacks=[early_stop_callback],
        )

        self.model = TemporalFusionTransformer.from_dataset(
            self.intern_training,
            learning_rate=learning_rate,
            hidden_size=hidden_size,
            attention_head_size=attention_head_size,
            dropout=dropout,
            hidden_continuous_size=hidden_continuous_size,
            lstm_layers=lstm_layers,
            output_size=len(self.quantiles),  # 3 quantiles by default
            loss=QuantileLoss(self.quantiles),
            reduce_on_plateau_patience=reduce_on_plateau_patience,
        )

        # res = trainer.tuner.lr_find(
        #     self.model,
        #     train_dataloader=train_dataloader,
        #     val_dataloaders=val_dataloader,
        #     max_lr=10.0,
        #     min_lr=1e-6,
        # )

        # self.model = TemporalFusionTransformer.from_dataset(
        #     self.intern_training,
        #     learning_rate=res.suggestion(), # using the suggested learining rate
        #     hidden_size=hidden_size,
        #     attention_head_size=attention_head_size,
        #     dropout=dropout,
        #     hidden_continuous_size=hidden_continuous_size,
        #     output_size=len(self.quantiles),  # 3 quantiles by default
        #     loss=QuantileLoss(self.quantiles),
        #     reduce_on_plateau_patience=reduce_on_plateau_patience,
        # )

        # fit network
        trainer.fit(
            self.model,
            train_dataloader=train_dataloader,
            val_dataloaders=val_dataloader,
        )