def multiple_dataloaders_with_covariates(data_with_covariates, request):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    params = request.param
    params.setdefault("target", "volume")

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        # weight="weight",
        group_ids=["agency", "sku"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        **params  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_with_covariates(data_with_covariates):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        target="volume",
        # weight="weight",
        group_ids=["agency", "sku"],
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["volume"],
        static_categoricals=["agency"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                          coerce_positive=False),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_fixed_window_without_covariates():
    data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10)
    validation = data.series.iloc[:2]

    max_encoder_length = 60
    max_prediction_length = 20

    training = TimeSeriesDataSet(
        data[lambda x: ~x.series.isin(validation)],
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        static_categoricals=[],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        time_varying_unknown_reals=["value"],
        target_normalizer=EncoderNormalizer(),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data[lambda x: x.series.isin(validation)],
        stop_randomization=True,
    )
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
示例#4
0
def make_dataloaders(data_with_covariates, **kwargs):
    training_cutoff = "2016-09-01"
    max_encoder_length = 4
    max_prediction_length = 3

    kwargs.setdefault("target", "volume")
    kwargs.setdefault("group_ids", ["agency", "sku"])
    kwargs.setdefault("add_relative_time_idx", True)
    kwargs.setdefault("time_varying_unknown_reals", ["volume"])

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff].copy(),
        time_idx="time_idx",
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        **kwargs,  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates.copy(),
        min_prediction_idx=training.index.time.max() + 1)
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=2,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=2,
                                              num_workers=0)
    test_dataloader = validation.to_dataloader(train=False,
                                               batch_size=1,
                                               num_workers=0)

    return dict(train=train_dataloader,
                val=val_dataloader,
                test=test_dataloader)
示例#5
0
    def test_model(model_path):
        """ Tests results of given model on dataset """
        DATA_PATH = 'data/data.csv'
        if not os.path.isfile(FILE_PATH):
            wget.download(PREPROCESS_URL, FILE_PATH)
        
        dataset = pd.read_csv(DATA_PATH)

        dataset['target'] = dataset['target'].astype(float)
        dataset['time_idx'] = dataset['time_idx'].astype(int)
        
        time_series = TimeSeriesDataSet.load('models/dataset_time_set')
        validation = TimeSeriesDataSet.from_dataset(time_series, dataset)
        
        all_dataloader = validation.to_dataloader(train=False, num_workers=0)
        model = TemporalFusionTransformer.load_from_checkpoint(model_path)

        actuals = torch.cat([y[0] for (x, y) in iter(all_dataloader)])
        predictions = model.predict(all_dataloader)

        print(f'test mape is {((actuals - predictions).abs() / actuals).mean()}')

        print(f' max mape {max(((actuals - predictions).abs() / actuals))}')

        res = (actuals - predictions).abs() / actuals
        print(f' max 99 mape {np.quantile(res, .99)}')
#         print("wynik", res)
        res = np.array([int(x) for x in res])
示例#6
0
    def transform_data(self, data, past_lags, index_label, target_label,
                       train_val_split):

        self.past_lags = past_lags
        self.oldest_lag = int(max(self.past_lags)) + 1
        self.index_label = index_label
        self.target_label = target_label

        # External train and validation sets
        X = data[[index_label]]
        y = data[[target_label]]

        self.training = (X.loc[:int(len(data) * train_val_split)],
                         y.loc[:int(len(data) * train_val_split)])
        self.validation = (X.loc[int(len(data) * train_val_split):],
                           y.loc[int(len(data) * train_val_split):])

        # intern train and validation sets, they use dataloaders to optimize the training routine
        # time index are epoch values
        # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
        data["time_idx"] = data.index
        data['group_id'] = 'series'

        max_prediction_length = self.oldest_lag
        max_encoder_length = self.oldest_lag
        # training_cutoff = data["time_idx"].max() - max_prediction_length

        self.intern_training = TimeSeriesDataSet(
            data[:int(len(data) * train_val_split)],
            time_idx="time_idx",
            group_ids=["group_id"],
            target=self.target_label,
            min_encoder_length=0,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["group_id"],
            # time_varying_unknown_reals=[self.target_label],
            # the docs says that the max_lag < max_encoder_length
            # lags={self.target_label: list(self.past_lags[1:-1] + 1)},
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
            # allow_missings=True
        )

        # create validation set (predict=True) which means to predict the last max_prediction_length points in time
        # for each series
        self._intern_validation = TimeSeriesDataSet.from_dataset(
            self.intern_training, data, predict=True, stop_randomization=True)

        # store the last input to use as encoder data to next predictions
        self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()
示例#7
0
    def save_time_series(self):
        """ Download preprocessing file and creates data in a format suited for temporal fusion """
        PREPROCESS_URL = 'https://raw.githubusercontent.com/AWarno/CancerOptimization/main/preprocess_data.py'
        FILE_PATH = 'data/preprocess_data.py'
        DATA_PATH = 'data/data.csv'
        FEATURES = ['dose', 'time']
        GROUP_ID = 'series'
        
        # Data file already exists so we don't need to generate it
        if os.path.isfile(DATA_PATH):
            return
        
        # Preprocessing file already exists so we don't need to download it again
        if not os.path.isfile(FILE_PATH):
            wget.download(PREPROCESS_URL, FILE_PATH)
        
        os.system('python ' + FILE_PATH)
        
        dataset = pd.read_csv(DATA_PATH)

        n = dataset[GROUP_ID].astype(int).max()

        dataset['target'] = dataset['target'].astype(float)

        dataset['time_idx'] = dataset['time_idx'].astype(int)

        training = TimeSeriesDataSet(
            dataset[dataset[GROUP_ID].apply(lambda x: int(x) < int(n * 0.7))],
            time_idx='time_idx',
            target='target',
            group_ids=[GROUP_ID],
            min_encoder_length=20,  
            max_encoder_length=20,
            min_prediction_length=1,
            max_prediction_length=1,
            static_categoricals=[],
            static_reals=[],
            time_varying_known_categoricals=[],
            variable_groups={},
            time_varying_known_reals=['time_idx'],
            time_varying_unknown_categoricals=[],
            time_varying_unknown_reals=['target'] + FEATURES,
            add_relative_time_idx=True,
            add_target_scales=False,
            add_encoder_length=True,
            categorical_encoders={GROUP_ID: NaNLabelEncoder().fit(dataset.series)},
        )
        
        training.save(self.TIMESERIES_PATH)
示例#8
0
    def get_examples(self, year: int, encoder_length: dict,
                     prediction_length: dict) -> TimeSeriesDataSet:

        solar_df_filtered = self.solar_df[(self.solar_df['year'] == year)]
        group_length = 2 * encoder_length['max'] - 1
        num_groups = int(np.floor(solar_df_filtered.shape[0] / group_length))
        solar_df_filtered = solar_df_filtered[1:(num_groups * group_length +
                                                 1)]
        solar_df_filtered['group'] = np.repeat(np.arange(num_groups),
                                               group_length)
        examples = TimeSeriesDataSet(
            solar_df_filtered,
            group_ids=["group"],
            target="power",
            time_idx="time_idx",
            min_encoder_length=encoder_length['min'],
            max_encoder_length=encoder_length['max'],
            min_prediction_length=prediction_length['min'],
            max_prediction_length=prediction_length['max'],
            time_varying_unknown_reals=["power"],
            time_varying_known_reals=[
                "cloudcover_low", "cloudcover_mid", "cloudcover_high"
            ],
            time_varying_known_categoricals=["seasons"],
            allow_missings=True,
        )

        return examples
示例#9
0
def test_predict_dependency(model, dataloaders_with_covariates, data_with_covariates, kwargs):
    train_dataset = dataloaders_with_covariates["train"].dataset
    dataset = TimeSeriesDataSet.from_dataset(
        train_dataset, data_with_covariates[lambda x: x.agency == data_with_covariates.agency.iloc[0]], predict=True
    )
    model.predict_dependency(dataset, variable="discount", values=[0.1, 0.0], **kwargs)
    model.predict_dependency(dataset, variable="agency", values=data_with_covariates.agency.unique()[:2], **kwargs)
def test_prediction_with_dataloder_raw(data_with_covariates, tmp_path):
    # tests correct concatenation of raw output
    test_data = data_with_covariates.copy()
    np.random.seed(2)
    test_data = test_data.sample(frac=0.5)

    dataset = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        max_encoder_length=8,
        max_prediction_length=10,
        min_prediction_length=1,
        min_encoder_length=1,
        target="volume",
        group_ids=["agency", "sku"],
        constant_fill_strategy=dict(volume=0.0),
        allow_missing_timesteps=True,
        time_varying_unknown_reals=["volume"],
        time_varying_known_reals=["time_idx"],
        target_normalizer=GroupNormalizer(groups=["agency", "sku"]),
    )

    net = TemporalFusionTransformer.from_dataset(
        dataset,
        learning_rate=1e-6,
        hidden_size=4,
        attention_head_size=1,
        dropout=0.2,
        hidden_continuous_size=2,
        log_interval=1,
        log_val_interval=1,
        log_gradient_flow=True,
    )
    logger = TensorBoardLogger(tmp_path)
    trainer = pl.Trainer(max_epochs=1, gradient_clip_val=1e-6, logger=logger)
    trainer.fit(net,
                train_dataloaders=dataset.to_dataloader(batch_size=4,
                                                        num_workers=0))

    # choose small batch size to provoke issue
    res = net.predict(dataset.to_dataloader(batch_size=2, num_workers=0),
                      mode="raw")
    # check that interpretation works
    net.interpret_output(res)["attention"]
    assert net.interpret_output(res.iget(
        slice(1)))["attention"].size() == torch.Size(
            (1, net.hparams.max_encoder_length))
示例#11
0
    def predict(self, data):
        """ Transforms data and predicts output based on train model 
        
            Parameters: self, list of protocols
            
            Return: list of results for each protocol based on train model
        """
        print(data)
        self.save_time_series()
        dataset = self.prepare_data(data)
        
        time_series = TimeSeriesDataSet.load(self.TIMESERIES_PATH)
        validation = TimeSeriesDataSet.from_dataset(time_series, dataset)
        
        val_dataloader = validation.to_dataloader(train=False, num_workers=0)
    
        res = self.model.predict(val_dataloader)
#         print("wynik", res)
        res = np.array([int(x) for x in res])
        
        return res
示例#12
0
def test_dataset(test_data):
    training = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        target="volume",
        time_varying_known_reals=["price_regular"],
        group_ids=["agency", "sku"],
        static_categoricals=["agency"],
        max_encoder_length=5,
        max_prediction_length=2,
        randomize_length=None,
    )
    return training
    def _create_dataset(self, df, valid_p=0.2):
        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]
        self.set_auto_batch_epoch(self.n_data)

        training_cutoff = df.shape[0] - int(valid_p * df.shape[0])

        training = TimeSeriesDataSet(
            df.iloc[:training_cutoff],
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(df.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"]),
            randomize_length=None,
            add_relative_time_idx=False,
            add_target_scales=False,
        )

        validation = TimeSeriesDataSet.from_dataset(
            training, df, min_prediction_idx=training_cutoff)
        train_dataloader = training.to_dataloader(train=True,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)
        val_dataloader = validation.to_dataloader(train=False,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)

        return training, train_dataloader, val_dataloader
示例#14
0
    def load_data(
        self,
        data: DataFrame,
        time_idx: Optional[str] = None,
        target: Optional[Union[str, List[str]]] = None,
        group_ids: Optional[List[str]] = None,
        parameters: Optional[Dict[str, Any]] = None,
        **time_series_dataset_kwargs: Any,
    ):
        if self.training:
            time_series_dataset = TimeSeriesDataSet(
                data,
                time_idx=time_idx,
                group_ids=group_ids,
                target=target,
                **time_series_dataset_kwargs)
            parameters = time_series_dataset.get_parameters()

            # Add some sample data so that we can recreate the `TimeSeriesDataSet` later on
            parameters["data_sample"] = data.iloc[[0]].to_dict()

            self.parameters = parameters
        else:
            if parameters is None:
                raise MisconfigurationException(
                    "Loading data for evaluation or inference requires parameters from the train data. Either "
                    "construct the train data at the same time as evaluation and inference or provide the train "
                    "`datamodule.parameters` to `from_data_frame` in the `parameters` argument."
                )
            parameters = copy(parameters)
            parameters.pop("data_sample")
            time_series_dataset = TimeSeriesDataSet.from_parameters(
                parameters,
                data,
                stop_randomization=True,
            )
        return time_series_dataset
示例#15
0
    def optimize(cls, dataset: TimeSeriesDataSet, num_steps: int, **kwargs):

        model = FullyConnectedModelWithCovariates.from_dataset(
            dataset, **kwargs)
        dataloader = dataset.to_dataloader()
        optimizer = torch.optim.Adam(model.parameters(),
                                     model.hparams.learning_rate)
        criteria = torch.nn.L1Loss()
        for step in range(num_steps):
            optimizer.zero_grad()
            x_train, y_train = next(iter(dataloader))
            # Forward pass
            y_pred = model(x_train)['prediction']
            # Compute Loss
            loss = criteria(y_pred, y_train[0])

            print('Step {}: train loss: {}'.format(step, loss.item()))

            # Backward pass
            loss.backward()
            optimizer.step()

        return model
示例#16
0
        #value=np.arange(30),
        group=np.repeat(np.arange(3), 10),
        time_idx=np.tile(np.arange(10), 3),
    ))
test_data

# %%
from pytorch_forecasting import TimeSeriesDataSet

# create the dataset from the pandas dataframe
dataset = TimeSeriesDataSet(
    test_data,
    group_ids=["group"],
    target="value",
    time_idx="time_idx",
    min_encoder_length=5,
    max_encoder_length=5,
    min_prediction_length=2,
    max_prediction_length=2,
    time_varying_unknown_reals=["value"],
)

# %%
dataset.get_parameters()

# %% [markdown]
# Now, we take a look at the output of the dataloader. It's `x` will be fed to the model's forward method, that is why it is so important to understand it.

# %%
# convert the dataset to a dataloader
dataloader = dataset.to_dataloader(batch_size=4)
示例#17
0
max_encoder_length = 150
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

context_length = max_encoder_length
prediction_length = max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx < training_cutoff],
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    min_encoder_length=context_length,
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
    min_prediction_length=prediction_length,
    time_varying_unknown_reals=["value"],
    randomize_length=None,
    add_relative_time_idx=False,
    add_target_scales=False,
)

validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            min_prediction_idx=training_cutoff)
batch_size = 128
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=2)
示例#18
0
class TFTWrapper(BaseWrapper):
    def __init__(self, quantiles):
        super().__init__(quantiles)
        self.intern_training = None
        self._intern_validation = None
        self.training = None
        self.validation = None
        self.model = None
        self.trainer = None
        self.oldest_lag = None
        self.last_period = None
        self.quantiles = quantiles

    def transform_data(self, data, past_lags, index_label, target_label,
                       train_val_split):

        self.past_lags = past_lags
        self.oldest_lag = int(max(self.past_lags)) + 1
        self.index_label = index_label
        self.target_label = target_label

        # External train and validation sets
        X = data[[index_label]]
        y = data[[target_label]]

        self.training = (X.loc[:int(len(data) * train_val_split)],
                         y.loc[:int(len(data) * train_val_split)])
        self.validation = (X.loc[int(len(data) * train_val_split):],
                           y.loc[int(len(data) * train_val_split):])

        # intern train and validation sets, they use dataloaders to optimize the training routine
        # time index are epoch values
        # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
        data["time_idx"] = data.index
        data['group_id'] = 'series'

        max_prediction_length = self.oldest_lag
        max_encoder_length = self.oldest_lag
        # training_cutoff = data["time_idx"].max() - max_prediction_length

        self.intern_training = TimeSeriesDataSet(
            data[:int(len(data) * train_val_split)],
            time_idx="time_idx",
            group_ids=["group_id"],
            target=self.target_label,
            min_encoder_length=0,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["group_id"],
            # time_varying_unknown_reals=[self.target_label],
            # the docs says that the max_lag < max_encoder_length
            # lags={self.target_label: list(self.past_lags[1:-1] + 1)},
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
            # allow_missings=True
        )

        # create validation set (predict=True) which means to predict the last max_prediction_length points in time
        # for each series
        self._intern_validation = TimeSeriesDataSet.from_dataset(
            self.intern_training, data, predict=True, stop_randomization=True)

        # store the last input to use as encoder data to next predictions
        self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()

    def train(
        self,
        max_epochs=25,
        hidden_size=16,
        lstm_layers=1,
        dropout=0.1,
        attention_head_size=4,
        reduce_on_plateau_patience=4,
        hidden_continuous_size=8,
        learning_rate=1e-3,
        gradient_clip_val=0.1,
    ):
        # configure network and trainer
        # create dataloaders for model
        batch_size = 128
        train_dataloader = self.intern_training.to_dataloader(
            train=True, batch_size=batch_size)
        val_dataloader = self._intern_validation.to_dataloader(
            train=False, batch_size=batch_size * 10)

        pl.seed_everything(42)

        early_stop_callback = EarlyStopping(monitor="val_loss",
                                            min_delta=1e-4,
                                            patience=10,
                                            verbose=False,
                                            mode="min")
        # lr_logger = LearningRateMonitor()

        trainer = pl.Trainer(
            max_epochs=max_epochs,
            gpus=0,
            weights_summary=None,
            gradient_clip_val=gradient_clip_val,
            # limit_train_batches=30,  # coment in for training, running validation every 30 batches
            # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
            callbacks=[early_stop_callback],
        )

        self.model = TemporalFusionTransformer.from_dataset(
            self.intern_training,
            learning_rate=learning_rate,
            hidden_size=hidden_size,
            attention_head_size=attention_head_size,
            dropout=dropout,
            hidden_continuous_size=hidden_continuous_size,
            lstm_layers=lstm_layers,
            output_size=len(self.quantiles),  # 3 quantiles by default
            loss=QuantileLoss(self.quantiles),
            reduce_on_plateau_patience=reduce_on_plateau_patience,
        )

        # res = trainer.tuner.lr_find(
        #     self.model,
        #     train_dataloader=train_dataloader,
        #     val_dataloaders=val_dataloader,
        #     max_lr=10.0,
        #     min_lr=1e-6,
        # )

        # self.model = TemporalFusionTransformer.from_dataset(
        #     self.intern_training,
        #     learning_rate=res.suggestion(), # using the suggested learining rate
        #     hidden_size=hidden_size,
        #     attention_head_size=attention_head_size,
        #     dropout=dropout,
        #     hidden_continuous_size=hidden_continuous_size,
        #     output_size=len(self.quantiles),  # 3 quantiles by default
        #     loss=QuantileLoss(self.quantiles),
        #     reduce_on_plateau_patience=reduce_on_plateau_patience,
        # )

        # fit network
        trainer.fit(
            self.model,
            train_dataloader=train_dataloader,
            val_dataloaders=val_dataloader,
        )

    def _auto_feed(self, X, future_steps, quantile=False):
        """
        Perform autofeed over the X values to predict the futures steps.
        """
        def append_new_data(cur_X, new_value, date_step):
            new_date = cur_X[self.index_label].iloc[-1] + date_step
            new_entry = {
                self.index_label: new_date,
                self.target_label: new_value,
                'time_idx': cur_X['time_idx'].iloc[-1] + 1,
                'group_id': 'series'
            }
            return cur_X.append(new_entry, ignore_index=True)

        # prediction or quantile mode
        mode = 'quantiles' if quantile else 'prediction'

        # interval between dates (last two dates in the dataset)
        cur_X = X.copy()
        date_step = cur_X[self.index_label].iloc[-1] - \
            cur_X[self.index_label].iloc[-2]

        y = []

        # if the future steps is less or equals than the oldest lag the model can predict it by default
        if future_steps <= self.oldest_lag:
            predict = self.model.predict(cur_X, mode=mode)[0].numpy().tolist()
            return predict[:future_steps]
        else:
            # short cut the auto feed prediction with more reliable prediction
            predict = self.model.predict(cur_X, mode=mode)[0].numpy().tolist()
            for new_value in predict:
                cur_X = append_new_data(cur_X, new_value, date_step)
            y = predict

        for _ in range(self.oldest_lag, future_steps):
            predict = self.model.predict(cur_X, mode=mode)[0][0]
            if quantile:
                y.append(predict.numpy().tolist())
                new_value = y[-1][1]  # get quantil 0.5
            else:
                y.append(float(predict.numpy()))
                new_value = y[-1]

            cur_X = append_new_data(cur_X, new_value, date_step)

        return y

    def _verify_target_column(self, data):
        if not self.target_label in data.columns:
            data[self.target_label] = 0

    def predict(self, X, future_steps, history, quantile=False):
        predictions = []

        self._verify_target_column(X)

        for i in range(len(X)):
            X_temp = history.append(X.iloc[:i], ignore_index=True)
            time_idx = list(range(len(X_temp)))  # refact to use real time idx
            time_idx = [
                idx + self.last_period["time_idx"].max() for idx in time_idx
            ]
            X_temp[self.index_label] = pd.to_datetime(X_temp[self.index_label])
            X_temp[self.index_label] = X_temp[self.index_label].dt.tz_localize(
                None)
            X_temp["time_idx"] = time_idx
            X_temp['group_id'] = 'series'

            y = self._auto_feed(X_temp, future_steps, quantile)
            predictions.append(y)

        return predictions

    def next(self, X, future_steps, quantile=False):

        self._verify_target_column(X)

        # pre-process the data
        X[self.index_label] = pd.to_datetime(X[self.index_label])
        X[self.index_label] = X[self.index_label].dt.tz_localize(None)
        X['group_id'] = 'series'

        temp_data = self.last_period.iloc[-(self.oldest_lag + 1):].copy()

        cur_X = temp_data.append(X, ignore_index=True)
        time_idx = list(range(len(cur_X)))  # refact to use real time idx
        cur_X["time_idx"] = time_idx

        cur_X.index = list(range(len(cur_X)))

        y = self._auto_feed(cur_X, future_steps, quantile)

        return y
示例#19
0
    def predict(self, future_dataframe):
        """
        Predicts based on the future_dataframe. Should be called only after make_future_dataframe is called
        Args:
            future_dataframe: DataFrame form make_future_dataframe function
        Returns:
            forecast dataframe
        """

        if self.fitted is False:
            log.warning("Model has not been fitted. Predictions will be random.")

        future_dataframe = future_dataframe.copy(deep=True)

        testing = TimeSeriesDataSet(
            future_dataframe,
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(future_dataframe.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_known_reals=["time_idx"],
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"], transformation="softplus", center=False),
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
        )

        new_raw_predictions, new_x = self.model.predict(testing, mode="raw", return_x=True)

        y_predicted = self.model.to_prediction(new_raw_predictions).detach().cpu()  # [0, : new_x["decoder_lengths"][0]]

        y_predicted = y_predicted.detach().numpy()

        def pad_with(vector, pad_width, iaxis, kwargs):
            pad_value = kwargs.get("padder", np.nan)
            vector[: pad_width[0]] = pad_value
            vector[-pad_width[1] :] = pad_value

        y_pred_padded = np.pad(y_predicted, self.prediction_length, pad_with)[
            self.prediction_length : -1, self.prediction_length : -self.prediction_length
        ]
        y_pred_padded = np.vstack([np.roll(y_pred_padded[:, i], i, axis=0) for i in range(y_pred_padded.shape[1])]).T

        result = pd.DataFrame(
            np.ones(shape=(len(future_dataframe), (2 + self.prediction_length))) * np.nan,
            columns=["ds", "y"] + [f"yhat{i}" for i in range(1, self.prediction_length + 1)],
        )
        result["ds"] = future_dataframe["ds"]

        result.loc[: len(future_dataframe) - (self.periods + 1), "y"] = (
            future_dataframe["y"].iloc[: len(future_dataframe) - (self.periods)].values
        )

        first_part = result.iloc[: self.context_length]
        second_part = result.iloc[self.context_length :]

        second_part.loc[:, [col for col in second_part.columns[2:]]] = y_pred_padded
        result = pd.concat([first_part, second_part])
        for i in range(1, self.prediction_length + 1):
            result[f"residual{i}"] = result[f"yhat{i}"] - result["y"]

        return result
示例#20
0
文件: TFT.py 项目: NHQ/tempy
data["cumFoam"] = data["cumFoam"] + 1e6
training = TimeSeriesDataSet(
    data[lambda x: x.date <= 300],
    time_idx="date",
    group_ids=["symbol"],
    target="open",
    #["open", "high", "low", "volume"],
    allow_missings=True,
    #group_ids=["agency", "sku"],
    min_encoder_length=
    50,  #max_encoder_length // 2,  # allow encoder lengths from 0 to max_prediction_length
    max_encoder_length=100,  #max_encoder_length,
    min_prediction_length=20,
    max_prediction_length=20,  #max_prediction_length,
    static_categoricals=["symbol"],
    #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    #time_varying_known_categoricals=["special_days", "month"],
    #variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=meta["knownReals"],
    #time_varying_unknown_categoricals=[],#meta["features"],
    time_varying_unknown_reals=meta["unknownReals"],
    #target_normalizer=NaNLabelEncoder(add_nan=True),
    target_normalizer=None,
    #GroupNormalizer(
    #    groups=["symbol"], transformation="softplus", center=False
    #),  # use softplus with beta=1.0 and normalize by group
    #add_relative_time_idx=True,
    #add_target_scales=True,
    add_encoder_length=True,
    categorical_encoders={"symbol": NaNLabelEncoder(add_nan=True)})
validation = TimeSeriesDataSet.from_dataset(training,
示例#21
0
max_encoder_length = 60
max_prediction_length = 20

df_train_nbeats = df_train.copy()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats["group"] = 0

df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats)

nbeats_training = TimeSeriesDataSet(
    df_train_nbeats_sub,
    time_idx="index",
    target="y",
    categorical_encoders={
        "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"])
    },
    group_ids=["group"],
    time_varying_unknown_reals=["y"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
)
nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training,
                                                   df_train_nbeats_val)

# %%
batch_size = 128
nbeats_train_dataloader = nbeats_training.to_dataloader(train=True,
                                                        batch_size=batch_size,
                                                        num_workers=0)
nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False,
                                                        batch_size=batch_size,
示例#22
0
validation = data.series.sample(20)

max_encoder_length = 60
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: ~x.series.isin(validation)],
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    static_categoricals=["static"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["value"],
    time_varying_known_reals=["time_idx"],
    target_normalizer=GroupNormalizer(groups=["series"]),
    add_relative_time_idx=False,
    add_target_scales=True,
    randomize_length=None,
)

validation = TimeSeriesDataSet.from_dataset(
    training,
    data[lambda x: x.series.isin(validation)],
    # predict=True,
    stop_randomization=True,
)
示例#23
0
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # allow encoder lengths from 0 to max_prediction_length
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus", center=False
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)