Exemplo n.º 1
0
def dataloaders_with_covariates(data_with_covariates):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        target="volume",
        # weight="weight",
        group_ids=["agency", "sku"],
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["volume"],
        static_categoricals=["agency"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                          coerce_positive=False),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Exemplo n.º 2
0
def dataloaders_fixed_window_without_covariates():
    data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10)
    validation = data.series.iloc[:2]

    max_encoder_length = 60
    max_prediction_length = 20

    training = TimeSeriesDataSet(
        data[lambda x: ~x.series.isin(validation)],
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        static_categoricals=[],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        time_varying_unknown_reals=["value"],
        target_normalizer=EncoderNormalizer(),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data[lambda x: x.series.isin(validation)],
        stop_randomization=True,
    )
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Exemplo n.º 3
0
def multiple_dataloaders_with_covariates(data_with_covariates, request):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    params = request.param
    params.setdefault("target", "volume")

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        # weight="weight",
        group_ids=["agency", "sku"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        **params  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Exemplo n.º 4
0
    def get_examples(self, year: int, encoder_length: dict,
                     prediction_length: dict) -> TimeSeriesDataSet:

        solar_df_filtered = self.solar_df[(self.solar_df['year'] == year)]
        group_length = 2 * encoder_length['max'] - 1
        num_groups = int(np.floor(solar_df_filtered.shape[0] / group_length))
        solar_df_filtered = solar_df_filtered[1:(num_groups * group_length +
                                                 1)]
        solar_df_filtered['group'] = np.repeat(np.arange(num_groups),
                                               group_length)
        examples = TimeSeriesDataSet(
            solar_df_filtered,
            group_ids=["group"],
            target="power",
            time_idx="time_idx",
            min_encoder_length=encoder_length['min'],
            max_encoder_length=encoder_length['max'],
            min_prediction_length=prediction_length['min'],
            max_prediction_length=prediction_length['max'],
            time_varying_unknown_reals=["power"],
            time_varying_known_reals=[
                "cloudcover_low", "cloudcover_mid", "cloudcover_high"
            ],
            time_varying_known_categoricals=["seasons"],
            allow_missings=True,
        )

        return examples
Exemplo n.º 5
0
def make_dataloaders(data_with_covariates, **kwargs):
    training_cutoff = "2016-09-01"
    max_encoder_length = 4
    max_prediction_length = 3

    kwargs.setdefault("target", "volume")
    kwargs.setdefault("group_ids", ["agency", "sku"])
    kwargs.setdefault("add_relative_time_idx", True)
    kwargs.setdefault("time_varying_unknown_reals", ["volume"])

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff].copy(),
        time_idx="time_idx",
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        **kwargs,  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates.copy(),
        min_prediction_idx=training.index.time.max() + 1)
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=2,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=2,
                                              num_workers=0)
    test_dataloader = validation.to_dataloader(train=False,
                                               batch_size=1,
                                               num_workers=0)

    return dict(train=train_dataloader,
                val=val_dataloader,
                test=test_dataloader)
Exemplo n.º 6
0
    def transform_data(self, data, past_lags, index_label, target_label,
                       train_val_split):

        self.past_lags = past_lags
        self.oldest_lag = int(max(self.past_lags)) + 1
        self.index_label = index_label
        self.target_label = target_label

        # External train and validation sets
        X = data[[index_label]]
        y = data[[target_label]]

        self.training = (X.loc[:int(len(data) * train_val_split)],
                         y.loc[:int(len(data) * train_val_split)])
        self.validation = (X.loc[int(len(data) * train_val_split):],
                           y.loc[int(len(data) * train_val_split):])

        # intern train and validation sets, they use dataloaders to optimize the training routine
        # time index are epoch values
        # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
        data["time_idx"] = data.index
        data['group_id'] = 'series'

        max_prediction_length = self.oldest_lag
        max_encoder_length = self.oldest_lag
        # training_cutoff = data["time_idx"].max() - max_prediction_length

        self.intern_training = TimeSeriesDataSet(
            data[:int(len(data) * train_val_split)],
            time_idx="time_idx",
            group_ids=["group_id"],
            target=self.target_label,
            min_encoder_length=0,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["group_id"],
            # time_varying_unknown_reals=[self.target_label],
            # the docs says that the max_lag < max_encoder_length
            # lags={self.target_label: list(self.past_lags[1:-1] + 1)},
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
            # allow_missings=True
        )

        # create validation set (predict=True) which means to predict the last max_prediction_length points in time
        # for each series
        self._intern_validation = TimeSeriesDataSet.from_dataset(
            self.intern_training, data, predict=True, stop_randomization=True)

        # store the last input to use as encoder data to next predictions
        self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()
Exemplo n.º 7
0
def test_dataset(test_data):
    training = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        target="volume",
        time_varying_known_reals=["price_regular"],
        group_ids=["agency", "sku"],
        static_categoricals=["agency"],
        max_encoder_length=5,
        max_prediction_length=2,
        randomize_length=None,
    )
    return training
Exemplo n.º 8
0
    def save_time_series(self):
        """ Download preprocessing file and creates data in a format suited for temporal fusion """
        PREPROCESS_URL = 'https://raw.githubusercontent.com/AWarno/CancerOptimization/main/preprocess_data.py'
        FILE_PATH = 'data/preprocess_data.py'
        DATA_PATH = 'data/data.csv'
        FEATURES = ['dose', 'time']
        GROUP_ID = 'series'
        
        # Data file already exists so we don't need to generate it
        if os.path.isfile(DATA_PATH):
            return
        
        # Preprocessing file already exists so we don't need to download it again
        if not os.path.isfile(FILE_PATH):
            wget.download(PREPROCESS_URL, FILE_PATH)
        
        os.system('python ' + FILE_PATH)
        
        dataset = pd.read_csv(DATA_PATH)

        n = dataset[GROUP_ID].astype(int).max()

        dataset['target'] = dataset['target'].astype(float)

        dataset['time_idx'] = dataset['time_idx'].astype(int)

        training = TimeSeriesDataSet(
            dataset[dataset[GROUP_ID].apply(lambda x: int(x) < int(n * 0.7))],
            time_idx='time_idx',
            target='target',
            group_ids=[GROUP_ID],
            min_encoder_length=20,  
            max_encoder_length=20,
            min_prediction_length=1,
            max_prediction_length=1,
            static_categoricals=[],
            static_reals=[],
            time_varying_known_categoricals=[],
            variable_groups={},
            time_varying_known_reals=['time_idx'],
            time_varying_unknown_categoricals=[],
            time_varying_unknown_reals=['target'] + FEATURES,
            add_relative_time_idx=True,
            add_target_scales=False,
            add_encoder_length=True,
            categorical_encoders={GROUP_ID: NaNLabelEncoder().fit(dataset.series)},
        )
        
        training.save(self.TIMESERIES_PATH)
def test_prediction_with_dataloder_raw(data_with_covariates, tmp_path):
    # tests correct concatenation of raw output
    test_data = data_with_covariates.copy()
    np.random.seed(2)
    test_data = test_data.sample(frac=0.5)

    dataset = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        max_encoder_length=8,
        max_prediction_length=10,
        min_prediction_length=1,
        min_encoder_length=1,
        target="volume",
        group_ids=["agency", "sku"],
        constant_fill_strategy=dict(volume=0.0),
        allow_missing_timesteps=True,
        time_varying_unknown_reals=["volume"],
        time_varying_known_reals=["time_idx"],
        target_normalizer=GroupNormalizer(groups=["agency", "sku"]),
    )

    net = TemporalFusionTransformer.from_dataset(
        dataset,
        learning_rate=1e-6,
        hidden_size=4,
        attention_head_size=1,
        dropout=0.2,
        hidden_continuous_size=2,
        log_interval=1,
        log_val_interval=1,
        log_gradient_flow=True,
    )
    logger = TensorBoardLogger(tmp_path)
    trainer = pl.Trainer(max_epochs=1, gradient_clip_val=1e-6, logger=logger)
    trainer.fit(net,
                train_dataloaders=dataset.to_dataloader(batch_size=4,
                                                        num_workers=0))

    # choose small batch size to provoke issue
    res = net.predict(dataset.to_dataloader(batch_size=2, num_workers=0),
                      mode="raw")
    # check that interpretation works
    net.interpret_output(res)["attention"]
    assert net.interpret_output(res.iget(
        slice(1)))["attention"].size() == torch.Size(
            (1, net.hparams.max_encoder_length))
Exemplo n.º 10
0
    def _create_dataset(self, df, valid_p=0.2):
        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]
        self.set_auto_batch_epoch(self.n_data)

        training_cutoff = df.shape[0] - int(valid_p * df.shape[0])

        training = TimeSeriesDataSet(
            df.iloc[:training_cutoff],
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(df.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"]),
            randomize_length=None,
            add_relative_time_idx=False,
            add_target_scales=False,
        )

        validation = TimeSeriesDataSet.from_dataset(
            training, df, min_prediction_idx=training_cutoff)
        train_dataloader = training.to_dataloader(train=True,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)
        val_dataloader = validation.to_dataloader(train=False,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)

        return training, train_dataloader, val_dataloader
Exemplo n.º 11
0
    def load_data(
        self,
        data: DataFrame,
        time_idx: Optional[str] = None,
        target: Optional[Union[str, List[str]]] = None,
        group_ids: Optional[List[str]] = None,
        parameters: Optional[Dict[str, Any]] = None,
        **time_series_dataset_kwargs: Any,
    ):
        if self.training:
            time_series_dataset = TimeSeriesDataSet(
                data,
                time_idx=time_idx,
                group_ids=group_ids,
                target=target,
                **time_series_dataset_kwargs)
            parameters = time_series_dataset.get_parameters()

            # Add some sample data so that we can recreate the `TimeSeriesDataSet` later on
            parameters["data_sample"] = data.iloc[[0]].to_dict()

            self.parameters = parameters
        else:
            if parameters is None:
                raise MisconfigurationException(
                    "Loading data for evaluation or inference requires parameters from the train data. Either "
                    "construct the train data at the same time as evaluation and inference or provide the train "
                    "`datamodule.parameters` to `from_data_frame` in the `parameters` argument."
                )
            parameters = copy(parameters)
            parameters.pop("data_sample")
            time_series_dataset = TimeSeriesDataSet.from_parameters(
                parameters,
                data,
                stop_randomization=True,
            )
        return time_series_dataset
Exemplo n.º 12
0
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # allow encoder lengths from 0 to max_prediction_length
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus", center=False
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)
Exemplo n.º 13
0
max_encoder_length = 150
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

context_length = max_encoder_length
prediction_length = max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx < training_cutoff],
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    min_encoder_length=context_length,
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
    min_prediction_length=prediction_length,
    time_varying_unknown_reals=["value"],
    randomize_length=None,
    add_relative_time_idx=False,
    add_target_scales=False,
)

validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            min_prediction_idx=training_cutoff)
batch_size = 128
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=2)
Exemplo n.º 14
0
    def predict(self, future_dataframe):
        """
        Predicts based on the future_dataframe. Should be called only after make_future_dataframe is called
        Args:
            future_dataframe: DataFrame form make_future_dataframe function
        Returns:
            forecast dataframe
        """

        if self.fitted is False:
            log.warning("Model has not been fitted. Predictions will be random.")

        future_dataframe = future_dataframe.copy(deep=True)

        testing = TimeSeriesDataSet(
            future_dataframe,
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(future_dataframe.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_known_reals=["time_idx"],
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"], transformation="softplus", center=False),
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
        )

        new_raw_predictions, new_x = self.model.predict(testing, mode="raw", return_x=True)

        y_predicted = self.model.to_prediction(new_raw_predictions).detach().cpu()  # [0, : new_x["decoder_lengths"][0]]

        y_predicted = y_predicted.detach().numpy()

        def pad_with(vector, pad_width, iaxis, kwargs):
            pad_value = kwargs.get("padder", np.nan)
            vector[: pad_width[0]] = pad_value
            vector[-pad_width[1] :] = pad_value

        y_pred_padded = np.pad(y_predicted, self.prediction_length, pad_with)[
            self.prediction_length : -1, self.prediction_length : -self.prediction_length
        ]
        y_pred_padded = np.vstack([np.roll(y_pred_padded[:, i], i, axis=0) for i in range(y_pred_padded.shape[1])]).T

        result = pd.DataFrame(
            np.ones(shape=(len(future_dataframe), (2 + self.prediction_length))) * np.nan,
            columns=["ds", "y"] + [f"yhat{i}" for i in range(1, self.prediction_length + 1)],
        )
        result["ds"] = future_dataframe["ds"]

        result.loc[: len(future_dataframe) - (self.periods + 1), "y"] = (
            future_dataframe["y"].iloc[: len(future_dataframe) - (self.periods)].values
        )

        first_part = result.iloc[: self.context_length]
        second_part = result.iloc[self.context_length :]

        second_part.loc[:, [col for col in second_part.columns[2:]]] = y_pred_padded
        result = pd.concat([first_part, second_part])
        for i in range(1, self.prediction_length + 1):
            result[f"residual{i}"] = result[f"yhat{i}"] - result["y"]

        return result
Exemplo n.º 15
0
Arquivo: TFT.py Projeto: NHQ/tempy
data["cumFoam"] = data["cumFoam"] + 1e6
training = TimeSeriesDataSet(
    data[lambda x: x.date <= 300],
    time_idx="date",
    group_ids=["symbol"],
    target="open",
    #["open", "high", "low", "volume"],
    allow_missings=True,
    #group_ids=["agency", "sku"],
    min_encoder_length=
    50,  #max_encoder_length // 2,  # allow encoder lengths from 0 to max_prediction_length
    max_encoder_length=100,  #max_encoder_length,
    min_prediction_length=20,
    max_prediction_length=20,  #max_prediction_length,
    static_categoricals=["symbol"],
    #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    #time_varying_known_categoricals=["special_days", "month"],
    #variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=meta["knownReals"],
    #time_varying_unknown_categoricals=[],#meta["features"],
    time_varying_unknown_reals=meta["unknownReals"],
    #target_normalizer=NaNLabelEncoder(add_nan=True),
    target_normalizer=None,
    #GroupNormalizer(
    #    groups=["symbol"], transformation="softplus", center=False
    #),  # use softplus with beta=1.0 and normalize by group
    #add_relative_time_idx=True,
    #add_target_scales=True,
    add_encoder_length=True,
    categorical_encoders={"symbol": NaNLabelEncoder(add_nan=True)})
validation = TimeSeriesDataSet.from_dataset(training,
Exemplo n.º 16
0
max_encoder_length = 60
max_prediction_length = 20

df_train_nbeats = df_train.copy()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats["group"] = 0

df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats)

nbeats_training = TimeSeriesDataSet(
    df_train_nbeats_sub,
    time_idx="index",
    target="y",
    categorical_encoders={
        "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"])
    },
    group_ids=["group"],
    time_varying_unknown_reals=["y"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
)
nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training,
                                                   df_train_nbeats_val)

# %%
batch_size = 128
nbeats_train_dataloader = nbeats_training.to_dataloader(train=True,
                                                        batch_size=batch_size,
                                                        num_workers=0)
nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False,
                                                        batch_size=batch_size,
Exemplo n.º 17
0
validation = data.series.sample(20)

max_encoder_length = 60
max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: ~x.series.isin(validation)],
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    static_categoricals=["static"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["value"],
    time_varying_known_reals=["time_idx"],
    target_normalizer=GroupNormalizer(groups=["series"]),
    add_relative_time_idx=False,
    add_target_scales=True,
    randomize_length=None,
)

validation = TimeSeriesDataSet.from_dataset(
    training,
    data[lambda x: x.series.isin(validation)],
    # predict=True,
    stop_randomization=True,
)
Exemplo n.º 18
0
        #value=np.arange(30),
        group=np.repeat(np.arange(3), 10),
        time_idx=np.tile(np.arange(10), 3),
    ))
test_data

# %%
from pytorch_forecasting import TimeSeriesDataSet

# create the dataset from the pandas dataframe
dataset = TimeSeriesDataSet(
    test_data,
    group_ids=["group"],
    target="value",
    time_idx="time_idx",
    min_encoder_length=5,
    max_encoder_length=5,
    min_prediction_length=2,
    max_prediction_length=2,
    time_varying_unknown_reals=["value"],
)

# %%
dataset.get_parameters()

# %% [markdown]
# Now, we take a look at the output of the dataloader. It's `x` will be fed to the model's forward method, that is why it is so important to understand it.

# %%
# convert the dataset to a dataloader
dataloader = dataset.to_dataloader(batch_size=4)