Пример #1
0
    def prepare_data(self, unprepared_dataset: Optional[DatasetH] = None) -> DatasetH:
        """
        Load dataset
        - if unprepared_dataset is specified, then prepare the dataset directly
        - Otherwise,

        Separating this function will make it easier to reuse the dataset

        Returns:
            DatasetH: the instance of DatasetH
        """
        # automatically getting the historical dependency if not specified
        if self.hist_ref is None:
            dataset: DatasetH = self.record.load_object("dataset") if unprepared_dataset is None else unprepared_dataset
            # Special treatment of historical dependencies
            if isinstance(dataset, TSDatasetH):
                hist_ref = dataset.step_len - 1
            else:
                hist_ref = 0  # if only the lastest data is used, then only current data will be used and no historical data will be used
        else:
            hist_ref = self.hist_ref

        start_time_buffer = get_date_by_shift(
            self.last_end, -hist_ref + 1, clip_shift=False, freq=self.freq  # pylint: disable=E1130
        )
        start_time = get_date_by_shift(self.last_end, 1, freq=self.freq)
        seg = {"test": (start_time, self.to_date)}
        return self.rmdl.get_dataset(
            start_time=start_time_buffer, end_time=self.to_date, segments=seg, unprepared_dataset=unprepared_dataset
        )
Пример #2
0
 def mask_future(s):
     """mask future information"""
     # from qlib.utils import get_date_by_shift
     start, end = s.name
     end = get_date_by_shift(trading_date=end,
                             shift=self.trunc_days - 1,
                             future=True)
     return s.mask((s.index >= start) & (s.index <= end))
Пример #3
0
    def prepare_data(self) -> DatasetH:
        """
        Load dataset

        Separating this function will make it easier to reuse the dataset

        Returns:
            DatasetH: the instance of DatasetH
        """
        start_time_buffer = get_date_by_shift(self.last_end,
                                              -self.hist_ref + 1,
                                              clip_shift=False,
                                              freq=self.freq)
        start_time = get_date_by_shift(self.last_end, 1, freq=self.freq)
        seg = {"test": (start_time, self.to_date)}
        dataset = self.rmdl.get_dataset(start_time=start_time_buffer,
                                        end_time=self.to_date,
                                        segments=seg)
        return dataset
Пример #4
0
    def update(self, dataset: DatasetH = None):
        """
        Update the prediction in a recorder.

        Args:
            DatasetH: the instance of DatasetH. None for reprepare.
        """
        # FIXME: the problem below is not solved
        # The model dumped on GPU instances can not be loaded on CPU instance. Follow exception will raised
        # RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.
        # https://github.com/pytorch/pytorch/issues/16797

        start_time = get_date_by_shift(self.last_end, 1, freq=self.freq)
        if start_time >= self.to_date:
            self.logger.info(
                f"The prediction in {self.record.info['id']} are latest ({start_time}). No need to update to {self.to_date}."
            )
            return

        # load dataset
        if dataset is None:
            # For reusing the dataset
            dataset = self.prepare_data()

        # Load model
        model = self.rmdl.get_model()

        new_pred: pd.Series = model.predict(dataset)

        cb_pred = pd.concat(
            [self.old_pred, new_pred.to_frame("score")], axis=0)
        cb_pred = cb_pred.sort_index()

        self.record.save_objects(**{"pred.pkl": cb_pred})

        self.logger.info(
            f"Finish updating new {new_pred.shape[0]} predictions in {self.record.info['id']}."
        )
Пример #5
0
    def __init__(
        self,
        record: Recorder,
        to_date=None,
        from_date=None,
        hist_ref: Optional[int] = None,
        freq="day",
        fname="pred.pkl",
        loader_cls: type = RMDLoader,
    ):
        """
        Init PredUpdater.

        Expected behavior in following cases:
        - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date
        - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected.

        Args:
            record : Recorder
            to_date :
                update to prediction to the `to_date`
                if to_date is None:
                    data will updated to the latest date.
            from_date :
                the update will start from `from_date`
                if from_date is None:
                    the updating will occur on the next tick after the latest data in historical data
            hist_ref : int
                Sometimes, the dataset will have historical depends.
                Leave the problem to users to set the length of historical dependency
                If user doesn't specify this parameter, Updater will try to load dataset to automatically determine the hist_ref

                .. note::

                    the start_time is not included in the `hist_ref`; So the `hist_ref` will be `step_len - 1` in most cases

            loader_cls : type
                the class to load the model and dataset

        """
        # TODO: automate this hist_ref in the future.
        super().__init__(record=record)

        self.to_date = to_date
        self.hist_ref = hist_ref
        self.freq = freq
        self.fname = fname
        self.rmdl = loader_cls(rec=record)

        latest_date = D.calendar(freq=freq)[-1]
        if to_date is None:
            to_date = latest_date
        to_date = pd.Timestamp(to_date)

        if to_date >= latest_date:
            self.logger.warning(
                f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`."
            )
            to_date = latest_date
        self.to_date = to_date

        # FIXME: it will raise error when running routine with delay trainer
        # should we use another prediction updater for delay trainer?
        self.old_data: pd.DataFrame = record.load_object(fname)
        if from_date is None:
            # dropna is for being compatible to some data with future information(e.g. label)
            # The recent label data should be updated together
            self.last_end = self.old_data.dropna().index.get_level_values("datetime").max()
        else:
            self.last_end = get_date_by_shift(from_date, -1, align="right")