def prepare_data(self, unprepared_dataset: Optional[DatasetH] = None) -> DatasetH: """ Load dataset - if unprepared_dataset is specified, then prepare the dataset directly - Otherwise, Separating this function will make it easier to reuse the dataset Returns: DatasetH: the instance of DatasetH """ # automatically getting the historical dependency if not specified if self.hist_ref is None: dataset: DatasetH = self.record.load_object("dataset") if unprepared_dataset is None else unprepared_dataset # Special treatment of historical dependencies if isinstance(dataset, TSDatasetH): hist_ref = dataset.step_len - 1 else: hist_ref = 0 # if only the lastest data is used, then only current data will be used and no historical data will be used else: hist_ref = self.hist_ref start_time_buffer = get_date_by_shift( self.last_end, -hist_ref + 1, clip_shift=False, freq=self.freq # pylint: disable=E1130 ) start_time = get_date_by_shift(self.last_end, 1, freq=self.freq) seg = {"test": (start_time, self.to_date)} return self.rmdl.get_dataset( start_time=start_time_buffer, end_time=self.to_date, segments=seg, unprepared_dataset=unprepared_dataset )
def mask_future(s): """mask future information""" # from qlib.utils import get_date_by_shift start, end = s.name end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) return s.mask((s.index >= start) & (s.index <= end))
def prepare_data(self) -> DatasetH: """ Load dataset Separating this function will make it easier to reuse the dataset Returns: DatasetH: the instance of DatasetH """ start_time_buffer = get_date_by_shift(self.last_end, -self.hist_ref + 1, clip_shift=False, freq=self.freq) start_time = get_date_by_shift(self.last_end, 1, freq=self.freq) seg = {"test": (start_time, self.to_date)} dataset = self.rmdl.get_dataset(start_time=start_time_buffer, end_time=self.to_date, segments=seg) return dataset
def update(self, dataset: DatasetH = None): """ Update the prediction in a recorder. Args: DatasetH: the instance of DatasetH. None for reprepare. """ # FIXME: the problem below is not solved # The model dumped on GPU instances can not be loaded on CPU instance. Follow exception will raised # RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU. # https://github.com/pytorch/pytorch/issues/16797 start_time = get_date_by_shift(self.last_end, 1, freq=self.freq) if start_time >= self.to_date: self.logger.info( f"The prediction in {self.record.info['id']} are latest ({start_time}). No need to update to {self.to_date}." ) return # load dataset if dataset is None: # For reusing the dataset dataset = self.prepare_data() # Load model model = self.rmdl.get_model() new_pred: pd.Series = model.predict(dataset) cb_pred = pd.concat( [self.old_pred, new_pred.to_frame("score")], axis=0) cb_pred = cb_pred.sort_index() self.record.save_objects(**{"pred.pkl": cb_pred}) self.logger.info( f"Finish updating new {new_pred.shape[0]} predictions in {self.record.info['id']}." )
def __init__( self, record: Recorder, to_date=None, from_date=None, hist_ref: Optional[int] = None, freq="day", fname="pred.pkl", loader_cls: type = RMDLoader, ): """ Init PredUpdater. Expected behavior in following cases: - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected. Args: record : Recorder to_date : update to prediction to the `to_date` if to_date is None: data will updated to the latest date. from_date : the update will start from `from_date` if from_date is None: the updating will occur on the next tick after the latest data in historical data hist_ref : int Sometimes, the dataset will have historical depends. Leave the problem to users to set the length of historical dependency If user doesn't specify this parameter, Updater will try to load dataset to automatically determine the hist_ref .. note:: the start_time is not included in the `hist_ref`; So the `hist_ref` will be `step_len - 1` in most cases loader_cls : type the class to load the model and dataset """ # TODO: automate this hist_ref in the future. super().__init__(record=record) self.to_date = to_date self.hist_ref = hist_ref self.freq = freq self.fname = fname self.rmdl = loader_cls(rec=record) latest_date = D.calendar(freq=freq)[-1] if to_date is None: to_date = latest_date to_date = pd.Timestamp(to_date) if to_date >= latest_date: self.logger.warning( f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`." ) to_date = latest_date self.to_date = to_date # FIXME: it will raise error when running routine with delay trainer # should we use another prediction updater for delay trainer? self.old_data: pd.DataFrame = record.load_object(fname) if from_date is None: # dropna is for being compatible to some data with future information(e.g. label) # The recent label data should be updated together self.last_end = self.old_data.dropna().index.get_level_values("datetime").max() else: self.last_end = get_date_by_shift(from_date, -1, align="right")