def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date ---------- trading_date : pandas.Timestamp current date shift : int clip_shift: bool align : Optional[str] When align is None, this function will raise ValueError if `trading_date` is not a trading date when align is "left"/"right", it will try to align to left/right nearest trading date before shifting when `trading_date` is not a trading date """ from qlib.data import D # pylint: disable=C0415 cal = D.calendar(future=future, freq=freq) trading_date = pd.to_datetime(trading_date) if align is None: if trading_date not in list(cal): raise ValueError("{} is not trading day!".format(str(trading_date))) _index = bisect.bisect_left(cal, trading_date) elif align == "left": _index = bisect.bisect_right(cal, trading_date) - 1 elif align == "right": _index = bisect.bisect_left(cal, trading_date) else: raise ValueError(f"align with value `{align}` is not supported") shift_index = _index + shift if shift_index < 0 or shift_index >= len(cal): if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range") return cal[shift_index]
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True): """get trading date with shift bias wil cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date ---------- trading_date : pandas.Timestamp current date shift : int clip_shift: bool """ from qlib.data import D cal = D.calendar(future=future) if pd.to_datetime(trading_date) not in list(cal): raise ValueError("{} is not trading day!".format(str(trading_date))) _index = bisect.bisect_left(cal, trading_date) shift_index = _index + shift if shift_index < 0 or shift_index >= len(cal): if clip_shift: shift_index = np.clip(shift_index, 0, len(cal) - 1) else: raise IndexError( f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range" ) return cal[shift_index]
def load_data(self): ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] print(ret) instruments = D.instruments('csi300')# ['SH600570','SH600000'] fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
def test_update_label(self): task = copy.deepcopy(CSI300_GBDT_TASK) task["record"] = { "class": "SignalRecord", "module_path": "qlib.workflow.record_temp", "kwargs": { "dataset": "<DATASET>", "model": "<MODEL>" }, } exp_name = "online_srv_test" cal = D.calendar() shift = 10 latest_date = cal[-1 - shift] train_start = latest_date - pd.Timedelta(days=61) train_end = latest_date - pd.Timedelta(days=41) task["dataset"]["kwargs"]["segments"] = { "train": (train_start, train_end), "valid": (latest_date - pd.Timedelta(days=40), latest_date - pd.Timedelta(days=21)), "test": (latest_date - pd.Timedelta(days=20), latest_date), } task["dataset"]["kwargs"]["handler"]["kwargs"] = { "start_time": train_start, "end_time": latest_date, "fit_start_time": train_start, "fit_end_time": train_end, "instruments": "csi300", } rec = task_train(task, exp_name) pred = rec.load_object("pred.pkl") online_tool = OnlineToolR(exp_name) online_tool.reset_online_tag(rec) # set to online model online_tool.update_online_pred() new_pred = rec.load_object("pred.pkl") label = rec.load_object("label.pkl") label_date = label.dropna().index.get_level_values("datetime").max() pred_date = new_pred.dropna().index.get_level_values("datetime").max() # The prediction is updated, but the label is not updated. self.assertTrue(label_date < pred_date) # Update label now lu = LabelUpdater(rec) lu.update() new_label = rec.load_object("label.pkl") new_label_date = new_label.index.get_level_values("datetime").max() self.assertTrue( new_label_date == pred_date) # make sure the label is updated now
def set_end_time(self, end_time=None): """ Set end time. None for use calendar's end time. Args: end_time """ self.cals = D.calendar(future=self._future, end_time=end_time)
def test_update_pred(self): """ This test is for testing if it will raise error if the `to_date` is out of the boundary. """ task = copy.deepcopy(CSI300_GBDT_TASK) task["record"] = ["qlib.workflow.record_temp.SignalRecord"] exp_name = "online_srv_test" cal = D.calendar() latest_date = cal[-1] train_start = latest_date - pd.Timedelta(days=61) train_end = latest_date - pd.Timedelta(days=41) task["dataset"]["kwargs"]["segments"] = { "train": (train_start, train_end), "valid": (latest_date - pd.Timedelta(days=40), latest_date - pd.Timedelta(days=21)), "test": (latest_date - pd.Timedelta(days=20), latest_date), } task["dataset"]["kwargs"]["handler"]["kwargs"] = { "start_time": train_start, "end_time": latest_date, "fit_start_time": train_start, "fit_end_time": train_end, "instruments": "csi300", } rec = task_train(task, exp_name) pred = rec.load_object("pred.pkl") online_tool = OnlineToolR(exp_name) online_tool.reset_online_tag(rec) # set to online model online_tool.update_online_pred(to_date=latest_date + pd.Timedelta(days=10)) good_pred = rec.load_object("pred.pkl") mod_range = slice(latest_date - pd.Timedelta(days=20), latest_date - pd.Timedelta(days=10)) mod_range2 = slice(latest_date - pd.Timedelta(days=9), latest_date - pd.Timedelta(days=2)) mod_pred = good_pred.copy() mod_pred.loc[mod_range] = -1 mod_pred.loc[mod_range2] = -2 rec.save_objects(**{"pred.pkl": mod_pred}) online_tool.update_online_pred( to_date=latest_date - pd.Timedelta(days=10), from_date=latest_date - pd.Timedelta(days=20) ) updated_pred = rec.load_object("pred.pkl") # this range is not fixed self.assertTrue((updated_pred.loc[mod_range] == good_pred.loc[mod_range]).all().item()) # this range is fixed now self.assertTrue((updated_pred.loc[mod_range2] == -2).all().item())
def test_1_dump_calendars(self): ori_calendars = set( map( pd.Timestamp, pd.read_csv(QLIB_DIR.joinpath("calendars", "day.txt"), header=None).loc[:, 0].values, )) res_calendars = set(D.calendar()) assert len(ori_calendars - res_calendars) == len( res_calendars - ori_calendars) == 0, "dump calendars failed"
def __init__(self, record: Recorder, to_date=None, hist_ref: int = 0, freq="day", fname="pred.pkl"): """ Init PredUpdater. Args: record : Recorder to_date : update to prediction to the `to_date` hist_ref : int Sometimes, the dataset will have historical depends. Leave the problem to users to set the length of historical dependency .. note:: the start_time is not included in the hist_ref """ # TODO: automate this hist_ref in the future. super().__init__(record=record) self.to_date = to_date self.hist_ref = hist_ref self.freq = freq self.fname = fname self.rmdl = RMDLoader(rec=record) latest_date = D.calendar(freq=freq)[-1] if to_date == None: to_date = latest_date to_date = pd.Timestamp(to_date) if to_date >= latest_date: self.logger.warning( f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`." ) to_date = latest_date self.to_date = to_date # FIXME: it will raise error when running routine with delay trainer # should we use another prediction updater for delay trainer? self.old_data: pd.DataFrame = record.load_object(fname) # dropna is for being compatible to some data with future information(e.g. label) # The recent label data should be updated together self.last_end = self.old_data.dropna().index.get_level_values( "datetime").max()
def test_update_pred(self): """ This test is for testing if it will raise error if the `to_date` is out of the boundary. """ task = copy.deepcopy(CSI300_GBDT_TASK) task["record"] = { "class": "SignalRecord", "module_path": "qlib.workflow.record_temp", "kwargs": { "dataset": "<DATASET>", "model": "<MODEL>" }, } exp_name = "online_srv_test" cal = D.calendar() latest_date = cal[-1] train_start = latest_date - pd.Timedelta(days=61) train_end = latest_date - pd.Timedelta(days=41) task["dataset"]["kwargs"]["segments"] = { "train": (train_start, train_end), "valid": (latest_date - pd.Timedelta(days=40), latest_date - pd.Timedelta(days=21)), "test": (latest_date - pd.Timedelta(days=20), latest_date), } task["dataset"]["kwargs"]["handler"]["kwargs"] = { "start_time": train_start, "end_time": latest_date, "fit_start_time": train_start, "fit_end_time": train_end, "instruments": "csi300", } rec = task_train(task, exp_name) pred = rec.load_object("pred.pkl") online_tool = OnlineToolR(exp_name) online_tool.reset_online_tag(rec) # set to online model online_tool.update_online_pred(to_date=latest_date + pd.Timedelta(days=10))
def transform_end_date(end_date=None, freq="day"): """get previous trading date If end_date is -1, None, or end_date is greater than the maximum trading day, the last trading date is returned. Otherwise, returns the end_date ---------- end_date: str end trading date date : pandas.Timestamp current date """ from ..data import D last_date = D.calendar(freq=freq)[-1] if end_date is None or (str(end_date) == "-1") or ( pd.Timestamp(last_date) < pd.Timestamp(end_date)): log.warning("\nInfo: the end_date in the configuration file is {}, " "so the default last date {} is used.".format( end_date, last_date)) end_date = last_date return end_date
def _gen_day_dataset(self, config, conf_type): try: path = config.pop("path") except KeyError as e: raise ValueError("Must specify the path to save the dataset.") from e if os.path.isfile(path + "tmp_dataset.pkl"): start = time.time() print_log("Dataset exists, load from disk.", __name__) else: start = time.time() if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) print_log("Generating dataset", __name__) self._prepare_calender_cache() dataset = init_instance_by_config(config) print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__) dataset.config(dump_all=False, recursive=True) dataset.to_pickle(path + "tmp_dataset.pkl") with open(path + "tmp_dataset.pkl", "rb") as f: new_dataset = pkl.load(f) time_list = D.calendar(start_time=self.start_time, end_time=self.end_time, freq="1min")[::240] def generate_dataset(times): if os.path.isfile(path + times.strftime("%Y-%m-%d") + ".pkl"): print("exist " + times.strftime("%Y-%m-%d")) return self._init_qlib(self.qlib_conf) end_times = times + datetime.timedelta(days=1) new_dataset.handler.config(**{"start_time": times, "end_time": end_times}) if conf_type == "backtest": new_dataset.handler.setup_data() else: new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS) new_dataset.config(dump_all=True, recursive=True) new_dataset.to_pickle(path + times.strftime("%Y-%m-%d") + ".pkl") Parallel(n_jobs=8)(delayed(generate_dataset)(times) for times in time_list)
def calendar_callback(self, cbody, task_uri): """Target function for the established process when the received task asks for calendar data. Call the data provider to acquire data and publish the calendar data. """ start_time = cbody["start_time"] end_time = cbody["end_time"] if start_time == "None": start_time = None if end_time == "None": end_time = None freq = cbody["freq"] future = cbody.get("future", False) status_code = 0 self.logger.debug("process calendar data at %f" % time.time()) try: calendar_result = D.calendar(start_time, end_time, freq, future) calendar_result = [str(c) for c in calendar_result] self.logger.debug("finish processing calendar data and publish message at %f" % time.time()) self.publish_message("calendar", calendar_result, status_code, task_uri) except Exception as e: self.logger.exception(f"Error while processing request %.200s" % e) self.publish_message("calendar", None, 1, task_uri, str(e))
def __init__(self, future=True, end_time=None): self._future = future self.cals = D.calendar(future=future, end_time=end_time)
def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]: import qlib from qlib.data import D qlib.init(provider_uri=self.qlib_data_1d_dir) return list(D.calendar(freq="day"))
def __init__( self, record: Recorder, to_date=None, from_date=None, hist_ref: Optional[int] = None, freq="day", fname="pred.pkl", loader_cls: type = RMDLoader, ): """ Init PredUpdater. Expected behavior in following cases: - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected. Args: record : Recorder to_date : update to prediction to the `to_date` if to_date is None: data will updated to the latest date. from_date : the update will start from `from_date` if from_date is None: the updating will occur on the next tick after the latest data in historical data hist_ref : int Sometimes, the dataset will have historical depends. Leave the problem to users to set the length of historical dependency If user doesn't specify this parameter, Updater will try to load dataset to automatically determine the hist_ref .. note:: the start_time is not included in the `hist_ref`; So the `hist_ref` will be `step_len - 1` in most cases loader_cls : type the class to load the model and dataset """ # TODO: automate this hist_ref in the future. super().__init__(record=record) self.to_date = to_date self.hist_ref = hist_ref self.freq = freq self.fname = fname self.rmdl = loader_cls(rec=record) latest_date = D.calendar(freq=freq)[-1] if to_date is None: to_date = latest_date to_date = pd.Timestamp(to_date) if to_date >= latest_date: self.logger.warning( f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`." ) to_date = latest_date self.to_date = to_date # FIXME: it will raise error when running routine with delay trainer # should we use another prediction updater for delay trainer? self.old_data: pd.DataFrame = record.load_object(fname) if from_date is None: # dropna is for being compatible to some data with future information(e.g. label) # The recent label data should be updated together self.last_end = self.old_data.dropna().index.get_level_values("datetime").max() else: self.last_end = get_date_by_shift(from_date, -1, align="right")
import qlib from qlib.data import D if __name__ == '__main__': qlib.init(provider_uri='~/.qlib/qlib_data/cn_data') print( D.calendar(start_time='2019-01-01', end_time='2020-12-31', freq='day')[:5])
def prepareTrainDataset(ifSavePortfolioIndex=False): print( "------------------------ Begin to prepare train dataset... ------------------------" ) # read config file cf = configparser.ConfigParser() cf.read("config/config.ini") minDaysRange = int(cf.get("Parameter", "minDaysRange")) # offset of days numberOfYears = int(cf.get("Parameter", "numberOfYears")) numberOfMonths = int(cf.get("Parameter", "numberOfMonths")) numberOfDays = int(cf.get("Parameter", "numberOfDays")) # qlib init qlib.init(provider_uri='data/bin') # use one fund be the standard of trading day calendar = D.calendar(freq='day') lastDay = calendar[-1] # 2021-02-10 00:00:00 firstDay = lastDay - DateOffset(years=numberOfYears, months=numberOfMonths, days=numberOfDays) # 2018-02-10 00:00:00 # exclude the influence of days without trading calendarBetweenFirstDayAndLastDay = D.calendar(freq='day', start_time=firstDay, end_time=lastDay) firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0] lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1] # get portfolio pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio") if not os.path.exists(pathOfDfSparsePortfolio): getSparseMatrixForPortfolioInAllFunds() dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0) if ifSavePortfolioIndex: dfPortfolioIndex = dfSparsePortfolio["FullElements"] dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv") folderToSaveTrainDataset = getFolderNameInConfig( "folderToSaveTrainDataset") # the folder to save train dataset folderToSaveTestDataset = getFolderNameInConfig( "folderToSaveTestDataset") # the folder to save test dataset count = 0 instruments = D.instruments(market='all') for file in D.list_instruments(instruments=instruments, as_list=True): fundCode = file.split("_")[0] # 000001 if count % 100 == 0: print("count = %s\tfundCode=%s" % (count, fundCode)) try: # can't find portfolio for this fund try: dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]] except: continue # read file and remove empty line df = D.features([file], ['$AccumulativeNetAssetValue'], start_time=firstDayToAnalyze, end_time=lastDayToAnalyze) df.columns = ['AccumulativeNetAssetValue'] #df = df.unstack(level=0) df["datetime"] = df.index.levels[1] # reset the index df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True) # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high if df.shape[0] <= minDaysRange: continue # count the days between first day and last day day = df['datetime'] # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday) firstDayInThisFund = day[day.first_valid_index( )] # 2018-02-12 00:00:00, 2018-02-10 is Satuaday lastDayInThisFund = day[ day.last_valid_index()] # 2021-02-10 00:00:00 # must have value in latest day if (lastDayInThisFund - lastDayToAnalyze).days != 0: continue df['daysDiffWithLastDay'] = df['datetime'].apply( lambda x: (lastDayInThisFund - x).days) # get the value in important days netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][ "AccumulativeNetAssetValue"].tolist()[0] # 4.046 # get train dataset which found more than 3 years if (firstDayInThisFund - firstDayToAnalyze).days <= 0: # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day]) df["adjustFactorToLatestDay"] = df[ "AccumulativeNetAssetValue"] / netValueInFirstDay df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]] # abandon the latest day, it's meaningless df.reset_index(drop=True, inplace=True) df = df.T.drop(labels=0, axis=1).T # reset index to concat with dfSparsePortfolioForThisFund df.reset_index(drop=True, inplace=True) df = df.T # duplicate to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = pd.concat( [dfSparsePortfolioForThisFund.T] * df.shape[1]) # reset index to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index( drop=True).T dfDataset = pd.concat([dfSparsePortfolioForThisFund, df], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTrainDataset, "%s.csv" % fundCode)) else: dfInFirstDay = df[df['datetime'] == firstDayInThisFund].reset_index(drop=True) dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T dfInFirstDay[fundCode] = dfInFirstDay[0] dfDataset = pd.concat( [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode)) count += 1 except Exception as e: print("fundCode = %s\terror = %s" % (fundCode, e)) continue print("------------------------ Done. ------------------------")