def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region=REG_CN, task_url="mongodb://10.0.0.4:27017/", task_db_name="rolling_db", experiment_name="rolling_exp", task_pool="rolling_task", task_config=None, rolling_step=550, rolling_type=RollingGen.ROLL_SD, ): # TaskManager config if task_config is None: task_config = [ CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG ] mongo_conf = { "task_url": task_url, "task_db_name": task_db_name, } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.experiment_name = experiment_name self.task_pool = task_pool self.task_config = task_config self.rolling_gen = RollingGen(step=rolling_step, rtype=rolling_type)
def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region="cn", task_url="mongodb://10.0.0.4:27017/", task_db_name="rolling_db", rolling_step=550, tasks=[task_xgboost_config], add_tasks=[task_lgb_config], ): mongo_conf = { "task_url": task_url, # your MongoDB url "task_db_name": task_db_name, # database name } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.tasks = tasks self.add_tasks = add_tasks self.rolling_step = rolling_step strategies = [] for task in tasks: name_id = task["model"][ "class"] # NOTE: Assumption: The model class can specify only one strategy strategies.append( RollingStrategy( name_id, task, RollingGen(step=rolling_step, rtype=RollingGen.ROLL_SD), )) self.rolling_online_manager = OnlineManager(strategies)
def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region="cn", trainer=DelayTrainerRM(), # you can choose from TrainerR, TrainerRM, DelayTrainerR, DelayTrainerRM task_url="mongodb://10.0.0.4:27017/", # not necessary when using TrainerR or DelayTrainerR task_db_name="rolling_db", # not necessary when using TrainerR or DelayTrainerR rolling_step=550, tasks=None, add_tasks=None, ): if add_tasks is None: add_tasks = [CSI100_RECORD_LGB_TASK_CONFIG_ROLLING] if tasks is None: tasks = [CSI100_RECORD_XGBOOST_TASK_CONFIG_ROLLING] mongo_conf = { "task_url": task_url, # your MongoDB url "task_db_name": task_db_name, # database name } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.tasks = tasks self.add_tasks = add_tasks self.rolling_step = rolling_step strategies = [] for task in tasks: name_id = task["model"]["class"] # NOTE: Assumption: The model class can specify only one strategy strategies.append( RollingStrategy( name_id, task, RollingGen(step=rolling_step, rtype=RollingGen.ROLL_SD), ) ) self.trainer = trainer self.rolling_online_manager = OnlineManager(strategies, trainer=self.trainer)
def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region="cn", exp_name="rolling_exp", task_url="mongodb://10.0.0.4:27017/", task_db_name="rolling_db", task_pool="rolling_task", rolling_step=80, start_time="2018-09-10", end_time="2018-10-31", tasks=[task_xgboost_config, task_lgb_config], ): """ Init OnlineManagerExample. Args: provider_uri (str, optional): the provider uri. Defaults to "~/.qlib/qlib_data/cn_data". region (str, optional): the stock region. Defaults to "cn". exp_name (str, optional): the experiment name. Defaults to "rolling_exp". task_url (str, optional): your MongoDB url. Defaults to "mongodb://10.0.0.4:27017/". task_db_name (str, optional): database name. Defaults to "rolling_db". task_pool (str, optional): the task pool name (a task pool is a collection in MongoDB). Defaults to "rolling_task". rolling_step (int, optional): the step for rolling. Defaults to 80. start_time (str, optional): the start time of simulating. Defaults to "2018-09-10". end_time (str, optional): the end time of simulating. Defaults to "2018-10-31". tasks (dict or list[dict]): a set of the task config waiting for rolling and training """ self.exp_name = exp_name self.task_pool = task_pool self.start_time = start_time self.end_time = end_time mongo_conf = { "task_url": task_url, "task_db_name": task_db_name, } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.rolling_gen = RollingGen( step=rolling_step, rtype=RollingGen.ROLL_SD, ds_extra_mod_func=None ) # The rolling tasks generator, ds_extra_mod_func is None because we just need to simulate to 2018-10-31 and needn't change the handler end time. self.trainer = DelayTrainerRM( self.exp_name, self.task_pool) # Also can be TrainerR, TrainerRM, DelayTrainerR self.rolling_online_manager = OnlineManager( RollingStrategy(exp_name, task_template=tasks, rolling_gen=self.rolling_gen), trainer=self.trainer, begin_time=self.start_time, ) self.tasks = tasks
def add_strategy(self): print("========== load ==========") self.rolling_online_manager = OnlineManager.load(self._ROLLING_MANAGER_PATH) print("========== add strategy ==========") strategies = [] for task in self.add_tasks: name_id = task["model"]["class"] # NOTE: Assumption: The model class can specify only one strategy strategies.append( RollingStrategy( name_id, task, RollingGen(step=self.rolling_step, rtype=RollingGen.ROLL_SD), ) ) self.rolling_online_manager.add_strategy(strategies=strategies) print("========== dump ==========") self.rolling_online_manager.to_pickle(self._ROLLING_MANAGER_PATH)
def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region=REG_CN, task_url="mongodb://10.0.0.4:27017/", task_db_name="rolling_db", experiment_name="rolling_exp", task_pool="rolling_task", task_config=[task_xgboost_config, task_lgb_config], rolling_step=550, rolling_type=RollingGen.ROLL_SD, ): # TaskManager config mongo_conf = { "task_url": task_url, "task_db_name": task_db_name, } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.experiment_name = experiment_name self.task_pool = task_pool self.task_config = task_config self.rolling_gen = RollingGen(step=rolling_step, rtype=rolling_type)
def setup(self, trainer=TrainerR, trainer_kwargs={}): """ after running this function `self.data_ic_df` will become set. Each col represents a data. Each row represents the Timestamp of performance of that data. For example, .. code-block:: python 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... datetime ... 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... ... ... ... ... ... ... ... ... ... ... """ # 1) prepare the prediction of proxy models perf_task_tpl = deepcopy( self.task_tpl ) # this task is supposed to contains no complicated objects trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) # NOTE: # The handler is initialized for only once. if not trainer.has_worker(): self.dh = init_task_handler(perf_task_tpl) else: self.dh = init_instance_by_config( perf_task_tpl["dataset"]["kwargs"]["handler"]) seg = perf_task_tpl["dataset"]["kwargs"]["segments"] # We want to split the training time period into small segments. perf_task_tpl["dataset"]["kwargs"]["segments"] = { "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), "test": (None, None), } # NOTE: # we play a trick here # treat the training segments as test to create the rolling tasks rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) gen_task = task_generator(perf_task_tpl, [rg]) recorders = R.list_recorders(experiment_name=self.exp_name) if len(gen_task) == len(recorders): get_module_logger("Internal Data").info( "the data has been initialized") else: # train new models assert 0 == len( recorders ), "An empty experiment is required for setup `InternalData``" trainer.train(gen_task) # 2) extract the similarity matrix label_df = self.dh.fetch(col_set="label") # for recorders = R.list_recorders(experiment_name=self.exp_name) key_l = [] ic_l = [] for _, rec in tqdm(recorders.items(), desc="calc"): pred = rec.load_object("pred.pkl") task = rec.load_object("task") data_key = task["dataset"]["kwargs"]["segments"]["train"] key_l.append(data_key) ic_l.append( delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) ic_l = Parallel(n_jobs=-1)(ic_l) self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) del self.dh # handler is not useful now
def __init__( self, *, task_tpl: Union[dict, list], step: int, trunc_days: int = None, rolling_ext_days: int = 0, exp_name: Union[str, InternalData], segments: Union[Dict[Text, Tuple], float], hist_step_n: int = 10, task_mode: str = MetaTask.PROC_MODE_FULL, fill_method: str = "max", ): """ A dataset for meta model. Parameters ---------- task_tpl : Union[dict, list] Decide what tasks are used. - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` - list : when list, use the list of tasks directly the list is supposed to be sorted according timeline step : int the rolling step trunc_days: int days to be truncated based on the test start rolling_ext_days: int sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples. the total length of test periods will be `step + rolling_ext_days` exp_name : Union[str, InternalData] Decide what meta_info are used for prediction. - str: the name of the experiment to store the performance of data - InternalData: a prepared internal data segments: Union[Dict[Text, Tuple], float] the segments to divide data both left and right if segments is a float: the float represents the percentage of data for training hist_step_n: int length of historical steps for the meta infomation task_mode : str Please refer to the docs of MetaTask """ super().__init__(segments=segments) if isinstance(exp_name, InternalData): self.internal_data = exp_name else: self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) self.internal_data.setup() self.task_tpl = deepcopy( task_tpl ) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. self.trunc_days = trunc_days self.hist_step_n = hist_step_n self.step = step if isinstance(task_tpl, dict): rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type ) # NOTE: trunc_days is very important !!!! task_iter = rg(task_tpl) if rolling_ext_days > 0: self.ta = TimeAdjuster(future=True) for t in task_iter: t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift( t["dataset"]["kwargs"]["segments"]["test"], step=rolling_ext_days, rtype=RollingGen.ROLL_EX) if task_mode == MetaTask.PROC_MODE_FULL: # Only pre initializing the task when full task is req # initializing handler and share it. init_task_handler(task_tpl) else: assert isinstance(task_tpl, list) task_iter = task_tpl self.task_list = [] self.meta_task_l = [] logger = get_module_logger("MetaDatasetDS") logger.info(f"Example task for training meta model: {task_iter[0]}") for t in tqdm(task_iter, desc="creating meta tasks"): try: self.meta_task_l.append( MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") assert len( self.meta_task_l ) > 0, "No meta tasks found. Please check the data and setting"
def __init__( self, provider_uri="~/.qlib/qlib_data/cn_data", region="cn", exp_name="rolling_exp", task_url="mongodb://10.0.0.4:27017/", # not necessary when using TrainerR or DelayTrainerR task_db_name="rolling_db", # not necessary when using TrainerR or DelayTrainerR task_pool="rolling_task", rolling_step=80, start_time="2018-09-10", end_time="2018-10-31", tasks=None, trainer="TrainerR", ): """ Init OnlineManagerExample. Args: provider_uri (str, optional): the provider uri. Defaults to "~/.qlib/qlib_data/cn_data". region (str, optional): the stock region. Defaults to "cn". exp_name (str, optional): the experiment name. Defaults to "rolling_exp". task_url (str, optional): your MongoDB url. Defaults to "mongodb://10.0.0.4:27017/". task_db_name (str, optional): database name. Defaults to "rolling_db". task_pool (str, optional): the task pool name (a task pool is a collection in MongoDB). Defaults to "rolling_task". rolling_step (int, optional): the step for rolling. Defaults to 80. start_time (str, optional): the start time of simulating. Defaults to "2018-09-10". end_time (str, optional): the end time of simulating. Defaults to "2018-10-31". tasks (dict or list[dict]): a set of the task config waiting for rolling and training """ if tasks is None: tasks = [ CSI100_RECORD_XGBOOST_TASK_CONFIG_ONLINE, CSI100_RECORD_LGB_TASK_CONFIG_ONLINE ] self.exp_name = exp_name self.task_pool = task_pool self.start_time = start_time self.end_time = end_time mongo_conf = { "task_url": task_url, "task_db_name": task_db_name, } qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf) self.rolling_gen = RollingGen( step=rolling_step, rtype=RollingGen.ROLL_SD, ds_extra_mod_func=None ) # The rolling tasks generator, ds_extra_mod_func is None because we just need to simulate to 2018-10-31 and needn't change the handler end time. if trainer == "TrainerRM": self.trainer = TrainerRM(self.exp_name, self.task_pool) elif trainer == "TrainerR": self.trainer = TrainerR(self.exp_name) else: # TODO: support all the trainers: TrainerR, TrainerRM, DelayTrainerR raise NotImplementedError(f"This type of input is not supported") self.rolling_online_manager = OnlineManager( RollingStrategy(exp_name, task_template=tasks, rolling_gen=self.rolling_gen), trainer=self.trainer, begin_time=self.start_time, ) self.tasks = tasks
def create_rolling_tasks(self): task = self.basic_task() task_l = task_generator( task, RollingGen(step=self.step, trunc_days=self.horizon + 1) ) # the last two days should be truncated to avoid information leakage return task_l