def main(seed, config_file="configs/config_alstm.yaml"): # set random seed with open(config_file) as f: config = yaml.safe_load(f) # seed_suffix = "/seed1000" if "init" in config_file else f"/seed{seed}" seed_suffix = "" config["task"]["model"]["kwargs"].update({ "seed": seed, "logdir": config["task"]["model"]["kwargs"]["logdir"] + seed_suffix }) # initialize workflow qlib.init( provider_uri=config["qlib_init"]["provider_uri"], region=config["qlib_init"]["region"], ) dataset = init_instance_by_config(config["task"]["dataset"]) model = init_instance_by_config(config["task"]["model"]) # train model model.fit(dataset)
def _exe_task(task_config: dict): rec = R.get_recorder() # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # FIXME: resume reweighter after merging data selection # reweighter: Reweighter = task_config.get("reweighter", None) # model training # auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # fill placehorder placehorder_value = {"<MODEL>": model, "<DATASET>": dataset} task_config = fill_placeholder(task_config, placehorder_value) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: # Some recorder require the parameter `model` and `dataset`. # try to automatically pass in them to the initialization function # to make defining the tasking easier r = init_instance_by_config( record, recorder=rec, default_module="qlib.workflow.record_temp", try_kwargs={ "model": model, "dataset": dataset }, ) r.generate()
def train_with_sigana(uri_path: str = None): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # predict and calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def train_with_sigana(): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana"): R.log_params(**flatten_dict(task)) model.fit(dataset) # predict and calculate ic and ric recorder = R.get_recorder() sar = SigAnaRecord(recorder, model=model, dataset=dataset) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) pred_score = sar.load("pred.pkl") smr = SignalMseRecord(recorder) smr.generate() uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def dump_and_load_dataset(self): """dump and load dataset state on disk""" self._init_qlib() self._prepare_calender_cache() dataset = init_instance_by_config(self.task["dataset"]) dataset_backtest = init_instance_by_config( self.task["dataset_backtest"]) ##=============dump dataset============= dataset.to_pickle(path="dataset.pkl") dataset_backtest.to_pickle(path="dataset_backtest.pkl") del dataset, dataset_backtest ##=============reload dataset============= with open("dataset.pkl", "rb") as file_dataset: dataset = pickle.load(file_dataset) with open("dataset_backtest.pkl", "rb") as file_dataset_backtest: dataset_backtest = pickle.load(file_dataset_backtest) self._prepare_calender_cache() ##=============reload_dataset============= dataset.init(init_type=DataHandlerLP.IT_LS) dataset_backtest.init() ##=============get data============= xtrain, xtest = dataset.prepare(["train", "test"]) backtest_train, backtest_test = dataset_backtest.prepare( ["train", "test"]) print(xtrain, xtest) print(backtest_train, backtest_test) del xtrain, xtest del backtest_train, backtest_test
def backtest(self): self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } self.port_analysis_config["strategy"] = strategy_config self.port_analysis_config["backtest"]["benchmark"] = self.benchmark with R.start(experiment_name="backtest"): recorder = R.get_recorder() par = PortAnaRecord( recorder, self.port_analysis_config, risk_analysis_freq=["day", "30min", "5min"], indicator_analysis_freq=["day", "30min", "5min"], indicator_analysis_method="value_weighted", ) par.generate()
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. Args: rec (Recorder): the recorder will be resumed experiment_name (str): the name of experiment Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # model training model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # fill placehorder placehorder_value = {"<MODEL>": model, "<DATASET>": dataset} task_config = fill_placeholder(task_config, placehorder_value) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: r = init_instance_by_config(record, recorder=rec) r.generate() return rec
def dump_and_load_dataset(self): """dump and load dataset state on disk""" self._init_qlib() self._prepare_calender_cache() dataset = init_instance_by_config(self.task["dataset"]) dataset_backtest = init_instance_by_config(self.task["dataset_backtest"]) ##=============dump dataset============= dataset.to_pickle(path="dataset.pkl") dataset_backtest.to_pickle(path="dataset_backtest.pkl") del dataset, dataset_backtest ##=============reload dataset============= with open("dataset.pkl", "rb") as file_dataset: dataset = pickle.load(file_dataset) with open("dataset_backtest.pkl", "rb") as file_dataset_backtest: dataset_backtest = pickle.load(file_dataset_backtest) self._prepare_calender_cache() ##=============reinit dataset============= dataset.config( handler_kwargs={ "start_time": "2021-01-19 00:00:00", "end_time": "2021-01-25 16:00:00", }, segments={ "test": ( "2021-01-19 00:00:00", "2021-01-25 16:00:00", ), }, ) dataset.setup_data( handler_kwargs={ "init_type": DataHandlerLP.IT_LS, }, ) dataset_backtest.config( handler_kwargs={ "start_time": "2021-01-19 00:00:00", "end_time": "2021-01-25 16:00:00", }, segments={ "test": ( "2021-01-19 00:00:00", "2021-01-25 16:00:00", ), }, ) dataset_backtest.setup_data(handler_kwargs={}) ##=============get data============= xtest = dataset.prepare("test") backtest_test = dataset_backtest.prepare("test") print(xtest, backtest_test) return
def train_multiseg(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = MultiSegRecord(model, dataset, recorder) sr.generate(dict(valid="valid", test="test"), True) uri = R.get_uri() return uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model R.log_params(**flatten_dict(task_config)) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # Get the recorder recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def train_mse(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalMseRecord(recorder, model=model, dataset=dataset) sr.generate() uri = R.get_uri() return uri
def get_data(self): """use dataset to get highreq data""" self._init_qlib() self._prepare_calender_cache() dataset = init_instance_by_config(self.task["dataset"]) xtrain, xtest = dataset.prepare(["train", "test"]) print(xtrain, xtest) dataset_backtest = init_instance_by_config(self.task["dataset_backtest"]) backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) print(backtest_train, backtest_test) return
def objective(trial): task = { "model": { "class": "LGBModel", "module_path": "qlib.contrib.model.gbdt", "kwargs": { "loss": "mse", "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1), "learning_rate": trial.suggest_uniform("learning_rate", 0, 1), "subsample": trial.suggest_uniform("subsample", 0, 1), "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1e4), "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1e4), "max_depth": 10, "num_leaves": trial.suggest_int("num_leaves", 1, 1024), "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0), "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0), "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 50), "min_child_samples": trial.suggest_int("min_child_samples", 5, 100), }, }, } evals_result = dict() model = init_instance_by_config(task["model"]) model.fit(dataset, evals_result=evals_result) return min(evals_result["valid"])
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None, swap_level=True, freq="day"): """ Parameters ---------- config : Tuple[list, tuple, dict] Please refer to the doc of DLWParser filter_pipe : Filter pipe for the instruments swap_level : Whether to swap level of MultiIndex """ if filter_pipe is not None: assert isinstance(filter_pipe, list), "The type of `filter_pipe` must be list." filter_pipe = [ init_instance_by_config( fp, None if "module_path" in fp else filter_module, accept_types=BaseDFilter) for fp in filter_pipe ] self.filter_pipe = filter_pipe self.swap_level = swap_level self.freq = freq super().__init__(config)
def _gen_data(self, config, datasets=["train", "valid", "test"]): try: path = config.pop("path") except KeyError as e: raise ValueError("Must specify the path to save the dataset.") from e if os.path.isfile(path): start = time.time() print_log("Dataset exists, load from disk.", __name__) # res = dataset.prepare(['train', 'valid', 'test']) with open(path, "rb") as f: data = pkl.load(f) if isinstance(data, dict): res = [data[i] for i in datasets] else: res = data.prepare(datasets) print_log(f"Data loaded, time cost: {time.time() - start:.2f}", __name__) else: if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) print_log("Generating dataset", __name__) start_time = time.time() self._prepare_calender_cache() dataset = init_instance_by_config(config) dataset.config(dump_all=True, recursive=True) dataset.to_pickle(path) res = dataset.prepare(datasets) print_log(f"Data generated, time cost: {(time.time() - start_time):.2f}", __name__) return res
def _gen_dataset(self, config): try: path = config.pop("path") except KeyError as e: raise ValueError("Must specify the path to save the dataset.") from e if os.path.isfile(path): start = time.time() print_log("Dataset exists, load from disk.", __name__) with open(path, "rb") as f: dataset = pkl.load(f) print_log(f"Data loaded, time cost: {time.time() - start:.2f}", __name__) else: start = time.time() if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) print_log("Generating dataset", __name__) self._prepare_calender_cache() dataset = init_instance_by_config(config) print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__) dataset.prepare(["train", "valid", "test"]) print_log(f"Dataset prepared, time cost: {time.time() - start:.2f}", __name__) dataset.config(dump_all=True, recursive=True) dataset.to_pickle(path) return dataset
def basic_task(self): """For fast training rolling""" if self.model_type == "gbdt": conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" # dump the processed data on to disk for later loading to speed up the processing h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format( self.horizon) elif self.model_type == "linear": conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml" h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format( self.horizon) else: raise AssertionError("Model type is not supported!") with conf_path.open("r") as f: conf = yaml.safe_load(f) # modify dataset horizon conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) ] task = conf["task"] if not h_path.exists(): h_conf = task["dataset"]["kwargs"]["handler"] h = init_instance_by_config(h_conf) h.to_pickle(h_path, dump_all=True) task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" task["record"] = ["qlib.workflow.record_temp.SignalRecord"] return task
def train(uri_path: str = None): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) # To test get_local_dir print(recorder.get_local_dir()) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") return pred_score, {"ic": ic, "ric": ric}, rid
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): # model initiaiton print("") print("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) print("dataset={:}".format(dataset)) model = init_instance_by_config(task_config["model"]) # start exp with R.start(experiment_name=experiment_name, recorder_name=recorder_name, uri=uri): log_file = R.get_recorder().root_uri / "{:}.log".format( experiment_name) set_log_basic_config(log_file) # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def __init__(self, handler_config: dict, fetch_kwargs: dict = {}, is_group=False): """ Parameters ---------- handler_config : dict handler_config will be used to describe the handlers .. code-block:: <handler_config> := { "group_name1": <handler> "group_name2": <handler> } or <handler_config> := <handler> <handler> := DataHandler Instance | DataHandler Config fetch_kwargs : dict fetch_kwargs will be used to describe the different arguments of fetch method, such as col_set, squeeze, data_key, etc. is_group: bool is_group will be used to describe whether the key of handler_config is group """ from qlib.data.dataset.handler import DataHandler if is_group: self.handlers = { grp: init_instance_by_config(config, accept_types=DataHandler) for grp, config in handler_config.items() } else: self.handlers = init_instance_by_config(handler_config, accept_types=DataHandler) self.is_group = is_group self.fetch_kwargs = {"col_set": DataHandler.CS_RAW} self.fetch_kwargs.update(fetch_kwargs)
def collect_data(self): self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) executor_config = self.port_analysis_config["executor"] backtest_config = self.port_analysis_config["backtest"] backtest_config["benchmark"] = self.benchmark strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } data_generator = collect_data(executor=executor_config, strategy=strategy_config, **backtest_config) for trade_decision in data_generator: print(trade_decision)
def train(): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load() # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) return pred_score, {"ic": ic, "ric": ric}, rid
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. Args: rec (Recorder): the recorder will be resumed experiment_name (str): the name of experiment Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # model training model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: cls, kwargs = get_cls_kwargs( record, default_module="qlib.workflow.record_temp") if cls is SignalRecord: rconf = {"model": model, "dataset": dataset, "recorder": rec} else: rconf = {"recorder": rec} r = cls(**kwargs, **rconf) r.generate() return rec
def task_train(task_config: dict, experiment_name): """ task based training Parameters ---------- task_config : dict A dict describes a task setting. """ # model initiaiton model = init_instance_by_config(task_config["model"]) dataset = init_instance_by_config(task_config["dataset"]) # start exp with R.start(experiment_name=experiment_name): # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"params.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: if record["class"] == SignalRecord.__name__: srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def get_feature_importance(self): # this must be lightGBM, because it needs to get the feature importance rb = RollingBenchmark(model_type="gbdt") task = rb.basic_task() model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) model.fit(dataset) fi = model.get_feature_importance() # Because the model use numpy instead of dataframe for training lightgbm # So the we must use following extra steps to get the right feature importance df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) cols = df.columns fi_named = { cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items() } return pd.Series(fi_named)
def _dump_pre_handler(self, path): handler_config = { "class": "Alpha158", "module_path": "qlib.contrib.data.handler", "kwargs": { "start_time": self.start_time, "end_time": self.end_time, "instruments": self.MARKET, "infer_processors": [], "learn_processors": [], }, } pre_handler = init_instance_by_config(handler_config) pre_handler.config(dump_all=True) pre_handler.to_pickle(path)
def init_vars(self, init_cash, position_dict, freq: str, benchmark_config: dict): self.init_cash = init_cash self.current_position: BasePosition = init_instance_by_config({ "class": self._pos_type, "kwargs": { "cash": init_cash, "position_dict": position_dict, }, "module_path": "qlib.backtest.position", }) self.portfolio_metrics = None self.hist_positions = {} self.reset(freq=freq, benchmark_config=benchmark_config)
def create_signal_from( obj: Union[Signal, Tuple[BaseModel, Dataset], List, Dict, Text, pd.Series, pd.DataFrame] ) -> Signal: """ create signal from diverse information This method will choose the right method to create a signal based on `obj` Please refer to the code below. """ if isinstance(obj, Signal): return obj elif isinstance(obj, (tuple, list)): return ModelSignal(*obj) elif isinstance(obj, (dict, str)): return init_instance_by_config(obj) elif isinstance(obj, (pd.DataFrame, pd.Series)): return SignalWCache(signal=obj) else: raise NotImplementedError(f"This type of signal is not supported")
def main(xargs, exp_yaml): assert Path(exp_yaml).exists(), "{:} does not exist.".format(exp_yaml) with open(exp_yaml) as fp: config = yaml.safe_load(fp) config = update_gpu(config, xargs.gpu) # config = update_market(config, 'csi300') qlib.init(**config.get("qlib_init")) dataset_config = config.get("task").get("dataset") dataset = init_instance_by_config(dataset_config) pprint("args: {:}".format(xargs)) pprint(dataset_config) pprint(dataset) for irun in range(xargs.times): run_exp(config.get("task"), dataset, xargs.alg, "recorder-{:02d}-{:02d}".format(irun, xargs.times), xargs.save_dir)
def dump_data_for_proxy_model(self): """ Dump data for training meta model. The meta model will be trained upon the proxy forecasting model. This dataset is for the proxy forecasting model. """ topk = 30 fi = self.get_feature_importance() col_selected = fi.nlargest(topk) rb = RollingBenchmark(model_type=self.sim_task_model) task = rb.basic_task() dataset = init_instance_by_config(task["dataset"]) prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) feature_df = prep_ds["feature"] label_df = prep_ds["label"] feature_selected = feature_df.loc[:, col_selected.index] feature_selected = feature_selected.groupby("datetime").apply( lambda df: (df - df.mean()).div(df.std())) feature_selected = feature_selected.fillna(0.0) df_all = { "label": label_df.reindex(feature_selected.index), "feature": feature_selected, } df_all = pd.concat(df_all, axis=1) df_all.to_pickle(DIRNAME / "fea_label_df.pkl") # dump data in handler format for aligning the interface handler = DataHandlerLP( data_loader={ "class": "qlib.data.dataset.loader.StaticDataLoader", "kwargs": { "config": DIRNAME / "fea_label_df.pkl" }, }) handler.to_pickle(DIRNAME / "handler_proxy.pkl", dump_all=True)