def train(): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # get data x_train, y_train, x_validate, y_validate, x_test, y_test = QLibDataHandlerClose( **DATA_HANDLER_CONFIG).get_split_data(**TRAINER_CONFIG) # train model = LGBModel(**MODEL_CONFIG) model.fit(x_train, y_train, x_validate, y_validate) _pred = model.predict(x_test) _pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns) pred_score = pd.DataFrame(index=_pred.index) pred_score["score"] = _pred.iloc(axis=1)[0] # get performance model_score = model.score(x_test, y_test) # Remove rows from x, y and w, which contain Nan in any columns in y_test. x_test, y_test, __ = drop_nan_by_y_index(x_test, y_test) pred_test = model.predict(x_test) model_pearsonr = pearsonr(np.ravel(pred_test), np.ravel(y_test.values))[0] return pred_score, { "model_score": model_score, "model_pearsonr": model_pearsonr }
).get_split_data(**TRAINER_CONFIG) MODEL_CONFIG = { "loss": "mse", "colsample_bytree": 0.8879, "learning_rate": 0.0421, "subsample": 0.8789, "lambda_l1": 205.6999, "lambda_l2": 580.9768, "max_depth": 8, "num_leaves": 210, "num_threads": 20, } # use default model # custom Model, refer to: TODO: Model API url model = LGBModel(**MODEL_CONFIG) model.fit(x_train, y_train, x_validate, y_validate) _pred = model.predict(x_test) _pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns) # backtest requires pred_score pred_score = pd.DataFrame(index=_pred.index) pred_score["score"] = _pred.iloc(axis=1)[0] # save pred_score to file pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() pred_score_path.parent.mkdir(exist_ok=True, parents=True) pred_score.to_pickle(pred_score_path) ################################### # backtest