Exemplo n.º 1
0
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None):
    """get trading date with shift bias will cur_date
        e.g. : shift == 1,  return next trading date
               shift == -1, return previous trading date
    ----------
    trading_date : pandas.Timestamp
        current date
    shift : int
    clip_shift: bool
    align : Optional[str]
        When align is None, this function will raise ValueError if `trading_date` is not a trading date
        when align is "left"/"right", it will try to align to left/right nearest trading date before shifting when `trading_date` is not a trading date

    """
    from qlib.data import D  # pylint: disable=C0415

    cal = D.calendar(future=future, freq=freq)
    trading_date = pd.to_datetime(trading_date)
    if align is None:
        if trading_date not in list(cal):
            raise ValueError("{} is not trading day!".format(str(trading_date)))
        _index = bisect.bisect_left(cal, trading_date)
    elif align == "left":
        _index = bisect.bisect_right(cal, trading_date) - 1
    elif align == "right":
        _index = bisect.bisect_left(cal, trading_date)
    else:
        raise ValueError(f"align with value `{align}` is not supported")
    shift_index = _index + shift
    if shift_index < 0 or shift_index >= len(cal):
        if clip_shift:
            shift_index = np.clip(shift_index, 0, len(cal) - 1)
        else:
            raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range")
    return cal[shift_index]
Exemplo n.º 2
0
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True):
    """get trading date with shift bias wil cur_date
        e.g. : shift == 1,  return next trading date
               shift == -1, return previous trading date
    ----------
    trading_date : pandas.Timestamp
        current date
    shift : int
    clip_shift: bool

    """
    from qlib.data import D

    cal = D.calendar(future=future)
    if pd.to_datetime(trading_date) not in list(cal):
        raise ValueError("{} is not trading day!".format(str(trading_date)))
    _index = bisect.bisect_left(cal, trading_date)
    shift_index = _index + shift
    if shift_index < 0 or shift_index >= len(cal):
        if clip_shift:
            shift_index = np.clip(shift_index, 0, len(cal) - 1)
        else:
            raise IndexError(
                f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range"
            )
    return cal[shift_index]
Exemplo n.º 3
0
    def load_data(self):
        ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2]
        print(ret)

        instruments = D.instruments('csi300')# ['SH600570','SH600000']
        fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low']
        data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
Exemplo n.º 4
0
    def test_update_label(self):

        task = copy.deepcopy(CSI300_GBDT_TASK)

        task["record"] = {
            "class": "SignalRecord",
            "module_path": "qlib.workflow.record_temp",
            "kwargs": {
                "dataset": "<DATASET>",
                "model": "<MODEL>"
            },
        }

        exp_name = "online_srv_test"

        cal = D.calendar()
        shift = 10
        latest_date = cal[-1 - shift]

        train_start = latest_date - pd.Timedelta(days=61)
        train_end = latest_date - pd.Timedelta(days=41)
        task["dataset"]["kwargs"]["segments"] = {
            "train": (train_start, train_end),
            "valid": (latest_date - pd.Timedelta(days=40),
                      latest_date - pd.Timedelta(days=21)),
            "test": (latest_date - pd.Timedelta(days=20), latest_date),
        }

        task["dataset"]["kwargs"]["handler"]["kwargs"] = {
            "start_time": train_start,
            "end_time": latest_date,
            "fit_start_time": train_start,
            "fit_end_time": train_end,
            "instruments": "csi300",
        }

        rec = task_train(task, exp_name)

        pred = rec.load_object("pred.pkl")

        online_tool = OnlineToolR(exp_name)
        online_tool.reset_online_tag(rec)  # set to online model
        online_tool.update_online_pred()

        new_pred = rec.load_object("pred.pkl")
        label = rec.load_object("label.pkl")
        label_date = label.dropna().index.get_level_values("datetime").max()
        pred_date = new_pred.dropna().index.get_level_values("datetime").max()

        # The prediction is updated, but the label is not updated.
        self.assertTrue(label_date < pred_date)

        # Update label now
        lu = LabelUpdater(rec)
        lu.update()
        new_label = rec.load_object("label.pkl")
        new_label_date = new_label.index.get_level_values("datetime").max()
        self.assertTrue(
            new_label_date == pred_date)  # make sure the label is updated now
Exemplo n.º 5
0
    def set_end_time(self, end_time=None):
        """
        Set end time. None for use calendar's end time.

        Args:
            end_time
        """
        self.cals = D.calendar(future=self._future, end_time=end_time)
Exemplo n.º 6
0
    def test_update_pred(self):
        """
        This test is for testing if it will raise error if the `to_date` is out of the boundary.
        """
        task = copy.deepcopy(CSI300_GBDT_TASK)

        task["record"] = ["qlib.workflow.record_temp.SignalRecord"]

        exp_name = "online_srv_test"

        cal = D.calendar()
        latest_date = cal[-1]

        train_start = latest_date - pd.Timedelta(days=61)
        train_end = latest_date - pd.Timedelta(days=41)
        task["dataset"]["kwargs"]["segments"] = {
            "train": (train_start, train_end),
            "valid": (latest_date - pd.Timedelta(days=40), latest_date - pd.Timedelta(days=21)),
            "test": (latest_date - pd.Timedelta(days=20), latest_date),
        }

        task["dataset"]["kwargs"]["handler"]["kwargs"] = {
            "start_time": train_start,
            "end_time": latest_date,
            "fit_start_time": train_start,
            "fit_end_time": train_end,
            "instruments": "csi300",
        }

        rec = task_train(task, exp_name)

        pred = rec.load_object("pred.pkl")

        online_tool = OnlineToolR(exp_name)
        online_tool.reset_online_tag(rec)  # set to online model

        online_tool.update_online_pred(to_date=latest_date + pd.Timedelta(days=10))

        good_pred = rec.load_object("pred.pkl")

        mod_range = slice(latest_date - pd.Timedelta(days=20), latest_date - pd.Timedelta(days=10))
        mod_range2 = slice(latest_date - pd.Timedelta(days=9), latest_date - pd.Timedelta(days=2))
        mod_pred = good_pred.copy()

        mod_pred.loc[mod_range] = -1
        mod_pred.loc[mod_range2] = -2

        rec.save_objects(**{"pred.pkl": mod_pred})
        online_tool.update_online_pred(
            to_date=latest_date - pd.Timedelta(days=10), from_date=latest_date - pd.Timedelta(days=20)
        )

        updated_pred = rec.load_object("pred.pkl")

        # this range is not fixed
        self.assertTrue((updated_pred.loc[mod_range] == good_pred.loc[mod_range]).all().item())
        # this range is fixed now
        self.assertTrue((updated_pred.loc[mod_range2] == -2).all().item())
Exemplo n.º 7
0
 def test_1_dump_calendars(self):
     ori_calendars = set(
         map(
             pd.Timestamp,
             pd.read_csv(QLIB_DIR.joinpath("calendars", "day.txt"),
                         header=None).loc[:, 0].values,
         ))
     res_calendars = set(D.calendar())
     assert len(ori_calendars - res_calendars) == len(
         res_calendars - ori_calendars) == 0, "dump calendars failed"
Exemplo n.º 8
0
    def __init__(self,
                 record: Recorder,
                 to_date=None,
                 hist_ref: int = 0,
                 freq="day",
                 fname="pred.pkl"):
        """
        Init PredUpdater.

        Args:
            record : Recorder
            to_date :
                update to prediction to the `to_date`
            hist_ref : int
                Sometimes, the dataset will have historical depends.
                Leave the problem to users to set the length of historical dependency

                .. note::

                    the start_time is not included in the hist_ref

        """
        # TODO: automate this hist_ref in the future.
        super().__init__(record=record)

        self.to_date = to_date
        self.hist_ref = hist_ref
        self.freq = freq
        self.fname = fname
        self.rmdl = RMDLoader(rec=record)

        latest_date = D.calendar(freq=freq)[-1]
        if to_date == None:
            to_date = latest_date
        to_date = pd.Timestamp(to_date)

        if to_date >= latest_date:
            self.logger.warning(
                f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`."
            )
            to_date = latest_date
        self.to_date = to_date
        # FIXME: it will raise error when running routine with delay trainer
        # should we use another prediction updater for delay trainer?
        self.old_data: pd.DataFrame = record.load_object(fname)

        # dropna is for being compatible to some data with future information(e.g. label)
        # The recent label data should be updated together
        self.last_end = self.old_data.dropna().index.get_level_values(
            "datetime").max()
Exemplo n.º 9
0
    def test_update_pred(self):
        """
        This test is for testing if it will raise error if the `to_date` is out of the boundary.
        """
        task = copy.deepcopy(CSI300_GBDT_TASK)

        task["record"] = {
            "class": "SignalRecord",
            "module_path": "qlib.workflow.record_temp",
            "kwargs": {
                "dataset": "<DATASET>",
                "model": "<MODEL>"
            },
        }

        exp_name = "online_srv_test"

        cal = D.calendar()
        latest_date = cal[-1]

        train_start = latest_date - pd.Timedelta(days=61)
        train_end = latest_date - pd.Timedelta(days=41)
        task["dataset"]["kwargs"]["segments"] = {
            "train": (train_start, train_end),
            "valid": (latest_date - pd.Timedelta(days=40),
                      latest_date - pd.Timedelta(days=21)),
            "test": (latest_date - pd.Timedelta(days=20), latest_date),
        }

        task["dataset"]["kwargs"]["handler"]["kwargs"] = {
            "start_time": train_start,
            "end_time": latest_date,
            "fit_start_time": train_start,
            "fit_end_time": train_end,
            "instruments": "csi300",
        }

        rec = task_train(task, exp_name)

        pred = rec.load_object("pred.pkl")

        online_tool = OnlineToolR(exp_name)
        online_tool.reset_online_tag(rec)  # set to online model

        online_tool.update_online_pred(to_date=latest_date +
                                       pd.Timedelta(days=10))
Exemplo n.º 10
0
def transform_end_date(end_date=None, freq="day"):
    """get previous trading date
    If end_date is -1, None, or end_date is greater than the maximum trading day, the last trading date is returned.
    Otherwise, returns the end_date
    ----------
    end_date: str
        end trading date
    date : pandas.Timestamp
        current date
    """
    from ..data import D

    last_date = D.calendar(freq=freq)[-1]
    if end_date is None or (str(end_date) == "-1") or (
            pd.Timestamp(last_date) < pd.Timestamp(end_date)):
        log.warning("\nInfo: the end_date in the configuration file is {}, "
                    "so the default last date {} is used.".format(
                        end_date, last_date))
        end_date = last_date
    return end_date
Exemplo n.º 11
0
    def _gen_day_dataset(self, config, conf_type):
        try:
            path = config.pop("path")
        except KeyError as e:
            raise ValueError("Must specify the path to save the dataset.") from e

        if os.path.isfile(path + "tmp_dataset.pkl"):
            start = time.time()
            print_log("Dataset exists, load from disk.", __name__)
        else:
            start = time.time()
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print_log("Generating dataset", __name__)
            self._prepare_calender_cache()
            dataset = init_instance_by_config(config)
            print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__)
            dataset.config(dump_all=False, recursive=True)
            dataset.to_pickle(path + "tmp_dataset.pkl")

        with open(path + "tmp_dataset.pkl", "rb") as f:
            new_dataset = pkl.load(f)

        time_list = D.calendar(start_time=self.start_time, end_time=self.end_time, freq="1min")[::240]

        def generate_dataset(times):
            if os.path.isfile(path + times.strftime("%Y-%m-%d") + ".pkl"):
                print("exist " + times.strftime("%Y-%m-%d"))
                return
            self._init_qlib(self.qlib_conf)
            end_times = times + datetime.timedelta(days=1)
            new_dataset.handler.config(**{"start_time": times, "end_time": end_times})
            if conf_type == "backtest":
                new_dataset.handler.setup_data()
            else:
                new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS)
            new_dataset.config(dump_all=True, recursive=True)
            new_dataset.to_pickle(path + times.strftime("%Y-%m-%d") + ".pkl")

        Parallel(n_jobs=8)(delayed(generate_dataset)(times) for times in time_list)
Exemplo n.º 12
0
    def calendar_callback(self, cbody, task_uri):
        """Target function for the established process when the received task asks for calendar data.

        Call the data provider to acquire data and publish the calendar data.
        """

        start_time = cbody["start_time"]
        end_time = cbody["end_time"]
        if start_time == "None":
            start_time = None
        if end_time == "None":
            end_time = None
        freq = cbody["freq"]
        future = cbody.get("future", False)
        status_code = 0
        self.logger.debug("process calendar data at %f" % time.time())
        try:
            calendar_result = D.calendar(start_time, end_time, freq, future)
            calendar_result = [str(c) for c in calendar_result]
            self.logger.debug("finish processing calendar data and publish message at %f" % time.time())
            self.publish_message("calendar", calendar_result, status_code, task_uri)
        except Exception as e:
            self.logger.exception(f"Error while processing request %.200s" % e)
            self.publish_message("calendar", None, 1, task_uri, str(e))
Exemplo n.º 13
0
 def __init__(self, future=True, end_time=None):
     self._future = future
     self.cals = D.calendar(future=future, end_time=end_time)
Exemplo n.º 14
0
    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
        import qlib
        from qlib.data import D

        qlib.init(provider_uri=self.qlib_data_1d_dir)
        return list(D.calendar(freq="day"))
Exemplo n.º 15
0
    def __init__(
        self,
        record: Recorder,
        to_date=None,
        from_date=None,
        hist_ref: Optional[int] = None,
        freq="day",
        fname="pred.pkl",
        loader_cls: type = RMDLoader,
    ):
        """
        Init PredUpdater.

        Expected behavior in following cases:
        - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date
        - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected.

        Args:
            record : Recorder
            to_date :
                update to prediction to the `to_date`
                if to_date is None:
                    data will updated to the latest date.
            from_date :
                the update will start from `from_date`
                if from_date is None:
                    the updating will occur on the next tick after the latest data in historical data
            hist_ref : int
                Sometimes, the dataset will have historical depends.
                Leave the problem to users to set the length of historical dependency
                If user doesn't specify this parameter, Updater will try to load dataset to automatically determine the hist_ref

                .. note::

                    the start_time is not included in the `hist_ref`; So the `hist_ref` will be `step_len - 1` in most cases

            loader_cls : type
                the class to load the model and dataset

        """
        # TODO: automate this hist_ref in the future.
        super().__init__(record=record)

        self.to_date = to_date
        self.hist_ref = hist_ref
        self.freq = freq
        self.fname = fname
        self.rmdl = loader_cls(rec=record)

        latest_date = D.calendar(freq=freq)[-1]
        if to_date is None:
            to_date = latest_date
        to_date = pd.Timestamp(to_date)

        if to_date >= latest_date:
            self.logger.warning(
                f"The given `to_date`({to_date}) is later than `latest_date`({latest_date}). So `to_date` is clipped to `latest_date`."
            )
            to_date = latest_date
        self.to_date = to_date

        # FIXME: it will raise error when running routine with delay trainer
        # should we use another prediction updater for delay trainer?
        self.old_data: pd.DataFrame = record.load_object(fname)
        if from_date is None:
            # dropna is for being compatible to some data with future information(e.g. label)
            # The recent label data should be updated together
            self.last_end = self.old_data.dropna().index.get_level_values("datetime").max()
        else:
            self.last_end = get_date_by_shift(from_date, -1, align="right")
Exemplo n.º 16
0
import qlib
from qlib.data import D
if __name__ == '__main__':
    qlib.init(provider_uri='~/.qlib/qlib_data/cn_data')
    print(
        D.calendar(start_time='2019-01-01', end_time='2020-12-31',
                   freq='day')[:5])
Exemplo n.º 17
0
def prepareTrainDataset(ifSavePortfolioIndex=False):
    print(
        "------------------------ Begin to prepare train dataset... ------------------------"
    )

    # read config file
    cf = configparser.ConfigParser()
    cf.read("config/config.ini")

    minDaysRange = int(cf.get("Parameter", "minDaysRange"))

    # offset of days
    numberOfYears = int(cf.get("Parameter", "numberOfYears"))
    numberOfMonths = int(cf.get("Parameter", "numberOfMonths"))
    numberOfDays = int(cf.get("Parameter", "numberOfDays"))

    # qlib init
    qlib.init(provider_uri='data/bin')

    # use one fund be the standard of trading day
    calendar = D.calendar(freq='day')
    lastDay = calendar[-1]  # 2021-02-10 00:00:00
    firstDay = lastDay - DateOffset(years=numberOfYears,
                                    months=numberOfMonths,
                                    days=numberOfDays)  # 2018-02-10 00:00:00

    # exclude the influence of days without trading
    calendarBetweenFirstDayAndLastDay = D.calendar(freq='day',
                                                   start_time=firstDay,
                                                   end_time=lastDay)
    firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0]
    lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1]

    # get portfolio
    pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio")
    if not os.path.exists(pathOfDfSparsePortfolio):
        getSparseMatrixForPortfolioInAllFunds()
    dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0)

    if ifSavePortfolioIndex:
        dfPortfolioIndex = dfSparsePortfolio["FullElements"]
        dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv")

    folderToSaveTrainDataset = getFolderNameInConfig(
        "folderToSaveTrainDataset")  # the folder to save train dataset
    folderToSaveTestDataset = getFolderNameInConfig(
        "folderToSaveTestDataset")  # the folder to save test dataset

    count = 0
    instruments = D.instruments(market='all')
    for file in D.list_instruments(instruments=instruments, as_list=True):
        fundCode = file.split("_")[0]  # 000001

        if count % 100 == 0:
            print("count = %s\tfundCode=%s" % (count, fundCode))

        try:
            # can't find portfolio for this fund
            try:
                dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]]
            except:
                continue

            # read file and remove empty line
            df = D.features([file], ['$AccumulativeNetAssetValue'],
                            start_time=firstDayToAnalyze,
                            end_time=lastDayToAnalyze)
            df.columns = ['AccumulativeNetAssetValue']
            #df = df.unstack(level=0)
            df["datetime"] = df.index.levels[1]

            # reset the index
            df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True)

            # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high
            if df.shape[0] <= minDaysRange:
                continue

            # count the days between first day and last day
            day = df['datetime']
            # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday)
            firstDayInThisFund = day[day.first_valid_index(
            )]  # 2018-02-12 00:00:00, 2018-02-10 is Satuaday
            lastDayInThisFund = day[
                day.last_valid_index()]  # 2021-02-10 00:00:00

            # must have value in latest day
            if (lastDayInThisFund - lastDayToAnalyze).days != 0:
                continue

            df['daysDiffWithLastDay'] = df['datetime'].apply(
                lambda x: (lastDayInThisFund - x).days)

            # get the value in important days
            netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][
                "AccumulativeNetAssetValue"].tolist()[0]  # 4.046

            # get train dataset which found more than 3 years
            if (firstDayInThisFund - firstDayToAnalyze).days <= 0:
                # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day])
                df["adjustFactorToLatestDay"] = df[
                    "AccumulativeNetAssetValue"] / netValueInFirstDay
                df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]]

                # abandon the latest day, it's meaningless
                df.reset_index(drop=True, inplace=True)
                df = df.T.drop(labels=0, axis=1).T
                # reset index to concat with dfSparsePortfolioForThisFund
                df.reset_index(drop=True, inplace=True)
                df = df.T

                # duplicate to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = pd.concat(
                    [dfSparsePortfolioForThisFund.T] * df.shape[1])
                # reset index to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index(
                    drop=True).T

                dfDataset = pd.concat([dfSparsePortfolioForThisFund, df],
                                      axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTrainDataset,
                                 "%s.csv" % fundCode))
            else:
                dfInFirstDay = df[df['datetime'] ==
                                  firstDayInThisFund].reset_index(drop=True)
                dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T
                dfInFirstDay[fundCode] = dfInFirstDay[0]
                dfDataset = pd.concat(
                    [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]],
                    axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode))

            count += 1
        except Exception as e:
            print("fundCode = %s\terror = %s" % (fundCode, e))
            continue

    print("------------------------ Done. ------------------------")