Exemplo n.º 1
0
    def load_group_df(
        self,
        instruments,
        exprs: list,
        names: list,
        start_time: Union[str, pd.Timestamp] = None,
        end_time: Union[str, pd.Timestamp] = None,
        gp_name: str = None,
    ) -> pd.DataFrame:
        if instruments is None:
            warnings.warn("`instruments` is not set, will load all stocks")
            instruments = "all"
        if isinstance(instruments, str):
            instruments = D.instruments(instruments, filter_pipe=self.filter_pipe)
        elif self.filter_pipe is not None:
            warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")

        freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq
        df = D.features(
            instruments, exprs, start_time, end_time, freq=freq, inst_processors=self.inst_processor.get(gp_name, [])
        )
        df.columns = names
        if self.swap_level:
            df = df.swaplevel().sort_index()  # NOTE: if swaplevel, return <datetime, instrument>
        return df
Exemplo n.º 2
0
    def load_data(self):
        ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2]
        print(ret)

        instruments = D.instruments('csi300')# ['SH600570','SH600000']
        fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low']
        data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
Exemplo n.º 3
0
 def test_2_dump_instruments(self):
     ori_ins = set(
         map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
     res_ins = set(D.list_instruments(D.instruments("all"), as_list=True))
     assert len(ori_ins -
                res_ins) == len(ori_ins -
                                res_ins) == 0, "dump instruments failed"
Exemplo n.º 4
0
    def test_0_qlib_data(self):

        GetData().qlib_data_cn(QLIB_DIR)
        df = D.features(D.instruments("csi300"), self.FIELDS)
        self.assertListEqual(list(df.columns), self.FIELDS,
                             "get qlib data failed")
        self.assertFalse(df.dropna().empty, "get qlib data failed")
Exemplo n.º 5
0
    def test_handler_storage(self):
        # init data handler
        data_handler = TestHandler(**self.data_handler_kwargs)

        # init data handler with hasing storage
        data_handler_hs = TestHandler(**self.data_handler_kwargs,
                                      infer_processors=["HashStockFormat"])

        fetch_start_time = "2019-01-01"
        fetch_end_time = "2019-12-31"
        instruments = D.instruments(market=self.market)
        instruments = D.list_instruments(instruments=instruments,
                                         start_time=fetch_start_time,
                                         end_time=fetch_end_time,
                                         as_list=True)

        with TimeInspector.logt("random fetch with DataFrame Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler.fetch(selector=(fetch_stock,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler.fetch(selector=(fetch_stocks,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

        with TimeInspector.logt("random fetch with HasingStock Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler_hs.fetch(selector=(fetch_stock,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler_hs.fetch(selector=(fetch_stocks,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)
Exemplo n.º 6
0
 def testClose(self):
     close_p = D.features(D.instruments("csi300"),
                          ["Ref($close, 1)/$close - 1"])
     close_desc = close_p.describe(percentiles=np.arange(0.1, 1.0, 0.1))
     print(close_desc)
     self.assertLessEqual(abs(close_desc.loc["90%"][0]), 0.1,
                          "Close value is abnormal")
     self.assertLessEqual(abs(close_desc.loc["10%"][0]), 0.1,
                          "Close value is abnormal")
Exemplo n.º 7
0
    def _get_old_data(self, qlib_data_dir: [str, Path]):
        import qlib
        from qlib.data import D

        qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve())
        qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None)
        df = D.features(D.instruments("all"), ["$close/$factor", "$adjclose/$close"])
        df.columns = [self._ori_close_field, self._first_close_field]
        return df
Exemplo n.º 8
0
    def _get_all_1d_data(self):
        import qlib
        from qlib.data import D

        qlib.init(provider_uri=self.qlib_data_1d_dir)
        df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor", "$close"], freq="day")
        df.reset_index(inplace=True)
        df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True)
        df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
        return df
Exemplo n.º 9
0
def fill_1min_using_1d(
    data_1min_dir: [str, Path],
    qlib_data_1d_dir: [str, Path],
    max_workers: int = 16,
    date_field_name: str = "date",
    symbol_field_name: str = "symbol",
):
    """Use 1d data to fill in the missing symbols relative to 1min

    Parameters
    ----------
    data_1min_dir: str
        1min data dir
    qlib_data_1d_dir: str
        1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format
    max_workers: int
        ThreadPoolExecutor(max_workers), by default 16
    date_field_name: str
        date field name, by default date
    symbol_field_name: str
        symbol field name, by default symbol

    """
    data_1min_dir = Path(data_1min_dir).expanduser().resolve()
    qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()

    min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name)
    symbols_1min = get_symbols(data_1min_dir)

    qlib.init(provider_uri=str(qlib_data_1d_dir))
    data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day")

    miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min)
    if not miss_symbols:
        logger.warning("More symbols in 1min than 1d, no padding required")
        return

    logger.info(f"miss_symbols  {len(miss_symbols)}: {miss_symbols}")
    tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0])
    columns = tmp_df.columns
    _si = tmp_df[symbol_field_name].first_valid_index()
    is_lower = tmp_df.loc[_si][symbol_field_name].islower()
    for symbol in tqdm(miss_symbols):
        if is_lower:
            symbol = symbol.lower()
        index_1d = data_1d.loc(axis=0)[symbol.upper()].index
        index_1min = generate_minutes_calendar_from_daily(index_1d)
        index_1min.name = date_field_name
        _df = pd.DataFrame(columns=columns, index=index_1min)
        if date_field_name in _df.columns:
            del _df[date_field_name]
        _df.reset_index(inplace=True)
        _df[symbol_field_name] = symbol
        _df["paused_num"] = 0
        _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
Exemplo n.º 10
0
    def test_0_qlib_data(self):

        GetData().qlib_data(name="qlib_data_simple",
                            target_dir=QLIB_DIR,
                            region="cn",
                            interval="1d",
                            version="latest")
        df = D.features(D.instruments("csi300"), self.FIELDS)
        self.assertListEqual(list(df.columns), self.FIELDS,
                             "get qlib data failed")
        self.assertFalse(df.dropna().empty, "get qlib data failed")
Exemplo n.º 11
0
 def testClose(self):
     close_p = D.features(D.instruments('csi300'),
                          ['Ref($close, 1)/$close - 1'])
     close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1))
     print(close_desc)
     self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1,
                          "Close value is abnormal")
     self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2,
                          "Close value is abnormal")
     self.assertGreaterEqual(close_desc.loc["min"][0], -0.2,
                             "Close value is abnormal")
Exemplo n.º 12
0
    def test_setting(self):
        # All the query below passes
        df = D.features(["SH600519"], ["ChangeInstrument('SH000300', $close)"])

        # get market return for "SH600519"
        df = D.features(["SH600519"], ["ChangeInstrument('SH000300', Feature('close')/Ref(Feature('close'),1) -1)"])
        df = D.features(["SH600519"], ["ChangeInstrument('SH000300', $close/Ref($close,1) -1)"])
        # excess return
        df = D.features(
            ["SH600519"], ["($close/Ref($close,1) -1) - ChangeInstrument('SH000300', $close/Ref($close,1) -1)"]
        )
        print(df)
Exemplo n.º 13
0
def load_dataset(market='csi300'):

    # features
    fields = []
    names = []

    fields += ['$open/$close']  # NOTE: Ref($open, 0) != $open
    fields += ['Ref($open, %d)/$close' % d for d in range(1, 60)]
    names  += ['OPEN%d'%d for d in range(60)]

    fields += ['$high/$close']
    fields += ['Ref($high, %d)/$close' % d for d in range(1, 60)]
    names  += ['HIGH%d'%d for d in range(60)]

    fields += ['$low/$close']
    fields += ['Ref($low, %d)/$close' % d for d in range(1, 60)]
    names  += ['LOW%d'%d for d in range(60)]

    fields += ['$close/$close']  # 1
    fields += ['Ref($close, %d)/$close' % d for d in range(1, 60)]
    names  += ['CLOSE%d'%d for d in range(60)]

    fields += ['$vwap/$close']
    fields += ['Ref($vwap, %d)/$close' % d for d in range(1, 60)]
    names  += ['VWAP%d'%d for d in range(60)]

    # fields += ['Log($volume/$volume)']  # 1
    # fields += ['Log(Ref($volume, %d)/$volume)' % d for d in range(1, 60)]
    # names  += ['VOLUME%d'%d for d in range(60)]

    fields += ['$volume/$volume']  # 1
    fields += ['Ref($volume, %d)/$volume' % d for d in range(1, 60)]
    names  += ['VOLUME%d'%d for d in range(60)]

    # labels
    labels = ['Ref($vwap, -2)/Ref($vwap, -1)-1']
    label_names = ['LABEL0']

    ## load features
    print('loading features...')
    df = D.features(D.instruments(market), fields, start_time='2007-01-01')
    df.columns = names
    print('load features over')
    ## load labels
    if len(labels):
        print('loading labels...')
        df_labels = D.features(D.instruments('all'), labels, start_time='2007-01-01')
        df_labels.columns = label_names
        df[label_names] = df_labels
        print('load labels over')

    return df, names, label_names
Exemplo n.º 14
0
    def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame:
        if instruments is None:
            warnings.warn("`instruments` is not set, will load all stocks")
            instruments = "all"
        if isinstance(instruments, str):
            instruments = D.instruments(instruments, filter_pipe=self.filter_pipe)
        elif self.filter_pipe is not None:
            warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")

        df = D.features(instruments, exprs, start_time, end_time)
        df.columns = names
        df = df.swaplevel().sort_index()  # NOTE: always return <datetime, instrument>
        return df
Exemplo n.º 15
0
    def testCSI300(self):
        close_p = D.features(D.instruments("csi300"), ["$close"])
        size = close_p.groupby("datetime").size()
        cnt = close_p.groupby("datetime").count()["$close"]
        size_desc = size.describe(percentiles=np.arange(0.1, 1.0, 0.1))
        cnt_desc = cnt.describe(percentiles=np.arange(0.1, 1.0, 0.1))

        print(size_desc)
        print(cnt_desc)

        self.assertLessEqual(size_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks")
        self.assertGreaterEqual(size_desc.loc["80%"], 290, "Insufficient number of CSI300 constituent stocks")

        self.assertLessEqual(cnt_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks")
Exemplo n.º 16
0
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True):
    """get trading date with shift bias wil cur_date
        e.g. : shift == 1,  return next trading date
               shift == -1, return previous trading date
    ----------
    trading_date : pandas.Timestamp
        current date
    shift : int
    clip_shift: bool

    """
    from qlib.data import D

    cal = D.calendar(future=future)
    if pd.to_datetime(trading_date) not in list(cal):
        raise ValueError("{} is not trading day!".format(str(trading_date)))
    _index = bisect.bisect_left(cal, trading_date)
    shift_index = _index + shift
    if shift_index < 0 or shift_index >= len(cal):
        if clip_shift:
            shift_index = np.clip(shift_index, 0, len(cal) - 1)
        else:
            raise IndexError(
                f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range"
            )
    return cal[shift_index]
Exemplo n.º 17
0
 def test_expr(self):
     fields = [
         "P(Mean($$roewa_q, 1))",
         "P($$roewa_q)",
         "P(Mean($$roewa_q, 2))",
         "P(Ref($$roewa_q, 1))",
         "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
     ]
     instruments = ["sh600519"]
     data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
     expect = """
                            P(Mean($$roewa_q, 1))  P($$roewa_q)  P(Mean($$roewa_q, 2))  P(Ref($$roewa_q, 1))  P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
     instrument datetime
     sh600519   2019-07-01               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-02               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-03               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-04               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-05               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-08               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-09               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-10               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-11               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-12               0.094737      0.094737               0.219691              0.344644                               0.219691
                2019-07-15               0.000000      0.000000               0.047369              0.094737                               0.047369
                2019-07-16               0.000000      0.000000               0.047369              0.094737                               0.047369
                2019-07-17               0.000000      0.000000               0.047369              0.094737                               0.047369
                2019-07-18               0.175322      0.175322               0.135029              0.094737                               0.135029
                2019-07-19               0.175322      0.175322               0.135029              0.094737                               0.135029
     """
     self.check_same(data.tail(15), expect)
Exemplo n.º 18
0
    def test_query(self):
        instruments = ["sh600519"]
        fields = ["P($$roewa_q)", "P($$yoyni_q)"]
        # Mao Tai published 2019Q2 report at 2019-07-13 & 2019-07-18
        # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index
        data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
        res = """
               P($$roewa_q)  P($$yoyni_q)
        count    133.000000    133.000000
        mean       0.196412      0.277930
        std        0.097591      0.030262
        min        0.000000      0.243892
        25%        0.094737      0.243892
        50%        0.255220      0.304181
        75%        0.255220      0.305041
        max        0.344644      0.305041
        """
        self.check_same(data.describe(), res)

        res = """
                               P($$roewa_q)  P($$yoyni_q)
        instrument datetime
        sh600519   2019-07-15      0.000000      0.305041
                   2019-07-16      0.000000      0.305041
                   2019-07-17      0.000000      0.305041
                   2019-07-18      0.175322      0.252650
                   2019-07-19      0.175322      0.252650
        """
        self.check_same(data.tail(), res)
Exemplo n.º 19
0
 def test_pref_operator(self):
     instruments = ["sh600519"]
     fields = [
         "PRef($$roewa_q, 201902)",
         "PRef($$yoyni_q, 201801)",
         "P($$roewa_q)",
         "P($$roewa_q) / PRef($$roewa_q, 201801)",
     ]
     data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
     except_data = """
                            PRef($$roewa_q, 201902)  PRef($$yoyni_q, 201801)  P($$roewa_q)  P($$roewa_q) / PRef($$roewa_q, 201801)
     instrument datetime                                                                                                          
     sh600519   2018-05-02                      NaN                 0.395075      0.088887                                1.000000
                2018-05-03                      NaN                 0.395075      0.088887                                1.000000
                2018-05-04                      NaN                 0.395075      0.088887                                1.000000
                2018-05-07                      NaN                 0.395075      0.088887                                1.000000
                2018-05-08                      NaN                 0.395075      0.088887                                1.000000
     ...                                        ...                      ...           ...                                     ...
                2019-07-15                 0.000000                 0.395075      0.000000                                0.000000
                2019-07-16                 0.000000                 0.395075      0.000000                                0.000000
                2019-07-17                 0.000000                 0.395075      0.000000                                0.000000
                2019-07-18                 0.175322                 0.395075      0.175322                                1.972414
                2019-07-19                 0.175322                 0.395075      0.175322                                1.972414
     
     [299 rows x 4 columns]
     """
     self.check_same(data, except_data)
Exemplo n.º 20
0
 def test_expr2(self):
     instruments = ["sh600519"]
     fields = ["P($$roewa_q)", "P($$yoyni_q)"]
     fields += ["P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)"]
     fields += ["P(Sum($$yoyni_q, 4))"]
     fields += ["$close", "P($$roewa_q) * $close"]
     data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
     except_data = """
                                    P($$roewa_q)  P($$yoyni_q)  P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)  P(Sum($$yoyni_q, 4))      $close  P($$roewa_q) * $close
     instrument datetime                                                                                                                                                       
     sh600519   2019-01-02      0.255220      0.243892                                           1.484224                           1.661578   63.595333              16.230801
                2019-01-03      0.255220      0.243892                                           1.484224                           1.661578   62.641907              15.987467
                2019-01-04      0.255220      0.243892                                           1.484224                           1.661578   63.915985              16.312637
                2019-01-07      0.255220      0.243892                                           1.484224                           1.661578   64.286530              16.407207
                2019-01-08      0.255220      0.243892                                           1.484224                           1.661578   64.212196              16.388237
     ...                             ...           ...                                                ...                                ...         ...                    ...
                2019-12-25      0.255819      0.219821                                           0.677052                           1.081693  122.150467              31.248409
                2019-12-26      0.255819      0.219821                                           0.677052                           1.081693  122.301315              31.286999
                2019-12-27      0.255819      0.219821                                           0.677052                           1.081693  125.307404              32.056015
                2019-12-30      0.255819      0.219821                                           0.677052                           1.081693  127.763992              32.684456
                2019-12-31      0.255819      0.219821                                           0.677052                           1.081693  127.462303              32.607277
     
     [244 rows x 6 columns]
     """
     self.check_same(data, except_data)
Exemplo n.º 21
0
 def test_2_dump_features(self):
     self.DUMP_DATA.dump_features(include_fields=self.FIELDS)
     df = D.features(self.STOCK_NAMES, self.QLIB_FIELDS)
     TestDumpData.SIMPLE_DATA = df.loc(axis=0)[self.STOCK_NAMES[0], :]
     self.assertFalse(df.dropna().empty, "features data failed")
     self.assertListEqual(list(df.columns), self.QLIB_FIELDS,
                          "features columns failed")
Exemplo n.º 22
0
 def test_case(instruments, queries, note=None):
     if note:
         print(note)
     print(f"checking {instruments} with queries {queries}")
     df = D.features(instruments, queries)
     print(df)
     return df
Exemplo n.º 23
0
 def clear_task(body):
     """Callback function when initialize rabbitmq."""
     tbody = pickle.loads(body)
     ttype = tbody["meta"]["type"]
     task_uri = D._uri(ttype, **(tbody["args"]))
     # delete task
     pop_ssids_from_redis(task_uri)
Exemplo n.º 24
0
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None):
    """get trading date with shift bias will cur_date
        e.g. : shift == 1,  return next trading date
               shift == -1, return previous trading date
    ----------
    trading_date : pandas.Timestamp
        current date
    shift : int
    clip_shift: bool
    align : Optional[str]
        When align is None, this function will raise ValueError if `trading_date` is not a trading date
        when align is "left"/"right", it will try to align to left/right nearest trading date before shifting when `trading_date` is not a trading date

    """
    from qlib.data import D  # pylint: disable=C0415

    cal = D.calendar(future=future, freq=freq)
    trading_date = pd.to_datetime(trading_date)
    if align is None:
        if trading_date not in list(cal):
            raise ValueError("{} is not trading day!".format(str(trading_date)))
        _index = bisect.bisect_left(cal, trading_date)
    elif align == "left":
        _index = bisect.bisect_right(cal, trading_date) - 1
    elif align == "right":
        _index = bisect.bisect_left(cal, trading_date)
    else:
        raise ValueError(f"align with value `{align}` is not supported")
    shift_index = _index + shift
    if shift_index < 0 or shift_index >= len(cal):
        if clip_shift:
            shift_index = np.clip(shift_index, 0, len(cal) - 1)
        else:
            raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range")
    return cal[shift_index]
Exemplo n.º 25
0
    def task_callback(self, ch, method, properties, body):
        """Callback function when a published task is received.

        When a published task is received from rabbitmq,
        a new process will be established to attend to the task.
        `self.channel.basic_qos(prefetch_count=1)` is used to control the maximum concurrency of data processing process.
        """
        self.logger.debug("Receive task from queue at %f" % time.time())
        tbody = pickle.loads(body)
        ttype = tbody["meta"]["type"]
        ssid = tbody["meta"]["ssid"]
        self.logger.info("receive %s task : '%.200s'" % (ttype, tbody))

        task_uri = D._uri(ttype, **(tbody["args"]))
        self.logger.debug("check task  at %f" % time.time())
        qlen = add_to_task_l_and_check_qlen(task_uri, ssid)
        if qlen == 1:  # first to create the task queue
            # no task is running
            # here the data processes will not use the historical memory cache as before
            # acutally the memory cache is used for accelerate the inside of a
            # process

            self.logger.debug("start processing data at %f" % time.time())
            # In order to no longer clear the MemoryCache, a process has been created here.
            p = multiprocessing.Process(target=getattr(self, "%s_callback" % ttype), args=(tbody["args"], task_uri))
            p.start()
            p.join()
        else:
            self.logger.debug(f"There has already been the same task. Just append the ssid {ssid}.")

        ch.basic_ack(delivery_tag=method.delivery_tag)
Exemplo n.º 26
0
    def __init__(self,
                 record: Recorder,
                 to_date=None,
                 hist_ref: int = 0,
                 freq="day"):
        """
        Init PredUpdater.

        Args:
            record : Recorder
            to_date :
                update to prediction to the `to_date`
            hist_ref : int
                Sometimes, the dataset will have historical depends.
                Leave the problem to users to set the length of historical dependency

                .. note::

                    the start_time is not included in the hist_ref

        """
        # TODO: automate this hist_ref in the future.
        super().__init__(record=record)

        self.to_date = to_date
        self.hist_ref = hist_ref
        self.freq = freq
        self.rmdl = RMDLoader(rec=record)

        if to_date == None:
            to_date = D.calendar(freq=freq)[-1]
        self.to_date = pd.Timestamp(to_date)
        self.old_pred = record.load_object("pred.pkl")
        self.last_end = self.old_pred.index.get_level_values("datetime").max()
Exemplo n.º 27
0
 def _compare(self, file_path: Path):
     symbol = file_path.name.strip(self.file_suffix)
     if symbol.lower() not in self.qlib_symbols:
         return self.NOT_IN_FEATURES
     # qlib data
     qlib_df = D.features([symbol], self.qlib_fields, freq=self.freq)
     qlib_df.rename(columns={_c: _c.strip("$")
                             for _c in qlib_df.columns},
                    inplace=True)
     # csv data
     origin_df = pd.read_csv(file_path)
     origin_df[self.date_field_name] = pd.to_datetime(
         origin_df[self.date_field_name])
     if self.symbol_field_name not in origin_df.columns:
         origin_df[self.symbol_field_name] = symbol
     origin_df.set_index([self.symbol_field_name, self.date_field_name],
                         inplace=True)
     origin_df.index.names = qlib_df.index.names
     try:
         compare = datacompy.Compare(
             origin_df,
             qlib_df,
             on_index=True,
             abs_tol=1e-08,  # Optional, defaults to 0
             rel_tol=1e-05,  # Optional, defaults to 0
             df1_name="Original",  # Optional, defaults to 'df1'
             df2_name="New",  # Optional, defaults to 'df2'
         )
         _r = compare.matches(ignore_extra_columns=True)
         return self.COMPARE_TRUE if _r else self.COMPARE_FALSE
     except Exception as e:
         logger.warning(f"{symbol} compare error: {e}")
         return self.COMPARE_ERROR
Exemplo n.º 28
0
    def instrument_callback(self, ibody, task_uri):
        """Target function for the established process when the received task asks for instrument data.

        Call the data provider to acquire data and publish the instrument data.
        """

        instruments = ibody["instruments"]
        start_time = ibody["start_time"]
        end_time = ibody["end_time"]
        if start_time == "None":
            start_time = None
        if end_time == "None":
            end_time = None
        freq = ibody["freq"]
        as_list = ibody["as_list"]
        status_code = 0
        # TODO: add exceptions detection and modify status_code
        self.logger.debug("process instrument data at %f" % time.time())
        try:
            instrument_result = D.list_instruments(instruments, start_time, end_time, freq, as_list)
            if isinstance(instrument_result, dict):
                instrument_result = {i: [(str(s), str(e)) for s, e in t] for i, t in instrument_result.items()}
            self.logger.debug("finish processing instrument data and publish message at %f" % time.time())
            self.publish_message("instrument", instrument_result, status_code, task_uri)
        except Exception as e:
            self.logger.exception(f"Error while processing request %.200s" % e)
            self.publish_message("instrument", None, 1, task_uri, str(e))
Exemplo n.º 29
0
    def test_no_exist_data(self):
        fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"]
        data = D.features(["sh600519", "sh601988"],
                          fields,
                          start_time="2019-01-01",
                          end_time="2019-07-19",
                          freq="day")
        data[
            "$close"] = 1  # in case of different dataset gives different values
        expect = """
                               P($$roewa_q)  P($$yoyni_q)  $close
        instrument datetime
        sh600519   2019-01-02       0.25522      0.243892       1
                   2019-01-03       0.25522      0.243892       1
                   2019-01-04       0.25522      0.243892       1
                   2019-01-07       0.25522      0.243892       1
                   2019-01-08       0.25522      0.243892       1
        ...                             ...           ...     ...
        sh601988   2019-07-15           NaN           NaN       1
                   2019-07-16           NaN           NaN       1
                   2019-07-17           NaN           NaN       1
                   2019-07-18           NaN           NaN       1
                   2019-07-19           NaN           NaN       1

        [266 rows x 3 columns]
        """
        self.check_same(data, expect)
Exemplo n.º 30
0
    def test_exp_06(self):
        t = 3
        expr6_price_func = (
            lambda name, index, method:
            f'2 * (TResample(${name}{index}, "{t}s", "{method}") - Ref(TResample(${name}{index}, "{t}s", "{method}"), 1)) / {t}'
        )
        exprs = []
        names = []
        for i in range(1, 11):
            for name in ["bid", "ask"]:
                exprs.append(
                    f"TResample({expr6_price_func(name, i, 'last')}, '1min', 'mean') / {self.expr_sum_buy_ask_1}"
                )
                names.append(f"p_diff_{name}{i}_{t}s")

        for i in range(1, 11):
            for name in ["asize", "bsize"]:
                exprs.append(
                    f"TResample({expr6_price_func(name, i, 'mean')}, '1min', 'mean') / {self.total_volume}"
                )
                names.append(f"v_diff_{name}{i}_{t}s")

        df = D.features(self.stocks_list, fields=exprs, freq="ticks")
        df.columns = names
        print(df)