def test_setting(self): # All the query below passes df = D.features(["SH600519"], ["ChangeInstrument('SH000300', $close)"]) # get market return for "SH600519" df = D.features(["SH600519"], ["ChangeInstrument('SH000300', Feature('close')/Ref(Feature('close'),1) -1)"]) df = D.features(["SH600519"], ["ChangeInstrument('SH000300', $close/Ref($close,1) -1)"]) # excess return df = D.features( ["SH600519"], ["($close/Ref($close,1) -1) - ChangeInstrument('SH000300', $close/Ref($close,1) -1)"] ) print(df)
def load_dataset(market='csi300'): # features fields = [] names = [] fields += ['$open/$close'] # NOTE: Ref($open, 0) != $open fields += ['Ref($open, %d)/$close' % d for d in range(1, 60)] names += ['OPEN%d'%d for d in range(60)] fields += ['$high/$close'] fields += ['Ref($high, %d)/$close' % d for d in range(1, 60)] names += ['HIGH%d'%d for d in range(60)] fields += ['$low/$close'] fields += ['Ref($low, %d)/$close' % d for d in range(1, 60)] names += ['LOW%d'%d for d in range(60)] fields += ['$close/$close'] # 1 fields += ['Ref($close, %d)/$close' % d for d in range(1, 60)] names += ['CLOSE%d'%d for d in range(60)] fields += ['$vwap/$close'] fields += ['Ref($vwap, %d)/$close' % d for d in range(1, 60)] names += ['VWAP%d'%d for d in range(60)] # fields += ['Log($volume/$volume)'] # 1 # fields += ['Log(Ref($volume, %d)/$volume)' % d for d in range(1, 60)] # names += ['VOLUME%d'%d for d in range(60)] fields += ['$volume/$volume'] # 1 fields += ['Ref($volume, %d)/$volume' % d for d in range(1, 60)] names += ['VOLUME%d'%d for d in range(60)] # labels labels = ['Ref($vwap, -2)/Ref($vwap, -1)-1'] label_names = ['LABEL0'] ## load features print('loading features...') df = D.features(D.instruments(market), fields, start_time='2007-01-01') df.columns = names print('load features over') ## load labels if len(labels): print('loading labels...') df_labels = D.features(D.instruments('all'), labels, start_time='2007-01-01') df_labels.columns = label_names df[label_names] = df_labels print('load labels over') return df, names, label_names
def _compare(self, file_path: Path): symbol = file_path.name.strip(self.file_suffix) if symbol.lower() not in self.qlib_symbols: return self.NOT_IN_FEATURES # qlib data qlib_df = D.features([symbol], self.qlib_fields, freq=self.freq) qlib_df.rename(columns={_c: _c.strip("$") for _c in qlib_df.columns}, inplace=True) # csv data origin_df = pd.read_csv(file_path) origin_df[self.date_field_name] = pd.to_datetime( origin_df[self.date_field_name]) if self.symbol_field_name not in origin_df.columns: origin_df[self.symbol_field_name] = symbol origin_df.set_index([self.symbol_field_name, self.date_field_name], inplace=True) origin_df.index.names = qlib_df.index.names try: compare = datacompy.Compare( origin_df, qlib_df, on_index=True, abs_tol=1e-08, # Optional, defaults to 0 rel_tol=1e-05, # Optional, defaults to 0 df1_name="Original", # Optional, defaults to 'df1' df2_name="New", # Optional, defaults to 'df2' ) _r = compare.matches(ignore_extra_columns=True) return self.COMPARE_TRUE if _r else self.COMPARE_FALSE except Exception as e: logger.warning(f"{symbol} compare error: {e}") return self.COMPARE_ERROR
def load_data(self): ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] print(ret) instruments = D.instruments('csi300')# ['SH600570','SH600000'] fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
def test_2_dump_features(self): self.DUMP_DATA.dump_features(include_fields=self.FIELDS) df = D.features(self.STOCK_NAMES, self.QLIB_FIELDS) TestDumpData.SIMPLE_DATA = df.loc(axis=0)[self.STOCK_NAMES[0], :] self.assertFalse(df.dropna().empty, "features data failed") self.assertListEqual(list(df.columns), self.QLIB_FIELDS, "features columns failed")
def test_0_qlib_data(self): GetData().qlib_data_cn(QLIB_DIR) df = D.features(D.instruments("csi300"), self.FIELDS) self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed") self.assertFalse(df.dropna().empty, "get qlib data failed")
def load_group_df( self, instruments, exprs: list, names: list, start_time: Union[str, pd.Timestamp] = None, end_time: Union[str, pd.Timestamp] = None, gp_name: str = None, ) -> pd.DataFrame: if instruments is None: warnings.warn("`instruments` is not set, will load all stocks") instruments = "all" if isinstance(instruments, str): instruments = D.instruments(instruments, filter_pipe=self.filter_pipe) elif self.filter_pipe is not None: warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list") freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq df = D.features( instruments, exprs, start_time, end_time, freq=freq, inst_processors=self.inst_processor.get(gp_name, []) ) df.columns = names if self.swap_level: df = df.swaplevel().sort_index() # NOTE: if swaplevel, return <datetime, instrument> return df
def test_query(self): instruments = ["sh600519"] fields = ["P($$roewa_q)", "P($$yoyni_q)"] # Mao Tai published 2019Q2 report at 2019-07-13 & 2019-07-18 # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") res = """ P($$roewa_q) P($$yoyni_q) count 133.000000 133.000000 mean 0.196412 0.277930 std 0.097591 0.030262 min 0.000000 0.243892 25% 0.094737 0.243892 50% 0.255220 0.304181 75% 0.255220 0.305041 max 0.344644 0.305041 """ self.check_same(data.describe(), res) res = """ P($$roewa_q) P($$yoyni_q) instrument datetime sh600519 2019-07-15 0.000000 0.305041 2019-07-16 0.000000 0.305041 2019-07-17 0.000000 0.305041 2019-07-18 0.175322 0.252650 2019-07-19 0.175322 0.252650 """ self.check_same(data.tail(), res)
def test_no_exist_data(self): fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"] data = D.features(["sh600519", "sh601988"], fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") data[ "$close"] = 1 # in case of different dataset gives different values expect = """ P($$roewa_q) P($$yoyni_q) $close instrument datetime sh600519 2019-01-02 0.25522 0.243892 1 2019-01-03 0.25522 0.243892 1 2019-01-04 0.25522 0.243892 1 2019-01-07 0.25522 0.243892 1 2019-01-08 0.25522 0.243892 1 ... ... ... ... sh601988 2019-07-15 NaN NaN 1 2019-07-16 NaN NaN 1 2019-07-17 NaN NaN 1 2019-07-18 NaN NaN 1 2019-07-19 NaN NaN 1 [266 rows x 3 columns] """ self.check_same(data, expect)
def test_exp_06(self): t = 3 expr6_price_func = ( lambda name, index, method: f'2 * (TResample(${name}{index}, "{t}s", "{method}") - Ref(TResample(${name}{index}, "{t}s", "{method}"), 1)) / {t}' ) exprs = [] names = [] for i in range(1, 11): for name in ["bid", "ask"]: exprs.append( f"TResample({expr6_price_func(name, i, 'last')}, '1min', 'mean') / {self.expr_sum_buy_ask_1}" ) names.append(f"p_diff_{name}{i}_{t}s") for i in range(1, 11): for name in ["asize", "bsize"]: exprs.append( f"TResample({expr6_price_func(name, i, 'mean')}, '1min', 'mean') / {self.total_volume}" ) names.append(f"v_diff_{name}{i}_{t}s") df = D.features(self.stocks_list, fields=exprs, freq="ticks") df.columns = names print(df)
def test_pref_operator(self): instruments = ["sh600519"] fields = [ "PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)", "P($$roewa_q) / PRef($$roewa_q, 201801)", ] data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day") except_data = """ PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) P($$roewa_q) / PRef($$roewa_q, 201801) instrument datetime sh600519 2018-05-02 NaN 0.395075 0.088887 1.000000 2018-05-03 NaN 0.395075 0.088887 1.000000 2018-05-04 NaN 0.395075 0.088887 1.000000 2018-05-07 NaN 0.395075 0.088887 1.000000 2018-05-08 NaN 0.395075 0.088887 1.000000 ... ... ... ... ... 2019-07-15 0.000000 0.395075 0.000000 0.000000 2019-07-16 0.000000 0.395075 0.000000 0.000000 2019-07-17 0.000000 0.395075 0.000000 0.000000 2019-07-18 0.175322 0.395075 0.175322 1.972414 2019-07-19 0.175322 0.395075 0.175322 1.972414 [299 rows x 4 columns] """ self.check_same(data, except_data)
def test_case(instruments, queries, note=None): if note: print(note) print(f"checking {instruments} with queries {queries}") df = D.features(instruments, queries) print(df) return df
def test_expr2(self): instruments = ["sh600519"] fields = ["P($$roewa_q)", "P($$yoyni_q)"] fields += ["P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)"] fields += ["P(Sum($$yoyni_q, 4))"] fields += ["$close", "P($$roewa_q) * $close"] data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") except_data = """ P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close instrument datetime sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801 2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467 2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637 2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207 2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237 ... ... ... ... ... ... ... 2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409 2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999 2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015 2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456 2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277 [244 rows x 6 columns] """ self.check_same(data, except_data)
def test_expr(self): fields = [ "P(Mean($$roewa_q, 1))", "P($$roewa_q)", "P(Mean($$roewa_q, 2))", "P(Ref($$roewa_q, 1))", "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)", ] instruments = ["sh600519"] data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") expect = """ P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2) instrument datetime sh600519 2019-07-01 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-02 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-03 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-04 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-05 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-08 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-09 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-10 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-11 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-12 0.094737 0.094737 0.219691 0.344644 0.219691 2019-07-15 0.000000 0.000000 0.047369 0.094737 0.047369 2019-07-16 0.000000 0.000000 0.047369 0.094737 0.047369 2019-07-17 0.000000 0.000000 0.047369 0.094737 0.047369 2019-07-18 0.175322 0.175322 0.135029 0.094737 0.135029 2019-07-19 0.175322 0.175322 0.135029 0.094737 0.135029 """ self.check_same(data.tail(15), expect)
def testClose(self): close_p = D.features(D.instruments("csi300"), ["Ref($close, 1)/$close - 1"]) close_desc = close_p.describe(percentiles=np.arange(0.1, 1.0, 0.1)) print(close_desc) self.assertLessEqual(abs(close_desc.loc["90%"][0]), 0.1, "Close value is abnormal") self.assertLessEqual(abs(close_desc.loc["10%"][0]), 0.1, "Close value is abnormal")
def test_3_dump_features_simple(self): stock = self.STOCK_NAMES[0] dump_data = DumpData(csv_path=SOURCE_DIR.joinpath(f"{stock.lower()}.csv"), qlib_dir=QLIB_DIR) dump_data.dump_features(include_fields=self.FIELDS, calendar_path=QLIB_DIR.joinpath("calendars", "day.txt")) df = D.features([stock], self.QLIB_FIELDS) self.assertEqual(len(df), len(TestDumpData.SIMPLE_DATA), "dump features simple failed") self.assertTrue(np.isclose(df.dropna(), self.SIMPLE_DATA.dropna()).all(), "dump features simple failed")
def _get_old_data(self, qlib_data_dir: [str, Path]): import qlib from qlib.data import D qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve()) qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None) df = D.features(D.instruments("all"), ["$close/$factor", "$adjclose/$close"]) df.columns = [self._ori_close_field, self._first_close_field] return df
def test_basic01(self): df = D.features( self.stocks_list, fields=["TResample($ask1, '1min', 'last')"], freq="ticks", start_time="20201230", end_time="20210101", ) print(df)
def test_basic03(self): df = D.features( self.stocks_list, fields=["$function_code"], freq="order", start_time="20201230", end_time="20210101", ) print(df)
def test_regiter_custom_ops(self): instruments = ["SH600000"] fields = ["Diff($close)", "Distance($close, Ref($close, 1))"] print( D.features(instruments, fields, start_time="2010-01-01", end_time="2017-12-31", freq="day"))
def test_exp_09_trans(self): exprs = [ f'TResample(Div(Sub(TResample({self.expr7_3_init("C", "Gt", "3")}, "3s", "last"), Ref(TResample({self.expr7_3_init("C", "Gt", "3")}, "3s","last"), 1)), 3), "1min", "mean")', f'TResample(Div(Sub(TResample({self.expr7_3_init("C", "Lt", "3")}, "3s", "last"), Ref(TResample({self.expr7_3_init("C", "Lt", "3")}, "3s","last"), 1)), 3), "1min", "mean")', ] names = ["ca_diff_intensity_3s_3s", "cb_diff_intensity_3s_3s"] df = D.features(self.stocks_list, fields=exprs, freq="transaction") df.columns = names print(df)
def test_basic(self): # NOTE: this data contains a lot of zeros in $askX and $bidX df = D.features( self.stocks_list, fields=["$ask1", "$ask2", "$bid1", "$bid2"], freq="ticks", start_time="20201230", end_time="20210101", ) print(df)
def test_exp_05(self): exprs = [ f"2 * Sub({ self.total_func('ask', 'last')}, {self.total_func('bid', 'last')})/{self.expr_sum_buy_ask_1}", f"Sub({ self.total_func('asize', 'mean')}, {self.total_func('bsize', 'mean')})/{self.total_volume}", ] names = ["p_accspread", "v_accspread"] df = D.features(self.stocks_list, fields=exprs, freq="ticks") df.columns = names print(df)
def test_exp_04(self): exprs = [] names = [] for name in ["asize", "bsize"]: exprs.append(f"(({ self.total_func(name, 'mean')}) / 10) / {self.total_volume}") names.append(f"v_avg_{name}") df = D.features(self.stocks_list, fields=exprs, freq="ticks") df.columns = names print(df)
def test_exp_01(self): exprs = [] names = [] for name in ["asize", "bsize"]: for i in range(1, 11): exprs.append(f"TResample(${name}{i}, '1min', 'mean') / ({self.total_volume})") names.append(f"v_{name}_{i}") df = D.features(self.stocks_list, fields=exprs, freq="ticks") df.columns = names print(df)
def _get_all_1d_data(self): import qlib from qlib.data import D qlib.init(provider_uri=self.qlib_data_1d_dir) df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor", "$close"], freq="day") df.reset_index(inplace=True) df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True) df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns)) return df
def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): universe = D.features(D.instruments("csi300"), ["$close"], start_time=start_time).swaplevel().sort_index() price_all = (D.features( D.instruments("all"), ["$close"], start_time=start_time).squeeze().unstack(level="instrument")) # StructuredCovEstimator is a statistical risk model riskmodel = StructuredCovEstimator() for i in range(T - 1, len(price_all)): date = price_all.index[i] ref_date = price_all.index[i - T + 1] print(date) codes = universe.loc[date].index price = price_all.loc[ref_date:date, codes] # calculate return and remove extreme return ret = price.pct_change() ret.clip(ret.quantile(0.025), ret.quantile(0.975), axis=1, inplace=True) # run risk model F, cov_b, var_u = riskmodel.predict(ret, is_price=False, return_decomposed_components=True) # save risk data root = riskdata_root + "/" + date.strftime("%Y%m%d") os.makedirs(root, exist_ok=True) pd.DataFrame(F, index=codes).to_pickle(root + "/factor_exp.pkl") pd.DataFrame(cov_b).to_pickle(root + "/factor_cov.pkl") # for specific_risk we follow the convention to save volatility pd.Series(np.sqrt(var_u), index=codes).to_pickle(root + "/specific_risk.pkl")
def fill_1min_using_1d( data_1min_dir: [str, Path], qlib_data_1d_dir: [str, Path], max_workers: int = 16, date_field_name: str = "date", symbol_field_name: str = "symbol", ): """Use 1d data to fill in the missing symbols relative to 1min Parameters ---------- data_1min_dir: str 1min data dir qlib_data_1d_dir: str 1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format max_workers: int ThreadPoolExecutor(max_workers), by default 16 date_field_name: str date field name, by default date symbol_field_name: str symbol field name, by default symbol """ data_1min_dir = Path(data_1min_dir).expanduser().resolve() qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve() min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name) symbols_1min = get_symbols(data_1min_dir) qlib.init(provider_uri=str(qlib_data_1d_dir)) data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day") miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min) if not miss_symbols: logger.warning("More symbols in 1min than 1d, no padding required") return logger.info(f"miss_symbols {len(miss_symbols)}: {miss_symbols}") tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0]) columns = tmp_df.columns _si = tmp_df[symbol_field_name].first_valid_index() is_lower = tmp_df.loc[_si][symbol_field_name].islower() for symbol in tqdm(miss_symbols): if is_lower: symbol = symbol.lower() index_1d = data_1d.loc(axis=0)[symbol.upper()].index index_1min = generate_minutes_calendar_from_daily(index_1d) index_1min.name = date_field_name _df = pd.DataFrame(columns=columns, index=index_1min) if date_field_name in _df.columns: del _df[date_field_name] _df.reset_index(inplace=True) _df[symbol_field_name] = symbol _df["paused_num"] = 0 _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
def test_exp_10(self): exprs = [] names = [] for i in [5, 10, 30, 60]: exprs.append( f'TResample(Ref(TResample($ask1 + $bid1, "1s", "ffill"), {-i}) / TResample($ask1 + $bid1, "1s", "ffill") - 1, "1min", "mean" )' ) names.append(f"lag_{i}_change_rate" for i in [5, 10, 30, 60]) df = D.features(self.stocks_list, fields=exprs, freq="ticks") df.columns = names print(df)
def test_0_qlib_data(self): GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", version="latest") df = D.features(D.instruments("csi300"), self.FIELDS) self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed") self.assertFalse(df.dropna().empty, "get qlib data failed")