def load_dataset(market='csi300'): # features fields = [] names = [] fields += ['$open/$close'] # NOTE: Ref($open, 0) != $open fields += ['Ref($open, %d)/$close' % d for d in range(1, 60)] names += ['OPEN%d'%d for d in range(60)] fields += ['$high/$close'] fields += ['Ref($high, %d)/$close' % d for d in range(1, 60)] names += ['HIGH%d'%d for d in range(60)] fields += ['$low/$close'] fields += ['Ref($low, %d)/$close' % d for d in range(1, 60)] names += ['LOW%d'%d for d in range(60)] fields += ['$close/$close'] # 1 fields += ['Ref($close, %d)/$close' % d for d in range(1, 60)] names += ['CLOSE%d'%d for d in range(60)] fields += ['$vwap/$close'] fields += ['Ref($vwap, %d)/$close' % d for d in range(1, 60)] names += ['VWAP%d'%d for d in range(60)] # fields += ['Log($volume/$volume)'] # 1 # fields += ['Log(Ref($volume, %d)/$volume)' % d for d in range(1, 60)] # names += ['VOLUME%d'%d for d in range(60)] fields += ['$volume/$volume'] # 1 fields += ['Ref($volume, %d)/$volume' % d for d in range(1, 60)] names += ['VOLUME%d'%d for d in range(60)] # labels labels = ['Ref($vwap, -2)/Ref($vwap, -1)-1'] label_names = ['LABEL0'] ## load features print('loading features...') df = D.features(D.instruments(market), fields, start_time='2007-01-01') df.columns = names print('load features over') ## load labels if len(labels): print('loading labels...') df_labels = D.features(D.instruments('all'), labels, start_time='2007-01-01') df_labels.columns = label_names df[label_names] = df_labels print('load labels over') return df, names, label_names
def load_data(self): ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] print(ret) instruments = D.instruments('csi300')# ['SH600570','SH600000'] fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low'] data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
def load_group_df( self, instruments, exprs: list, names: list, start_time: Union[str, pd.Timestamp] = None, end_time: Union[str, pd.Timestamp] = None, gp_name: str = None, ) -> pd.DataFrame: if instruments is None: warnings.warn("`instruments` is not set, will load all stocks") instruments = "all" if isinstance(instruments, str): instruments = D.instruments(instruments, filter_pipe=self.filter_pipe) elif self.filter_pipe is not None: warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list") freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq df = D.features( instruments, exprs, start_time, end_time, freq=freq, inst_processors=self.inst_processor.get(gp_name, []) ) df.columns = names if self.swap_level: df = df.swaplevel().sort_index() # NOTE: if swaplevel, return <datetime, instrument> return df
def test_2_dump_instruments(self): ori_ins = set( map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) res_ins = set(D.list_instruments(D.instruments("all"), as_list=True)) assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"
def test_0_qlib_data(self): GetData().qlib_data_cn(QLIB_DIR) df = D.features(D.instruments("csi300"), self.FIELDS) self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed") self.assertFalse(df.dropna().empty, "get qlib data failed")
def test_handler_storage(self): # init data handler data_handler = TestHandler(**self.data_handler_kwargs) # init data handler with hasing storage data_handler_hs = TestHandler(**self.data_handler_kwargs, infer_processors=["HashStockFormat"]) fetch_start_time = "2019-01-01" fetch_end_time = "2019-12-31" instruments = D.instruments(market=self.market) instruments = D.list_instruments(instruments=instruments, start_time=fetch_start_time, end_time=fetch_end_time, as_list=True) with TimeInspector.logt("random fetch with DataFrame Storage"): # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] fetch_stock = instruments[random_index] data_handler.fetch(selector=(fetch_stock, slice(fetch_start_time, fetch_end_time)), level=None) # multi stocks for i in range(100): random_indexs = np.random.randint(len(instruments), size=5) fetch_stocks = [ instruments[_index] for _index in random_indexs ] data_handler.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None) with TimeInspector.logt("random fetch with HasingStock Storage"): # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] fetch_stock = instruments[random_index] data_handler_hs.fetch(selector=(fetch_stock, slice(fetch_start_time, fetch_end_time)), level=None) # multi stocks for i in range(100): random_indexs = np.random.randint(len(instruments), size=5) fetch_stocks = [ instruments[_index] for _index in random_indexs ] data_handler_hs.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None)
def testClose(self): close_p = D.features(D.instruments("csi300"), ["Ref($close, 1)/$close - 1"]) close_desc = close_p.describe(percentiles=np.arange(0.1, 1.0, 0.1)) print(close_desc) self.assertLessEqual(abs(close_desc.loc["90%"][0]), 0.1, "Close value is abnormal") self.assertLessEqual(abs(close_desc.loc["10%"][0]), 0.1, "Close value is abnormal")
def _get_old_data(self, qlib_data_dir: [str, Path]): import qlib from qlib.data import D qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve()) qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None) df = D.features(D.instruments("all"), ["$close/$factor", "$adjclose/$close"]) df.columns = [self._ori_close_field, self._first_close_field] return df
def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): universe = D.features(D.instruments("csi300"), ["$close"], start_time=start_time).swaplevel().sort_index() price_all = (D.features( D.instruments("all"), ["$close"], start_time=start_time).squeeze().unstack(level="instrument")) # StructuredCovEstimator is a statistical risk model riskmodel = StructuredCovEstimator() for i in range(T - 1, len(price_all)): date = price_all.index[i] ref_date = price_all.index[i - T + 1] print(date) codes = universe.loc[date].index price = price_all.loc[ref_date:date, codes] # calculate return and remove extreme return ret = price.pct_change() ret.clip(ret.quantile(0.025), ret.quantile(0.975), axis=1, inplace=True) # run risk model F, cov_b, var_u = riskmodel.predict(ret, is_price=False, return_decomposed_components=True) # save risk data root = riskdata_root + "/" + date.strftime("%Y%m%d") os.makedirs(root, exist_ok=True) pd.DataFrame(F, index=codes).to_pickle(root + "/factor_exp.pkl") pd.DataFrame(cov_b).to_pickle(root + "/factor_cov.pkl") # for specific_risk we follow the convention to save volatility pd.Series(np.sqrt(var_u), index=codes).to_pickle(root + "/specific_risk.pkl")
def _get_all_1d_data(self): import qlib from qlib.data import D qlib.init(provider_uri=self.qlib_data_1d_dir) df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor", "$close"], freq="day") df.reset_index(inplace=True) df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True) df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns)) return df
def fill_1min_using_1d( data_1min_dir: [str, Path], qlib_data_1d_dir: [str, Path], max_workers: int = 16, date_field_name: str = "date", symbol_field_name: str = "symbol", ): """Use 1d data to fill in the missing symbols relative to 1min Parameters ---------- data_1min_dir: str 1min data dir qlib_data_1d_dir: str 1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format max_workers: int ThreadPoolExecutor(max_workers), by default 16 date_field_name: str date field name, by default date symbol_field_name: str symbol field name, by default symbol """ data_1min_dir = Path(data_1min_dir).expanduser().resolve() qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve() min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name) symbols_1min = get_symbols(data_1min_dir) qlib.init(provider_uri=str(qlib_data_1d_dir)) data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day") miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min) if not miss_symbols: logger.warning("More symbols in 1min than 1d, no padding required") return logger.info(f"miss_symbols {len(miss_symbols)}: {miss_symbols}") tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0]) columns = tmp_df.columns _si = tmp_df[symbol_field_name].first_valid_index() is_lower = tmp_df.loc[_si][symbol_field_name].islower() for symbol in tqdm(miss_symbols): if is_lower: symbol = symbol.lower() index_1d = data_1d.loc(axis=0)[symbol.upper()].index index_1min = generate_minutes_calendar_from_daily(index_1d) index_1min.name = date_field_name _df = pd.DataFrame(columns=columns, index=index_1min) if date_field_name in _df.columns: del _df[date_field_name] _df.reset_index(inplace=True) _df[symbol_field_name] = symbol _df["paused_num"] = 0 _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
def testClose(self): close_p = D.features(D.instruments('csi300'), ['Ref($close, 1)/$close - 1']) close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1)) print(close_desc) self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1, "Close value is abnormal") self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2, "Close value is abnormal") self.assertGreaterEqual(close_desc.loc["min"][0], -0.2, "Close value is abnormal")
def test_0_qlib_data(self): GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", version="latest") df = D.features(D.instruments("csi300"), self.FIELDS) self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed") self.assertFalse(df.dropna().empty, "get qlib data failed")
def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame: if instruments is None: warnings.warn("`instruments` is not set, will load all stocks") instruments = "all" if isinstance(instruments, str): instruments = D.instruments(instruments, filter_pipe=self.filter_pipe) elif self.filter_pipe is not None: warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list") df = D.features(instruments, exprs, start_time, end_time) df.columns = names df = df.swaplevel().sort_index() # NOTE: always return <datetime, instrument> return df
def testCSI300(self): close_p = D.features(D.instruments("csi300"), ["$close"]) size = close_p.groupby("datetime").size() cnt = close_p.groupby("datetime").count()["$close"] size_desc = size.describe(percentiles=np.arange(0.1, 1.0, 0.1)) cnt_desc = cnt.describe(percentiles=np.arange(0.1, 1.0, 0.1)) print(size_desc) print(cnt_desc) self.assertLessEqual(size_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks") self.assertGreaterEqual(size_desc.loc["80%"], 290, "Insufficient number of CSI300 constituent stocks") self.assertLessEqual(cnt_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks")
def setUpClass(cls, enable_1d_type="simple", enable_1min=False) -> None: # use default data super().setUpClass(enable_1d_type, enable_1min) nameDFilter = NameDFilter(name_rule_re="SH600110") instruments = D.instruments("csi300", filter_pipe=[nameDFilter]) start_time = "2005-01-04" end_time = "2005-12-31" freq = "day" instruments_d = DatasetD.get_instruments_d(instruments, freq) cls.instruments_d = instruments_d cal = Cal.calendar(start_time, end_time, freq) cls.cal = cal cls.start_time = cal[0] cls.end_time = cal[-1] cls.inst = list(instruments_d.keys())[0] cls.spans = list(instruments_d.values())[0]
def _gen_stock_dataset(self, config, conf_type): try: path = config.pop("path") except KeyError as e: raise ValueError("Must specify the path to save the dataset.") from e if os.path.isfile(path + "tmp_dataset.pkl"): start = time.time() print_log("Dataset exists, load from disk.", __name__) else: start = time.time() if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) print_log("Generating dataset", __name__) self._prepare_calender_cache() dataset = init_instance_by_config(config) print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__) dataset.config(dump_all=False, recursive=True) dataset.to_pickle(path + "tmp_dataset.pkl") with open(path + "tmp_dataset.pkl", "rb") as f: new_dataset = pkl.load(f) instruments = D.instruments(market="all") stock_list = D.list_instruments( instruments=instruments, start_time=self.start_time, end_time=self.end_time, freq="1min", as_list=True ) def generate_dataset(stock): if os.path.isfile(path + stock + ".pkl"): print("exist " + stock) return self._init_qlib(self.qlib_conf) new_dataset.handler.config(**{"instruments": [stock]}) if conf_type == "backtest": new_dataset.handler.setup_data() else: new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS) new_dataset.config(dump_all=True, recursive=True) new_dataset.to_pickle(path + stock + ".pkl") Parallel(n_jobs=32)(delayed(generate_dataset)(stock) for stock in stock_list)
def testCSI300(self): close_p = D.features(D.instruments('csi300'), ['$close']) size = close_p.groupby('datetime').size() cnt = close_p.groupby('datetime').count() size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1)) cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1)) print(size_desc) print(cnt_desc) self.assertLessEqual(size_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks") self.assertLessEqual( size_desc.loc["80%"][0], 290, "Insufficient number of CSI300 constituent stocks") self.assertLessEqual(cnt_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks") self.assertEqual(cnt_desc.loc["80%"][0], 300, "Insufficient number of CSI300 constituent stocks")
def generate_target_weight_position(self, score, current, trade_start_time, trade_end_time): trade_date = trade_start_time pre_date = get_pre_trading_date(trade_date, future=True) # previous trade date # load risk data outs = self.get_risk_data(pre_date) if outs is None: self.logger.warning( f"no risk data for {pre_date:%Y-%m-%d}, skip optimization") return None factor_exp, factor_cov, specific_risk, universe, blacklist = outs # transform score # NOTE: for stocks missing score, we always assume they have the lowest score score = score.reindex(universe).fillna(score.min()).values # get current weight # NOTE: if a stock is not in universe, its current weight will be zero cur_weight = current.get_stock_weight_dict(only_stock=False) cur_weight = np.array([cur_weight.get(stock, 0) for stock in universe]) assert all(cur_weight >= 0), "current weight has negative values" cur_weight = cur_weight / self.get_risk_degree( trade_date) # sum of weight should be risk_degree if cur_weight.sum() > 1 and self.verbose: self.logger.warning( f"previous total holdings excess risk degree (current: {cur_weight.sum()})" ) # load bench weight bench_weight = D.features(D.instruments("all"), [f"${self.market}_weight"], start_time=pre_date, end_time=pre_date).squeeze() bench_weight.index = bench_weight.index.droplevel(level="datetime") bench_weight = bench_weight.reindex(universe).fillna(0).values # whether stock tradable # NOTE: currently we use last day volume to check whether tradable tradable = D.features(D.instruments("all"), ["$volume"], start_time=pre_date, end_time=pre_date).squeeze() tradable.index = tradable.index.droplevel(level="datetime") tradable = tradable.reindex(universe).gt(0).values mask_force_hold = ~tradable # mask force sell mask_force_sell = np.array([stock in blacklist for stock in universe], dtype=bool) # optimize weight = self.optimizer( r=score, F=factor_exp, cov_b=factor_cov, var_u=specific_risk**2, w0=cur_weight, wb=bench_weight, mfh=mask_force_hold, mfs=mask_force_sell, ) target_weight_position = { stock: weight for stock, weight in zip(universe, weight) if weight > 0 } if self.verbose: self.logger.info("trade date: {:%Y-%m-%d}".format(trade_date)) self.logger.info("number of holding stocks: {}".format( len(target_weight_position))) self.logger.info("total holding weight: {:.6f}".format( weight.sum())) return target_weight_position
def prepareTrainDataset(ifSavePortfolioIndex=False): print( "------------------------ Begin to prepare train dataset... ------------------------" ) # read config file cf = configparser.ConfigParser() cf.read("config/config.ini") minDaysRange = int(cf.get("Parameter", "minDaysRange")) # offset of days numberOfYears = int(cf.get("Parameter", "numberOfYears")) numberOfMonths = int(cf.get("Parameter", "numberOfMonths")) numberOfDays = int(cf.get("Parameter", "numberOfDays")) # qlib init qlib.init(provider_uri='data/bin') # use one fund be the standard of trading day calendar = D.calendar(freq='day') lastDay = calendar[-1] # 2021-02-10 00:00:00 firstDay = lastDay - DateOffset(years=numberOfYears, months=numberOfMonths, days=numberOfDays) # 2018-02-10 00:00:00 # exclude the influence of days without trading calendarBetweenFirstDayAndLastDay = D.calendar(freq='day', start_time=firstDay, end_time=lastDay) firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0] lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1] # get portfolio pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio") if not os.path.exists(pathOfDfSparsePortfolio): getSparseMatrixForPortfolioInAllFunds() dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0) if ifSavePortfolioIndex: dfPortfolioIndex = dfSparsePortfolio["FullElements"] dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv") folderToSaveTrainDataset = getFolderNameInConfig( "folderToSaveTrainDataset") # the folder to save train dataset folderToSaveTestDataset = getFolderNameInConfig( "folderToSaveTestDataset") # the folder to save test dataset count = 0 instruments = D.instruments(market='all') for file in D.list_instruments(instruments=instruments, as_list=True): fundCode = file.split("_")[0] # 000001 if count % 100 == 0: print("count = %s\tfundCode=%s" % (count, fundCode)) try: # can't find portfolio for this fund try: dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]] except: continue # read file and remove empty line df = D.features([file], ['$AccumulativeNetAssetValue'], start_time=firstDayToAnalyze, end_time=lastDayToAnalyze) df.columns = ['AccumulativeNetAssetValue'] #df = df.unstack(level=0) df["datetime"] = df.index.levels[1] # reset the index df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True) # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high if df.shape[0] <= minDaysRange: continue # count the days between first day and last day day = df['datetime'] # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday) firstDayInThisFund = day[day.first_valid_index( )] # 2018-02-12 00:00:00, 2018-02-10 is Satuaday lastDayInThisFund = day[ day.last_valid_index()] # 2021-02-10 00:00:00 # must have value in latest day if (lastDayInThisFund - lastDayToAnalyze).days != 0: continue df['daysDiffWithLastDay'] = df['datetime'].apply( lambda x: (lastDayInThisFund - x).days) # get the value in important days netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][ "AccumulativeNetAssetValue"].tolist()[0] # 4.046 # get train dataset which found more than 3 years if (firstDayInThisFund - firstDayToAnalyze).days <= 0: # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day]) df["adjustFactorToLatestDay"] = df[ "AccumulativeNetAssetValue"] / netValueInFirstDay df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]] # abandon the latest day, it's meaningless df.reset_index(drop=True, inplace=True) df = df.T.drop(labels=0, axis=1).T # reset index to concat with dfSparsePortfolioForThisFund df.reset_index(drop=True, inplace=True) df = df.T # duplicate to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = pd.concat( [dfSparsePortfolioForThisFund.T] * df.shape[1]) # reset index to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index( drop=True).T dfDataset = pd.concat([dfSparsePortfolioForThisFund, df], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTrainDataset, "%s.csv" % fundCode)) else: dfInFirstDay = df[df['datetime'] == firstDayInThisFund].reset_index(drop=True) dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T dfInFirstDay[fundCode] = dfInFirstDay[0] dfDataset = pd.concat( [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode)) count += 1 except Exception as e: print("fundCode = %s\terror = %s" % (fundCode, e)) continue print("------------------------ Done. ------------------------")
def test_1_dump_instruments(self): self.DUMP_DATA.dump_instruments() ori_ins = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.iterdir())) res_ins = set(D.list_instruments(D.instruments("all"), as_list=True)) assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"
def get_features(fields): qlib.init(provider_uri=TestAutoData.provider_uri, expression_cache=None, dataset_cache=None, joblib_backend="loky") return D.features(D.instruments("csi300"), fields)