def __init__(self, start_t=date(2010, 1, 1), end_t=date(2016, 12, 31)): # 从 pkl 读数据是另外一个 class 处理 # cache 的文件路径将根据 start_t , end_t , FEATURES 做 hash # 暂时根据 start / end 确定存盘路径,以后改成只有一份全部数据的 pickle cache_file_path = os.path.join( "/tmp", f"TestTSDataGenerator_{start_t.isocalendar()}_{end_t.isoformat()}.pkl" ) self.original_data: pd.DataFrame = None self.df_x_to_loop: pd.DataFrame = None self.df_y_to_loop: pd.DataFrame = None if os.path.isfile(cache_file_path): self.original_data = pd.read_pickle(cache_file_path, compression="gzip") else: # 从 arctic 读取数据并缓存 arctic_store = Arctic(get_mongo_admin_conn_str()) lib_name = "jy_chn_equity_otvn_chunkstore" lib_chunk_store = arctic_store[lib_name] symbol_name = "mkt_data" self.original_data: pd.DataFrame = lib_chunk_store.read( symbol_name, chunk_range=pd.date_range(start_t, end_t), filter_data=True, columns=self.FEATURES) self.original_data.to_pickle(cache_file_path, compression="gzip", protocol=4)
def save_to_chunkstore_per_symbol(self): lib_name = "jy_equity_mkt_data" arctic_store = Arctic(get_mongo_admin_conn_str()) arctic_store.delete_library(lib_name) arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) lib_chunk_store = arctic_store[lib_name] df = self.load_all_close_price() df2 = df.pivot_table(values="close_price", index="t", columns="o", aggfunc=np.mean) df2.index.rename("date", inplace=True) i = 0 for col in df2.columns: df3 = df2.loc[:, col] df3 = df3.dropna(axis=0) lib_chunk_store.write(col, df3, chunker=DateChunker(), chunk_size="D") i += 1 if i % 2 == 0: print(f"{i}:{col}")
def save_to_arctic(self): # see https://github.com/manahl/arctic/blob/master/howtos/201507_demo_pydata.py arctic_store = Arctic(get_mongo_admin_conn_str()) # print(arctic_store.list_libraries()) closeprice_lib = arctic_store["jy_equity_closeprice"] # print(closeprice_lib) df = self.load_all_close_price() # print(df.index.to_frame()["o"].unique()) # df.reset_index(level=1, inplace=True) # print(df) df2 = df.pivot_table(values="close_price", index="t", columns="o", aggfunc=np.mean) # print(df2) # # print(df2.columns) i = 0 for col in df2.columns: df3 = df2.loc[:, col] df3 = df3.dropna(axis=0) closeprice_lib.write(col, df3) i += 1 print(f"{i}:{col}") # if i > 5 : # break print(closeprice_lib.list_symbols())
def show_chunk_store_info(self): arctic_store = Arctic(get_mongo_admin_conn_str()) lib_chunk_store = arctic_store["jy_otv_chunkstore"] print("list_symbols") print(lib_chunk_store.list_symbols()) print("get_info") print(lib_chunk_store.get_info("close_price")) print("chunk_ranges") print(list(lib_chunk_store.get_chunk_ranges("close_price")))
def read_all_data_from_arctic(self): arctic_store = Arctic(get_mongo_admin_conn_str()) closeprice_lib = arctic_store["jy_equity_closeprice"] start = time.time() rows_read = 0 for s in closeprice_lib.list_symbols(): rows_read += len(closeprice_lib.read(s).data) print("Symbols: %s Rows: %s Time: %s Rows/s: %s" % (len(closeprice_lib.list_symbols()), rows_read, (time.time() - start), rows_read / (time.time() - start))) pass
def convert_mkt_history_data(self): arctic_store = Arctic(get_mongo_admin_conn_str()) lib_name = "jy_chn_equity_otvn_chunkstore" arctic_store.delete_library(lib_name) arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) lib_chunk_store = arctic_store[lib_name] # 先 hardcode 日期范围,可以有更优雅的表达 for i, t_period in enumerate([(date(1990, 1, 1), date(2000, 1, 15)), (date(2000, 1, 15), date(2010, 1, 15)), (date(2010, 1, 15), date(2020, 1, 1))]): # 测算下来,日频数据,用 "M" 作为 chunk_size 的写入和读取效率是综合最高的 self._convert_period_equity_mkt_data_to_arctic( t_period[0], t_period[1], lib_chunk_store, "mkt_data", "M", i == 0)
def save_to_arctic_v2(self): # see https://github.com/manahl/arctic/blob/master/howtos/201507_demo_pydata.py arctic_store = Arctic(get_mongo_admin_conn_str()) # print(arctic_store.list_libraries()) # arctic_store.initialize_library("jy_equity_closeprice_v2") closeprice_lib = arctic_store["jy_equity_closeprice_v2"] # print(closeprice_lib) df = self.load_all_close_price() # print(df.index.to_frame()["o"].unique()) df2 = df.pivot_table(values="close_price", index="t", columns="o", aggfunc=np.mean) closeprice_lib.write("close_price", df2) print(closeprice_lib.list_symbols())
def __init__(self, lib_name: str = _ARCTIC_BINARY_LIBRARY, mongo_db: str = "auto"): """假定一个 instance 只操作一个 library mongo_db : "auto" 根据环境是否为 colab 自动选择 google 还是 local "google" 选择 google 的 mongo "intranet" 选择机房中的Mongo """ # 这里 暂时先 hardcode arctic 所使用的 mongo 地址 mongo_db_conn_str = get_mongo_admin_conn_str() if mongo_db == "google": mongo_db_conn_str = get_google_mongo_conn_str() elif mongo_db == "intranet": mongo_db_conn_str = get_intranet_mongo_conn_str() self._store = Arctic(mongo_db_conn_str) if not self._store.library_exists(lib_name): self._store.initialize_library(lib_name, VERSION_STORE) self._lib = self._store[lib_name]
def save_to_arctic_tickstore(self): # not work arctic_store = Arctic(get_mongo_admin_conn_str()) arctic_store.delete_library("jy_otv_tickstore") arctic_store.initialize_library("jy_otv_tickstore", lib_type=TICK_STORE) lib_tick_store = arctic_store["jy_otv_tickstore"] # lib_tick_store._chunk_size = 8396800 # print(closeprice_lib) df = self.load_all_close_price() # print(df.index.to_frame()["o"].unique()) df2 = df.pivot_table(values="close_price", index="t", columns="o", aggfunc=np.mean) df2.index = df2.index.tz_localize("Asia/Shanghai") # df2.reset_index(df2.index.tz_localize("Asia/Shanghai"), inplace=True) # print(df2.head()) lib_tick_store.write("close_price", df2)
def read_from_chunkstore(self, start: date, end: date, cols: List[str]): # start = time.time() # df = self.load_all_close_price() # print(f"total read time {time.time() - start} ") # 9,595,866 rows 读取时间 0.48 sec , 本地 压缩的 pickles 文件 arctic_store = Arctic(get_mongo_admin_conn_str()) lib_name = "jy_chn_equity_otvn_chunkstore" lib_chunk_store = arctic_store[lib_name] symbol_name = "mkt_data" run_start = time.time() # data = lib_chunk_store.read("close_price") data: pd.DataFrame = lib_chunk_store.read(symbol_name, chunk_range=pd.date_range( start, end), filter_data=True, columns=cols) logger.info(f"total read time {time.time() - run_start} ") # [5Y (2012/01/15 - 2017/12/15)] 3,906,128 rows , [ALL] 9,688,283 rows # chunk_size = Y , 16.74 secs(5Y) , 38.89 secs(ALL) # chunk_size = M , 16.83 secs(5Y) , 39.73 secs(ALL) # chunk_size = D , 23.75 secs(5Y) , 74.94 secs(ALL) # 增加读取的 v 的 column 对性能基本没影响,多读取两列 约增加了 0.5 sec # 参考: mysql 中读取 10Y(2000-2009, 3,187,574 rows) 数据约 120 secs,写入 arctic 约 77 secs # 之前错误的测试结果 # # 9,595,866 rows 读取时间 11.68 sec , chunk_size = M (ALL) # # 6,361,499 rows 读取时间 8.19 sec , chunk_size = M (10Y) # # 4,018,657 rows 读取时间 5.24 sec , chunk_size = M (5Y) # # 8,48,959 rows 读取时间 1.16 sec , chunk_size = M (1Y) return data
def save_to_chunkstore(self): arctic_store = Arctic(get_mongo_admin_conn_str()) arctic_store.delete_library("jy_otv_chunkstore") arctic_store.initialize_library("jy_otv_chunkstore", lib_type=CHUNK_STORE) lib_chunk_store = arctic_store["jy_otv_chunkstore"] # lib_tick_store._chunk_size = 8396800 # print(closeprice_lib) df = self.load_all_close_price() df.sort_index(axis=0, ascending=True, inplace=True) print(df) # df2 = df.pivot_table(values="close_price", index="t", columns="o", aggfunc=np.mean) # df2.index = df2.index.tz_localize("Asia/Shanghai") # df2.index.rename("date",inplace=True) # df2.reset_index(df2.index.tz_localize("Asia/Shanghai"), inplace=True) # print(df2.head()) # print(df2[-100:]) start = time.time() lib_chunk_store.write("close_price", df, chunk_size="M") print(f"total write time {time.time()-start} ")
def __init__(self): self.arctic_store = Arctic(get_mongo_admin_conn_str(), connectTimeoutMS=600 * 1000, serverSelectionTimeoutMS=600 * 1000)