def test_2_dump_instruments(self): ori_ins = set( map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) res_ins = set(D.list_instruments(D.instruments("all"), as_list=True)) assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"
def instrument_callback(self, ibody, task_uri): """Target function for the established process when the received task asks for instrument data. Call the data provider to acquire data and publish the instrument data. """ instruments = ibody["instruments"] start_time = ibody["start_time"] end_time = ibody["end_time"] if start_time == "None": start_time = None if end_time == "None": end_time = None freq = ibody["freq"] as_list = ibody["as_list"] status_code = 0 # TODO: add exceptions detection and modify status_code self.logger.debug("process instrument data at %f" % time.time()) try: instrument_result = D.list_instruments(instruments, start_time, end_time, freq, as_list) if isinstance(instrument_result, dict): instrument_result = {i: [(str(s), str(e)) for s, e in t] for i, t in instrument_result.items()} self.logger.debug("finish processing instrument data and publish message at %f" % time.time()) self.publish_message("instrument", instrument_result, status_code, task_uri) except Exception as e: self.logger.exception(f"Error while processing request %.200s" % e) self.publish_message("instrument", None, 1, task_uri, str(e))
def test_handler_storage(self): # init data handler data_handler = TestHandler(**self.data_handler_kwargs) # init data handler with hasing storage data_handler_hs = TestHandler(**self.data_handler_kwargs, infer_processors=["HashStockFormat"]) fetch_start_time = "2019-01-01" fetch_end_time = "2019-12-31" instruments = D.instruments(market=self.market) instruments = D.list_instruments(instruments=instruments, start_time=fetch_start_time, end_time=fetch_end_time, as_list=True) with TimeInspector.logt("random fetch with DataFrame Storage"): # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] fetch_stock = instruments[random_index] data_handler.fetch(selector=(fetch_stock, slice(fetch_start_time, fetch_end_time)), level=None) # multi stocks for i in range(100): random_indexs = np.random.randint(len(instruments), size=5) fetch_stocks = [ instruments[_index] for _index in random_indexs ] data_handler.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None) with TimeInspector.logt("random fetch with HasingStock Storage"): # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] fetch_stock = instruments[random_index] data_handler_hs.fetch(selector=(fetch_stock, slice(fetch_start_time, fetch_end_time)), level=None) # multi stocks for i in range(100): random_indexs = np.random.randint(len(instruments), size=5) fetch_stocks = [ instruments[_index] for _index in random_indexs ] data_handler_hs.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None)
def _gen_stock_dataset(self, config, conf_type): try: path = config.pop("path") except KeyError as e: raise ValueError("Must specify the path to save the dataset.") from e if os.path.isfile(path + "tmp_dataset.pkl"): start = time.time() print_log("Dataset exists, load from disk.", __name__) else: start = time.time() if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) print_log("Generating dataset", __name__) self._prepare_calender_cache() dataset = init_instance_by_config(config) print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__) dataset.config(dump_all=False, recursive=True) dataset.to_pickle(path + "tmp_dataset.pkl") with open(path + "tmp_dataset.pkl", "rb") as f: new_dataset = pkl.load(f) instruments = D.instruments(market="all") stock_list = D.list_instruments( instruments=instruments, start_time=self.start_time, end_time=self.end_time, freq="1min", as_list=True ) def generate_dataset(stock): if os.path.isfile(path + stock + ".pkl"): print("exist " + stock) return self._init_qlib(self.qlib_conf) new_dataset.handler.config(**{"instruments": [stock]}) if conf_type == "backtest": new_dataset.handler.setup_data() else: new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS) new_dataset.config(dump_all=True, recursive=True) new_dataset.to_pickle(path + stock + ".pkl") Parallel(n_jobs=32)(delayed(generate_dataset)(stock) for stock in stock_list)
def prepareTrainDataset(ifSavePortfolioIndex=False): print( "------------------------ Begin to prepare train dataset... ------------------------" ) # read config file cf = configparser.ConfigParser() cf.read("config/config.ini") minDaysRange = int(cf.get("Parameter", "minDaysRange")) # offset of days numberOfYears = int(cf.get("Parameter", "numberOfYears")) numberOfMonths = int(cf.get("Parameter", "numberOfMonths")) numberOfDays = int(cf.get("Parameter", "numberOfDays")) # qlib init qlib.init(provider_uri='data/bin') # use one fund be the standard of trading day calendar = D.calendar(freq='day') lastDay = calendar[-1] # 2021-02-10 00:00:00 firstDay = lastDay - DateOffset(years=numberOfYears, months=numberOfMonths, days=numberOfDays) # 2018-02-10 00:00:00 # exclude the influence of days without trading calendarBetweenFirstDayAndLastDay = D.calendar(freq='day', start_time=firstDay, end_time=lastDay) firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0] lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1] # get portfolio pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio") if not os.path.exists(pathOfDfSparsePortfolio): getSparseMatrixForPortfolioInAllFunds() dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0) if ifSavePortfolioIndex: dfPortfolioIndex = dfSparsePortfolio["FullElements"] dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv") folderToSaveTrainDataset = getFolderNameInConfig( "folderToSaveTrainDataset") # the folder to save train dataset folderToSaveTestDataset = getFolderNameInConfig( "folderToSaveTestDataset") # the folder to save test dataset count = 0 instruments = D.instruments(market='all') for file in D.list_instruments(instruments=instruments, as_list=True): fundCode = file.split("_")[0] # 000001 if count % 100 == 0: print("count = %s\tfundCode=%s" % (count, fundCode)) try: # can't find portfolio for this fund try: dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]] except: continue # read file and remove empty line df = D.features([file], ['$AccumulativeNetAssetValue'], start_time=firstDayToAnalyze, end_time=lastDayToAnalyze) df.columns = ['AccumulativeNetAssetValue'] #df = df.unstack(level=0) df["datetime"] = df.index.levels[1] # reset the index df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True) # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high if df.shape[0] <= minDaysRange: continue # count the days between first day and last day day = df['datetime'] # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday) firstDayInThisFund = day[day.first_valid_index( )] # 2018-02-12 00:00:00, 2018-02-10 is Satuaday lastDayInThisFund = day[ day.last_valid_index()] # 2021-02-10 00:00:00 # must have value in latest day if (lastDayInThisFund - lastDayToAnalyze).days != 0: continue df['daysDiffWithLastDay'] = df['datetime'].apply( lambda x: (lastDayInThisFund - x).days) # get the value in important days netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][ "AccumulativeNetAssetValue"].tolist()[0] # 4.046 # get train dataset which found more than 3 years if (firstDayInThisFund - firstDayToAnalyze).days <= 0: # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day]) df["adjustFactorToLatestDay"] = df[ "AccumulativeNetAssetValue"] / netValueInFirstDay df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]] # abandon the latest day, it's meaningless df.reset_index(drop=True, inplace=True) df = df.T.drop(labels=0, axis=1).T # reset index to concat with dfSparsePortfolioForThisFund df.reset_index(drop=True, inplace=True) df = df.T # duplicate to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = pd.concat( [dfSparsePortfolioForThisFund.T] * df.shape[1]) # reset index to concat with dfSparsePortfolioForThisFund dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index( drop=True).T dfDataset = pd.concat([dfSparsePortfolioForThisFund, df], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTrainDataset, "%s.csv" % fundCode)) else: dfInFirstDay = df[df['datetime'] == firstDayInThisFund].reset_index(drop=True) dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T dfInFirstDay[fundCode] = dfInFirstDay[0] dfDataset = pd.concat( [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]], axis=0) dfDataset.to_csv( os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode)) count += 1 except Exception as e: print("fundCode = %s\terror = %s" % (fundCode, e)) continue print("------------------------ Done. ------------------------")
def test_1_dump_instruments(self): self.DUMP_DATA.dump_instruments() ori_ins = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.iterdir())) res_ins = set(D.list_instruments(D.instruments("all"), as_list=True)) assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"