def check_dict(path=FACTOR_DICT_FILE_PATH): ''' 检查数据字典文件是否与当前模块中的因子字典相同,如果不同,则更新数据字典文件 ''' try: file_dict = load_pickle(FACTOR_DICT_FILE_PATH) except FileNotFoundError: # 如果不存在数据字典文件,则生成文件 print('Dictionary file not found, initialization...') update_factordict() return module_dict = gen_path_dict(get_factor_dict()) file_dict = load_pickle(FACTOR_DICT_FILE_PATH) if module_dict != file_dict: print('Updating dictionary file...') update_factordict()
def query(factor_name, time, codes=None, fillna=None): ''' 接受外部的请求,从数据库中获取对应因子的数据 Parameter --------- factor_name: str 需要查询的因子名称 time: type that can be converted by pd.to_datetime or tuple of that 单一的参数表示查询横截面的数据,元组(start_time, end_time)表示查询时间序列数据 codes: list, default None 需要查询数据的股票代码,默认为None,表示查询所有股票的数据 fillna: int or float, default 是否给NA值进行填充,默认为None,即不需要填充,如果需要填充则将填充值传给fillna参数 Return ------ out: pd.DataFrame 查询结果数据,index为时间,columns为股票代码,如果未查询到符合要求的数据,则返回None ''' factor_dict = load_pickle(FACTOR_DICT_FILE_PATH) if factor_dict is None: raise ValueError('Dictionary file needs initialization...') assert factor_name in factor_dict, \ 'Error, factor name "{pname}" is'.format(pname=factor_name) +\ ' not valid, valid names are {vnames}'.format(vnames=list(factor_dict.keys())) abs_path = factor_dict[factor_name] db = database.DBConnector(abs_path) data = db.query(time, codes) if fillna is not None: data = data.fillna(fillna) return data
def get_universe(path=UNIVERSE_FILE_PATH): ''' 用于获取当前数据中对应的universe Parameter --------- path: str, default UNIVERSE_FILE_PATH universe文件存储的位置 Return ------ out: list 当前数据对应的universe(排序后) ''' universe = datatoolkits.load_pickle(path)[0] return sorted(universe)
def update_universe(path=UNIVERSE_FILE_PATH): ''' 获取最新的universe,并将最新的universe与之前文件中的universe对比,如果发生了更新,打印相关信息 随后,将最新的universe存储在指定文件中,存储文件为一个tuple(universe, update_time) Parameter --------- path: str, default UNIVERSE_FILE_PATH 存储universe数据的文件 Return ------ universe: list 当前最新的universe Notes ----- 不能自行调用该函数用于获取universe,可能造成获取的universe与因子数据的universe不一致, 获取当前的universe,使用fmanger.factors.utils.get_universe函数 ''' logger = logging.getLogger(__name__.split()[0]) new_universe = fdgetter.get_db_data(fdgetter.BASIC_SQLs['A_UNIVERSE'], cols=('code', ), add_stockcode=False) new_universe['code'] = new_universe.code.apply(datatoolkits.add_suffix) new_universe = new_universe.code.tolist() try: universe_save = datatoolkits.load_pickle(path) universe, _ = universe_save nu_set = set(new_universe) ou_set = set(universe) if nu_set != ou_set: add_diff = list(nu_set.difference(ou_set)) minus_diff = list(ou_set.difference(nu_set)) msg = 'Warning: universe UPDATED, {drop} are DROPED, {add} are ADDED'.\ format(drop=minus_diff, add=add_diff) logger.info(msg) print(msg) except FileNotFoundError: pass data = (new_universe, dt.datetime.now()) datatoolkits.dump_pickle(data, path) return new_universe
def query(factor_name, time, codes=None, fillna=None): ''' 接受外部的请求,从数据库中获取对应因子的数据 Parameter --------- factor_name: str 需要查询的因子名称 time: type that can be converted by pd.to_datetime or tuple of that 单一的参数表示查询横截面的数据,元组(start_time, end_time)表示查询时间序列数据 codes: list, default None 需要查询数据的股票代码,默认为None,表示查询所有股票的数据 fillna: int or float, default(该参数将废止) 是否给NA值进行填充,默认为None,即不需要填充,如果需要填充则将填充值传给fillna参数 Return ------ out: pd.DataFrame 查询结果数据,index为时间,columns为股票代码,如果未查询到符合要求的数据,则返回None ''' # 若更换了机器,需要先更新因子字典 factor_dict = load_pickle(FACTOR_DICT_FILE_PATH) if factor_dict is None: raise ValueError('Dictionary file needs initialization...') assert factor_name in factor_dict, \ 'Error, factor name "{pname}" is'.format(pname=factor_name) +\ ' not valid, valid names are {vnames}'.format(vnames=sorted(factor_dict.keys())) abs_path = factor_dict[factor_name] db = database.DBConnector(abs_path) data = db.query(time, codes) if data is None: return None universe = get_universe() if codes is None: # 为了避免数据的universe不一致导致不同数据的横截面长度不同 data = data.reindex(columns=universe) if fillna is None: fillna = db.default_data if isinstance(fillna, np.bytes_): fillna = fillna.decode('utf8') data = data.fillna(fillna) return data
formater.get_basicformater(param)} Notes ----- 函数会根据参数的形式推断需要使用的formater的方法,推断方法如下: 当字典值为tuple时会使用get_modformater,当字典值为str时,使用get_basicformater ''' out = dict() for col, format_type in format_set.items(): if isinstance(format_type, str): out[col] = formater.get_basicformater(format_type) else: f, p = format_type out[col] = formater.get_modformater(f, p) return out if __name__ == '__main__': test_data = datatoolkits.load_pickle( r"F:\GeneralLib\CONST_DATAS\htmltable.pickle") res = table_convertor.format_df(test_data.reset_index(), formater=trans2formater({ 'nav': 'pct2p', 'CSI700': ('pctnp', 4) }), order=['nav', 'index', 'CSI700']) # ret = datatoolkits.load_pickle(r"F:\GeneralLib\CONST_DATAS\sample_ret.pickle") # sr = sortino_ratio(ret.group_05.pct_change().dropna(), 0.04) # print(sr)
#!/usr/bin/env python # -*- coding:utf-8 """ Author: Hao Li Email: [email protected] Github: https://github.com/SAmmer0 Created: 2018/3/20 """ import pandas as pd from datatoolkits import load_pickle import dateshandle from tdtools.tradingcalendar import TradingCalendar TD_PATH = r"E:\GeneralLib\CONST_DATAS\tradingDays.pickle" td_data = load_pickle(TD_PATH) trading_times = (('09:30', '11:30'), ('13:00', '15:00')) sse_calendar = TradingCalendar(td_data, trading_times) # 交易日计数测试 start_time = '2017-01-01' end_time = '2018-03-02' mod_cnt = sse_calendar.count(start_time, end_time, 'both') old_cnt = dateshandle.tds_count(start_time, end_time) assert mod_cnt == old_cnt # 交易日区间测试 start_time = '2016-03-01' end_time = '2017-11-03' mod_tds = sse_calendar.get_tradingdays(start_time,