def get_factor_return_daily(factor_return_name, trading_days=[]): """ 从本地数据库中获取某段日期某个factor_return的日收益率 @factor_return_name (str): factor名称 @trading_days (['%Y-%m-%d']): 日期列表 :return: DataFrame, index: date, columns: [sec_id, group01-group10, factor] """ if factor_return_name not in get_schema( "factor_return"): # 判断所给定的factor_return是否存在本地factor库中 Logger.error( "{} is not in FACTOR_RETURN library".format(factor_return_name)) return else: filepath = os.path.join(DB_FACTOR_RETURN_PATH, "{}.csv".format(factor_return_name)) df_info = open_csv_as_df(filepath, validate=True) if not trading_days: output = df_info.copy() else: output = df_info[df_info.date.isin(trading_days)] not_found_dates = set(trading_days) - set(output["date"].tolist()) if not_found_dates: Logger.warn( "Following dates are invalid: {}".format(not_found_dates)) return output = output.set_index(['date']) return output
def get_index_contents(index_code, date="", approx=False, log=False): """ 读取单个日期指数成分股列表 @index_code (str): 指数代码,目前支持 ['A', 'H', '000905.SH', '000300.SH', '000016.SH', 'HSI.HI'] @date ('%Y-%m-%d'): 单个日期 @log (Bool): 是否打印log :return (list): 股票代码列表 """ if log: Logger.info( "Reading index contents of {} on {}".format(index_code, date), "green") if not date: Logger.error("Empty date") raise ValueError # approx 用于保证更新 indicator 财报数据时财报日非交易日的情况 if approx: date = get_nearest_trading_day(date=date, direction='left', self_included=True) if index_code in IDXCONT_AS_SQL: output = get_index_contents_from_sql(index_code, date, log=log) elif index_code in IDXCONT_AS_CSV: output = get_index_contents_from_csv(index_code) else: Logger.error("Unrecognized index code: {}".format(index_code)) raise ValueError return output
def update_calendar(start_date, end_date, log=False): """ 从Wind更新calendar相关数据 每次更新将删除原有所有数据 更新到当前区间 @start_date ("%Y-%m-%d"): 开始日日期 必须是月初日期 @end_date ("%Y-%m-%d"): 结束日日期 必须是月月末日期 @log (Bool): 是否打印log """ Logger.info("Updating calendar ...", "green") max_existed_date = get_trading_days with SqliteProxy(log=log) as proxy: proxy.connect(os.path.join(DB_CALENDAR_PATH, "calendar.db")) proxy.execute("DELETE FROM calendar") try: df = load_calendar_from_wind(start_date, end_date) except Exception: Logger.error("Error occurred when loading") raise ValueError try: proxy.write_from_dataframe(df, "calendar") except Exception: Logger.error( "Error occurred when writing dataframe into sqlite db") traceback.print_exc() raise ValueError if log: Logger.info("calendar was updated from {} to {}".format( start_date, end_date), color="green") Logger.info("------------------------------------------")
def get_index_weights(index_code, date=""): """ 读取单个日期指数成分股的权重 @index_code (str): 指数代码,目前支持 ['000016.SH', '000300.SH', '000905.SH'] @date (%Y-%m-%d): 单个日期 :return: {sec_id: weight} """ if not date: Logger.error("Empty date") raise ValueError if index_code not in ['000016.SH', '000300.SH', '000905.SH']: Logger.error("Invalid index code: {}".format(index_code)) dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4])) with SqliteProxy(log=False) as proxy: proxy.connect(dbpath) query = "SELECT sec_id, weight FROM [{}] WHERE date = '{}' ".format( index_code, date) df = proxy.query_as_dataframe(query) if len(df) == 0: Logger.warn("Empty result when reading {} at {}".format( index_code, date)) output = {} else: output = { df.at[i, 'sec_id']: df.at[i, 'weight'] for i in range(len(df)) } return output
def sqlize_db_industry(subdb): """ 将 industry sql化 @subdb (str): 子数据库名 """ db_path = DB_PATH_LIB['industry'] subdb_path = os.path.join(db_path, subdb) trading_days = listdir_advanced(subdb_path, 'json', strip_suffix=True) with SqliteProxy(log=False) as proxy: for year, dates in classify_dates_by_year(trading_days).items(): path = os.path.join(db_path, '{}.db'.format(year)) proxy.connect(path) if subdb not in proxy.list_tables: create_db_03(proxy, subdb) for date in dates: js = json2dict(os.path.join(subdb_path, '{}.json'.format(date))) df = pd.DataFrame(list(js.items()), columns=['sec_id', 'industry']) df['date'] = date try: proxy.write_from_dataframe(df, "A_SWL1") except Exception: Logger.error( "Error occurred when sqlizing {} on {}.".format( subdb, date)) traceback.print_exc()
def open_db_folder(db=""): if not db: path = DB_PATH elif db in DB_PATH_LIB: path = DB_PATH_LIB[db] else: Logger.error("db not found: {}".format(db)) subprocess.Popen(r'explorer "{}"'.format(path))
def get_secs_factor_on_multidays(factor, sec_ids=[], trading_days=[], log=False): """ 从本地数据库中获取一段日期的单个factor的值,并返回 dict of DataFrame @factor (str): 单个factor @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股 @trading_days (["%Y-%m-%d"]): 日期列表 @log (Bool): 是否打印log :return: {date: Dataframe},其中 DataFrame 列为factor名,index为sec_id """ if log: Logger.info( "Reading {} from {} to {}".format(factor, trading_days[0], trading_days[-1]), "green") if factor not in get_schema("factor"): Logger.error("Unrecognized factor: {}".format(factor)) raise ValueError if not isinstance(sec_ids, list): Logger.error("sec_ids must be list!") raise ValueError if not trading_days: Logger.error("Empty date") raise ValueError # 长连接效率更高,所以这里不是复用 get_secs_factor 而是重新写 with SqliteProxy(log=log) as proxy: output = {} for year, date_list in classify_dates_by_year(trading_days).items(): path = os.path.join(DB_FACTOR, '{}.db'.format(year)) proxy.connect(path) for date in date_list: if len(sec_ids) == 0: # 为空默认全A股 conds = "" elif len(sec_ids) == 1: conds = "AND sec_id = '{}'".format(sec_ids[0]) else: conds = "AND sec_id IN {}".format(tuple(sec_ids)) query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format( factor, factor, date, conds) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} at {}".format( factor, date)) traceback.print_exc() raise ValueError output[date] = df return output
def load_secs_industry_sw_from_wind(index_code, date, level=1): """ 从Wind更新指定index成分股的申万行业数据 @index_code (str): 指数代码 可选代码: "A" "H" @date (%Y-%m-%d): 单个日期 @level (int): 行业级数 默认为1 表示为申万1级行业分类 :return: (dict of str): 键是证券代码,值是行业名称 """ universe = get_index_contents(index_code, date, log=False) if not universe: Logger.error("Empty universe at {}!".format(date)) return {} output = get_secs_industry_sw(sec_ids=universe, date=date, level=level, market=index_code) return output
def load_secs_industry_gics_from_wind(index_code, date, level=1): """ 从Wind更新指定index成分股的gics行业数据 @index_code (str): "H_GICSL1" @date (%Y-%m-%d): 单个日期 @level (int): 行业级数 默认为1 表示为申万1级行业分类 :return: (dict of str): 键是证券代码,值是行业名称 """ universe = get_index_contents(index_code, date) if not universe: Logger.error("Empty universe at {}!".format(date)) return {} output = get_secs_industry_gics(sec_ids=universe, level=1) return output
def update_indicators(indicators=[], trading_days=[], sec_ids=[], override=False, log=False): """ 更新多个indicator的指定日期列表的数据 @indicators (list): indicator的名称构成的列表 @trading_days ([%Y-%m-%d]): 日期列表 @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖 @log (Bool): 是否打印log """ SCHEMA = get_schema('indicator') if not indicators: indicators = list(SCHEMA.keys()) start = trading_days[0] end = trading_days[-1] update_days_map = { "财报数据": set(get_report_days(start, end)), "时间序列": set(get_trading_days(start, end)), } for ind in indicators: if ind in SCHEMA: # 更新日期取交集 itype = SCHEMA[ind]['type'] update_days = [ t for t in trading_days if t in update_days_map[itype] ] if not update_days: Logger.warn("No valid days to update!") else: update_single_indicator(indicator=ind, trading_days=update_days, sec_ids=sec_ids, override=override, log=log) else: Logger.error("Unrecognized indicator: {}".format(ind))
def calculate_factor(factor, date): """ 通过对indicator的计算得到因子的值 :param: factor (str): 该factor的名字 :param: date (%Y-%m-%d): 日期 :return: dataframe 处理后的因子值 """ func = getattr(formula, "calculate_raw_{}".format(factor)) if func is None: Logger.error("Formula not implemented: {}".format(factor)) raise ValueError context, df_today, missing_flag = load_context(factor, date) last_day = get_previous_existed_day_in_table(date, DB_FACTOR, factor) if missing_flag == 1: if last_day is None: # 无最新数据 Logger.error("当前日期数据缺失值太多,且之前没有可以复制的文件") raise ValueError else: Logger.warn("由于 {} 值缺失太多直接复制于 {}".format(date, last_day)) try: df_last = get_secs_factor(factor, sec_ids=[], date=last_day, log=False) except Exception: traceback.print_exc() Logger.warn("无法提取 {} 上个记录日的数据".format(factor)) raise ValueError value = df_today.merge(df_last, how="left", left_on='sec_id', right_index=True) return value else: data_raw = func(context) data_final = statistical_process( # 数据处理: 缺失值分离 winsorize 标准化 data=data_raw, var=factor, winsor_LB=WINSORIZE_LB, winsor_UB=WINSORIZE_UB) return data_final
def get_index_contents_from_sql(index_code, date="", log=False): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4])) with SqliteProxy(log=log) as proxy: proxy.connect(path) query = "SELECT sec_id FROM [{}] WHERE date = '{}'".format( index_code, date) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} at {}".format( index_code, date)) traceback.print_exc() raise ValueError if len(df) == 0: Logger.warn("Empty result when reading {} at {}".format( index_code, date)) return [] return df["sec_id"].tolist()
def update_index_std(index, cp=3, log=False): """ 更新index_std 更新原理: 无需指定trading_days 更新全部index中有的日期但在index_std中没有的日期 @index <str>: index名称 不是index_std名称 @cp <int>: winsorize的临界值 """ trading_days = get_unique_datelist_from_table("index", index) existed_days = get_unique_datelist_from_table("index_std", "{}_std".format(index)) update_days = sorted(list(set(trading_days) - set(existed_days))) if len(update_days) == 0: Logger.warn("All given dates has existed. No need to update!!") return output = process_ts_index(index, update_days, cp) if len(output) == 0: Logger.error("Fail to process {} on given dates".format(index)) df2mysql(USER, PASSWORD, "index_std", index + '_std', output) del output, trading_days, update_days gc.collect() Logger.info("Updated successfully!!")
def update_industry(industry, trading_days=[], override=False, log=False): """ 从Wind更新某指数成分股申万一级行业数据 @industry (str): 行业数据库名称 @trading_days (['%Y-%m-%d']): 日期列表 @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖 @log (Bool): 是否打印日志信息 """ Logger.info("Updating industry {}".format(industry), "green") if industry not in get_schema('industry'): Logger.error("Unrecognized industry: {}".format(industry)) return if not trading_days: Logger.error("Empty date") raise ValueError if industry in INDUSTRY_AS_SQL: update_industry_to_sql(industry, trading_days, override, log) elif industry in INDUSTRY_AS_JSON: # 非sql数据强制更新,原有的会自动保存副本 update_industry_to_json(industry, trading_days) else: Logger.error("Unrecognized industry: {}".format(industry)) raise ValueError if log: Logger.info("industry/{} is updated.".format(industry), color="green") Logger.info("------------------------------------------")
def update_industry_to_json(industry, trading_days): try: date = trading_days[-1] index_code, loader = INDEX_LOADER_MAP[industry] info = loader(index_code, date, level=1) except Exception: Logger.error("Error occurred when loading {} on {}".format(industry, date)) raise ValueError try: path = os.path.join(DB_INDUSTRY, '{}.json'.format(industry)) copy_to = os.path.join(DB_INDUSTRY, '{}_backup.json'.format(industry)) shutil.copy(path, copy_to) # 保存副本,以防数据损坏 dict2json(info, path, log=False) Logger.info("{} on {} is updated successfully".format(industry, date)) except Exception: Logger.error("Error occurred when writing {} on {}".format(industry, date)) raise ValueError # json files are different from sql, cannot use update_schema() # therefore update schema information explicitly try: now = datetime.now() schema = get_schema('industry') schema[industry]["begin date"] = "" schema[industry]["end date"] = now.strftime('%Y-%m-%d') schema[industry]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S') save_schema(schema, 'industry') Logger.info("schema updated: {}".format(industry)) except Exception: Logger.error("Error occurred when updating schema of {}".format(industry)) traceback.print_exc() raise ValueError
def get_secs_IC(ic_code, trading_days=[], log=False): """ 从本地数据库中获取一段日期的单个IC的值,并返回 dict of DataFrame @ic_code (str): 单个IC stocks_info: 全A股 @trading_days (["%Y-%m-%d"]): 日期列表 @log (Bool): 是否打印log :return: {date: Dataframe},date, sec_id, sec_name, is_st, is_trade, ... """ if log: Logger.info( "Reading {} from {} to {}".format(ic_code, trading_days[0], trading_days[-1]), "green") if not trading_days: Logger.error("Empty date") raise ValueError with MySQLProxy(log=log) as proxy: output = {} proxy.connect(USER, PASSWORD, 'index') # 注: 单个值用=,需要加上引号,多个值用tuple if len(trading_days) == 1: query = "SELECT * FROM {} WHERE date = '{}' ".format( ic_code, trading_days[0]) else: query = "SELECT * FROM {} WHERE date in {}".format( ic_code, tuple(trading_days)) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} at {}".format( ic_code, date)) traceback.print_exc() raise ValueError return df
def update_factors_return(factors_ret_to_update=[], trading_days=[], group_num=10, log=True): """ 根据trading_days更新factor_return数据 @factors_ret_to_update (<list>): factor列表 @trading_days (<[%Y-%m-%d]>) : 日期列表 @group_num (<int>): 分组个数 @log (<Bool>): 是否打印log """ factor_return_schema = get_schema('factor_return') if len(factors_ret_to_update) == 0: factors_ret_to_update = list(factor_return_schema.keys()) for factor_ret in factors_ret_to_update: if factor_ret not in factor_return_schema: Logger.error("Unrecognized factor return: {}".format(factor_ret)) else: update_single_factor_return(factor_ret, trading_days, group_num, log)
def get_secs_industry(industry_code, sec_ids=[], date=""): """ 获取某日期某些股票的的行业分类信息,数据格式 {股票代码:行业分类} @industry_code (str): 子数据库名称,目前支持 ["A_SWL1", "H_SWL1", "H_GICSL1"] @sec_ids: (list) 股票列表 @date: ("%Y-%m-%d") 单个日期 return: {sec_id: industry},不存在则忽略 """ if len(sec_ids) == 0: Logger.warn("Empty sec_ids when reading {} on {}!".format(industry_code, date)) return {} if industry_code in INDUSTRY_AS_SQL: output = get_secs_industry_from_sql(industry_code, sec_ids, date) elif industry_code in INDUSTRY_AS_JSON: output = get_secs_industry_from_json(industry_code, sec_ids) else: Logger.error("Unrecognized industry code: {}".format(industry_code)) raise ValueError return output
def update_factors(factors=[], trading_days=[], override=False, log=False): """ 更新多个factor的指定日期列表的数据 @factors (<list>):factor名称构成的列表 @trading_days ([%Y-%m-%d]): 日期列表 @override (<Bool>): 是否覆盖原记录 默认为False 表示不覆盖 @log (<Bool>): 是否打印log """ SCHEMA = get_schema("factor") if not factors: factors = sorted(SCHEMA, key=lambda x: SCHEMA[x]["level"]) for fac in factors: if fac in SCHEMA: update_single_factor(factor=fac, trading_days=trading_days, override=override, log=log) else: Logger.error("Unrecognized factor: {}".format(fac)) Logger.info("------------------------------------------")
def get_secs_factor(factor, sec_ids=[], date="", log=False): """ 从本地数据库中获取单个日期的单个factor的值,并返回 DataFrame @factor (str): 单个factor @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股 @date ('%Y-%m-%d'): 单个日期 @log (Bool): 是否打印log :return: Dataframe 列为factor名,index为sec_id """ if log: Logger.info("Reading {} at {}".format(factor, date), "green") if factor not in get_schema("factor"): Logger.error("Unrecognized factor: {}".format(factor)) raise ValueError if not isinstance(sec_ids, list): Logger.error("sec_ids must be list!") raise ValueError if not date: Logger.error("Empty date") raise ValueError with SqliteProxy(log=log) as proxy: path = os.path.join(DB_FACTOR, '{}.db'.format(date[:4])) proxy.connect(path) if len(sec_ids) == 0: # 为空默认全A股 conds = "" elif len(sec_ids) == 1: conds = "AND sec_id = '{}'".format(sec_ids[0]) else: conds = "AND sec_id IN {}".format(tuple(sec_ids)) query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format( factor, factor, date, conds) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} at {}".format( factor, date)) traceback.print_exc() raise ValueError return df.sort_values(by=['sec_id']).set_index(['sec_id'])
def sqlize_db(db_name, subdb_list=[]): """将数据库sql化""" if not subdb_list: subdb_list = list(get_schema(db_name).keys()) else: subdb_list = [s for s in subdb_list if s in get_schema(db_name)] db_path = os.path.join(DB_PATH, db_name) with SqliteProxy(log=False) as proxy: for subdb in subdb_list: Logger.info("SQLing {}/{}".format(db_name, subdb), "green") subdb_path = os.path.join(db_path, subdb) trading_days = listdir_advanced(subdb_path, 'csv', strip_suffix=True) for year, dates in classify_dates_by_year(trading_days).items(): path = os.path.join(db_path, '{}.db'.format(year)) proxy.connect(path) if subdb not in proxy.list_tables: creator = DB_CREATOR_MAP[db_name] creator(proxy, subdb) for date in dates: df = pd.read_csv( os.path.join(subdb_path, '{}.csv'.format(date))) df['date'] = date try: proxy.write_from_dataframe(df, subdb) except Exception: Logger.error( "Error occurred when sqlizing {} on {}.".format( subdb, date)) traceback.print_exc()
def update_index_contents_to_sql(index_code, trading_days, override, log=False): with SqliteProxy(log=log) as proxy: date_classfier = classify_dates_by_year(trading_days) for year, date_list in date_classfier.items(): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year)) proxy.connect(path) if index_code not in proxy.list_tables: create_table(proxy, "index_contents", index_code) # 判断已有数据 query = "SELECT DISTINCT(date) FROM [{}]".format(index_code) lookup = proxy.query_as_dataframe(query) lookup = set(lookup['date'].tolist()) for date in date_list: if date in lookup and not override: # 更新的日期已经存在于数据库时,不覆盖则跳过 if log: Logger.warn("{} records on {} is existed.".format( index_code, date)) continue try: loader = LOADER_MAP[index_code] df = loader(index_code, date) df['date'] = date except Exception: Logger.error("Error occurred when loading {} on {}".format( index_code, date)) raise ValueError if df is not None: # 从Wind下载数据成功时 try: if date in lookup and override: # 覆盖时删除原记录 proxy.execute( "DELETE FROM [{}] WHERE date = '{}'".format( index_code, date)) proxy.write_from_dataframe(df, index_code) except Exception: Logger.error( "Error occurred when writing {} on {}".format( index_code, date)) traceback.print_exc() raise ValueError Logger.info("{} on {} is updated successfully".format( index_code, date)) else: # 从wind提取数据失败时 Logger.error("Fail to fetch {} data on {}".format( index_code, date)) raise ValueError update_schema('index_contents', index_code)
def generate_table_template(db, table_name): """生成数据库建表模板""" if db in ("indicator", "factor"): template = [ ("date", "CHAR(10)", False, False), ("sec_id", "TEXT", False, False), (table_name, "REAL", False, True), ] elif db == "index_contents": if table_name == "A_SWL1": template = [ ("date", "CHAR(10)", False, False), ("sec_id", "TEXT", False, False), ("sec_name", "TEXT", False, False), ] elif table_name in ('000016.SH', '000300.SH', '000905.SH'): template = [ ("date", "CHAR(10)", False, False), ("sec_id", "TEXT", False, False), ("sec_name", "TEXT", False, False), ("weight", "REAL", False, False), ] else: Logger.error("Unrecognized table name: {}".format(table_name)) raise ValueError elif db == "industry": template = [ ("date", "CHAR(10)", False, False), ("sec_id", "TEXT", False, False), ("industry", "TEXT", False, True), ] else: Logger.error("Unrecognized db name: {}".format(db)) raise ValueError return template
def get_index_contents_on_multidays(index_code, trading_days=[], log=False): """ 读取多个日期某指数全部股票列表 @index_code (str): 指数代码,目前支持 ['A', '000905.SH', '000300.SH', '000016.SH'] @trading_days (['%Y-%m-%d']): 日期列表 @log (Bool): 是否打印log :return: ({date: list}), key为date value为 股票代码列表 """ if log: Logger.info( "Reading all {} records between trading_days ...".format( index_code), "green") if len(trading_days) == 0: Logger.error("Empty date") raise ValueError elif len(trading_days) == 1: date = trading_days[0] return {date: get_index_contents(index_code, date, log=False)} output = {} if index_code in IDXCONT_AS_SQL: with SqliteProxy(log=log) as proxy: for year, date_list in classify_dates_by_year( trading_days).items(): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year)) proxy.connect(path) query = "SELECT date, sec_id FROM [{}] WHERE date IN {}".format( index_code, tuple(date_list)) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error( "Empty result when reading {} from {} to {}".format( index_code, trading_days[0], trading_days[-1])) traceback.print_exc() raise ValueError if len(df) == 0: Logger.warn( "Empty result when reading {} from {} to {}".format( index_code, trading_days[0], trading_days[-1])) for date in date_list: output[date] = df[df.date == date]['sec_id'].tolist() elif index_code in IDXCONT_AS_CSV: info = get_index_contents_from_csv(index_code) output = {date: info for date in trading_days} else: Logger.error("Unrecognized index code: {}".format(index_code)) raise ValueError return output
def get_secs_index_std(index_std, sec_ids=[], trading_days=[], log=False): """ 从本地数据库中获取一段日期的单个index_std的值,并返回 DataFrame @index_std (str): 单个index_std @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股 @trading_days (["%Y-%m-%d"]): 日期列表 @log (Bool): 是否打印log :return: {date: Dataframe},其中 DataFrame 列为index_std名,index_std为sec_id """ if log: Logger.info("Reading {} from {} to {}".format(index_std, trading_days[0], trading_days[-1]), "green") # if index_std not in get_schema("index_std"): # Logger.error("Unrecognized index_std: {}".format(index_std)) # raise ValueError if not isinstance(sec_ids, list): Logger.error("sec_ids must be list!") raise ValueError if not trading_days: Logger.error("Empty date") raise ValueError with MySQLProxy(log=log) as proxy: output={} proxy.connect(USER, PASSWORD, "index_std") # 注: 单个值用=,需要加上引号,多个值用tuple if len(sec_ids) == 0: if len(trading_days) == 1: query="SELECT * FROM {} WHERE date = '{}' ".format(index_std, trading_days[0]) else: query="SELECT * FROM {} WHERE date in {}".format(index_std, tuple(trading_days)) elif len(sec_ids) == 1: if len(trading_days) == 1: query="SELECT * FROM {} WHERE date = '{}' AND sec_id = '{}' ".format(index_std, trading_days[0], sec_ids[0]) else: query="SELECT * FROM {} WHERE date in {} AND sec_id = '{}' ".format(index_std, tuple(trading_days), sec_ids[0]) else: if len(trading_days) == 1: query="SELECT * FROM {} WHERE date = '{}' AND sec_id in {}".format(index_std, trading_days[0], tuple(sec_ids)) else: query="SELECT * FROM {} WHERE date in {} AND sec_id in {}".format(index_std, tuple(trading_days), tuple(sec_ids)) try: df=proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} ".format(inde)) traceback.print_exc() raise ValueError df['date']=df['date'].apply(lambda x: str(x)) return df
def load_single_indicator_on_single_day_from_wind(indicator, sec_ids, date, log=False): """ 从wind上下载某个指定日期的指标 @sec_ids<list> : 股票代码列表 @indicator (str): 指标名称,仅支持单个indicator传递 @date ("%Y-%m-%d): 单个日期 return: DataFrame,columns=['sec_id','indicator_name'] """ WindAPI.login(is_quiet=True) schema = SCHEMA[indicator] options = schema['kwargs'] if len(sec_ids) != 0: # 为空表示全A股 universe = sec_ids if schema["type"] == "时间序列": if len(sec_ids) == 0: universe = get_index_contents(index_code="A", date=date, log=log) if universe is None: Logger.error("Fail to fetch stock lists on {}".format(date)) raise ValueError options["tradeDate"] = date.replace("-", "") elif schema["type"] == "财报数据": # approx参数为True,保证财报日为非交易日的情形 if len(sec_ids) == 0: universe = get_index_contents(index_code="A", date=date, approx=True, log=log) if universe is None: Logger.error("Fail to fetch stock lists on: {}".format(date)) raise ValueError options["rptDate"] = date.replace("-", "") else: Logger.error("Unrecognized indicator type: {}".format(schema["type"])) raise ValueError response = WDServer.wss(codes=",".join(universe), fields=SCHEMA[indicator]['field'], options=options2str(options)) WindAPI.test_error(response) df = {field: response.Data[i] for i, field in enumerate(response.Fields)} df = pd.DataFrame(df, index=response.Codes).reset_index() df.columns = ["sec_id", indicator] return df
def update_index_contents_to_csv(index_code, trading_days, override): try: date = trading_days[-1] df = loader(index_code, date) loader = LOADER_MAP[index_code] except Exception: Logger.error("Error occurred when loading {}".format(index_code)) raise ValueError try: path = os.path.join(DB_INDEX_CONTENTS, '{}.csv'.format(index_code)) copy_to = os.path.join(DB_INDEX_CONTENTS, '{}_backup.csv'.format(index_code)) shutil.copy(path, copy_to) # 保存副本,以防数据损坏 df.to_csv(path, encoding="utf-8", index=False) Logger.info("{} on {} is updated successfully".format( index_code, date)) except Exception: Logger.error("Error occurred when writing {}".format(index_code)) traceback.print_exc() raise ValueError # csv files are different from sql, cannot use update_schema() # therefore update schema information explicitly try: now = datetime.now() schema = get_schema('index_contents') schema[index_code]["begin date"] = "" schema[index_code]["end date"] = now.strftime('%Y-%m-%d') schema[index_code]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S') save_schema(schema, 'index_contents') Logger.info("schema updated: {}".format(index_code)) except Exception: Logger.error( "Error occurred when updating schema of {}".format(index_code)) traceback.print_exc() raise ValueError
def update_index_contents(index_code, trading_days=[], override=False, log=False): """ 从Wind更新index_contents相关数据 @index_code (str): 要更新的指标 @trading_days (['%Y-%m-%d']): 传入的日期列表 @override (Bool): 是否覆盖旧数据,默认为False,表示不覆盖 @log (Bool): 是否打印log """ Logger.info("Updating index_contents {}".format(index_code), "green") if index_code not in get_schema('index_contents'): Logger.error("Unrecognized index: {}".format(index_code)) return if not trading_days: Logger.error("Empty date") raise ValueError if index_code in IDXCONT_AS_SQL: update_index_contents_to_sql(index_code, trading_days, override, log) elif index_code in IDXCONT_AS_CSV: # 非sql数据强制更新,原有的会自动保存副本 update_index_contents_to_csv(index_code, trading_days) else: Logger.error("Unrecognized index code: {}".format(index_code)) raise ValueError if log: Logger.info("index_content/{} is updated.".format(index_code), color="green") Logger.info("------------------------------------------")
def get_single_index_daily_return(index_std, trading_days, cycle, groups=10, save_plot=True, save_daily=True, save_cum=True): """ 对单个index_std进行回测,输出单日收益表 @index_std <str>: index名称 @trading_days <list of date>: 回测时间段 时间是datetime格式 @cycle <int>: 以天为单位的周期 @groups <int>: 分组个数 默认为10 """ if len(trading_days) == 0: Logger.error("Empty date!!") # 根据周期计算会用到的时间点 trading_days = sorted(trading_days) selected_days = [] for i in range(cycle, len(trading_days), cycle): selected_days.append(trading_days[i]) selected_days = list(map(str, selected_days)) # 一次性获取回测周期的前复权收盘价和index_std df_close = get_secs_index(index='close', sec_ids=[], trading_days=selected_days) df_index = get_secs_index_std(index_std=index_std, sec_ids=[], trading_days=selected_days) df_daily = pd.DataFrame() for i in range(len(selected_days) - 1): date_now = selected_days[i] date_next = selected_days[i + 1] index_now = df_index[df_index.date == date_now] close_now = df_close[df_close.date == date_now].rename( columns={'close': 'close_now'}) del close_now['date'] close_next = df_close[df_close.date == date_next].rename( columns={'close': 'close_next'}) del close_next['date'] df_all = index_now.merge(close_now, how='inner', on=['sec_id']) df_all = df_all.merge(close_next, how='inner', on=['sec_id']) df_all = df_all.sort_values(by=[index_std]) df_all['return_rate'] = df_all['close_next'] / df_all['close_now'] - 1 # 分组 如果不能整分 则将多余的归为最后一组 多余的个数不会超过分组个数 df_all['group'] = np.nan group_index = df_all.columns.tolist().index('group') group = 1 dist = int(df_all.shape[0] / groups) for j in range(0, df_all.shape[0], dist): if group < groups: df_all.iloc[j:j + dist, group_index] = group else: df_all.iloc[j:, group_index] = group break group += 1 df_group = df_all.groupby([ 'group' ]).apply(lambda x: sum(x.return_rate * (x.close_now / x.close_now.sum()))).to_frame() df_group = df_group.rename(columns={0: 'return_rate'}) df_group = df_group.transpose() new_names = ['group{:0>2}'.format(i) for i in range(1, groups + 1)] df_group.columns = new_names df_group['date'] = date_now df_daily = df_daily.append(df_group) df_daily = df_daily.reset_index().drop(['index'], 1) df_daily['diff'] = df_daily['group{:0>2}'.format( groups)] - df_daily['group01'] if save_daily: df_daily.to_csv(os.path.join( PROJECT_FILES_PATH, 'daily_return', "{}_cycle_{}_daily.csv".format(index_std, cycle)), index=False, encoding='utf-8') df_cum = df_daily.copy() for i in range(df_daily.shape[0]): if i == 0: df_cum.iloc[i, 0:groups] = 1 else: # 单利计算累计收益 df_cum.iloc[i, 0:groups] = df_cum.iloc[ i - 1, 0:groups] + df_daily.iloc[i - 1, 0:groups] if save_cum: df_cum.to_csv(os.path.join( PROJECT_FILES_PATH, 'cum_return', "{}_cycle_{}_cum.csv".format(index_std, cycle)), index=False, encoding='utf-8') fig = plt.figure(figsize=(12, 6)) plt.ylabel("单利累计收益") ax = plt.subplot(111) df_cum = df_cum.set_index('date') df_cum[['group01', 'group{:0>2}'.format(groups)]].plot(ax=ax) ax.set_title("{}单利累计收益图 cycle={}".format(index_std, cycle)) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) if save_plot: picfile = os.path.join(PROJECT_FILES_PATH, 'plot', "{}_cycle_{}_cum.png".format(index_std, cycle)) plt.savefig(picfile) print("plot is saved to: {}".format(picfile))
def update_single_indicator(indicator, sec_ids=[], trading_days=[], override=False, log=False): """ 更新单个indicator的指定日期列表的数据 @indicator (str): 单个indicator的名称 @sec_ids<list> : 股票代码列表 @trading_days ([%Y-%m-%d]): 日期列表 @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖 @log (Bool): 是否打印log """ if log: Logger.info("Updating indicator {}".format(indicator), "green") if indicator not in get_schema('indicator'): Logger.error("Unrecognized indicator: {}".format(indicator)) raise ValueError if not trading_days: Logger.error("Empty date") raise ValueError with SqliteProxy(log=log) as proxy: date_classfier = classify_dates_by_year(trading_days) for year, date_list in date_classfier.items(): path = os.path.join(DB_INDICATOR, '{}.db'.format(year)) proxy.connect(path) if indicator not in proxy.list_tables: create_table(proxy, "indicator", indicator) # 判断已有数据 if len(date_list) == 1: query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format( indicator, date_list[0]) else: query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format( indicator, tuple(date_list)) lookup = proxy.query_as_dataframe(query) lookup = set(lookup['date'].tolist()) for date in date_list: if date in lookup and not override: # 更新的日期已经存在于数据库时,不覆盖则跳过 if log: Logger.warn("{} records on {} is existed.".format( indicator, date)) continue try: df = load_single_indicator_on_single_day_from_wind( indicator=indicator, sec_ids=sec_ids, date=date) except Exception: Logger.error("Error occurred when loading {} on {}".format( indicator, date)) raise ValueError if df is not None: # 从Wind下载数据成功时 if date in lookup and override: # 覆盖时删除原记录 if len(sec_ids) == 0: proxy.execute( "DELETE FROM [{}] WHERE date = '{}'".format( indicator, date)) if len(sec_ids) == 1: proxy.execute( "DELETE FROM [{}] WHERE date = '{}' and sec_id = '{}'" .format(indicator, date, sec_ids[0])) else: proxy.execute( "DELETE FROM [{}] WHERE date = '{}' and sec_id in {}" .format(indicator, date, tuple(sec_ids))) df['date'] = date try: proxy.write_from_dataframe(df, indicator) except Exception: Logger.error( "Error occurred when writing {} on {}".format( indicator, date)) traceback.print_exc() raise ValueError if log: Logger.info("{} on {} is updated successfully".format( indicator, date)) else: # 从wind提取数据失败时 Logger.error("Fail to fetch {} data on {}".format( indicator, date)) raise ValueError update_schema(db_name="indicator", sub_name=indicator) if log: Logger.info("indicator {} is updated.".format(indicator), color="green") Logger.info("------------------------------------------")