def get_secs_name_from_sql(sec_ids=[], index_code="A"): """ 获取最新日期A股或港股股票名称,数据格式为 {股票代码:股票名称} @sec_id (list): 股票列表 默认为空 表示最新日期的所有A股 @index_code (str): 数据库名称 :return: {sec_id: sec_name} """ last_date = get_schema('index_contents')[index_code]['end date'] dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(last_date[:4])) with SqliteProxy(log=False) as proxy: proxy.connect(dbpath) if len(sec_ids) == 0: # 默认取所有股票 query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}'".format( index_code, last_date) elif len(sec_ids) == 1: query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}' AND sec_id = '{}'".format( index_code, last_date, sec_ids[0]) else: query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}' AND sec_id in {}".format( index_code, last_date, tuple(sec_ids)) df = proxy.query_as_dataframe(query).set_index("sec_id") if len(df) == 0: Logger.warn("Empty result for query contents from {} on {}".format( index_code, last_date)) output = { sec: df.at[sec, "sec_name"] for sec in sec_ids if sec in df.index } return output
def get_index_weights(index_code, date=""): """ 读取单个日期指数成分股的权重 @index_code (str): 指数代码,目前支持 ['000016.SH', '000300.SH', '000905.SH'] @date (%Y-%m-%d): 单个日期 :return: {sec_id: weight} """ if not date: Logger.error("Empty date") raise ValueError if index_code not in ['000016.SH', '000300.SH', '000905.SH']: Logger.error("Invalid index code: {}".format(index_code)) dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4])) with SqliteProxy(log=False) as proxy: proxy.connect(dbpath) query = "SELECT sec_id, weight FROM [{}] WHERE date = '{}' ".format( index_code, date) df = proxy.query_as_dataframe(query) if len(df) == 0: Logger.warn("Empty result when reading {} at {}".format( index_code, date)) output = {} else: output = { df.at[i, 'sec_id']: df.at[i, 'weight'] for i in range(len(df)) } return output
def get_factor_return_daily(factor_return_name, trading_days=[]): """ 从本地数据库中获取某段日期某个factor_return的日收益率 @factor_return_name (str): factor名称 @trading_days (['%Y-%m-%d']): 日期列表 :return: DataFrame, index: date, columns: [sec_id, group01-group10, factor] """ if factor_return_name not in get_schema( "factor_return"): # 判断所给定的factor_return是否存在本地factor库中 Logger.error( "{} is not in FACTOR_RETURN library".format(factor_return_name)) return else: filepath = os.path.join(DB_FACTOR_RETURN_PATH, "{}.csv".format(factor_return_name)) df_info = open_csv_as_df(filepath, validate=True) if not trading_days: output = df_info.copy() else: output = df_info[df_info.date.isin(trading_days)] not_found_dates = set(trading_days) - set(output["date"].tolist()) if not_found_dates: Logger.warn( "Following dates are invalid: {}".format(not_found_dates)) return output = output.set_index(['date']) return output
def update_index_contents_to_sql(index_code, trading_days, override, log=False): with SqliteProxy(log=log) as proxy: date_classfier = classify_dates_by_year(trading_days) for year, date_list in date_classfier.items(): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year)) proxy.connect(path) if index_code not in proxy.list_tables: create_table(proxy, "index_contents", index_code) # 判断已有数据 query = "SELECT DISTINCT(date) FROM [{}]".format(index_code) lookup = proxy.query_as_dataframe(query) lookup = set(lookup['date'].tolist()) for date in date_list: if date in lookup and not override: # 更新的日期已经存在于数据库时,不覆盖则跳过 if log: Logger.warn("{} records on {} is existed.".format( index_code, date)) continue try: loader = LOADER_MAP[index_code] df = loader(index_code, date) df['date'] = date except Exception: Logger.error("Error occurred when loading {} on {}".format( index_code, date)) raise ValueError if df is not None: # 从Wind下载数据成功时 try: if date in lookup and override: # 覆盖时删除原记录 proxy.execute( "DELETE FROM [{}] WHERE date = '{}'".format( index_code, date)) proxy.write_from_dataframe(df, index_code) except Exception: Logger.error( "Error occurred when writing {} on {}".format( index_code, date)) traceback.print_exc() raise ValueError Logger.info("{} on {} is updated successfully".format( index_code, date)) else: # 从wind提取数据失败时 Logger.error("Fail to fetch {} data on {}".format( index_code, date)) raise ValueError update_schema('index_contents', index_code)
def get_index_contents_on_multidays(index_code, trading_days=[], log=False): """ 读取多个日期某指数全部股票列表 @index_code (str): 指数代码,目前支持 ['A', '000905.SH', '000300.SH', '000016.SH'] @trading_days (['%Y-%m-%d']): 日期列表 @log (Bool): 是否打印log :return: ({date: list}), key为date value为 股票代码列表 """ if log: Logger.info( "Reading all {} records between trading_days ...".format( index_code), "green") if len(trading_days) == 0: Logger.error("Empty date") raise ValueError elif len(trading_days) == 1: date = trading_days[0] return {date: get_index_contents(index_code, date, log=False)} output = {} if index_code in IDXCONT_AS_SQL: with SqliteProxy(log=log) as proxy: for year, date_list in classify_dates_by_year( trading_days).items(): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year)) proxy.connect(path) query = "SELECT date, sec_id FROM [{}] WHERE date IN {}".format( index_code, tuple(date_list)) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error( "Empty result when reading {} from {} to {}".format( index_code, trading_days[0], trading_days[-1])) traceback.print_exc() raise ValueError if len(df) == 0: Logger.warn( "Empty result when reading {} from {} to {}".format( index_code, trading_days[0], trading_days[-1])) for date in date_list: output[date] = df[df.date == date]['sec_id'].tolist() elif index_code in IDXCONT_AS_CSV: info = get_index_contents_from_csv(index_code) output = {date: info for date in trading_days} else: Logger.error("Unrecognized index code: {}".format(index_code)) raise ValueError return output
def update_indicators(indicators=[], trading_days=[], sec_ids=[], override=False, log=False): """ 更新多个indicator的指定日期列表的数据 @indicators (list): indicator的名称构成的列表 @trading_days ([%Y-%m-%d]): 日期列表 @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖 @log (Bool): 是否打印log """ SCHEMA = get_schema('indicator') if not indicators: indicators = list(SCHEMA.keys()) start = trading_days[0] end = trading_days[-1] update_days_map = { "财报数据": set(get_report_days(start, end)), "时间序列": set(get_trading_days(start, end)), } for ind in indicators: if ind in SCHEMA: # 更新日期取交集 itype = SCHEMA[ind]['type'] update_days = [ t for t in trading_days if t in update_days_map[itype] ] if not update_days: Logger.warn("No valid days to update!") else: update_single_indicator(indicator=ind, trading_days=update_days, sec_ids=sec_ids, override=override, log=log) else: Logger.error("Unrecognized indicator: {}".format(ind))
def calculate_factor(factor, date): """ 通过对indicator的计算得到因子的值 :param: factor (str): 该factor的名字 :param: date (%Y-%m-%d): 日期 :return: dataframe 处理后的因子值 """ func = getattr(formula, "calculate_raw_{}".format(factor)) if func is None: Logger.error("Formula not implemented: {}".format(factor)) raise ValueError context, df_today, missing_flag = load_context(factor, date) last_day = get_previous_existed_day_in_table(date, DB_FACTOR, factor) if missing_flag == 1: if last_day is None: # 无最新数据 Logger.error("当前日期数据缺失值太多,且之前没有可以复制的文件") raise ValueError else: Logger.warn("由于 {} 值缺失太多直接复制于 {}".format(date, last_day)) try: df_last = get_secs_factor(factor, sec_ids=[], date=last_day, log=False) except Exception: traceback.print_exc() Logger.warn("无法提取 {} 上个记录日的数据".format(factor)) raise ValueError value = df_today.merge(df_last, how="left", left_on='sec_id', right_index=True) return value else: data_raw = func(context) data_final = statistical_process( # 数据处理: 缺失值分离 winsorize 标准化 data=data_raw, var=factor, winsor_LB=WINSORIZE_LB, winsor_UB=WINSORIZE_UB) return data_final
def get_index_contents_from_sql(index_code, date="", log=False): path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4])) with SqliteProxy(log=log) as proxy: proxy.connect(path) query = "SELECT sec_id FROM [{}] WHERE date = '{}'".format( index_code, date) try: df = proxy.query_as_dataframe(query) except Exception: Logger.error("Error occurred when reading {} at {}".format( index_code, date)) traceback.print_exc() raise ValueError if len(df) == 0: Logger.warn("Empty result when reading {} at {}".format( index_code, date)) return [] return df["sec_id"].tolist()
def update_index_std(index, cp=3, log=False): """ 更新index_std 更新原理: 无需指定trading_days 更新全部index中有的日期但在index_std中没有的日期 @index <str>: index名称 不是index_std名称 @cp <int>: winsorize的临界值 """ trading_days = get_unique_datelist_from_table("index", index) existed_days = get_unique_datelist_from_table("index_std", "{}_std".format(index)) update_days = sorted(list(set(trading_days) - set(existed_days))) if len(update_days) == 0: Logger.warn("All given dates has existed. No need to update!!") return output = process_ts_index(index, update_days, cp) if len(output) == 0: Logger.error("Fail to process {} on given dates".format(index)) df2mysql(USER, PASSWORD, "index_std", index + '_std', output) del output, trading_days, update_days gc.collect() Logger.info("Updated successfully!!")
def get_secs_name(sec_ids=[]): """ 获取最新日期股票名称,自动处理A股和H股,数据格式为 {股票代码:股票名称} @sec_id (list): 股票列表 默认为空 表示最新日期的所有A股和H股 :return: {sec_id: sec_name} """ classifier = classify_equity(sec_ids) output = {} if classifier["A股"]: output.update(get_secs_name_from_sql(classifier["A股"], "A")) if classifier["港股"]: output.update(get_secs_name_from_csv(classifier["港股"], "H")) if classifier["其他"]: Logger.warn("Unrecognized sec_ids: {}".format(classifier["其他"])) return output
def get_secs_industry(industry_code, sec_ids=[], date=""): """ 获取某日期某些股票的的行业分类信息,数据格式 {股票代码:行业分类} @industry_code (str): 子数据库名称,目前支持 ["A_SWL1", "H_SWL1", "H_GICSL1"] @sec_ids: (list) 股票列表 @date: ("%Y-%m-%d") 单个日期 return: {sec_id: industry},不存在则忽略 """ if len(sec_ids) == 0: Logger.warn("Empty sec_ids when reading {} on {}!".format(industry_code, date)) return {} if industry_code in INDUSTRY_AS_SQL: output = get_secs_industry_from_sql(industry_code, sec_ids, date) elif industry_code in INDUSTRY_AS_JSON: output = get_secs_industry_from_json(industry_code, sec_ids) else: Logger.error("Unrecognized industry code: {}".format(industry_code)) raise ValueError return output
def get_secs_industry_SWL1(sec_ids=[], date=""): """ 获取某日期某些股票的的申万一级行业分类信息,自动处理A股和H股,数据格式 {股票代码:行业分类} @sec_id: (list) 股票列表 @date: (%Y-%m-%d) 单个日期 """ if len(sec_ids) == 0: Logger.warn("Empty sec_ids when reading SWL1 on {}!".format(date)) return {} classfier = classify_equity(sec_ids) output_A = {} if classfier['A股']: output_A = get_secs_industry(industry_code="A_SWL1", sec_ids=classfier['A股'], date=date) output_H = {} if classfier['港股']: output_H = get_secs_industry(industry_code="H_SWL1", sec_ids=classfier['港股'], date=date) output_A.update(output_H) return output_A
def update_single_indicator(indicator, sec_ids=[], trading_days=[], override=False, log=False): """ 更新单个indicator的指定日期列表的数据 @indicator (str): 单个indicator的名称 @sec_ids<list> : 股票代码列表 @trading_days ([%Y-%m-%d]): 日期列表 @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖 @log (Bool): 是否打印log """ if log: Logger.info("Updating indicator {}".format(indicator), "green") if indicator not in get_schema('indicator'): Logger.error("Unrecognized indicator: {}".format(indicator)) raise ValueError if not trading_days: Logger.error("Empty date") raise ValueError with SqliteProxy(log=log) as proxy: date_classfier = classify_dates_by_year(trading_days) for year, date_list in date_classfier.items(): path = os.path.join(DB_INDICATOR, '{}.db'.format(year)) proxy.connect(path) if indicator not in proxy.list_tables: create_table(proxy, "indicator", indicator) # 判断已有数据 if len(date_list) == 1: query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format( indicator, date_list[0]) else: query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format( indicator, tuple(date_list)) lookup = proxy.query_as_dataframe(query) lookup = set(lookup['date'].tolist()) for date in date_list: if date in lookup and not override: # 更新的日期已经存在于数据库时,不覆盖则跳过 if log: Logger.warn("{} records on {} is existed.".format( indicator, date)) continue try: df = load_single_indicator_on_single_day_from_wind( indicator=indicator, sec_ids=sec_ids, date=date) except Exception: Logger.error("Error occurred when loading {} on {}".format( indicator, date)) raise ValueError if df is not None: # 从Wind下载数据成功时 if date in lookup and override: # 覆盖时删除原记录 if len(sec_ids) == 0: proxy.execute( "DELETE FROM [{}] WHERE date = '{}'".format( indicator, date)) if len(sec_ids) == 1: proxy.execute( "DELETE FROM [{}] WHERE date = '{}' and sec_id = '{}'" .format(indicator, date, sec_ids[0])) else: proxy.execute( "DELETE FROM [{}] WHERE date = '{}' and sec_id in {}" .format(indicator, date, tuple(sec_ids))) df['date'] = date try: proxy.write_from_dataframe(df, indicator) except Exception: Logger.error( "Error occurred when writing {} on {}".format( indicator, date)) traceback.print_exc() raise ValueError if log: Logger.info("{} on {} is updated successfully".format( indicator, date)) else: # 从wind提取数据失败时 Logger.error("Fail to fetch {} data on {}".format( indicator, date)) raise ValueError update_schema(db_name="indicator", sub_name=indicator) if log: Logger.info("indicator {} is updated.".format(indicator), color="green") Logger.info("------------------------------------------")
def update_single_factor(factor, trading_days=[], override=False, log=False): """ 更新单个factor的指定日期列表的数据 @factor (str): factor名称 @trading_days ([%Y-%m-%d]): 日期列表 @override (Bool): 是否覆盖原记录,默认为False,表示不覆盖 @log (Bool): 是否打印log """ Logger.info("Updating factor {}".format(factor), "green") _n_updated_date = 0 if factor not in get_schema('factor'): Logger.error("Unrecognized factor: {}".format(factor)) raise ValueError if not trading_days: Logger.error("Empty date") raise ValueError with SqliteProxy(log=log) as proxy: date_classfier = classify_dates_by_year(trading_days) for year, date_list in date_classfier.items(): path = os.path.join(DB_FACTOR, '{}.db'.format(year)) proxy.connect(path) if factor not in proxy.list_tables: create_table(proxy, "factor", factor) # 判断已有数据 if len(date_list) == 1: query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format( factor, date_list[0]) else: query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format( factor, tuple(date_list)) lookup = proxy.query_as_dataframe(query) lookup = set(lookup['date'].tolist()) for date in date_list: if date in lookup and not override: # 更新的日期已经存在于数据库时,不覆盖则跳过 if log: Logger.warn("{} records on {} is existed.".format( factor, date)) continue try: df = load_single_factor_on_single_day(factor=factor, date=date) except Exception: Logger.error("Error occurred when loading {} on {}".format( factor, date)) traceback.print_exc() continue if df is not None: # 成功取得indicator if date in lookup and override: # 覆盖时删除原记录 proxy.execute( "DELETE FROM [{}] WHERE date = '{}'".format( factor, date)) df['date'] = date try: proxy.write_from_dataframe(df, factor) except Exception: Logger.error( "Error occurred when writing {} on {}".format( factor, date)) traceback.print_exc() raise ValueError if log: Logger.info("{} on {} is updated successfully".format( factor, date)) _n_updated_date += 1 else: # 从wind提取数据失败时 Logger.error("Fail to fetch {} data on {}".format( factor, date)) raise ValueError update_schema(db_name="factor", sub_name=factor) if log: _n_all_date = len(trading_days) _n_existed_date = _n_all_date - _n_updated_date Logger.info("传入日期数:{} 已经存在个数:{} 实际写入次数:{}".format( _n_all_date, _n_existed_date, _n_updated_date)) Logger.info("factor {} is updated.".format(factor), color="green") Logger.info("------------------------------------------")
def update_single_factor_return(factor_return, trading_days=[], group_num=10, log=True): """ 根据trading_days更新factor_return数据 @factor_return (<str>): factor的名称 @trading_days (<[%Y-%m-%d]>) : 日期列表 @group_num (<int>): 分组个数 """ if log: Logger.info("Updating factor_return {}...".format(factor_return)) if factor_return not in get_schema("factor_return"): Logger.error("Unrecognized factor_return: {}".format(factor_return)) return factor_path = DB_PATH_LIB['factor'] factor_exist_dates = get_date_lists_in_table(factor_path, factor_return) not_found_date = list(set(trading_days) - set(factor_exist_dates)) if len(not_found_date) != 0 and log: Logger.warn( "Fail to update these factor returns on following dates due to lack factor:{}" .format(not_found_date)) trading_days = list(set(trading_days) - set(not_found_date)) if len(trading_days) == 0: Logger.error("No valid date to update") return trading_days = sorted(trading_days) db_factor_return_path = os.path.join(DB_PATH, "factor_return") filepath = os.path.join(db_factor_return_path, '{}.csv'.format(factor_return)) df_new = load_single_factor_return_on_multidays(factor_return, trading_days, group_num) _n_updated_date = len(df_new) if not os.path.exists(filepath): # 没有已经更新过的记录 Logger.info("首次更新 {}数据".format(factor_return)) output = df_new.copy() output.to_csv(filepath, encoding="utf-8") else: df_old = pd.read_csv(filepath, encoding="utf-8") # 已经存在的所有return数据 min_exist_date = normalize(df_old['date'].min(), "%Y-%m-%d") max_exist_date = normalize(df_old['date'].max(), "%Y-%m-%d") max_update_date = trading_days[-1] min_update_date = trading_days[1] # 因为只能第二个日期才能计算收益 if (max(min_update_date, max_update_date) < min_exist_date) or \ (min(min_update_date, max_update_date) > max_exist_date): Logger.error("非法更新:待更新时间段孤立于现有的时间段") Logger.error("开始更新日期:{} 结束更新日期:{}".format(min_update_date, max_update_date)) Logger.error("原有开始日期:{} 原有结束日期:{}".format(min_exist_date, max_exist_date)) return if (min_update_date < min_exist_date) and \ (max_update_date <= max_exist_date) and \ (max_update_date >= min_exist_date): Logger.info("左更新:更新之前记录") if (min_update_date >= min_exist_date) and (max_update_date <= max_exist_date): Logger.info("存量更新:更新当前已经有的记录") if (min_update_date >= min_exist_date) and \ (min_update_date <= max_exist_date) and \ (max_update_date > max_exist_date): Logger.info("右更新:更新未来的记录") if (min_update_date < min_exist_date) and \ (max_update_date > max_exist_date): Logger.info("全更新:当前已经存在的日期是待更新日期的子集") df_old['date'] = df_old['date'].apply( lambda x: normalize(x, "%Y-%m-%d")) df_new['date'] = df_new['date'].apply( lambda x: normalize(x, "%Y-%m-%d")) bool_list = df_old['date'].isin(df_new['date']).apply( lambda x: not x) # 旧数据不在更新日期中为True # 取出那些不在本次更新范围内但原数据已经存在的日期列表 这些日期直接copy 无需计算 df_old = df_old[bool_list] output = df_old.append(df_new).sort_values(by=['date']) output = output.set_index(['date']) format_var_name_list = [ 'group{:0>2}'.format(i) for i in range(1, group_num + 1) ] format_var_name_list.append('{}'.format(factor_return)) output = output.reindex(columns=format_var_name_list) output.to_csv(filepath, encoding="utf-8") update_factor_return_schema(factor_return) if log: _n_all_date = len(output) _n_existed_date = _n_all_date - _n_updated_date Logger.info("传入日期数:{} 已经存在个数:{} 实际写入次数:{}".format( _n_all_date, _n_existed_date, _n_updated_date)) Logger.info("factor_return {} is updated.".format(factor_return), color="green") Logger.info("------------------------------------------")