def analysis_exposure(port, benchmark_port, date): """ 分析组合相对于基准的暴露情况 Parameter --------- port: pandas.Series 组合的持仓,index为股票代码,值为权重 benchmark_port: pandas.Series 基准的持仓,格式同上,None表示没有基准 date: datetime like 计算暴露的时间 Return ------ exposure: pandas.Series """ barra_vsf = fmanager.query('BARRA_VSF', date).iloc[0] mask = barra_vsf == 1 port = port.loc[mask] if benchmark_port is not None: benchmark_port = benchmark_port.loc[mask] benchmark_port = benchmark_port / benchmark_port.sum() factors = [f for f in fmanager.list_allfactor() if f.startswith('BARRA_RF')] factors += ['ZX_IND'] analysor = ExposureAnalysor({f: DataView(fmanager.generate_getter(f), get_calendar('stock.sse')) for f in factors}, 'ZX_IND') exposure = analysor.calculate_exposure(date, port.to_dict(), benchmark_port.to_dict(), True) return exposure
def main_test(fund_symbol, benchmark_name): rpt_date = '2018-06-30' date = get_calendar('stock.sse').latest_tradingday(rpt_date, 'FUTURE') benchmark_weights = fmanager.query(benchmark_name, date).iloc[0].dropna() fund_port = get_fund_detail_position(fund_symbol, rpt_date) exposure = analysis_exposure(fund_port, benchmark_weights, date).drop('NaS') return exposure
def query_compare(): old_db_data = query(test_factor, (start_time, second_end_time)) threshold_loc = old_db_data.index.get_loc(first_end_time) + 1 new_db_data = HDF5Engine.query( ParamsParser.from_dict( db_path, { "rel_path": 'test', 'start_time': start_time, 'end_time': second_end_time, "store_fmt": (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL) })) old_db_data = old_db_data.fillna(0) new_db_data = new_db_data.fillna(0) columns1 = new_db_data.columns[:initial_size] columns2 = new_db_data.columns is_close1 = np.isclose(old_db_data.ix[:threshold_loc, columns1], new_db_data.ix[:threshold_loc, columns1]) is_close2 = np.isclose(old_db_data.ix[threshold_loc:, columns2], new_db_data.iloc[threshold_loc:]) print(np.all(is_close1)) print(np.all(is_close2))
def load_data(self): ''' 从因子数据库中加载数据 ''' if not self.loaded: self._data = query(self._factor_name, (self._start_time, self._end_time)) self.loaded = True
def stock_filter(date): st_data = query('ST_TAG', date).iloc[0] trade_data = query('TRADEABLE', date).iloc[0] factor_data = query(factor_name, date).iloc[0] data = pd.DataFrame({'st_data': st_data, 'trade_data': trade_data, 'factor': factor_data}) if stock_pool is not None: data = data.assign(stock_pool=query(stock_pool, date).iloc[0]) else: data = data.assign(stock_pool=[1] * len(data)) if industry_cls is not None: data = data.assign(industry=query(industry_cls, date).iloc[0]) data = data.loc[data.industry != NaS] else: data = data.assign(industry=[NaS] * len(data)) data = data.loc[(data.trade_data == 1) & (data.st_data == 0) & (data.stock_pool == 1), :].\ dropna(subset=['factor'], axis=0) by_ind = data.groupby('industry') data = data.assign(datag=by_ind.factor.transform(lambda x: pd.qcut(x, group_num, labels=range(group_num)))) by_group_id = data.groupby('datag') out = by_group_id.get_group(group_id).index.tolist() return out
def second_insert(): data = query(test_factor, (start_time, second_end_time)) HDF5Engine.insert( data, ParamsParser.from_dict( db_path, { 'rel_path': rel_path, 'store_fmt': (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL), 'dtype': np_dtype('float64') }))
def __init__(self, port_data, standardlize=True): ''' Parameter --------- port_data: portmonitor.manager.PortfolioData 已经经过持仓更新的持仓数据 standardlize: boolean, default True 是否需要对组合的价值进行归一,如果进行归一化处理,每次刷新返回的值都是组合净值变动 ''' if standardlize: self._port_basevalue = port_data.last_asset_value else: self._port_basevalue = 1 self._port_holding = pd.Series( {drop_suffix(c): n for c, n in port_data.curholding.items()}) self.id_ = port_data.id today = dt.datetime.now() self._last_td = tds_pshift(today, 2) self._data = [] # set_trace() if 'ts_data' not in self._share_data: # 在第一个实例初始化ts昨收盘数据 ts_data = get_today_all().set_index('code').settlement.apply( np.float) ts_data = ts_data.loc[~ts_data.index.duplicated()] # 删除索引中的重复项 self._share_data['ts_data'] = ts_data if 'local_lastclose_data' not in self._share_data: # 在第一个实例初始化本地昨日收盘数据 lastclose_data = query('CLOSE', self._last_td).iloc[0] lastclose_data.index = lastclose_data.index.str.slice(stop=6) self._share_data['local_lastclose_data'] = lastclose_data if 'adj_ratio' not in self._share_data: # 在第一个实例初始化持股调整比例 ts_data = self._share_data['ts_data'] lastclose_data = self._share_data['local_lastclose_data'] diff = np.round(ts_data - lastclose_data, 2) ratio = lastclose_data / ts_data # set_trace() isclose2zero = diff == 0 # 对于当前没有数据的股票,直接将ratio设置为0,一般没有问题,因为这些股票一般是退市的股票 # 或者不在持仓中的股票 adj_ratio = pd.Series(np.where(isclose2zero, 1, ratio), index=ratio.index).fillna(0) adj_ratio[CASH] = 1 self._share_data['adj_ratio'] = adj_ratio self._adj_rtnum()
def __init__(self, bt, benchmark=None, riskfree_rate=0.04): ''' Parameter --------- bt: BackTest 需要被分析的回测实例 benchmark: pd.Series, default None 基准的净值数据,要求起始时间与bt相同,如果没有给定,则自动使用同期的上证综指收盘价 (SSEC_CLOSE)作为参考基准 riskfree_rate: float, default 0.04 无风险利率 ''' super().__init__(bt) if benchmark is None: benchmark = query('SSEC_CLOSE', (bt.start_date, bt.end_date)).iloc[:, 0] self._benchmark = price2nav(benchmark) self._riskfree_rate = riskfree_rate self._result_cache = None
def first_insert(): data = query(test_factor, (start_time, first_end_time)).iloc[:, :initial_size] columns = list(data.columns) shuffle(columns) data = data.loc[:, columns] HDF5Engine.insert( data, ParamsParser.from_dict( db_path, { 'rel_path': rel_path, 'store_fmt': (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL), 'dtype': np_dtype('float64') }))
def fv_correlation(factors, start_time, end_time, freq=MONTHLY, average=True, method='pearson'): ''' 计算不同因子的因子值之间的相关系数矩阵 Parameter --------- factors: list like 因子名称列表,必须能够在fmanager.list_allfactor()中找到 start_time: datetime like 计算相关系数矩阵的起始时间 end_time: datetime like 计算相关系数矩阵的终止时间 freq: str, default const.MONTHLY 计算协方差矩阵的频率,目前只支持周度(WEEKLY)和月度(MONTHLY) average: boolean, default True 是否返回相关系数矩阵平均后的值,默认进行平均的处理 method: string, default pearson 计算相关系数的方法,支持['pearson', 'spearman', 'kendall'] Return ------ out: pd.DataFrame or OrderDict 如果average参数为True,则返回pd.DataFrame,反之返回OrderDict,key为计算的时间,value为相关系数矩阵 ''' rebs = load_rebcalculator(freq, start_time, end_time) datas = [] for f in factors: tmp_data = query(f, (start_time, end_time)) tmp_data = tmp_data.reindex(rebs.reb_points) datas.append(tmp_data) datas = convert_data(datas, factors) by_time = datas.groupby(level=0) out = OrderedDict() for t in by_time.groups: tmp = by_time.get_group(t).reset_index(level=0, drop=True) out[t] = tmp.T.corr(method=method) if average: out = reduce(lambda x, y: x + y, out.values()) / len(out) return out
def _prepare_contextualfactor(self): ''' 对条件因子进行回测,用于获取一些各个情境下的基准表现 ''' # 条件因子回测 contextualfactor_test = FactortestTemplate( self.context_factor, self._start_time, self._end_time, group_num=self._context_num, reb_method=self._reb_type, show_progress=self.show_progress) contextualfactor_bt = contextualfactor_test.run_test() # 获取条件因子各组换手率、行业分布、净值的数据 # 净值分析数据 benchmark = query('SSEC_CLOSE', (self._start_time, self._end_time)).iloc[:, 0] benchmark = benchmark / benchmark.iloc[0] contextf_navanalysor = NavAnalysor(contextualfactor_bt, benchmark) contextf_navres = contextf_navanalysor.analysis_result # 换手率分析数据 contextf_toanalysor = TOAnalysor(contextualfactor_bt) contextf_tores = contextf_toanalysor.analysis_result # 行业分布分析数据 contextf_indanalysor = IndustryAnalysor(contextualfactor_bt, self._ind_cls) contextf_indres = contextf_indanalysor.analysis_result # 存储中间数据 self.context_factor_btres = FactorGroupTestRes( navs=contextualfactor_bt.navpd, nav_analysis=contextf_navres, to_analysis=contextf_tores, ind_analysis=contextf_indres, IC=None, Rank_IC=None) self._contextbt = contextualfactor_bt
def query_data_bydate(date, data_msg): ''' 辅助函数,用于从数据库中获取给定日期的数据,并将数据聚合成一个pd.DataFrame Parameter --------- date: datetime like 数据的日期 data_msg: dict 获取数据的相关信息,格式为{factor_name: col_name},要求factor_name能在fmanager.list_allfactor() 中找到,col_name表示希望在返回的pd.DataFrame中对应数据的列名,其中factor_name和col_name的类型 都要求为string Return ------ out: pd.DataFrame 查询的数据 ''' datas = {} for factor in data_msg: factor_data = query(factor, date).iloc[0] datas[data_msg[factor]] = factor_data out = pd.DataFrame(datas) return out
def factor_purify(tobe_purified, other_factors, start_time, end_time, normalize=True, winsorize_threshold=0.01, universe=None): ''' 使用回归的方法剔除其他因子对目标因子的影响,即使用目标因子对其他因子做横截面上的回归, 然后取残差,作为新的因子值 Parameter --------- tobe_purified: str 需要被纯化的因子名称 other_factors: list like 作为自变量的因子,格式为[factor1, factor2, ...] start_time: datetime like 纯化因子数据的开始时间 end_time: datetime like 纯化因子数据的结束时间 normalize: boolean, default True 是否在回归前对异常值进行winsorize处理,并将各个因子的数据转换为z-score winsorize_threshold: float, default 0.01 在winsorize处理时传入的参数,共2*n*winsorize_threshold个数据将会被进行winsorize处理 universe: iterable, default None 股票的universe,默认None表示从fmanager.get_universe中获取 Return ------ out: pd.DataFrame 经过纯化后的因子数据,index为时间,columns为universe中的股票代码 ''' # 加载数据 raw_data = query(tobe_purified, (start_time, end_time)) factors_data = list() factors_data.append(raw_data) for f in other_factors: tmp_data = query(f, (start_time, end_time)) factors_data.append(tmp_data) factors_tag = [tobe_purified] + list(other_factors) # universe获取 if universe is None: universe = get_universe() # 对数据进行正则化处理 if normalize: new_data = list() for data in factors_data: tmp = data.apply(lambda x: standardlize( winsorize(x, (winsorize_threshold, 1 - winsorize_threshold))), axis=1) tmp = tmp.loc[:, sorted(universe)] new_data.append(tmp) factors_data = new_data data = convert_data(factors_data, factors_tag) by_time = data.groupby(level=0) def calc_resid(x): raw_index = x.columns x = x.reset_index(level=0, drop=True).T.dropna(axis=0, how='any') res = extract_factor_OLS(x, factor_col=tobe_purified, x_cols=other_factors, standardlization=False) # pdb.set_trace() res = res.reindex(raw_index) return res out = by_time.apply(calc_resid) return out
Created: 2018/2/27 """ from numpy import all as np_all from numpy import isclose as np_isclose from database.db import Database from database.const import DataClassification, DataFormatCategory, DataValueCategory from fmanager import query db_path = r'C:\Users\c\Desktop\test\db_test' start_time = '2017-01-01' end_time = '2018-02-01' db = Database(db_path) num_data = query('CLOSE', (start_time, end_time)) char_data = query('ZX_IND', (start_time, end_time)) unstruct_data = list(range(1000)) db.insert(num_data, 'num_test', (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL), 'float64') db.insert(char_data, 'char_test', (DataClassification.STRUCTURED, DataValueCategory.CHAR, DataFormatCategory.PANEL)) db.insert(unstruct_data, 'unstruct_data.test', (DataClassification.UNSTRUCTURED, )) query_start = '2017-05-01' query_end = '2017-12-06'
from os import remove from os.path import exists, join from numpy import dtype as np_dtype from database.hdf5Engine.dbcore import HDF5Engine from database.const import DataFormatCategory, DataValueCategory, DataClassification from database.db import ParamsParser from fmanager import query TEST_FACTOR = 'CLOSE' start_time = '2017-01-01' end_time = '2018-01-15' new_end = '2018-02-01' sample_df = query(TEST_FACTOR, (start_time, end_time)) new_data = query(TEST_FACTOR, (end_time, new_end)) db_path = r'C:\Users\c\Desktop\test' # file_path = join(db_path, 'test.h5') # if exists(file_path): # remove(file_path) HDF5Engine.insert( new_data, ParamsParser.from_dict( db_path, { "rel_path": 'test', "store_fmt": (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL), "dtype":
plt.setp(ax.xaxis.get_minorticklabels(), rotation=rotation) for xlabel in ax.xaxis.get_majorticklabels(): # 隐藏major tick xlabel.set_visible(False) for xlabel in ax.xaxis.get_minorticklabels(): # 显示minor tick xlabel.set_visible(True) xmin, _ = plt.xlim() plt.xlim(xmin=xmin - 1) plt.show() if __name__ == '__main__': import pandas as pd import fmanager start_time = '2016-01-01' end_time = '2017-02-01' open_data = fmanager.query('OPEN', (start_time, end_time)).iloc[:, 0] close_data = fmanager.query('CLOSE', (start_time, end_time)).iloc[:, 0] high_data = fmanager.query('HIGH', (start_time, end_time)).iloc[:, 0] low_data = fmanager.query('LOW', (start_time, end_time)).iloc[:, 0] data = pd.DataFrame({ 'open': open_data, 'close': close_data, 'high': high_data, 'low': low_data }) plot_candle(data.reset_index(), time_col='index', time_index=False, rotation=45)
# @Version : $Id$ import numpy as np from database.hdf5Engine.dbcore import HDF5Engine from database.db import ParamsParser from database.const import DataClassification, DataValueCategory, DataFormatCategory from fmanager import query start_time = '2017-01-01' end_time = '2018-01-01' db_path = r'C:\Users\c\Desktop\test' data = HDF5Engine.query( ParamsParser.from_dict( db_path, { "rel_path": 'test_series', 'start_time': start_time, 'end_time': end_time, "store_fmt": (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.TIME_SERIES) })) fm_data = query('CLOSE', (start_time, end_time)).iloc[:, 0] data = data.fillna(-10000) fm_data = fm_data.fillna(-10000) print(np.all(data == fm_data))
from time import time import numpy as np from database.jsonEngine.dbcore import JSONEngine from database.const import DataClassification, DataFormatCategory, DataValueCategory from database.db import ParamsParser from fmanager import query sample_start_time = '2017-01-01' sample_end_time = '2018-02-01' query_start_time = '2017-05-01' query_end_time = '2017-12-04' sample_data = query('ZX_IND', (sample_start_time, sample_end_time)) db_path = r'C:\Users\c\Desktop\test' json_db = 'df_query_test' folder_path = join(db_path, json_db) if exists(folder_path): rmtree(folder_path) JSONEngine.insert( sample_data, ParamsParser.from_dict( db_path, { 'rel_path': json_db, 'store_fmt': (DataClassification.STRUCTURED, DataValueCategory.CHAR, DataFormatCategory.PANEL) }))
def get_df(start_time, end_time): return query('CLOSE', (start_time, end_time))
from os.path import exists, join from numpy import dtype as np_dtype from database.hdf5Engine.dbcore import HDF5Engine from database.const import DataFormatCategory, DataValueCategory, DataClassification from database.db import ParamsParser from fmanager import query TEST_FACTOR = 'CLOSE' start_time = '2017-01-01' end_time = '2017-12-30' new_end = '2018-01-15' second_insert_test = True sample_series = query(TEST_FACTOR, (start_time, end_time)).iloc[:, 0] new_sample_series = query(TEST_FACTOR, (start_time, new_end)).iloc[:, 0] db_path = r'C:\Users\c\Desktop\test' if not second_insert_test: file_path = join(db_path, 'test_series.h5') if exists(file_path): remove(file_path) HDF5Engine.insert( sample_series, ParamsParser.from_dict( db_path, { "rel_path": 'test_series', "store_fmt": (DataClassification.STRUCTURED, DataValueCategory.NUMERIC,
# @Link : https://github.com/SAmmer0 # @Version : $Id$ import numpy as np from database.hdf5Engine.dbcore import HDF5Engine from database.db import ParamsParser from database.const import DataClassification, DataValueCategory, DataFormatCategory from fmanager import query start_time = '2017-01-01' end_time = '2018-02-01' db_path = r'C:\Users\c\Desktop\test' data = HDF5Engine.query( ParamsParser.from_dict( db_path, { "rel_path": 'test', 'start_time': start_time, 'end_time': end_time, "store_fmt": (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL) })) fm_data = query('CLOSE', (start_time, end_time)) data = data.fillna(-10000) fm_data = fm_data.fillna(-10000) print(np.all(np.all(data == fm_data, axis=1)))
# -*- coding:utf-8 """ Author: Hao Li Email: [email protected] Github: https://github.com/SAmmer0 Created: 2018/3/28 """ import numpy as np from fmanager import query from tdtools import get_calendar from datautils.datacache.cachecore import DataView test_dates = [('2015-04-16', '2015-05-07'), ('2016-06-01', '2017-01-01'), ('2017-11-01', '2018-03-01'), ('2014-01-01', '2014-05-01')] def get_df(start_time, end_time): return query('CLOSE', (start_time, end_time)) dv_df = DataView(get_df, get_calendar('stock.sse')) for date in test_dates: tmp = dv_df.get_tsdata(*date).fillna(-1000) data_cpr = query('CLOSE', date).fillna(-1000) assert np.all(np.all(np.isclose(tmp, data_cpr), axis=1)) print(dv_df._cache_start, dv_df._cache_end, dv_df._extendable) assert len(dv_df._data_cache) == get_calendar('stock.sse').count( dv_df._cache_start, dv_df._cache_end)
from database.jsonEngine.dbcore import JSONEngine OVERLAP_INSERT_FLAG = False # 插入的数据是否有重叠 TEST_SECOND_FLAG = True # 是否进行第二次插入 first_start = '2017-01-01' first_end = '2017-06-01' if OVERLAP_INSERT_FLAG: second_start = '2017-04-01' second_end = '2018-02-01' else: second_start = '2017-06-02' second_end = '2018-02-01' first_sample = query('ZX_IND', (first_start, first_end)).iloc[:, 0] second_sample = query('ZX_IND', (second_start, second_end)).iloc[:, 0] db_path = r'C:\Users\c\Desktop\test' json_db = 'sjson_test' folder_path = join(db_path, json_db) if exists(folder_path): rmtree(folder_path) print(JSONEngine.insert(first_sample, ParamsParser.from_dict(db_path, {'rel_path': json_db, 'store_fmt': (DataClassification.STRUCTURED, DataValueCategory.CHAR, DataFormatCategory.TIME_SERIES)}))) if TEST_SECOND_FLAG: print(JSONEngine.insert(second_sample, ParamsParser.from_dict(db_path, {'rel_path': json_db, 'store_fmt': (DataClassification.STRUCTURED, DataValueCategory.CHAR, DataFormatCategory.TIME_SERIES)})))
from shutil import rmtree from os.path import exists from database.db import Database from database.const import DataClassification, DataFormatCategory, DataValueCategory from fmanager import query db_path = r'C:\Users\c\Desktop\test\db_test' if exists(db_path): rmtree(db_path) start_time = '2017-01-01' end_time = '2018-02-01' db = Database(db_path) num_data = query('CLOSE', (start_time, end_time)) char_data = query('ZX_IND', (start_time, end_time)) third_data = query('BETA', (start_time, end_time)) unstruct_data = list(range(1000)) print( db.insert(num_data, 'num_test', (DataClassification.STRUCTURED, DataValueCategory.NUMERIC, DataFormatCategory.PANEL), 'float64')) print( db.insert(char_data, 'char_test', (DataClassification.STRUCTURED, DataValueCategory.CHAR, DataFormatCategory.PANEL))) print( db.insert(third_data, 'factor.beta', (DataClassification.STRUCTURED, DataValueCategory.NUMERIC,
# -*- encoding: utf-8 ''' 对dbcore中DataWrapper的功能进行测试 ''' import json from fmanager import query from database.jsonEngine.dbcore import DataWrapper sample_data = query('ZX_IND', ('2017-12-01', '2018-02-01')) sample_data2 = query('ZX_IND', ('2017-11-01', '2018-01-01')) # 从pandas数据初始化 data = DataWrapper.init_from_pd(sample_data) data2 = DataWrapper.init_from_pd(sample_data2) print('Done!')
from datautils.datacache.cachecore import DataView from fmanager import query from tdtools import get_calendar def get_df(start_time, end_time): return query('CLOSE', (start_time, end_time)) dv_df = DataView(get_df, get_calendar('stock.sse')) # 横截面数据获取测试 date1 = '2016-03-25' tmp = dv_df.get_csdata(date1).fillna(-1000) data_cpr = query('CLOSE', date1).iloc[0, :].fillna(-1000) assert np.all(np.isclose(tmp, data_cpr)) print(dv_df._cache_start, dv_df._cache_end, dv_df._extendable) date2 = '2017-01-04' tmp = dv_df.get_csdata(date2).fillna(-1000) data_cpr = query('CLOSE', date2).iloc[0, :].fillna(-1000) assert np.all(np.isclose(tmp, data_cpr)) print(dv_df._cache_start, dv_df._cache_end, dv_df._extendable) date3 = '2018-01-04' tmp = dv_df.get_csdata(date3).fillna(-1000) data_cpr = query('CLOSE', date3).iloc[0, :].fillna(-1000) assert np.all(np.isclose(tmp, data_cpr)) print(dv_df._cache_start, dv_df._cache_end, dv_df._extendable)
'dtype': 'float64', 'rel_path': 'basicfactor.beta' }] start_time = '2014-01-01' end_time = '2018-03-01' db_path = r'C:\Users\c\Desktop\test\db_test' if os_path.exists(db_path): rmtree(db_path) rmtree(r'C:\Users\c\Documents\DatabaseMetadata') db = Database(db_path) for factor in test_factors: factor_data = fmanager.query(factor['name'], (start_time, end_time)) result = db.insert(factor_data, factor['rel_path'], factor['store format'], factor['dtype']) print(result) unstruct_data = list(range(1000)) print( db.insert(unstruct_data, 'unstruct_data.test', (DataClassification.UNSTRUCTURED, ))) db.print_collections() pprint(db.find_data('beta')) print(db.find_collection('quote')) db.remove_data('ind.zx', test_factors[0]['store format'])