def _inner(universe, start_time, end_time): ''' 上证综指 ''' sql = sql_tmplate.replace('code', index_code).replace('price_type', price_type) data = fdgetter.get_db_data(sql, cols=('close', 'time'), start_time=start_time, end_time=end_time, add_stockcode=False) # pdb.set_trace() data = pd.DataFrame(np.repeat([data.close.values], len(universe), axis=0).T, index=data.time, columns=sorted(universe)) checkdata_completeness(data, start_time, end_time) check_indexorder(data) return data
def get_adjfactor(universe, start_time, end_time): ''' 股票的复权因子 ''' sql = ''' SELECT A.ExDiviDate, A.RatioAdjustingFactor, M.SecuCode FROM QT_AdjustingFactor A, SecuMain M WHERE A.InnerCode = M.InnerCode AND M.secuMarket in (83, 90) AND M.SECUCATEGORY = 1 ORDER BY M.SecuCode ASC, A.ExDiviDate ASC ''' data = fdgetter.get_db_data(sql, cols=('time', 'data', 'code'), add_stockcode=False) data['code'] = data.code.apply(datatoolkits.add_suffix) by_code = data.groupby('code') tds = dateshandle.get_tds(start_time, end_time) data = by_code.apply(datatoolkits.map_data, days=tds, fromNowOn=True, fillna={'code': lambda x: x.code.iloc[0], 'data': lambda x: 1}) data = data.reset_index(drop=True) data = data.pivot_table('data', index='time', columns='code') data = data.loc[:, sorted(universe)].fillna(1) # 因为新股大多数情况下没有分红记录 assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_tradeable(universe, start_time, end_time): ''' 获取股票的交易状态 Notes ----- 将成交量为0或者最高价等于最低价视为不能交易,返回值为1表示正常交易,0表示不能交易,NA表示未上市而不能交易 ''' sql = ''' SELECT S.TradingDay, S.TurnoverVolume, S.HighPrice, S.LowPrice, M.Secucode FROM QT_DailyQuote S, SecuMain M WHERE S.InnerCode = M.InnerCode AND M.SecuMarket in (83, 90) AND S.TradingDay <= CAST(\'{end_time}\' as datetime) AND S.TradingDay >= CAST(\'{start_time}\' as datetime) AND M.SecuCategory = 1 ORDER BY S.TradingDay ASC, M.Secucode ASC ''' data = fdgetter.get_db_data(sql, cols=('time', 'vol', 'high', 'low', 'code'), start_time=start_time, end_time=end_time, add_stockcode=False) # pdb.set_trace() data['code'] = data.code.apply(datatoolkits.add_suffix) data.loc[data.vol > 0, 'vol'] = 1 data.loc[(data.vol > 0) & (data.high == data.low), 'vol'] = 0 data = data.pivot_table('vol', index='time', columns='code') data = data.loc[:, sorted(universe)] assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_ctargetprice(universe, start_time, end_time): ''' 获取一致预期目标价数据 ''' sql = ''' SELECT TARGET_PRICE, CON_DATE, STOCK_CODE FROM CON_FORECAST_SCHEDULE WHERE CON_DATE >= CAST(\'{start_time}\' AS datetime) AND CON_DATE <= CAST(\'{end_time}\' AS datetime) ''' data = fdgetter.get_db_data(sql, start_time=start_time, end_time=end_time, cols=('data', 'time', 'code'), db=fdgetter.zyyx, add_stockcode=False) # pdb.set_trace() data['code'] = data.code.apply(datatoolkits.add_suffix) data = data.pivot_table('data', index='time', columns='code') data = data.loc[:, sorted(universe)] assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_lnfloatmktv(universe, start_time, end_time): ''' 对数流通市值 ''' fmktv = query('FLOAT_MKTVALUE', (start_time, end_time)) data = np.log(fmktv) data = data.loc[:, sorted(universe)] assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_lntotalmktv(universe, start_time, end_time): ''' 对数总市值 ''' tmktv = query('TOTAL_MKTVALUE', (start_time, end_time)) data = np.log(tmktv) data = data.loc[:, sorted(universe)] assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_torate(universe, start_time, end_time): ''' 获取换手率,使用当天交易量/流通股数来计算 ''' volume = query('TO_VOLUME', (start_time, end_time)) float_shares = query('FLOAT_SHARE', (start_time, end_time)) res = volume / float_shares res = res.loc[:, sorted(universe)] assert checkdata_completeness(res, start_time, end_time), "Error, data missed!" return res
def inner(universe, start_time, end_time): adj_factor = query('ADJ_FACTOR', (start_time, end_time)) data = query(price_type, (start_time, end_time)) assert len(adj_factor) == len(data), "Error, basic data length does not match! " + \ "adj_factor data = {sd_len}, while close data = {cd_len}".format(sd_len=len(adj_factor), cd_len=len(data)) res = adj_factor * data res = res.loc[:, sorted(universe)] assert checkdata_completeness(res, start_time, end_time), "Error, data missed!" return res
def _inner(universe, start_time, end_time): share_data = query(share_factor_name, (start_time, end_time)) close_data = query('CLOSE', (start_time, end_time)) assert len(share_data) == len(close_data), "Error, basic data length does not match! " + \ "share data = {sd_len}, while close data = {cd_len}".format(sd_len=len(share_data), cd_len=len(close_data)) res = share_data * close_data res = res.loc[:, sorted(universe)] assert checkdata_completeness(res, start_time, end_time), "Error, data missed!" return res
def get_dailyret(universe, start_time, end_time): ''' 获取日收益率,使用后复权收盘价计算 ''' new_start = pd.to_datetime(start_time) - pd.Timedelta('30 day') data = query('ADJ_CLOSE', (new_start, end_time)) data = data.pct_change() mask = data.index >= start_time data = data.loc[mask, sorted(universe)] assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def _inner(universe, start_time, end_time): data = fdgetter.get_db_data(transed_sql, cols=cols, start_time=start_time, end_time=end_time, add_stockcode=False) data['code'] = data.code.apply(datatoolkits.add_suffix) if len(data.columns) == 4: data.loc[data.data == 0, 'data'] = data['prevclose'] data.drop('prevclose', inplace=True, axis=1) data = data.pivot_table('data', index='time', columns='code') data = data.loc[:, sorted(universe)] assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_avgtorate(universe, start_time, end_time): ''' 指过去20个交易日平均换手率 ''' start_time = pd.to_datetime(start_time) new_start = start_time - pd.Timedelta('60 day') daily_torate = query('TO_RATE', (new_start, end_time)) data = daily_torate.rolling(20, min_periods=20).mean().dropna(how='all') mask = (data.index >= start_time) & (data.index <= end_time) data = data.loc[mask, sorted(universe)] if start_time > pd.to_datetime(START_TIME): # 第一次更新从START_TIME开始,必然会有缺失数据 assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def inner(universe, start_time, end_time): start_time = pd.to_datetime(start_time) threshold = 10e-6 new_start = dateshandle.tds_shift(start_time, offset) daily_torate = query('TO_RATE', (new_start, end_time)) data = daily_torate.rolling(offset, min_periods=offset).sum().dropna(how='all') data[data <= threshold] = np.NaN data = data / month_num data = np.log(data) mask = (data.index >= start_time) & (data.index <= end_time) data = data.loc[mask, sorted(universe)] if start_time > pd.to_datetime(START_TIME): # 第一次更新从START_TIME开始,必然会有缺失数据 assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def _inner(universe, start_time, end_time): offset = 366 * n + 180 # pdb.set_trace() new_start = pd.to_datetime(start_time) - pd.Timedelta('%d day' % offset) data = fdgetter.get_db_data(sql, start_time=new_start, end_time=end_time, cols=('update_time', 'rpt_date', 'code', 'data'), add_stockcode=False) data['code'] = data.code.apply(datatoolkits.add_suffix) data = _handle_dbdata(data, start_time, end_time, fdmutils.cal_yr, col_name='data', offset=n) # pdb.set_trace() data = data.loc[:, sorted(universe)] assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def _inner(universe, start_time, end_time): data = fdgetter.get_db_data(transed_sql, cols=('data', 'time', 'code'), add_stockcode=False) data['code'] = data.code.apply(datatoolkits.add_suffix) data = data.drop_duplicates().sort_values(['code', 'time']) # 此处假设若时间相同则股本数量相同 by_code = data.groupby('code') tds = dateshandle.get_tds(start_time, end_time) data = by_code.apply(datatoolkits.map_data, days=tds, fromNowOn=True, fillna={'code': lambda x: x.code.iloc[0]}) data = data.reset_index(drop=True) data = data.pivot_table('data', index='time', columns='code') data = data.loc[:, sorted(universe)] assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_zxind(universe, start_time, end_time): ''' 获取中信行业的数据,并映射到每个交易日中 ''' ind_data = fdgetter.get_db_data(fdgetter.BASIC_SQLs['ZX_IND'], cols=('ind', 'time', 'code'), add_stockcode=False) ind_data['ind'] = ind_data.ind.map(ZXIND_TRANS_DICT) ind_data['code'] = ind_data.code.apply(datatoolkits.add_suffix) # pdb.set_trace() tds = dateshandle.get_tds(start_time, end_time) by_code = ind_data.groupby('code') ind_data = by_code.apply(datatoolkits.map_data, days=tds, fillna={'ind': lambda x: NaS, 'code': lambda x: x.code.iloc[0]}) ind_data = ind_data.reset_index(drop=True).set_index(['time', 'code']) ind_data = ind_data.loc[:, 'ind'].unstack() ind_data = ind_data.loc[:, sorted(universe)].dropna(axis=0, how='all').fillna(NaS) assert checkdata_completeness(ind_data, start_time, end_time), "Error, data missed!" return ind_data
def _inner(universe, start_time, end_time): start_time = pd.to_datetime(start_time) nstart_time = start_time - pd.Timedelta('60 days') data = fdgetter.get_db_data(fdgetter.BASIC_SQLs['INDEX_CONSTITUENTS'], code=index_code, cols=('code', 'time'), add_stockcode=False, start_time=nstart_time, end_time=end_time) data = data.assign(is_constituent=1) data['code'] = data.code.apply(datatoolkits.add_suffix) data = data.pivot_table('is_constituent', index='time', columns='code') tds = dateshandle.get_tds(nstart_time, end_time) data = data.loc[:, sorted(universe)].reset_index() data = datatoolkits.map_data(data, days=tds, fromNowOn=True) data = data.set_index('time').dropna(axis=0, how='all') data = data.loc[(data.index >= start_time) & (data.index <= end_time)] # pdb.set_trace() if start_time > pd.to_datetime(START_TIME): assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_nonlinearmktv(universe, start_time, end_time): ''' 非线性市值,并不针对市值数据进行正交化 ''' lnmktv = query('LN_TMKV', (start_time, end_time)) # def get_nlsize(df): # # 市值3次方然后通过OLS来获取与市值正交的残差 # raw_index = df.index # df = df.dropna() # nonlsize = np.power(df, 3) # mod = OLS(nonlsize, add_constant(df)) # mod_res = mod.fit() # return mod_res.resid.reindex(raw_index) # data = lnmktv.apply(get_nlsize, axis=1) data = np.power(lnmktv, 3) data = data.loc[:, sorted(universe)] assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_st(universe, start_time, end_time): ''' 获取股票特殊处理的情况 ''' st_data = fdgetter.get_db_data(fdgetter.BASIC_SQLs['ST_TAG'], cols=('time', 'abbr', 'ms', 'code'), add_stockcode=False) def _assign_st(row): map_dict = {'ST': 1, 'PT': 5, '撤销ST': 0, '*ST': 2, '撤消*ST并实行ST': 1, '从ST变为*ST': 2, '撤销*ST': 0, '退市整理期': 3, '高风险警示': 4} if row.ms in map_dict: return map_dict[row.ms] else: assert row.ms == '撤销PT', "Error, cannot handle tag '{tag}'".format(tag=row.ms) if 'ST' in row.abbr: return 1 elif '*ST' in row.abbr: return 2 else: return 0 st_data = st_data.assign(tag=lambda x: x.apply(_assign_st, axis=1)) st_data['code'] = st_data.code.apply(datatoolkits.add_suffix) # 剔除日期重复项,因为数字越大表示越风险越高,因而只保留数字大的 # pdb.set_trace() st_data = st_data.sort_values(['code', 'time', 'tag']) by_codentime = st_data.groupby(['code', 'time']) st_data = by_codentime.apply(lambda x: x.tail(1).iloc[0]) st_data = st_data.reset_index(drop=True) tds = dateshandle.get_tds(start_time, end_time) # pdb.set_trace() by_code = st_data.groupby('code') st_data = by_code.apply(datatoolkits.map_data, days=tds, fillna={'code': lambda x: x.code.iloc[0]}, fromNowOn=True) st_data = st_data.reset_index(drop=True) # st_data = st_data.reset_index(drop=True).set_index(['time', 'code']) # st_data = st_data.loc[:, 'tag'].unstack() st_data = st_data.pivot_table('tag', index='time', columns='code').dropna(axis=0, how='all') st_data = st_data.loc[:, sorted(universe)].fillna(0) assert checkdata_completeness(st_data, start_time, end_time), "Error, data missed!" return st_data
def _inner(universe, start_time, end_time): new_start = pd.to_datetime(start_time) - pd.Timedelta('540 day') data = fdgetter.get_db_data(sql_template, start_time=new_start, end_time=end_time, cols=('update_time', 'rpt_date', 'code', 'data'), add_stockcode=False) data['code'] = data.code.apply(datatoolkits.add_suffix) by_code = data.groupby('code') data = by_code.apply(fdmutils.get_observabel_data).reset_index(drop=True) by_cno = data.groupby(['code', 'obs_time']) data = by_cno.apply(fdmutils.cal_ttm, col_name='data').reset_index()\ .rename(columns={'obs_time': 'time'}) tds = dateshandle.get_tds(start_time, end_time) # pdb.set_trace() data = data.groupby('code').apply(datatoolkits.map_data, days=tds, fillna={'code': lambda x: x.code.iloc[0]}) data = data.reset_index(drop=True) data = data.pivot_table('data', index='time', columns='code') data = data.loc[:, sorted(universe)] assert check_indexorder(data), 'Error, data order is mixed!' assert checkdata_completeness(data, start_time, end_time), "Error, data missed!" return data
def get_liststatus(universe, start_time, end_time): ''' 获取股票的上市状态 ''' ls_data = fdgetter.get_db_data(fdgetter.BASIC_SQLs['LIST_STATUS'], cols=('code', 'list_status', 'time'), add_stockcode=False) ls_map = {1: 1, 2: 2, 3: 1, 4: 4, 6: 3} # 原数据库中1表示上市,2表示暂停上市,3表示恢复上市,4表示退市,6表示退市整理 ls_data['list_status'] = ls_data.list_status.map(ls_map) ls_data['code'] = ls_data.code.apply(datatoolkits.add_suffix) by_code = ls_data.groupby('code') tds = dateshandle.get_tds(start_time, end_time) ls_data = by_code.apply(datatoolkits.map_data, days=tds, fillna={'code': lambda x: x.code.iloc[0]}, fromNowOn=True) # ls_data = ls_data.reset_index(drop=True).set_index(['time', 'code']) # ls_data = ls_data.loc[:, 'list_status'].unstack() ls_data = ls_data.reset_index(drop=True) ls_data = ls_data.pivot_table('list_status', index='time', columns='code').dropna(axis=0, how='all') ls_data = ls_data.loc[:, sorted(universe)] assert checkdata_completeness(ls_data, start_time, end_time), "Error, data missed!" return ls_data