async def eval_fetch_timestamps(self, entity, ref_record, http_session): latest_timestamp = None try: if pd_valid(ref_record): time_field = self.get_evaluated_time_field() latest_timestamp = ref_record[time_field].max(axis=0) except Exception as e: self.logger.warning("get ref_record failed with error: {}".format(e)) if not latest_timestamp: latest_timestamp = entity.timestamp if not latest_timestamp: return self.start_timestamp, self.end_timestamp, self.default_size, None if latest_timestamp.date() >= now_pd_timestamp(self.region).date(): return latest_timestamp, None, 0, None if len(self.trade_day) > 0 and \ latest_timestamp.date() >= self.trade_day[0].date(): return latest_timestamp, None, 0, None if self.start_timestamp: latest_timestamp = max(latest_timestamp, self.start_timestamp) if self.end_timestamp and latest_timestamp > self.end_timestamp: size = 0 else: size = self.default_size return latest_timestamp, self.end_timestamp, size, None
async def download_sh_etf_component(self, df: pd.DataFrame, http_session, db_session): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \ 'isPagination=false&type={}&etfClass={}' etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')] etf_df = self.populate_sh_etf_type(etf_df, http_session) for _, etf in etf_df.iterrows(): url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS']) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue try: response_dict = demjson.decode(text) except Exception as e: self.logger.error( f'decode {url} failed with text: {text}, error as: {e}') continue response_df = pd.DataFrame(response_dict.get('result', [])) etf_code = etf['FUND_ID'] etf_id = f'etf_sh_{etf_code}' response_df = response_df[['instrumentId', 'instrumentName']].copy() response_df.rename(columns={ 'instrumentId': 'stock_code', 'instrumentName': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = ChnExchange.SSE.value response_df['code'] = etf_code response_df['name'] = etf['FUND_NAME'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')
async def on_finish(self): last_year = str(now_pd_timestamp(self.region).year) codes = [item.code for item in self.entities] db_session = get_db_session(self.region, self.provider, DividendFinancing) need_filleds, column_names = DividendFinancing.query_data( region=self.region, provider=self.provider, db_session=db_session, codes=codes, end_timestamp=last_year, filters=[DividendFinancing.rights_raising_fund.is_(None)]) if need_filleds: desc = self.data_schema.__name__ + ": update relevant table" db_session_1 = get_db_session(self.region, self.provider, self.data_schema) kafka_producer = connect_kafka_producer(findy_config['kafka']) for item in need_filleds: result, column_names = self.data_schema.query_data( region=self.region, provider=self.provider, db_session=db_session_1, entity_id=item.entity_id, start_timestamp=item.timestamp, end_timestamp=f"{item.timestamp.year}-12-31", func=func.sum(self.data_schema.rights_raising_fund)) if isinstance(result, (int, float)): item.rights_raising_fund = result data = { "task": 'rig', "total": len(need_filleds), "desc": desc, "leave": True, "update": 1 } publish_message(kafka_producer, progress_topic, bytes(progress_key, encoding='utf-8'), bytes(json.dumps(data), encoding='utf-8')) try: db_session.commit() except Exception as e: self.logger.error(f'{self.__class__.__name__}, error: {e}') db_session.rollback() await super().on_finish()
def format(self, entity, df): df['timestamp'] = now_pd_timestamp(Region.CHN) df['entity_id'] = entity.id df['provider'] = self.provider.value df['code'] = entity.code df['name'] = entity.name df['level'] = self.level.value df['exchange'] = entity.exchange df['entity_type'] = EntityType.Block.value df['id'] = self.generate_domain_id(entity, df) return df
async def fetch_cni_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取国证指数成分股 """ query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' try: response_df = response_df[['样本股代码']] except KeyError: response_df = response_df[['证券代码']] response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df.columns = ['stock_code'] response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def persist(self, entity, http_session, db_session, para): start_point = time.time() (ref_record, df_record) = para saved_counts = 0 is_finished = False if pd_valid(df_record): assert 'id' in df_record.columns saved_counts = await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=df_record, ref_df=ref_record, fix_duplicate_way=self.fix_duplicate_way) if saved_counts == 0: is_finished = True # could not get more data else: # not realtime if not self.real_time: is_finished = True # realtime and to the close time elif (self.close_hour is not None) and (self.close_minute is not None): now = now_pd_timestamp(self.region) if now.hour >= self.close_hour: if now.minute - self.close_minute >= 5: self.logger.info(f'{entity.id} now is the close time: {now}') is_finished = True if isinstance(self, KDataRecorder): is_finished = True start_timestamp = to_time_str(df_record['timestamp'].min(axis=0)) end_timestamp = to_time_str(df_record['timestamp'].max(axis=0)) self.result = [saved_counts, start_timestamp, end_timestamp] return is_finished, time.time() - start_point, saved_counts
async def fetch_csi_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误') continue response_df = pd.read_excel(io.BytesIO(content)) response_df = response_df[['成分券代码Constituent Code', '成分券名称Constituent Name']].rename( columns={'成分券代码Constituent Code': 'stock_code', '成分券名称Constituent Name': 'stock_name'}) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def record(self, entity, http_session, db_session, para): start_point = time.time() calendar = mcal.get_calendar(entity.upper()) trade_day, column_names = StockTradeDay.query_data( region=self.region, provider=self.provider, db_session=db_session, func=func.max(StockTradeDay.timestamp)) start = to_time_str(trade_day) if trade_day else "1980-01-01" dates = calendar.schedule(start_date=start, end_date=to_time_str(now_pd_timestamp(Region.US))) dates = dates.index.to_list() self.logger.info(f'add dates:{dates}') if len(dates) > 0: df = pd.DataFrame(dates, columns=['timestamp']) return False, time.time() - start_point, self.format(entity, df) return True, time.time() - start_point, None
async def fetch_szse_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取深证指数成分股 """ query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def eval_fetch_timestamps(self, entity, ref_record, http_session): latest_timestamp = None try: if pd_valid(ref_record): time_field = self.get_evaluated_time_field() latest_timestamp = ref_record[time_field].max(axis=0) except Exception as e: self.logger.warning(f'get ref record failed with error: {e}') if not latest_timestamp: latest_timestamp = entity.timestamp if not latest_timestamp: return self.start_timestamp, self.end_timestamp, self.default_size, None now = now_pd_timestamp(self.region) now_end = now.replace(hour=18, minute=0, second=0) trade_day_index = 0 if len(self.trade_day) > 0: if is_same_date(self.trade_day[trade_day_index], now) and now < now_end: trade_day_index = 1 end = self.trade_day[trade_day_index] else: end = now start_timestamp = next_dates(latest_timestamp) start = max(self.start_timestamp, start_timestamp) if self.start_timestamp else start_timestamp if start >= end: return start, end, 0, None size = self.eval_size_of_timestamp(start_timestamp=start, end_timestamp=end, level=self.level, one_day_trading_minutes=self.one_day_trading_minutes) return start, end, size, None
async def download_sz_etf_component(self, df: pd.DataFrame, http_session, db_session): query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml' self.parse_sz_etf_underlying_index(df) for _, etf in df.iterrows(): underlying_index = etf['拟合指数'] etf_code = etf['证券代码'] if len(underlying_index) == 0: self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...') continue url = query_url.format(underlying_index) text = sync_get(http_session, url, encoding='gbk', return_type='text') if text is None: continue try: dfs = pd.read_html(text, header=1) except ValueError as error: self.logger.error( f'HTML parse error: {error}, response: {text}') continue if len(dfs) < 4: continue response_df = dfs[3].copy() response_df = response_df.dropna(axis=1, how='any') response_df['品种代码'] = response_df['品种代码'].apply( lambda x: f'{x:06d}') etf_id = f'etf_sz_{etf_code}' response_df = response_df[['品种代码', '品种名称']].copy() response_df.rename(columns={ '品种代码': 'stock_code', '品种名称': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = ChnExchange.SZSE.value response_df['code'] = etf_code response_df['name'] = etf['证券简称'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')
def __init__(self, region: Region, data_schema: Type[Mixin], entity_schema: Type[EntityMixin], provider: Provider = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: IntervalLevel = None, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None) -> None: self.logger = logging.getLogger(self.__class__.__name__) self.data_schema = data_schema self.entity_schema = entity_schema self.region = region self.provider = provider if end_timestamp is None: end_timestamp = now_pd_timestamp(self.region) self.the_timestamp = the_timestamp if the_timestamp: self.start_timestamp = the_timestamp self.end_timestamp = the_timestamp else: self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.start_timestamp = to_pd_timestamp(self.start_timestamp) self.end_timestamp = to_pd_timestamp(self.end_timestamp) self.exchanges = exchanges if codes: if type(codes) == str: codes = codes.replace(' ', '') if codes.startswith('[') and codes.endswith(']'): codes = json.loads(codes) else: codes = codes.split(',') self.codes = codes self.entity_ids = entity_ids self.filters = filters self.order = order self.limit = limit if level: self.level = IntervalLevel(level) else: self.level = level self.category_field = category_field self.time_field = time_field self.computing_window = computing_window self.category_col = eval(f'self.data_schema.{self.category_field}') self.time_col = eval(f'self.data_schema.{self.time_field}') self.columns = columns # we store the data in a multiple index(category_column,timestamp) Dataframe if self.columns: # support str if type(columns[0]) == str: self.columns = [] for col in columns: self.columns.append(eval(f'data_schema.{col}')) # always add category_column and time_field for normalizing self.columns = list( set(self.columns) | {self.category_col, self.time_col}) self.data_listeners: List[DataListener] = [] self.data_df: pd.DataFrame = None
def init_timestamps(self, entity, http_session): return pd.date_range(start=entity.timestamp, end=now_pd_timestamp(Region.CHN), freq='B').tolist()
class NormalEntityMixin(EntityMixin): # the record created time in db created_timestamp = Column(DateTime, default=now_pd_timestamp(Region.CHN)) # the record updated time in db, some recorder would check it for whether need to refresh updated_timestamp = Column(DateTime)