示例#1
0
    async def eval_fetch_timestamps(self, entity, ref_record, http_session):
        latest_timestamp = None
        try:
            if pd_valid(ref_record):
                time_field = self.get_evaluated_time_field()
                latest_timestamp = ref_record[time_field].max(axis=0)
        except Exception as e:
            self.logger.warning("get ref_record failed with error: {}".format(e))

        if not latest_timestamp:
            latest_timestamp = entity.timestamp

        if not latest_timestamp:
            return self.start_timestamp, self.end_timestamp, self.default_size, None

        if latest_timestamp.date() >= now_pd_timestamp(self.region).date():
            return latest_timestamp, None, 0, None

        if len(self.trade_day) > 0 and \
           latest_timestamp.date() >= self.trade_day[0].date():
            return latest_timestamp, None, 0, None

        if self.start_timestamp:
            latest_timestamp = max(latest_timestamp, self.start_timestamp)

        if self.end_timestamp and latest_timestamp > self.end_timestamp:
            size = 0
        else:
            size = self.default_size

        return latest_timestamp, self.end_timestamp, size, None
    async def download_sh_etf_component(self, df: pd.DataFrame, http_session,
                                        db_session):
        """
        ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF
                        5. 债券 ETF 6. 黄金 ETF
        :param df: ETF 列表数据
        :return: None
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \
                    'isPagination=false&type={}&etfClass={}'

        etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')]
        etf_df = self.populate_sh_etf_type(etf_df, http_session)

        for _, etf in etf_df.iterrows():
            url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS'])
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            try:
                response_dict = demjson.decode(text)
            except Exception as e:
                self.logger.error(
                    f'decode {url} failed with text: {text}, error as: {e}')
                continue

            response_df = pd.DataFrame(response_dict.get('result', []))
            etf_code = etf['FUND_ID']
            etf_id = f'etf_sh_{etf_code}'
            response_df = response_df[['instrumentId',
                                       'instrumentName']].copy()
            response_df.rename(columns={
                'instrumentId': 'stock_code',
                'instrumentName': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = ChnExchange.SSE.value
            response_df['code'] = etf_code
            response_df['name'] = etf['FUND_NAME']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')
    async def on_finish(self):
        last_year = str(now_pd_timestamp(self.region).year)
        codes = [item.code for item in self.entities]

        db_session = get_db_session(self.region, self.provider,
                                    DividendFinancing)

        need_filleds, column_names = DividendFinancing.query_data(
            region=self.region,
            provider=self.provider,
            db_session=db_session,
            codes=codes,
            end_timestamp=last_year,
            filters=[DividendFinancing.rights_raising_fund.is_(None)])

        if need_filleds:
            desc = self.data_schema.__name__ + ": update relevant table"

            db_session_1 = get_db_session(self.region, self.provider,
                                          self.data_schema)
            kafka_producer = connect_kafka_producer(findy_config['kafka'])

            for item in need_filleds:
                result, column_names = self.data_schema.query_data(
                    region=self.region,
                    provider=self.provider,
                    db_session=db_session_1,
                    entity_id=item.entity_id,
                    start_timestamp=item.timestamp,
                    end_timestamp=f"{item.timestamp.year}-12-31",
                    func=func.sum(self.data_schema.rights_raising_fund))

                if isinstance(result, (int, float)):
                    item.rights_raising_fund = result

                data = {
                    "task": 'rig',
                    "total": len(need_filleds),
                    "desc": desc,
                    "leave": True,
                    "update": 1
                }
                publish_message(kafka_producer, progress_topic,
                                bytes(progress_key, encoding='utf-8'),
                                bytes(json.dumps(data), encoding='utf-8'))

            try:
                db_session.commit()
            except Exception as e:
                self.logger.error(f'{self.__class__.__name__}, error: {e}')
                db_session.rollback()

        await super().on_finish()
    def format(self, entity, df):
        df['timestamp'] = now_pd_timestamp(Region.CHN)

        df['entity_id'] = entity.id
        df['provider'] = self.provider.value
        df['code'] = entity.code
        df['name'] = entity.name
        df['level'] = self.level.value
        df['exchange'] = entity.exchange
        df['entity_type'] = EntityType.Block.value

        df['id'] = self.generate_domain_id(entity, df)
        return df
    async def fetch_cni_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取国证指数成分股
        """
        query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'

            try:
                response_df = response_df[['样本股代码']]
            except KeyError:
                response_df = response_df[['证券代码']]

            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df.columns = ['stock_code']
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
示例#6
0
    async def persist(self, entity, http_session, db_session, para):
        start_point = time.time()

        (ref_record, df_record) = para
        saved_counts = 0
        is_finished = False

        if pd_valid(df_record):
            assert 'id' in df_record.columns
            saved_counts = await df_to_db(region=self.region,
                                          provider=self.provider,
                                          data_schema=self.data_schema,
                                          db_session=db_session,
                                          df=df_record,
                                          ref_df=ref_record,
                                          fix_duplicate_way=self.fix_duplicate_way)
            if saved_counts == 0:
                is_finished = True

        # could not get more data
        else:
            # not realtime
            if not self.real_time:
                is_finished = True

            # realtime and to the close time
            elif (self.close_hour is not None) and (self.close_minute is not None):
                now = now_pd_timestamp(self.region)
                if now.hour >= self.close_hour:
                    if now.minute - self.close_minute >= 5:
                        self.logger.info(f'{entity.id} now is the close time: {now}')
                        is_finished = True

        if isinstance(self, KDataRecorder):
            is_finished = True

        start_timestamp = to_time_str(df_record['timestamp'].min(axis=0))
        end_timestamp = to_time_str(df_record['timestamp'].max(axis=0))

        self.result = [saved_counts, start_timestamp, end_timestamp]

        return is_finished, time.time() - start_point, saved_counts
    async def fetch_csi_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']
            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误')
                continue

            response_df = pd.read_excel(io.BytesIO(content))

            response_df = response_df[['成分券代码Constituent Code', '成分券名称Constituent Name']].rename(
                columns={'成分券代码Constituent Code': 'stock_code',
                         '成分券名称Constituent Name': 'stock_name'})

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
示例#8
0
    async def record(self, entity, http_session, db_session, para):
        start_point = time.time()

        calendar = mcal.get_calendar(entity.upper())

        trade_day, column_names = StockTradeDay.query_data(
            region=self.region,
            provider=self.provider,
            db_session=db_session,
            func=func.max(StockTradeDay.timestamp))

        start = to_time_str(trade_day) if trade_day else "1980-01-01"

        dates = calendar.schedule(start_date=start, end_date=to_time_str(now_pd_timestamp(Region.US)))
        dates = dates.index.to_list()
        self.logger.info(f'add dates:{dates}')

        if len(dates) > 0:
            df = pd.DataFrame(dates, columns=['timestamp'])
            return False, time.time() - start_point, self.format(entity, df)

        return True, time.time() - start_point, None
    async def fetch_szse_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取深证指数成分股
        """
        query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True)
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))

            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
示例#10
0
    async def eval_fetch_timestamps(self, entity, ref_record, http_session):
        latest_timestamp = None
        try:
            if pd_valid(ref_record):
                time_field = self.get_evaluated_time_field()
                latest_timestamp = ref_record[time_field].max(axis=0)
        except Exception as e:
            self.logger.warning(f'get ref record failed with error: {e}')

        if not latest_timestamp:
            latest_timestamp = entity.timestamp

        if not latest_timestamp:
            return self.start_timestamp, self.end_timestamp, self.default_size, None

        now = now_pd_timestamp(self.region)
        now_end = now.replace(hour=18, minute=0, second=0)

        trade_day_index = 0
        if len(self.trade_day) > 0:
            if is_same_date(self.trade_day[trade_day_index], now) and now < now_end:
                trade_day_index = 1
            end = self.trade_day[trade_day_index]
        else:
            end = now

        start_timestamp = next_dates(latest_timestamp)
        start = max(self.start_timestamp, start_timestamp) if self.start_timestamp else start_timestamp

        if start >= end:
            return start, end, 0, None

        size = self.eval_size_of_timestamp(start_timestamp=start,
                                           end_timestamp=end,
                                           level=self.level,
                                           one_day_trading_minutes=self.one_day_trading_minutes)

        return start, end, size, None
    async def download_sz_etf_component(self, df: pd.DataFrame, http_session,
                                        db_session):
        query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml'

        self.parse_sz_etf_underlying_index(df)
        for _, etf in df.iterrows():
            underlying_index = etf['拟合指数']
            etf_code = etf['证券代码']

            if len(underlying_index) == 0:
                self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...')
                continue

            url = query_url.format(underlying_index)
            text = sync_get(http_session,
                            url,
                            encoding='gbk',
                            return_type='text')
            if text is None:
                continue

            try:
                dfs = pd.read_html(text, header=1)
            except ValueError as error:
                self.logger.error(
                    f'HTML parse error: {error}, response: {text}')
                continue

            if len(dfs) < 4:
                continue

            response_df = dfs[3].copy()
            response_df = response_df.dropna(axis=1, how='any')
            response_df['品种代码'] = response_df['品种代码'].apply(
                lambda x: f'{x:06d}')

            etf_id = f'etf_sz_{etf_code}'
            response_df = response_df[['品种代码', '品种名称']].copy()
            response_df.rename(columns={
                '品种代码': 'stock_code',
                '品种名称': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = ChnExchange.SZSE.value
            response_df['code'] = etf_code
            response_df['name'] = etf['证券简称']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')
示例#12
0
    def __init__(self,
                 region: Region,
                 data_schema: Type[Mixin],
                 entity_schema: Type[EntityMixin],
                 provider: Provider = None,
                 entity_ids: List[str] = None,
                 exchanges: List[str] = None,
                 codes: List[str] = None,
                 the_timestamp: Union[str, pd.Timestamp] = None,
                 start_timestamp: Union[str, pd.Timestamp] = None,
                 end_timestamp: Union[str, pd.Timestamp] = None,
                 columns: List = None,
                 filters: List = None,
                 order: object = None,
                 limit: int = None,
                 level: IntervalLevel = None,
                 category_field: str = 'entity_id',
                 time_field: str = 'timestamp',
                 computing_window: int = None) -> None:
        self.logger = logging.getLogger(self.__class__.__name__)

        self.data_schema = data_schema
        self.entity_schema = entity_schema

        self.region = region
        self.provider = provider

        if end_timestamp is None:
            end_timestamp = now_pd_timestamp(self.region)

        self.the_timestamp = the_timestamp
        if the_timestamp:
            self.start_timestamp = the_timestamp
            self.end_timestamp = the_timestamp
        else:
            self.start_timestamp = start_timestamp
            self.end_timestamp = end_timestamp

        self.start_timestamp = to_pd_timestamp(self.start_timestamp)
        self.end_timestamp = to_pd_timestamp(self.end_timestamp)

        self.exchanges = exchanges

        if codes:
            if type(codes) == str:
                codes = codes.replace(' ', '')
                if codes.startswith('[') and codes.endswith(']'):
                    codes = json.loads(codes)
                else:
                    codes = codes.split(',')

        self.codes = codes
        self.entity_ids = entity_ids
        self.filters = filters
        self.order = order
        self.limit = limit

        if level:
            self.level = IntervalLevel(level)
        else:
            self.level = level

        self.category_field = category_field
        self.time_field = time_field
        self.computing_window = computing_window

        self.category_col = eval(f'self.data_schema.{self.category_field}')
        self.time_col = eval(f'self.data_schema.{self.time_field}')

        self.columns = columns

        # we store the data in a multiple index(category_column,timestamp) Dataframe
        if self.columns:
            # support str
            if type(columns[0]) == str:
                self.columns = []
                for col in columns:
                    self.columns.append(eval(f'data_schema.{col}'))

            # always add category_column and time_field for normalizing
            self.columns = list(
                set(self.columns) | {self.category_col, self.time_col})

        self.data_listeners: List[DataListener] = []

        self.data_df: pd.DataFrame = None
示例#13
0
 def init_timestamps(self, entity, http_session):
     return pd.date_range(start=entity.timestamp,
                          end=now_pd_timestamp(Region.CHN),
                          freq='B').tolist()
示例#14
0
class NormalEntityMixin(EntityMixin):
    # the record created time in db
    created_timestamp = Column(DateTime, default=now_pd_timestamp(Region.CHN))
    # the record updated time in db, some recorder would check it for whether need to refresh
    updated_timestamp = Column(DateTime)