示例#1
0
class SourceAnnouncementBaseV1(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base'
        self.his_table = 'juchao_ant'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )

    def daily_update(self, deadline: datetime.datetime = None):
        if deadline is None:
            deadline = datetime.datetime.now() - datetime.timedelta(days=1)

        load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \
UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline)
        logger.info(load_sql)

        items = []
        datas = self._r_spider_conn.select_all(load_sql)
        logger.info(len(datas))
        for data in datas:
            data = utils.process_secucode(data)
            if data:
                items.append(data)
        self._spider_conn._batch_save(items, self.merge_table_name, [
            'SecuCode', 'SecuAbbr', 'PubDatetime1', 'InsertDatetime1', 'Title1'
        ])

        update_sql = '''
        select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' and A.code = B.SecuCode \
and A.link = B.AntDoc and A.type = '公告';  '''.format(deadline)
        datas = self._r_spider_conn.select_all(update_sql)
        for data in datas:
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._spider_conn.table_update(self.merge_table_name, item,
                                           'PDFLink', data.get("link"))
示例#2
0
class JuchaoLiveNewsSpider(object):
    """巨潮快讯爬虫"""
    def __init__(self):
        self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
        self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
        self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
        self.table_name = 'juchao_kuaixun'
        self.name = '巨潮快讯'
        self._spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = PyMysqlPoolBase(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            db=JUY_DB,
        )
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
                          'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def get_secu_abbr(self, code):
        """
        从聚源数据库获取证券代码对应的中文简称
        :param code: 无前缀的证券代码
        :return:
        """
        sql = f'''select SecuAbbr from secumain where secucode=%s;'''
        name = self._juyuan_conn.select_one(sql, code).get("SecuAbbr")
        return name

    def get_redit_link(self, link):
        """获取最终重定向后的网址"""
        resp = self.my_get(link)
        redit_list = resp.history
        try:
            redit_link = redit_list[len(redit_list) - 1].headers["location"]
        except IndexError:
            redit_link = link
        except:
            return None
        return redit_link

    @retry(stop_max_attempt_number=5)
    def my_get(self, link):
        # 请求 超时\被ban 时重试
        logger.debug(f'get in {link} .. ')
        resp = requests.get(link, headers=self.headers, timeout=5)
        return resp

    def parse(self, url: str, day_string: str):
        items = []
        resp = self.my_get(url)
        if resp and resp.status_code == 200:
            text = resp.text
            datas = json.loads(text)
            if not datas:
                print("{} 无公告数据".format(url))
            else:
                for data in datas:
                    item = {}
                    # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称
                    announcementTime = time.localtime(int(data.get("announcementTime") / 1000))
                    announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime)
                    item['pub_date'] = announcementTime
                    item['type'] = data.get("type")
                    item['title'] = data.get("title")
                    page_path = data.get("pagePath")
                    if page_path is None:
                        link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format(
                            data.get("textId"), day_string)
                    else:
                        try:
                            link = self.get_redit_link(page_path)
                        except:
                            link = None
                    if not link:
                        continue
                    item['link'] = link
                    code = data.get("code")
                    if code:
                        item['code'] = code  # 无前缀的证券代码
                        item['name'] = self.get_secu_abbr(code)
                        items.append(item)
        return items

    def start(self, start_dt: datetime.datetime = None, end_dt: datetime.datetime = None):
        """启动入口"""
        # 使用传入的结束时间否则置为今天
        if end_dt is not None:
            end_dt = datetime.datetime.combine(end_dt, datetime.time.min)
        else:
            end_dt = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min)

        # 使用传入的开始时间 否则置为结束时间之前 3 天
        if start_dt is not None:
            start_dt = datetime.datetime.combine(start_dt, datetime.time.min)
        else:
            start_dt = end_dt - datetime.timedelta(days=3)

        # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间
        this_day = end_dt
        while this_day >= start_dt:
            day_string = this_day.strftime("%Y-%m-%d")
            items = self.parse(self.api_url.format(day_string), day_string)
            logger.info(f"{day_string}, {len(items)}")
            self._spider_conn._batch_save(items, self.table_name, self.fields)
            this_day -= datetime.timedelta(days=1)
            time.sleep(3)
class AnnSecuRef(object):
    def __init__(self):
        self.read_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )

        self.spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )

        self.juyuan_conn = PyMysqlPoolBase(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            db=JUY_DB,
        )

        self.batch_count = 10000
        self.codes_notfound = set()

    def fetch_bas_secumain(self) -> dict:
        # 将 bas_secumain 的全部数据加载到内存中
        bas_sql = '''select id, secu_code from bas_secumain where secu_category in (1, 41); '''
        bas_datas = self.read_spider_conn.select_all(bas_sql)
        bas_map = {}
        for data in bas_datas:
            bas_map[data['secu_code']] = data['id']
        return bas_map

    def update_rename_codes(self, bas_map: dict) -> dict:
        # 扩充映射 新增改名的部分
        sql_fetch_raname_secucodes = '''
                   SELECT A.SecuCode 'old_code', B.SecuCode 'new_code',
                   A.*
                   FROM gildata.LC_CodeChange A 
                   JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory = 1
                   WHERE A.CodeDefine = 1 
                   AND A.SecuCode <> B.SecuCode
                   ORDER BY A.StopDate; 
               '''
        rename_datas = self.juyuan_conn.select_all(sql_fetch_raname_secucodes)
        for rename_data in rename_datas:
            new_code = rename_data['new_code']
            secu_id = bas_map[new_code]
            old_code = rename_data['old_code']
            bas_map.update({old_code: secu_id})
        return bas_map

    def get_max_spyid(self):
        # 获取爬虫表中目前的最大 id
        sql_get_maxid = '''select max(id) from spy_announcement_data; '''
        max_id = self.read_spider_conn.select_one(sql_get_maxid).get("max(id)")
        return max_id

    def process_spy_datas(self, origin_ann_datas: list, bas_map: dict):
        # 处理爬虫数据 生成插入数据
        items = []
        for origin_data in origin_ann_datas:
            item = dict()
            item['ann_id'] = origin_data['id']
            secu_codes = origin_data['secu_codes']    # 对于沪深公告来说 目前secu_codes只有一个
            secu_id = bas_map.get(secu_codes)
            if secu_id is None:
                logger.warning(secu_codes)
                self.codes_notfound.add(secu_codes)
                continue
            item['secu_id'] = secu_id
            item['create_by'] = 0
            item['update_by'] = 0
            # print(item)
            items.append(item)
            if len(items) > 10000:
                count = self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ])
                logger.debug(count)
                items = []
        self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ])
        logger.warning(f'未匹配证券代码: {self.codes_notfound}')

    def diff_ids(self):
        '''对比两张表的 id 只取差值部分
        可在数据不一致时执行此函数
        '''

        sql = '''select id from spy_announcement_data ; '''
        ret = self.read_spider_conn.select_all(sql)
        spy_ids = set([r.get("id") for r in ret])
        print(len(spy_ids))

        sql = '''select ann_id from an_announcement_secu_ref; '''
        ret = self.spider_conn.select_all(sql)
        ref_ids = set([r.get("ann_id") for r in ret])
        print(len(ref_ids))

        diff_ids = tuple(spy_ids - ref_ids)
        logger.info(len(diff_ids))

        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)

        sql = f'''select * from spy_announcement_data where id in {diff_ids}'''
        spy_datas = self.read_spider_conn.select_all(sql)
        logger.info(len(spy_datas))
        self.process_spy_datas(spy_datas, bas_map)

    def init_load(self):
        # 在关联表为空时 初始化导入
        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)
        max_id = self.get_max_spyid()
        logger.info(f'Now max(id) of spy_announcement_data is {max_id}')
        for i in range(int(max_id / self.batch_count) + 1):
            _start = i * self.batch_count
            _end = i * self.batch_count + self.batch_count
            sql = f'''select id, secu_codes from spy_announcement_data where id >= {_start} and id < {_end}; '''
            origin_ann_datas = self.read_spider_conn.select_all(sql)
            self.process_spy_datas(origin_ann_datas, bas_map)

    def daily_sync(self, start_dt: datetime.datetime = None):
        # 在爬虫原始表中存在新数据时日常的同步关联表操作
        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)
        if start_dt is None:
            # 每次启动定位开始时间为当日开始时间
            start_dt = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min)
        sql = f'''select id, secu_codes from spy_announcement_data where update_time >= '{start_dt}'; '''
        origin_ann_datas = self.read_spider_conn.select_all(sql)
        self.process_spy_datas(origin_ann_datas, bas_map)