def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' }
def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, )
def fetch_A_secucode_innercode_map(juyuan_conn: sql_base.Connection): ''' 给出任意一个 SecuCode, 不管是上市还是退市, 不管是变更前还是变更后, 均在这此找到唯一的InnerCode ''' sql = """ ( select A.SecuCode, A.InnerCode, A.SecuAbbr, A.ListedDate, B.ChangeDate '退市日期' from gildata.SecuMain A left join gildata.LC_ListStatus B on A.InnerCode = B.InnerCode and B.ChangeType = 4 where A.SecuMarket in (83, 90) and A.SecuCategory in (1, 41) and A.ListedSector in (1, 2, 6, 7) and A.ListedDate <= CURDATE() ) UNION ( SELECT A.SecuCode, A.InnerCode, B.SecuAbbr, A.BeginDate '启用日期', A.StopDate '停用日期' FROM gildata.LC_CodeChange A JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory in (1, 41) WHERE A.CodeDefine = 1 AND A.SecuCode <> B.SecuCode ); """ exec_res = juyuan_conn.query(sql) map1 = {} for one in exec_res: map1[one['SecuCode']] = one['InnerCode'] sql = f'''select InnerCode, SecuMarket from secumain where InnerCode in {tuple(map1.values())};''' res = juyuan_conn.query(sql) map2 = {} for r in res: map2[r['InnerCode']] = r['SecuMarket'] info = {} for k, v in map1.items(): if map2[v] == 83: k = "SH" + k info[k] = v elif map2[v] == 90: k = "SZ" + k info[k] = v else: raise return info
def __init__(self): self.fields = ['code', 'orgId', 'category', 'pinyin', 'zwjc'] self.tool_table_name = 'juchao_codemap' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, )
def __init__(self): self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.batch_num = 10000
def send_crawl_overview(): conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB ) spiders_info = { 'juchao_ant2': 'AntTime', 'juchao_ant_finance': 'AntTime', 'juchao_kuaixun': 'pub_date', 'announcement_base2': 'PubDatetime1', # TODO v1 版本的讯息 'juchao_ant': 'AntTime', 'announcement_base': 'PubDatetime1', } msg = '' for table_name, field in spiders_info.items(): count = get_inc_num(conn, table_name, field) msg += f'{table_name} 相比昨日新增的个数是 {count}\n' print(msg) ding_msg(msg) return msg
def get_inc_num(conn: sql_base.Connection, table_name: str, field: str): query_sql = ''' select count(id) as inc_count from {} where {} >= date_sub(CURDATE(), interval 1 day); ''' query_sql = query_sql.format(table_name, field) inc_count = conn.get(query_sql).get("inc_count") return inc_count
def __init__(self): self.merge_table_name = 'announcement_base' self.his_table = 'juchao_ant' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, )
class SourceAnnouncementBaseV1(object): """将两个爬虫表合并生成公告基础表""" def __init__(self): self.merge_table_name = 'announcement_base' self.his_table = 'juchao_ant' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) def daily_update(self, deadline: datetime.datetime = None): if deadline is None: deadline = datetime.datetime.now() - datetime.timedelta(days=1) load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, \ AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \ UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline) logger.info(load_sql) items = [] datas = self._r_spider_conn.query(load_sql) logger.info(len(datas)) for data in datas: data = utils.process_secucode(data) if data: items.append(data) self._spider_conn.batch_insert(items, self.merge_table_name, ['SecuCode', 'SecuAbbr', 'PubDatetime1', 'InsertDatetime1', 'Title1']) update_sql = ''' select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' and A.code = B.SecuCode \ and A.link = B.AntDoc and A.type = '公告'; '''.format(deadline) datas = self._r_spider_conn.query(update_sql) for data in datas: item = { 'PubDatetime2': data.get("pub_date"), 'InsertDatetime2': data.get("CREATETIMEJZ"), 'Title2': data.get("title"), } self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))
class JuchaoCounter(object): def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) @property def codes_map(self): codes_map = {} sql = '''select code, OrgId from juchao_codemap; ''' res = self._spider_conn.query(sql) for r in res: codes_map[r.get('OrgId')] = r.get("code") return codes_map def launch(self, org_id: str): codes_map = self.codes_map org_id_lst = sorted(list(codes_map.keys())) position = org_id_lst.index(org_id) print("position", position) for org_id in org_id_lst[position:]: code = codes_map.get(org_id) stock_str = ','.join([code, org_id]) print(stock_str) self.get_count(stock_str) @retry(stop_max_attempt_number=3) def get_count(self, stock_str: str): time.sleep(random.randint(1, 3)/10) post_data = { 'pageNum': 1, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext', 'plate': '', 'stock': stock_str, 'searchkey': '', 'secid': '', 'category': '', 'trade': '', 'seDate': '', 'sortName': '', 'sortType': '', 'isHLtitle': True, } resp = requests.post(self.api, headers=self.headers, data=post_data, timeout=3) if resp.status_code == 200: text = resp.text py_datas = json.loads(text) total_ann = py_datas.get("totalAnnouncement") total_rec = py_datas.get("totalRecordNum") record = { "SecuCode": stock_str.split(',')[0], "TotalAnn": total_ann, "TotalRec": total_rec, } # print(record) # 记录单个 code 的结束 with open(file_name, "a") as f: f.write(json.dumps(record)+'\n') def check_count_bydate(self): """ 根据时间去计算每天的个数 """ sql = '''select SecuCode, count(*) from juchao_ant2 group by AntTime ; ''' def check_count(self): sql = '''select SecuCode, count(*) from juchao_ant2 group by SecuCode ; ''' ret = self._r_spider_conn.query(sql) exist_map = {} for r in ret: exist_map[r.get('SecuCode')] = r.get("count(*)") # print(exist_map) web_map = {} with open(file_name, "r") as f: lines = f.readlines() for line in lines: r = json.loads(line) web_map[r.get("SecuCode")] = r.get("TotalAnn") no_lst = [] big_delta_lst = [] small_delta_lst = [] for code in web_map: _count = 0 if code in _map: _code = _map.get(code) _count = exist_map.get(_code) exist_num = exist_map.get(code) exist_num += _count web_num = web_map.get(code) if not exist_num: no_lst.append(code) elif exist_num != web_num: delta = web_num - exist_num if delta > 0: big_delta_lst.append((code, delta)) # big_delta_lst.append(code) else: # small_delta_lst.append((code, delta)) small_delta_lst.append(code) # print(no_lst) # print(len(no_lst)) print(big_delta_lst) print(len(big_delta_lst))
class UpdateInnerCode(object): def __init__(self): self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.batch_num = 10000 def max_id(self): sql = '''select max(id) as max_id from announcement_base2; ''' return self._spider_conn.get(sql).get("max_id") def load_innercode_map(self): sql = '''select secucode, innercode from secumain; ''' _map = dict() ret = self._juyuan_conn.query(sql) for r in ret: _map[r.get("secucode")] = r.get("innercode") return _map def get_old_inner_code(self, secucode: str): sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format( secucode) r = self._juyuan_conn.get(sql) if r: inner_code = r.get("InnerCode") return inner_code def start(self, begin: int, end: int): inner_map = self.load_innercode_map() # secucode 无前缀 # max_id = self.max_id() # print(max_id) base_sql = '''select id, secucode from announcement_base2 where id between {} and {} and InnerCode = 0; ''' for i in range(begin, end): sql = base_sql.format(i * self.batch_num, i * self.batch_num + self.batch_num) print(sql) datas = self._spider_conn.query(sql) print(len(datas)) for data in datas: inner_code = inner_map.get(data.get('secucode')[2:]) if inner_code is None: inner_code = self.get_old_inner_code( data.get('secucode')[2:]) if inner_code is not None: inner_map.update( {data.get('secucode')[2:]: inner_code}) self._spider_conn.table_update( 'announcement_base2', {'InnerCode': inner_code}, "id", data.get("id")) else: self._spider_conn.table_update('announcement_base2', {'InnerCode': inner_code}, "id", data.get("id"))
class SourceAnnouncementBase(object): """将两个爬虫表合并生成公告基础表""" def __init__(self): self.merge_table_name = 'announcement_base2' self.his_table = 'juchao_ant2' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) def category_code_map(self): return { 'category_bcgz_szsh': ('补充更正', 19), 'category_bndbg_szsh': ('半年报', 2), 'category_cqdq_szsh': ('澄清致歉', 12), 'category_dshgg_szsh': ('董事会', 18), 'category_fxts_szsh': ('风险提示', 21), 'category_gddh_szsh': ('股东大会', 15), 'category_gqbd_szsh': ('股权变动', 16), 'category_gqjl_szsh': ('股权激励', 17), 'category_gszl_szsh': ('公司治理', 24), 'category_gszq_szsh': ('公司债', 25), 'category_jj_szsh': ('解禁', 9), 'category_jshgg_szsh': ('监事会', 14), 'category_kzzq_szsh': ('可转债', 22), 'category_ndbg_szsh': ('年报', 4), 'category_pg_szsh': ('配股', 7), 'category_qtrz_szsh': ('其他融资', 23), 'category_qyfpxzcs_szsh': ('权益分派', 11), 'category_rcjy_szsh': ('日常经营', 10), 'category_sf_szsh': ('首发', 8), 'category_sjdbg_szsh': ('三季报', 3), 'category_tbclts_szsh': ('特别处理和退市', 13), 'category_tszlq_szsh': ('退市整理期', 20), 'category_yjdbg_szsh': ('一季报', 1), 'category_yjygjxz_szsh': ('业绩预告', 5), 'category_zf_szsh': ('增发', 6), 'category_zj_szsh': ('中介报告', 26), 'category_others': ('其他', 27), } def load_innercode_map(self): sql = '''select secucode, innercode from secumain; ''' _map = dict() ret = self._juyuan_conn.query(sql) for r in ret: _map[r.get("secucode")] = r.get("innercode") return _map def get_old_inner_code(self, secucode: str): sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format(secucode) r = self._juyuan_conn.get(sql) if r: inner_code = r.get("InnerCode") return inner_code def daily_update(self, deadline: datetime.datetime = None): inner_map = self.load_innercode_map() category_code_map = self.category_code_map() if deadline is None: deadline = datetime.datetime.now() - datetime.timedelta(days=1) load_sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \ AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \ UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline) logger.info(f"his load sql is {load_sql}") datas = self._r_spider_conn.query(load_sql) logger.info(f"load count is {len(datas)} from his table.") items = [] for data in datas: data['CategoryCode'] = category_code_map.get(data.get("CategoryCode"))[1] inner_code = inner_map.get(data.get("SecuCode")) if inner_code: data['InnerCode'] = inner_code # 爬虫库中的 secucode 无前缀 else: inner_code = self.get_old_inner_code(data.get("SecuCode")) if inner_code: inner_map.update({data.get("SecuCode"): inner_code}) data['InnerCode'] = inner_code else: continue data = utils.process_secucode(data) # 暂时只要 0 3 6 if data: items.append(data) self._spider_conn.batch_insert(items, self.merge_table_name, ['SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1', 'InsertDatetime1', 'Title1', 'InnerCode']) update_sql = '''select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' \ and A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告'; '''.format(deadline) logger.info(f"live sql is {update_sql}") datas = self._r_spider_conn.query(update_sql) logger.info(f'load count {len(datas)} from live table.') for data in datas: item = { 'PubDatetime2': data.get("pub_date"), 'InsertDatetime2': data.get("CREATETIMEJZ"), 'Title2': data.get("title"), } self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))
class JuchaoLiveNewsSpider(object): """巨潮快讯爬虫""" def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' } def get_secu_abbr(self, code): """ 从聚源数据库获取证券代码对应的中文简称 :param code: 无前缀的证券代码 :return: """ sql = f'''select SecuAbbr from secumain where secucode=%s;''' name = self._juyuan_conn.get(sql, code).get("SecuAbbr") return name def get_redit_link(self, link): """获取最终重定向后的网址""" resp = self.my_get(link) redit_list = resp.history try: redit_link = redit_list[len(redit_list) - 1].headers["location"] except IndexError: redit_link = link except: return None return redit_link @retry(stop_max_attempt_number=5) def my_get(self, link): # 请求 超时\被ban 时重试 logger.debug(f'get in {link} .. ') resp = requests.get(link, headers=self.headers, timeout=5) return resp def parse(self, url: str, day_string: str): items = [] resp = self.my_get(url) if resp and resp.status_code == 200: text = resp.text datas = json.loads(text) if not datas: print("{} 无公告数据".format(url)) else: for data in datas: item = {} # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称 announcementTime = time.localtime( int(data.get("announcementTime") / 1000)) announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime) item['pub_date'] = announcementTime item['type'] = data.get("type") item['title'] = data.get("title") page_path = data.get("pagePath") if page_path is None: link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format( data.get("textId"), day_string) else: try: link = self.get_redit_link(page_path) except: link = None if not link: continue item['link'] = link code = data.get("code") if code: item['code'] = code # 无前缀的证券代码 item['name'] = self.get_secu_abbr(code) items.append(item) return items def start(self, start_dt: datetime.datetime = None, end_dt: datetime.datetime = None): """启动入口""" # 使用传入的结束时间否则置为今天 if end_dt is not None: end_dt = datetime.datetime.combine(end_dt, datetime.time.min) else: end_dt = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min) # 使用传入的开始时间 否则置为结束时间之前 3 天 if start_dt is not None: start_dt = datetime.datetime.combine(start_dt, datetime.time.min) else: start_dt = end_dt - datetime.timedelta(days=3) # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间 this_day = end_dt while this_day >= start_dt: day_string = this_day.strftime("%Y-%m-%d") items = self.parse(self.api_url.format(day_string), day_string) logger.info(f"{day_string}, {len(items)}") self._spider_conn.batch_insert(items, self.table_name, self.fields) this_day -= datetime.timedelta(days=1) time.sleep(3)
class SpyAnnBase(object): spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) def start(self): sql = ''' CREATE TABLE `announcement_base2` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, `SecuCode` varchar(100) COLLATE utf8_bin NOT NULL COMMENT '股票代码', `InnerCode` int(11) NOT NULL DEFAULT '0', `SecuAbbr` varchar(100) COLLATE utf8_bin NOT NULL COMMENT '股票简称', `CategoryCode` tinyint(4) NOT NULL COMMENT '巨潮网站分类', `PDFLink` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '公告pdf地址', `PubDatetime1` datetime NOT NULL COMMENT '公告发布时间(巨潮公告速递栏目中的时间)', `InsertDatetime1` datetime NOT NULL COMMENT '爬虫入库时间(巨潮公告速递栏目)', `Title1` varchar(1000) COLLATE utf8_bin NOT NULL COMMENT '巨潮公告速递栏目中的标题', `PubDatetime2` datetime DEFAULT NULL COMMENT '公告发布时间(巨潮快讯栏目中的时间)', `InsertDatetime2` datetime DEFAULT NULL COMMENT '爬虫入库时间(巨潮快递栏目)', `Title2` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '巨潮快讯栏目中的标题(没有则留空)', `CreateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `UpdateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', PRIMARY KEY (`id`), UNIQUE KEY `un1` (`PDFLink`), KEY `k1` (`SecuCode`,`CategoryCode`,`PubDatetime1`,`PubDatetime2`,`UpdateTime`), KEY `innercode` (`InnerCode`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='公告基础表' ; ''' secu_id_map = {} ''' id: 19 secu_code: 000010 secu_abbr: 美丽生态 jy_inner_code: 31 secu_type: 1 industry_code: IX880032 secu_market: 90 ''' sql = '''select id, secu_code from bas_secumain ; ''' ret = self.spider_conn.query(sql) for r in ret: secu_id_map[r['secu_code']] = r['id'] print(secu_id_map) sql = ''' CREATE TABLE IF NOT EXISTS `spy_announcement_data` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '主键', `secu_id` int(20) NOT NULL COMMENT '证券ID', `category_code` tinyint(4) NOT NULL COMMENT '巨潮网站分类', `pdf_link` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '公告pdf地址', `title` varchar(1000) COLLATE utf8_bin NOT NULL COMMENT '公告标题', `pub_date` datetime NOT NULL COMMENT '公告发布时间(巨潮公告速递栏目中的时间)', `create_by` int(11) NOT NULL DEFAULT 0 COMMENT '创建人', `update_by` int(11) NOT NULL DEFAULT 0 COMMENT '更新人', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', PRIMARY KEY (`id`), UNIQUE KEY `ix_spy_announcement_data` (`pdf_link`), KEY `pub_date` (`pub_date`), KEY `update_time` (`update_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='公告采集原始表' ; ''' self.spider_conn.execute(sql) base_sql = ''' select id, SecuCode, CategoryCode as category_code, PDFLink as pdf_link, Title1 as title, PubDatetime1 as pub_date from announcement_base2 where id >= {} and id < {} ; ''' maxid_sql = '''select max(id) from announcement_base2; ''' max_id = self.r_spider_conn.get(maxid_sql).get("max(id)") print(max_id) # 123787519 fields = ['secu_id', 'category_code', 'title', 'pub_date'] for i in range(int(max_id / 10000) + 1): start = i * 10000 end = i * 10000 + 10000 sql = base_sql.format(start, end) print(sql) datas = self.r_spider_conn.query(sql) for data in datas: data['secu_id'] = secu_id_map[data['SecuCode'][2:]] data.pop('SecuCode') save_count = self.spider_conn.batch_insert( datas, 'spy_announcement_data', fields) print(len(datas), ">>>", save_count)
class BasSecuMain(object): spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) theme_conn = Connection( host=THE_HOST, port=THE_PORT, user=THE_USER, password=THE_PASSWD, database=THE_DB, ) fields = [ 'secu_code', 'secu_abbr', 'jy_inner_code', 'industry_code', 'secu_type' ] table_name = 'bas_secumain' def start(self): sql = ''' CREATE TABLE IF NOT EXISTS `bas_secumain` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键', `secu_code` varchar(100) NOT NULL COMMENT '股票代码', `secu_abbr` varchar(100) NOT NULL COMMENT '股票简称', `jy_inner_code` int(11) NOT NULL COMMENT '聚源内部编码', `secu_type` int(11) NOT NULL COMMENT '证券代码分类(1:A股 2:B股 3:基金)', `industry_code` varchar(20) DEFAULT NULL COMMENT '经传行业版块代码(仅限个股)', `secu_market` int(11) DEFAULT NULL COMMENT '证券市场', `is_delete` tinyint(1) DEFAULT 0 COMMENT '逻辑删除标志', UNIQUE KEY `pk_bas_secumain` (`ID`), UNIQUE KEY `ix_bas_secumain` (`jy_inner_code`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='证券主表'; ''' self.spider_conn.execute(sql) industry_map = {} sql = ''' select A.code as IndustryCode, A.name as IndustryName, B.code as SecuCode, B.name as SecuAbbr from block A, block_code B where A.type = 1 and A.id = B.bid ; ''' ret = self.theme_conn.query(sql) for r in ret: industry_map[r['SecuCode']] = r['IndustryCode'] print(industry_map)
'juchao_ant': 'AntTime', 'announcement_base': 'PubDatetime1', } msg = '' for table_name, field in spiders_info.items(): count = get_inc_num(conn, table_name, field) msg += f'{table_name} 相比昨日新增的个数是 {count}\n' print(msg) ding_msg(msg) return msg if __name__ == '__main__': # ding_msg('just test') # send_crawl_overview() juyuanconn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) ret = fetch_A_secucode_innercode_map(juyuanconn) print(ret)
class SourceAnnouncementBase(object): """将两个爬虫表合并生成公告基础表""" def __init__(self): self.merge_table_name = 'announcement_base2' self.his_table = 'juchao_ant2' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = Connection( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, database=R_SPIDER_MYSQL_DB, ) self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) def category_code_map(self): return { 'category_bcgz_szsh': ('补充更正', 19), 'category_bndbg_szsh': ('半年报', 2), 'category_cqdq_szsh': ('澄清致歉', 12), 'category_dshgg_szsh': ('董事会', 18), 'category_fxts_szsh': ('风险提示', 21), 'category_gddh_szsh': ('股东大会', 15), 'category_gqbd_szsh': ('股权变动', 16), 'category_gqjl_szsh': ('股权激励', 17), 'category_gszl_szsh': ('公司治理', 24), 'category_gszq_szsh': ('公司债', 25), 'category_jj_szsh': ('解禁', 9), 'category_jshgg_szsh': ('监事会', 14), 'category_kzzq_szsh': ('可转债', 22), 'category_ndbg_szsh': ('年报', 4), 'category_pg_szsh': ('配股', 7), 'category_qtrz_szsh': ('其他融资', 23), 'category_qyfpxzcs_szsh': ('权益分派', 11), 'category_rcjy_szsh': ('日常经营', 10), 'category_sf_szsh': ('首发', 8), 'category_sjdbg_szsh': ('三季报', 3), 'category_tbclts_szsh': ('特别处理和退市', 13), 'category_tszlq_szsh': ('退市整理期', 20), 'category_yjdbg_szsh': ('一季报', 1), 'category_yjygjxz_szsh': ('业绩预告', 5), 'category_zf_szsh': ('增发', 6), 'category_zj_szsh': ('中介报告', 26), 'category_others': ('其他', 27), } def spider_max_id(self, table_name: str): sql = f'''select max(id) as max_id from {table_name}; ''' max_id = self._r_spider_conn.get(sql).get("max_id") return max_id def check_ids(self): _delta = [] max_id = self.spider_max_id(self.his_table) for i in range(int(max_id / self.batch_number) + 1): begin_id = i * self.batch_number end_id = i * self.batch_number + self.batch_number print() print(begin_id, end_id) sql1 = '''select count(id) from {} where id between {} and {}; '''.format( self.his_table, begin_id, end_id) sql2 = '''select count(id) from {} where id between {} and {}; '''.format( self.merge_table_name, begin_id, end_id) count1 = self._r_spider_conn.get(sql1).get("count(id)") count2 = self._r_spider_conn.get(sql2).get("count(id)") print(count1, count2) if count1 != count2: sl1 = '''select id from {} where id between {} and {};'''.format( self.his_table, begin_id, end_id) sl2 = '''select id from {} where id between {} and {};'''.format( self.merge_table_name, begin_id, end_id) rs1 = self._r_spider_conn.query(sl1) ids1 = set([r.get("id") for r in rs1]) rs2 = self._r_spider_conn.query(sl2) ids2 = set([r.get("id") for r in rs2]) print(list(ids1 - ids2)) _delta.extend(list(ids1 - ids2)) print(_delta) return _delta def process_diff_ids(self, diff_lst): diff_lst = tuple(diff_lst) category_code_map = self.category_code_map() _sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \ AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where id in {}; '''.format( self.his_table, diff_lst) datas = self._r_spider_conn.query(_sql) print(len(datas)) for data in datas: data['CategoryCode'] = category_code_map.get( data.get("CategoryCode"))[1] data = utils.process_secucode(data) # 暂时只要 0 3 6 if data: print(data) ret = self._spider_conn.table_insert( self.merge_table_name, data, [ 'SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1', 'InsertDatetime1', 'Title1' ]) print(ret) def first_load(self): inner_map = self.load_innercode_map() category_code_map = self.category_code_map() load_sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \ AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \ id >= {} and id < {}; ''' max_id = self.spider_max_id(self.his_table) for i in range(int(max_id / self.batch_number) + 1): begin_id = i * self.batch_number end_id = i * self.batch_number + self.batch_number print(begin_id, end_id) _sql = load_sql.format(self.his_table, begin_id, end_id) print(_sql) datas = self._r_spider_conn.query(_sql) for data in datas: inner_code = inner_map.get(data.get("SecuCode")) if inner_code: data['InnerCode'] = inner_code # 爬虫库中的 secucode 无前缀 else: inner_code = self.get_old_inner_code(data.get("SecuCode")) if inner_code: data['InnerCode'] = inner_code inner_map.update({data.get("SecuCode"): inner_code}) else: continue data['CategoryCode'] = category_code_map.get( data.get("CategoryCode"))[1] data = utils.process_secucode(data) # 暂时只要 0 3 6 if data: self._spider_conn.table_insert( self.merge_table_name, data, [ 'SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1', 'InsertDatetime1', 'Title1', 'InnerCode' ]) # 将巨潮快讯的数据补充进去 update_sql = '''select A.* from {} A, {} B \ where A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告' \ and A.id between {} and {}; ''' max_id = self.spider_max_id(self.live_table) for j in range(int(max_id / self.batch_number) + 1): begin_id = j * self.batch_number end_id = j * self.batch_number + self.batch_number print(begin_id, end_id) _sql = update_sql.format(self.live_table, self.his_table, begin_id, end_id) print(_sql) datas = self._r_spider_conn.query(_sql) for data in datas: item = { 'PubDatetime2': data.get("pub_date"), 'InsertDatetime2': data.get("CREATETIMEJZ"), 'Title2': data.get("title"), } self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link")) def load_innercode_map(self): sql = '''select secucode, innercode from secumain; ''' _map = dict() ret = self._juyuan_conn.query(sql) for r in ret: _map[r.get("secucode")] = r.get("innercode") return _map def get_old_inner_code(self, secucode: str): sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format( secucode) r = self._juyuan_conn.get(sql) if r: inner_code = r.get("InnerCode") return inner_code
class JuChaoCodeMap(object): def __init__(self): self.fields = ['code', 'orgId', 'category', 'pinyin', 'zwjc'] self.tool_table_name = 'juchao_codemap' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) def create_tools_table(self): sql = ''' CREATE TABLE IF NOT EXISTS `juchao_codemap` ( `id` int(11) NOT NULL AUTO_INCREMENT, `code` varchar(8) NOT NULL COMMENT '证券代码', `orgId` varchar(16) NOT NULL COMMENT '证券编码', `category` varchar(8) NOT NULL COMMENT '证券分类', `pinyin` varchar(10) NOT NULL COMMENT '证券中文名拼音', `zwjc` varchar(20) NOT NULL COMMENT '证券中文名', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `orgId_code` (`orgId`, `code`), KEY `update_time` (`UPDATETIMEJZ`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨潮证券编码'; ''' self._spider_conn.execute(sql) def get_stock_json(self): api = 'http://www.cninfo.com.cn/new/data/szse_a_stock.json?_={}'.format( int(time.time() * 1000)) headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://uc.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://uc.cninfo.com.cn/user/optionalConfig?groupId=88937', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } resp = requests.get(api, headers=headers) if resp and resp.status_code == 200: text = resp.text try: py_data = json.loads(text).get("stockList") except: print(text) raise Exception("resp parse error.") self._spider_conn.batch_insert(py_data, self.tool_table_name, self.fields) def start(self): self.create_tools_table() self.get_stock_json()
FROM gildata.LC_CodeChange A JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory in (1, 41) WHERE A.CodeDefine = 1 AND A.SecuCode <> B.SecuCode ); ''' juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) # res1 = juyuan_conn.query(sql1) # map1 = {} # for one in res1: # map1[one['SecuCode']] = one['InnerCode'] # # res2 = juyuan_conn.query(sql2) # map2 = {} # for one in res2: # map2[one['SecuCode']] = one['InnerCode'] # #
`Desc` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '事件描述', `IfShow` tinyint(4) NOT NULL DEFAULT '1' COMMENT '软件是否展示:0-不展示,1-展示', `FirstLevelShowRank` int(10) DEFAULT NULL COMMENT '公告智选前端展示排序-一级事件(对应EventMainTypeCode)(一个一级事件下面对应的多个二级事件每条数据需要保证这个字段一致)', `SecondLevelShowRank` int(10) DEFAULT NULL COMMENT '公告智选前端展示排序-二级事件(对应EventCode)', `CreateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `UpdateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`) USING BTREE, UNIQUE KEY `u1` (`EventName`) USING BTREE, UNIQUE KEY `u2` (`EventCode`) USING BTREE, UNIQUE KEY `u3` (`EventMainTypeCode`,`EventCode`,`SecondLevelShowRank`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='常量表-公告事件'; ''' sql2 = ''' INSERT INTO sf_const_announcement (EventMainTypeCode, EventMainTypeName, EventCode, EventName, \ EventOneWordLabel, Sentiment, Level, EventDayChgPerc, EventDayWinRatio, NextDayWinRatio, NextDayChgPerc, \ ThreeDayChgPerc,FiveDayChgPerc, IfShow, FirstLevelShowRank, SecondLevelShowRank) VALUES \ ('A000', '其他', 'A000000', '其他类别', '其', 0,0, 0,0,0,0,0,0, 1, 7,1); ''' conn = Connection( host=TEST_MYSQL_HOST, port=TEST_MYSQL_PORT, user=TEST_MYSQL_USER, password=TEST_MYSQL_PASSWORD, database=TEST_MYSQL_DB, ) conn.insert(sql1) conn.insert(sql2)