def crawl_holders(data): logger.info('----crawling {} holders----'.format(data['companyName'])) for i in data['holders']: if re.search(r'firm_(\w+).html', i['url']) and i['name'] not in crawled: try: q = crawl_from_qichacha(i['name'], i['url'], {}) except NeedValidationError as e: raise e except Exception as e: logger.info('error: {}, {}'.format(i['name'], i['url'])) has_error.append((i['name'], i['url'])) crawled.append(i['name']) else: if q['overview']['stock_code']: crawl_stock(q['overview']['stock_code']) logger.info('crawl: {}'.format(i['name'])) crawled.append(i['name']) logger.info('----crawling {} holders end----'.format(data['companyName']))
def run(self): global single_time_crawled, need_validate while 1: try: # 若待爬队列15s无数据 unique, name, level = self.wait_crawl_q.get(timeout=15) logger1.info('+++++{} crawling ({})'.format( self.thread_name, name)) url = 'https://www.qichacha.com/firm_' + unique + '.html' # 暂时不使用代理 proxy = None # 加入延时 time.sleep(random.uniform(1, 2)) try: qichacha, html = crawl_from_qichacha(name, url, proxy) # 出现验证错误 except NeedValidationError as e: # 等待两秒后重试 time.sleep(2) try: qichacha = crawl_from_qichacha(name, url, proxy) except NeedValidationError as e: # 若需要验证,则该公司不需要加入待写队列 self.wait_crawl_q.task_done() need_validate = True # 清空待爬队列 logger1.error( '===!!{} get Need Validation Error, clearing wait_crawl_q' .format(self.thread_name)) while not self.wait_crawl_q.empty(): try: self.wait_crawl_q.get_nowait() self.wait_crawl_q.task_done() except queue.Empty: logger1.error( '!!!!!{} get Empty Error when clear wait_crawl_q' .format(self.thread_name)) logger1.error( '===!!{} clear wait_crawl_q finished'.format( self.thread_name)) continue # 爬虫出现错误 except Exception as e: logger1.error('!!!!!{} crawl ({}, {}) error!!!!!'.format( self.thread_name, name, url)) logger1.exception(e) # 加入待写队列的item分为两类 # 一类 flag = 0,表示需要更新该记录的 crawled_date, has_error # 一类 flag = 1,表示需要插入该记录的 unique, name, level # 更新该公司的 crawled_date 和 has_error flag = 0 crawled_date = datetime.date.today() has_error = 1 wait_write_q_item = (flag, unique, name, level, crawled_date, has_error) self.wait_write_q.put(wait_write_q_item) self.wait_crawl_q.task_done() single_time_crawled += 1 continue # 没有错误 else: logger1.info('+++++{} crawled ({})'.format( self.thread_name, name)) flag = 0 crawled_date = datetime.date.today() has_error = 0 eastmoney, cninfo = '', '' # 若有股票代码则爬取股票信息 if qichacha['overview']['stock_code']: try: eastmoney, cninfo = crawl_stock( qichacha['overview']['stock_code']) except Exception as e: # 表示该公司股票信息爬取失败 has_error = 2 logger1.exception(e) logger1.error('crawl stock {} error'.format( qichacha['overview']['stock_code'])) # 更新该公司的 crawled_date 和 has_error wait_write_q_item = (flag, unique, name, level, crawled_date, has_error) self.wait_write_q.put(wait_write_q_item) # 将爬取结果入库 document = { 'unique': unique, 'company': qichacha['companyName'], 'html': html, 'qichacha': qichacha, 'eastmoney': eastmoney, 'cninfo': cninfo, 'crawl_time': str(datetime.date.today()), 'store_time': '' } self.mongo_collection.insert_one(document) # 根据该公司 level 将 holders 或 investments 加入待写队列 # level = 0,根公司,将 holders 和 investments 加入待写队列 if level == 0: for holder in qichacha['holders']: new_unique = re.search(r'firm_(\w+).html', holder['url']) if not new_unique: continue # 插入股东信息 flag = 1 new_unique = new_unique.group(1) new_name = holder['name'] new_level = level - 1 new_crawled_date = '2000-01-01' new_has_error = '' wait_write_q_item = (flag, new_unique, new_name, new_level, new_crawled_date, new_has_error) self.wait_write_q.put(wait_write_q_item) for investment in qichacha['investments']: new_unique = re.search(r'firm_(\w+).html', investment['url']) if not new_unique: continue # 插入投资信息 flag = 1 new_unique = new_unique.group(1) new_name = investment['name'] new_level = level + 1 new_crawled_date = '2000-01-01' new_has_error = '' wait_write_q_item = (flag, new_unique, new_name, new_level, new_crawled_date, new_has_error) self.wait_write_q.put(wait_write_q_item) # -2 < level < 0,将 holders 加入待写队列 elif -2 < level < 0: for holder in qichacha['holders']: new_unique = re.search(r'firm_(\w+).html', holder['url']) if not new_unique: continue # 插入股东信息 flag = 1 new_unique = new_unique.group(1) new_name = holder['name'] new_level = level - 1 new_crawled_date = '2000-01-01' new_has_error = '' wait_write_q_item = (flag, new_unique, new_name, new_level, new_crawled_date, new_has_error) self.wait_write_q.put(wait_write_q_item) # 0 < level < 6,将 investments 加入待写队列 elif 0 < level < 6: for investment in qichacha['investments']: new_unique = re.search(r'firm_(\w+).html', investment['url']) if not new_unique: continue # 插入投资信息 flag = 1 new_unique = new_unique.group(1) new_name = investment['name'] new_level = level + 1 new_crawled_date = '2000-01-01' new_has_error = '' wait_write_q_item = (flag, new_unique, new_name, new_level, new_crawled_date, new_has_error) self.wait_write_q.put(wait_write_q_item) # level = -2 或 level = 6,只将自身加入待写队列 elif level == -2 or level == 6: # 不需要处理 holders 或 investments pass self.wait_crawl_q.task_done() single_time_crawled += 1 except queue.Empty: logger1.info('+++++{}: No data in wait_crawl_q'.format( self.thread_name)) logger1.info('+++++{} end+++++'.format(self.thread_name)) return
def run(self): global single_time_crawled, need_validate while 1: try: # 等待15s # 若待爬队列15s无数据 name, unique = self.wait_crawl_q.get(timeout=15) # logger.info('+++++{} get ({}, {}) from wait_crawl_q'.format(self.name, name, unique)) if unique: url = 'https://www.qichacha.com/firm_' + unique + '.html' else: url = '' # 暂时不使用代理 proxy = None # 加入延时 time.sleep(random.uniform(2, 3)) try: qichacha = crawl_from_qichacha(name, url, proxy) except NeedValidationError as e: wait_write_q_item = (name, unique, 1, 0) self.wait_write_q.put(wait_write_q_item) logger1.info( '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}' .format(self.name, *wait_write_q_item, self.wait_crawl_q.qsize())) self.wait_crawl_q.task_done() need_validate = True logger1.error( '===!!{} get Need Validation Error, clearing q, qsize: {}, unfinished: {}' .format(self.name, self.wait_crawl_q.qsize(), self.wait_crawl_q.unfinished_tasks)) while not self.wait_crawl_q.empty(): try: self.wait_crawl_q.get_nowait() self.wait_crawl_q.task_done() except queue.Empty: logger1.error( '!!!!!{} get Empty Error when clear q'.format( self.name)) logger1.error( '===!!{} clear q finished, qsize: {}, unfinished: {}'. format(self.name, self.wait_crawl_q.qsize(), self.wait_crawl_q.unfinished_tasks)) continue except Exception as e: logger1.exception(e) logger1.error('!!!!!{} crawl ({}, {}) error!!!!!'.format( self.name, name, url)) wait_write_q_item = (name, unique, 0, 1) self.wait_write_q.put(wait_write_q_item) logger1.info( '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}' .format(self.name, *wait_write_q_item, self.wait_crawl_q.qsize())) self.wait_crawl_q.task_done() single_time_crawled += 1 continue else: if not unique: url = qichacha['url'] unique = re.search(r'firm_(\w+).html', url).group(1) logger1.info('+++++{} crawled ({})'.format( self.name, name)) wait_write_q_item = (name, unique, 0, 0) self.wait_write_q.put(wait_write_q_item) logger1.info( '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}' .format(self.name, *wait_write_q_item, self.wait_crawl_q.qsize())) if qichacha['overview']['stock_code']: crawl_stock_thread = threading.Thread( target=crawl_stock, args=(qichacha['overview']['stock_code'], ), name='crawl-stock-thread') crawl_stock_thread.start() for holder in qichacha['holders']: unique = re.search(r'firm_(\w+).html', holder['url']) if not unique: continue unique = unique.group(1) name = holder['name'] wait_write_q_item = (name, unique, 1, 0) self.wait_write_q.put(wait_write_q_item) for investment in qichacha['investments']: unique = re.search(r'firm_(\w+).html', investment['url']) if not unique: continue unique = unique.group(1) name = investment['company_name'] wait_write_q_item = (name, unique, 1, 0) self.wait_write_q.put(wait_write_q_item) self.wait_crawl_q.task_done() single_time_crawled += 1 except queue.Empty: logger1.info('+++++{}: No data in wait_crawl_q'.format( self.name)) logger1.info('+++++{} end+++++'.format(self.name)) return
'贵州汇生林业开发有限公司', 'url': 'https://www.qichacha.com/firm_bc79a2ed616358340df33a6155d399c1.html' }, ] with open('crawled.json', 'r', encoding='utf-8') as f: crawled = json.load(f) with open('error.json', 'r', encoding='utf-8') as f: has_error = json.load(f) for g in group_list: if not os.path.isfile('json/qichacha/' + g['name'] + '.json'): # 公司本身 q = crawl_from_qichacha(g['name'], '', {}) if q['overview']['stock_code']: crawl_stock(q['overview']['stock_code']) crawled.append(g['name']) unique = re.search(r'_(\w+).html', g['url']).group(1) with open('json/qichacha/' + unique + '.json', 'r', encoding='utf-8') as f: root_data = json.load(f) try: # 一级母公司 crawl_holders(root_data) # 一级子公司 crawl_investments(root_data)
if stop: finish() unique, name, level = item logger1.info('Crawling (%s)' % name) url = 'https://www.qichacha.com/firm_' + unique + '.html' # 暂时不使用代理 proxy = None # 加入延时 time.sleep(random.uniform(crawl_delay, crawl_delay + 3)) try: qichacha, html = crawl_from_qichacha(name, url, proxy) # 出现未登录错误 except NotLoginError as e: logger1.error('Not Login, Please reset cookie') finish() # 出现验证错误 except NeedValidationError as e: # 等待两秒后重试 time.sleep(2) try: qichacha = crawl_from_qichacha(name, url, proxy) except NeedValidationError as e: