def get_pagination(self, key): min_page = 0 max_page = 5 if not key: return min_page, max_page if API_MODE == 'tyc': return min_page, max_page elif API_MODE == 'pro': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, '0', parse.quote(key)) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all('div', class_='search-pagination') def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(search_pagination) == 0 or not is_ok: while 1: if is_ok and len(search_pagination) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all( 'div', class_='search-pagination') l = len(search_pagination[0].find_all('a')) for index_a, a in enumerate(search_pagination[0].find_all('a')): if index_a == (l - 2): max_page = a.string.strip() if max_page.find('...') > -1: max_page = max_page.split('...')[1] if isinstance(max_page, str): max_page = int(max_page) break LOG.info('[%s] pagination max: %s' % (key, max_page)) return min_page, max_page
def work_by_key(self, key): print(key, '@' * 100) ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res # page is_page = False for ct in range(9): url = '%s/p%s?key=%s' % (TYC_SEARCH_API, 1, parse.quote(key)) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') self.headers['Cookie'] = cookies_get() if is_ok: is_page = True break page_vlas = 200 if not is_page: page_vlas = 200 else: et_late = etree.HTML(search_resp) page_num = [ i.xpath('./li/a/text()')[-2] for i in et_late.xpath( '//div[@class="result-footer"]/div[@class=" search-pager"]/ul' ) ] if page_num: page_vlas = str(page_num[0]).replace('.', '') LOG.critical(f'搜索关键词为:{key}, 总页面:{page_vlas}------------------------') print(f'搜索关键词为:{key}, 总页面:{page_vlas}------------------------') # 公司列表 for page in range(1, int(page_vlas), 1): self.headers['Cookie'] = cookies_get() url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) print(url, 'Q' * 80) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): self.headers['Cookie'] = cookies_get() sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp HTNL = etree.HTML(search_resp) print( HTNL.xpath( '//*[@id="web-content"]/div/div[1]/div[3]/div[2]/div[1]/div/div[3]/div[1]/a/text()' ), 'A' * 80) # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: print(url) LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) eto = etree.HTML(search_resp) user_name = eto.xpath('//div[@nav-type="user"]/a/text()') is_success = False for i in range(9): if not ''.join(user_name): self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) is_success = True break if is_success: for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() res_dict['tyt_url'] = tag.get('href').strip() res_dict['name'] = tag.get_text().strip() res_dict['company_id'] = str( tag.get('href')).split('/')[-1] res_dict['label_index'] = str(key) res_dict['rquest_url'] = url res_dict['source'] = '天眼查' res_dict['created_time'] = str(datetime.now()) result = _insert(res_dict) if result.get('status', False): c_id = str(result.get('_id')) try: # detail_res = self.detail_by_url(res_dict.get('tyt_url')) self.detail_by_url(res_dict.get('tyt_url'), c_id) except: try: self.detail_by_url(res_dict.get('tyt_url'), c_id) except: pass ret_res.append(res_dict) random_sleep(1, 2.5) # break # break return ret_res
def detail_by_url(self, comp_url: str, obj_id: str): print(self.count, comp_url, obj_id, '$' * 80) detail_res = dict() if not comp_url: return detail_res is_ok, search_resp = api_get(url=comp_url, headers=self.headers, data={}, resptype='text') if not is_ok: return detail_res soup = BeautifulSoup(search_resp, 'lxml') # header: 详情页 公司名称 title_list = soup.find_all('div', class_="header") et2 = etree.HTML(search_resp) # if not title_list: # return -1 try: company_name = (title_list[0].find_all( 'h1', class_="name"))[0].get_text() except: name = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()') company_name = ''.join(name) detail_res['company_name'] = company_name # 电话 更多联系方式 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80) origin_phone = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()' ) # 邮箱 更多邮箱 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80) origin_email = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()' ) if origin_phone and origin_email: year_list = [i.get('showSource') for i in eval(origin_phone[0])] phone_item_vals = [ i.get('phoneNumber') for i in eval(origin_phone[0]) ] email_list = eval(origin_email[0]) contact_item = {} for contact in zip(year_list, phone_item_vals, email_list): contact_item['c_id'] = obj_id contact_item['company_name'] = detail_res.get( 'company_name', '') contact_item['report_year'] = contact[0] contact_item['phone'] = contact[1] contact_item['email'] = contact[-1] contact_item['date_time'] = datetime.now() bixao_phone_emial.find_one_and_update({'c_id': obj_id}, {'$set': contact_item}, upsert=True) # detail: 电话 邮箱 公司官网 地址 简介 detail_div = soup.find_all('div', class_="detail") def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if not detail_div: while 1: if is_ok and detail_div: break else: LOG.critical('验证############### %s ###############' % comp_url) random_sleep(20, 25) self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(comp_url) soup = BeautifulSoup(search_resp, 'lxml') detail_div = soup.find_all('div', class_="detail") for div in detail_div[0].find_all('div'): if not div: continue # f0 电话 && 邮箱 if div.get('class') == ['f0']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['phone'] = child.get_text().strip( ) or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: detail_res['email'] = child.get_text().strip( ) or '-' break else: break # 公司官网 && 地址 elif div.get('class') == ['f0', 'clearfix']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['company_url'] = child.get_text( ).strip() or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: for small_index, small_child in enumerate( child.children): if small_index == 0: detail_res[ 'address'] = small_child.get_text( ).strip() or '-' break break else: break # 简介 elif div.get('class') == ['summary']: for big_index, big_child in enumerate(div): if big_index == 0: resume = big_child.string if resume: resume = resume.strip() detail_res['resume'] = resume or '-' break else: break else: continue # detail-list: detail_list_div = soup.find_all('div', class_="detail-list") if not detail_list_div: return detail_res detail_res['c_id'] = obj_id etc = etree.HTML(search_resp) for div in detail_list_div[0].find_all('div'): if not div: continue if div.get('tyc-event-ch' ) == 'CompangyDetail.gongshangxinxin': # 工商信息 for index_1, child_1 in enumerate( div.find_all('div', recursive=False)): if index_1 == 1: for index_1_1, child_1_1 in enumerate(child_1): if index_1_1 == 2: for index_tr, tr in enumerate( child_1_1.find_all('tr')): if index_tr == 0: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 注册资本 detail_res[ 'register_funds'] = td.get_text( ).strip() or '-' elif index_td == 3: # 实缴资金 detail_res[ 'paidin_funds'] = td.get_text( ).strip() or '-' elif index_tr == 1: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 成立日期 detail_res[ 'establish_date'] = td.get_text( ).strip() or '-' elif index_td == 3: # 经营状态 detail_res[ 'status'] = td.get_text( ).strip() or '-' elif index_tr == 2: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 统一社会信用代码 detail_res[ 'credit_code'] = td.get_text( ).strip() or '-' elif index_td == 3: # 工商注册号 detail_res[ 'registration_number'] = td.get_text( ).strip() or '-' elif index_tr == 3: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 纳税人识别号 detail_res[ 'identification_number'] = td.get_text( ).strip() or '-' elif index_td == 3: # 组织机构代码 detail_res[ 'organization_code'] = td.get_text( ).strip() or '-' elif index_tr == 4: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 公司类型 detail_res[ 'company_type'] = td.get_text( ).strip() or '-' elif index_td == 3: # 行业 detail_res[ 'industry'] = td.get_text( ).strip() or '-' elif index_tr == 6: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 营业期限 detail_res[ 'business_term'] = td.get_text( ).strip() or '-' elif index_td == 3: # 纳税人资质 detail_res[ 'taxpayer_qualification'] = td.get_text( ).strip() or '-' elif index_tr == 7: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 人员规模 detail_res[ 'personnel_size'] = td.get_text( ).strip() or '-' elif index_td == 3: # 参保人数 detail_res[ 'insured_num'] = td.get_text( ).strip() or '-' elif index_tr == 9: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 注册地址 detail_res[ 'registered_address'] = td.get_text( ).strip() or '-' elif index_tr == 10: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 经营范围 detail_res[ 'business_scope'] = td.get_text( ).strip() or '-' break continue elif div.get( 'tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan': # 主要人员 people_item = {} people_item['c_id'] = obj_id people_item['company_name'] = detail_res.get( 'company_name', '') # 姓名 people_item['name'] = etc.xpath( '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()' )[0] # 职位 people_item['position'] = etc.xpath( '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[3]/span/text()' )[0] bixiao_people.find_one_and_update({'c_id': obj_id}, {'$set': people_item}, upsert=True) print(people_item) for people_vals in people_item: if not people_item[people_vals]: LOG.info(f'主要人员数据匹配异常:{people_item}, 请求地址:{comp_url}') elif div.get( 'tyc-event-ch') == 'CompangyDetail.gudongxinxi': # 股东信息 capital_item = {} capital_item['c_id'] = obj_id capital_item['company_name'] = detail_res.get( 'company_name', '') # 股东名称 title = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()' ) # 标签 label = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/div/span/text()' ) # 持股比例 has_rates = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[3]/div/div/span/text()' ) # 认缴出资额 subscribed_capital = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[4]/div/span/text()' ) capital_item['title'] = ''.join(title) capital_item['label'] = ''.join(label) capital_item['has_rates'] = ''.join(has_rates) capital_item['subscribed_capital'] = ''.join( subscribed_capital) bixiao_shareholder.find_one_and_update({'c_id': obj_id}, {'$set': capital_item}, upsert=True) print(capital_item, 'C' * 80) elif div.get( 'tyc-event-ch') == 'CompangyDetail.findNewsCount': # 新闻舆情 news_item = {} news_item['c_id'] = obj_id news_item['company_name'] = detail_res.get('company_name', '') # 标题 news_item['title'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/text()' )[0] # 内容地址 news_item['info_url'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/@href' )[0] # 来源 news_item['source'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[1]/text()' )[0] # 发布时间 news_item['date_doc'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[2]/text()' )[0] print(news_item) bixiao_news.update({'c_id': obj_id}, {'$set': news_item}, upsert=True) for news_vals in news_item: if not news_item[news_vals]: LOG.info(f'新闻舆情数据匹配异常:{news_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin': # 产品信息 product_item = {} product_item['c_id'] = obj_id product_item['company_name'] = detail_res.get( 'company_name', '') # 产品名称 product_item['name'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[2]/table' '/tbody/tr/td[2]/span/text()')[0] # 产品简称 product_item['short_name'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[3]' '/span/text()')[0] # 产品分类 product_item['type'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[4]/span' '/text()')[0] # 领域 product_item['domain'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[5]' '/span/text()')[0] print(product_item) bixiao_product.find_one_and_update({'c_id': obj_id}, {'$set': product_item}, upsert=True) for product_vals in product_item: if not product_item[product_vals]: LOG.info(f'产品信息数据匹配异常:{product_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin': # 招聘信息 recruit_item = {} recruit_item['c_id'] = obj_id recruit_item['company_name'] = detail_res.get( 'company_name', '') recruit_item['opd_date'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[2]' '/text()')[0] recruit_item['position_'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[3]' '/text()')[0] recruit_item['month_salary'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[4]' '/text()')[0] recruit_item['education'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[5]' '/text()')[0] recruit_item['work_experience'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[6]' '/text()')[0] recruit_item['address'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[7]' '/text()')[0] print(recruit_item, 'P' * 80) bixiao_recruit.find_one_and_update({'c_id': obj_id}, {'$set': recruit_item}, upsert=True) for recruit_vals in recruit_item: if not recruit_item[recruit_vals]: LOG.info(f'招聘信息数据匹配异常:{recruit_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch' ) == 'CompangyDetail.lishiwangzhanbeian': # ICP备案 record_item = {} record_item['c_id'] = obj_id record_item['company_name'] = detail_res.get( 'company_name', '') record_item['opd_date'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]' '/span/text()')[0] record_item['web_name'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]' '/span/text()')[0] record_item['index_url'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/' 'a/@href')[0] record_item['domain_name'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]' '/text()')[0] record_item['website_filing'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/' 'span/text()')[0] print(record_item, 'M' * 80) bixiao_record_icp.find_one_and_update({'c_id': obj_id}, {'$set': record_item}, upsert=True) for record_vals in record_item: if not record_item[record_vals]: LOG.info(f'ICP备案数据匹配异常:{record_item}, 请求地址:{comp_url}') print(detail_res, '%' * 80) bixiao_business.find_one_and_update({'c_id': obj_id}, {'$set': detail_res}, upsert=True) return detail_res
def work_by_key(self, key): ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res # page for page in range(1, self.MAX_PAGE, 1): url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) print(url) print(cookies_get()) self.headers['Cookie'] = cookies_get() is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue with open('company_list.html', 'w', encoding='utf-8') as wf: wf.write(search_resp) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() res_dict['tyt_url'] = tag.get('href').strip() res_dict['name'] = tag.get_text().strip() self.save_list( tag.get('href').strip() + '-' + tag.get_text().strip()) # print(res_dict['name'], res_dict['tyt_url'], str(True if res_dict else False)) print(res_dict) ret_res.append(res_dict) random_sleep(1, 2.5)
def detail_by_url(self, comp_url: str): detail_res = dict() if not comp_url: return detail_res search_resp = comp_url soup = BeautifulSoup(search_resp, 'lxml') # header: 详情页 公司名称 title_list = soup.find_all('div', class_="header") et2 = etree.HTML(search_resp) # if not title_list: # return -1 try: company_name = (title_list[0].find_all('h1', class_="name"))[0].get_text() except: name = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()') company_name = ''.join(name) detail_res['company_name'] = company_name # 电话 更多联系方式 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80) origin_phone = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()') # 邮箱 更多邮箱 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80) origin_email = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()') if origin_phone and origin_email: year_list = [i.get('showSource') for i in eval(origin_phone[0])] phone_item_vals = [i.get('phoneNumber') for i in eval(origin_phone[0])] email_list = eval(origin_email[0]) contact_item = {} for contact in zip(year_list, phone_item_vals, email_list): contact_item['company_name'] = detail_res.get('company_name', '') contact_item['report_year'] = contact[0] contact_item['phone'] = contact[1] contact_item['email'] = contact[-1] contact_item['date_time'] = self.timestamp_to_strftime(time.time()) print(contact_item, '@'*80) reslut = email_phone_insert(contact_item) if reslut.get('status', False): print('插入成功') else: print(reslut.get('msg')) # detail: 电话 邮箱 公司官网 地址 简介 detail_div = soup.find_all('div', class_="detail") def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if not detail_div: while 1: if detail_div: break else: LOG.critical('验证############### %s ###############' % comp_url) random_sleep(20, 25) self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(comp_url) soup = BeautifulSoup(search_resp, 'lxml') detail_div = soup.find_all('div', class_="detail") for div in detail_div[0].find_all('div'): if not div: continue # f0 电话 && 邮箱 if div.get('class') == ['f0']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['phone'] = child.get_text().strip() or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: detail_res['email'] = child.get_text().strip() or '-' break else: break # 公司官网 && 地址 elif div.get('class') == ['f0', 'clearfix']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['company_url'] = child.get_text().strip() or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: for small_index, small_child in enumerate(child.children): if small_index == 0: detail_res['address'] = small_child.get_text().strip() or '-' break break else: break # 简介 elif div.get('class') == ['summary']: for big_index, big_child in enumerate(div): if big_index == 0: resume = big_child.string if resume: resume = resume.strip() detail_res['resume'] = resume or '-' break else: break else: continue # detail-list: detail_list_div = soup.find_all('div', class_="detail-list") if not detail_list_div: return detail_res etc = etree.HTML(search_resp) for div in detail_list_div[0].find_all('div'): if not div: continue # detail_res['source'] = '天眼查' # detail_res['created_time'] = self.timestamp_to_strftime(time.time()) if div.get('tyc-event-ch') == 'CompangyDetail.gongshangxinxin': # 工商信息 registration_item = dict() for index_1, child_1 in enumerate(div.find_all('div', recursive=False)): if index_1 == 1: for index_1_1, child_1_1 in enumerate(child_1): if index_1_1 == 2: for index_tr, tr in enumerate(child_1_1.find_all('tr')): if index_tr == 0: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 注册资本 detail_res['register_funds'] = td.get_text().strip() or '-' elif index_td == 3: # 实缴资金 detail_res['paidin_funds'] = td.get_text().strip() or '-' elif index_tr == 1: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 成立日期 detail_res['establish_date'] = td.get_text().strip() or '-' elif index_td == 3: # 经营状态 detail_res['status'] = td.get_text().strip() or '-' elif index_tr == 2: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 统一社会信用代码 detail_res['credit_code'] = td.get_text().strip() or '-' elif index_td == 3: # 工商注册号 detail_res['registration_number'] = td.get_text().strip() or '-' elif index_tr == 3: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 纳税人识别号 detail_res['identification_number'] = td.get_text().strip() or '-' elif index_td == 3: # 组织机构代码 detail_res['organization_code'] = td.get_text().strip() or '-' elif index_tr == 4: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 公司类型 detail_res['company_type'] = td.get_text().strip() or '-' elif index_td == 3: # 行业 detail_res['industry'] = td.get_text().strip() or '-' elif index_tr == 6: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 营业期限 detail_res['business_term'] = td.get_text().strip() or '-' elif index_td == 3: # 纳税人资质 detail_res['taxpayer_qualification'] = td.get_text().strip() or '-' elif index_tr == 7: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 人员规模 detail_res['personnel_size'] = td.get_text().strip() or '-' elif index_td == 3: # 参保人数 detail_res['insured_num'] = td.get_text().strip() or '-' elif index_tr == 9: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 注册地址 detail_res['registered_address'] = td.get_text().strip() or '-' elif index_tr == 10: for index_td, td in enumerate(tr.find_all('td')): if index_td == 1: # 经营范围 detail_res['business_scope'] = td.get_text().strip() or '-' break continue elif div.get('tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan': # 主要人员 people_item = {} people_item['company_name'] = detail_res.get('company_name', '') # 姓名 name = etc.xpath('//*[@id="_container_staff"]/div/table/tbody/tr/td[2]/table/tbody/tr/td[2]/a/text()') # 职位 position = etc.xpath('//*[@id="_container_staff"]/div/table/tbody/tr/td[3]/span/text()') # 详情地址 doc_url = etc.xpath( '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[3]/a/@href') for people in zip(name, position, doc_url): people_item['name'] = people[0] people_item['position'] = people[1] people_item['doc_url'] = people[2] people_item['created_time'] = self.timestamp_to_strftime(time.time()) result = people_insert(people_item) if result.get('status', False): print(result) else: LOG.debug(f'') bixiao_people.find_one_and_update({'doc_url': detail_res.get('doc_url', '')}, {'$set': people_item}, upsert=True) print(people_item) elif div.get('tyc-event-ch') == 'CompangyDetail.gudongxinxi': # 股东信息 capital_item = {} capital_item['company_name'] = detail_res.get('company_name', '') # 股东名称 title = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[2]/a/text()') # 标签 label = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[2]/div/span/text()') # 持股比例 has_rates = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[3]/div/div/span/text()') # 认缴出资额 subscribed_capital = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[4]/div/span/text()') # 详情地址 doc_url = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[3]/a/@href') for capital in zip(title, label, has_rates, subscribed_capital, doc_url): capital_item['title'] = ''.join(capital[0]) capital_item['label'] = ''.join(capital[1]) capital_item['has_rates'] = ''.join(capital[2]) capital_item['subscribed_capital'] = ''.join(capital[3]) capital_item['doc_url'] = capital[4] capital_item['created_time'] = self.timestamp_to_strftime(time.time()) bixiao_shareholder.find_one_and_update({'doc_url': detail_res.get('doc_url', '')}, {'$set': capital_item}, upsert=True) print(capital_item, 'C' * 80) elif div.get('tyc-event-ch') == 'CompangyDetail.findNewsCount': # 新闻舆情 news_item = {} news_item['company_name'] = detail_res.get('company_name', '') # 标题 title = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[1]/a/text()') # 内容地址 info_url = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[1]/a/@href') # 来源 source = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[3]/span[1]/text()') # 发布时间 date_doc = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[3]/span[2]/text()') for news_datas in zip(title, info_url, source, date_doc): news_item['title'] = news_datas[0] news_item['info_url'] = news_datas[1] news_item['source'] = news_datas[2] news_item['date_doc'] = news_datas[3] news_item['content'] = self.request_doing(url=news_datas[1], headers=self.headers, params={}) news_item['created_time'] = self.timestamp_to_strftime(time.time()) print(news_item) bixiao_news.update({'info_url': detail_res.get('info_url', '')}, {'$set': news_item}, upsert=True) elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin': # 产品信息 product_item = {} product_item['company_name'] = detail_res.get('company_name', '') # 产品名称 name = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[2]/table' '/tbody/tr/td[2]/span/text()') # 产品简称 short_name = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[3]' '/span/text()') # 产品分类 type = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[4]/span' '/text()') # 领域 domain = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[5]' '/span/text()') # 详情地址 doc_url = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[6]/a/@href') for product in zip(name, short_name, type, domain, doc_url): product_item['name'] = product[0] product_item['short_name'] = product[1] product_item['type'] = product[2] product_item['domain'] = product[3] product_item['doc_url'] = product[4] product_item['doc_info'] = self.request_doing(url=product[4], headers=self.headers, params={}) product_item['created_time'] = self.timestamp_to_strftime(time.time()) print(product_item) bixiao_product.find_one_and_update({'doc_url': detail_res.get('doc_url', '')}, {'$set': product_item}, upsert=True) elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin': # 招聘信息 recruit_item = {} recruit_item['company_name'] = detail_res.get('company_name', '') opd_date = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[2]' '/text()') position_ = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[3]' '/text()') month_salary = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[4]' '/text()') education = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[5]' '/text()') work_experience = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[6]' '/text()') address = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[7]' '/text()') opd_url = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[8]/a/@href') for recruit in zip(opd_date, position_, month_salary, education, work_experience, address, opd_url): recruit_item['opd_date'] = recruit[0] recruit_item['position_'] = recruit[1] recruit_item['month_salary'] = recruit[2] recruit_item['education'] = recruit[3] recruit_item['work_experience'] = recruit[4] recruit_item['address'] = recruit[5] recruit_item['opd_url'] = recruit[6] recruit_item['created_time'] = self.timestamp_to_strftime(time.time()) print(recruit_item, 'P' * 80) bixiao_recruit.find_one_and_update({'opd_url': detail_res.get('opd_url', '')}, {'$set': recruit_item}, upsert=True) elif div.get('tyc-event-ch') == 'CompangyDetail.lishiwangzhanbeian': # ICP备案 record_item = {} record_item['company_name'] = detail_res.get('company_name', '') # 审核日期 opd_date = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]' '/span/text()') # 网站名称 web_name = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]' '/span/text()') # 网站首页 index_url = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/' 'a/@href') # 域名 domain_name = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]' '/text()') # 网站备案/许可证号 website_filing = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/' 'span/text()') for record in zip(opd_date, web_name, index_url, domain_name, website_filing): record_item['opd_date'] = record[0] record_item['web_name'] = record[1] record_item['index_url'] = record[2] record_item['domain_name'] = record[3] record_item['website_filing'] = record[4] record_item['created_time'] = self.timestamp_to_strftime(time.time()) res = record_icp_insert(record_item) if res.get('status', False): print(res) else: LOG.debug(f'企业年报入库异常: {res.get("msg")}...') # print(record_item, 'M' * 80) # bixiao_record_icp.find_one_and_update({'index_url': detail_res.get('index_url', '')}, # {'$set': record_item}, upsert=True) elif div.get('tyc-event-ch') == 'CompangyDetail.rongzilishi': # 融资历程 financing_item = dict() financing_item['company_name'] = detail_res.get('company_name', '') # 披露日期 opd_date = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[2]/text()') # 交易金额 change_money = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[3]/text()') # 融资轮次 financing_round = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[4]/div[1]/text()') # 估值 valuation = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[5]/text()') # 比例 proportion = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[6]/text()') # 投资方 investor = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[7]/div/a/text()') # 新闻来源 news_source = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[8]/div/text()') for financing in zip(opd_date, change_money, financing_round, valuation, proportion, investor, news_source): financing_item['opd_date'] = financing[0] financing_item['change_money'] = financing[1] financing_item['financing_round'] = financing[2] financing_item['valuation'] = financing[3] financing_item['proportion'] = financing[4] financing_item['investor'] = financing[5] financing_item['news_source'] = financing[6] financing_item['created_time'] = self.timestamp_to_strftime(time.time()) print(financing_item, 'F'*80) res = financing_insert(financing_item) if res.get('status', False): print(res) else: LOG.debug(f'融资历程入库异常: {res.get("msg")}...') elif div.get('tyc-event-ch') == 'CompangyDetail.nianbao': # 企业年报 reports_item = dict() reports_item['company_name'] = detail_res.get('company_name', '') # 年报 reports = etc.xpath('//*[@id="web-content"]/div/div/div[5]/div[1]/div/div[2]/div[1]/div[15]/div[2]' '/div/table/tbody/tr/td[2]/text()') # 详情地址 operation = etc.xpath('//*[@id="web-content"]/div/div/div[5]/div[1]/div/div[2]/div[1]/div[15]/div[2]' '/div/table/tbody/tr/td[3]/a/@href') for annual in zip(reports, operation): reports_item['reports'] = annual[0] reports_item['operation'] = annual[1] reports_item['reports_info'] = self.request_doing(url=operation, headers=self.headers, params={}) reports_item['created_time'] = self.timestamp_to_strftime(time.time()) print(reports_item, '?'*80) res = reports_insert(reports_item) if res.get('status', False): print(res) else: LOG.debug(f'企业年报入库异常: {res.get("msg")}...') print(detail_res, '%' * 80) bixiao_business.find_one_and_update({'company_name': detail_res.get('company_name', '')}, {'$set': detail_res}, upsert=True)
def get_pagination(self, key, _type='default', city_id=None, sub_city_id=None, cityes=None, sub_city_info=None): min_page = self.MIN_PAGE max_page = self.MAX_PAGE if (max_page - min_page) / PAGINATION == (max_page - min_page) // PAGINATION: max_range = (max_page - min_page) // PAGINATION else: max_range = (max_page - min_page) // PAGINATION + 1 if API_MODE not in ['tyc', 'pro']: return min_page, max_page, max_page, max_range if API_MODE == 'tyc' and _type == 'city': city_info = cityes.get(city_id) url = '%s?key=%s&base=%s' % (TYC_SEARCH_API, parse.quote(key), city_info.get('name')) elif API_MODE == 'tyc' and _type == 'sub_city': city_info = cityes.get(city_id) if city_id in ZXS_CITY_IDS: url = '%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \ % (TYC_SEARCH_API, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name'))) else: url = '%s?key=%s&base=%s' % (TYC_SEARCH_API, parse.quote(key), sub_city_info.get('name')) elif API_MODE == 'tyc': url = '%s?key=%s' % (TYC_SEARCH_API, parse.quote(key)) elif API_MODE == 'pro' and _type == 'city': city_info = cityes.get(city_id) url = '%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, parse.quote(key), city_info.get('name')) elif API_MODE == 'pro' and _type == 'sub_city': city_info = cityes.get(city_id) if city_id in ZXS_CITY_IDS: url = '%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \ % (TYC_PRO_SEARCH_API, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name'))) else: url = '%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, parse.quote(key), sub_city_info.get('name')) elif API_MODE == 'pro': url = '%s?key=%s' % (TYC_PRO_SEARCH_API, parse.quote(key)) self.headers['Referer'] = url is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all('div', class_='search-pagination') # 仅一页 if is_ok and not search_pagination: return 0, 1, 1, 1 def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(search_pagination) == 0 or not is_ok: while 1: if is_ok and len(search_pagination) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all( 'div', class_='search-pagination') l = len(search_pagination[0].find_all('a')) for index_a, a in enumerate(search_pagination[0].find_all('a')): if index_a == (l - 2): max_page = a.string.strip() if max_page.find('...') > -1: max_page = max_page.split('...')[1] if isinstance(max_page, str): max_page = int(max_page) break max_pagination = max_page if MIN_PAGE: min_page = int(MIN_PAGE) if MAX_PAGE: max_page = int(MAX_PAGE) if (int(MAX_PAGE) < int(max_pagination)) \ else int(max_pagination) if min_page == max_page: max_range = 1 elif min_page > max_page: LOG.critical('Page min and max is error: min[%s] max[%s]' % (min_page, max_page)) sys.exit() else: if (max_page - min_page) / PAGINATION == (max_page - min_page) // PAGINATION: max_range = (max_page - min_page) // PAGINATION else: max_range = (max_page - min_page) // PAGINATION + 1 return min_page, max_page, max_pagination, max_range
def detail_pro_by_url(self, comp_url: str): detail_res = dict() if not comp_url: return detail_res self.headers['Referer'] = comp_url is_ok, search_resp = api_get(url=comp_url, headers=self.headers, data={}, resptype='text') if not is_ok: print('X-' * 100) print(comp_url) return detail_res soup = BeautifulSoup(search_resp, 'lxml') # detail: 电话 邮箱 公司官网 地址 简介 detail_div = soup.find_all('div', class_="ie9Style") def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if not detail_div: while 1: if is_ok and detail_div: break else: LOG.critical('验证############### %s ###############' % comp_url) random_sleep(20, 25) is_ok, search_resp = while_req(comp_url) soup = BeautifulSoup(search_resp, 'lxml') detail_div = soup.find_all('div', class_="ie9Style") # 0 企业缩略图 1 基础信息 2 下载 for index, div in enumerate(detail_div[1].find_all('div', recursive=False)): if not div: continue # 电话 && 邮箱 if index == 1: for big_index, big_child in enumerate(div): if big_index == 0: for sub_index, child in enumerate(big_child.children): if sub_index == 1: detail_res['phone'] = child.get_text().strip( ) or '-' elif big_index == 1: for sub_index, child in enumerate(big_child.children): if sub_index == 1: detail_res['email'] = child.get_text().strip( ) or '-' # 公司官网 && 地址 elif index == 2: for big_index, big_child in enumerate(div): if big_index == 0: for sub_index, child in enumerate(big_child.children): if sub_index == 1: detail_res['company_url'] = child.get_text( ).strip() or '-' elif big_index == 1: for sub_index, child in enumerate(big_child.children): if sub_index == 1: detail_res['address'] = child.get_text().strip( ) or '-' break # 简介 elif index == 3: for big_index, big_child in enumerate(div): if big_index == 0: for sub_index, sub_child in enumerate(big_child): if sub_index == 1: resume = sub_child.string if resume: resume = resume.strip() detail_res['resume'] = resume or '-' break break else: continue # detail-list: 信用代码 公司类型 所属行业 营业期限 实缴资本 经营范围 detail_list_div = soup.find_all('div', class_='base0910') if not detail_list_div: return detail_res for index_tr, tr in enumerate(detail_list_div[0].find_all('tr')): if not tr: continue if index_tr == 1: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 1: # 信用代码 detail_res['credit_code'] = td.get_text().strip( ) or '-' elif index_td == 3: # 公司类型 detail_res['company_type'] = td.get_text().strip( ) or '-' elif index_tr == 2: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 3: # 所属行业 detail_res['industry'] = td.get_text().strip() or '-' elif index_tr == 3: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 1: # 营业期限 detail_res['business_term'] = td.get_text().strip( ) or '-' elif index_tr == 4: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 3: # 实缴资本 detail_res['paidin_funds'] = td.get_text().strip( ) or '-' elif index_tr == 8: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 1: # 经营范围 detail_res['business_scope'] = td.get_text().strip( ) or '-' # detail-list: 注册资金 注册日期 经营状态 detail_list_div_1 = soup.find_all('div', class_='baseInfo_model2017') if not detail_list_div: return detail_res for index_table, table in enumerate( detail_list_div_1[0].find_all('table')): if not table: continue if index_table == 1: for index_tr, tr in enumerate(table.find_all('tr')): if index_tr == 1: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 1: for index_td_span, td_span in enumerate( td.find_all('span')): if index_td_span == 1: detail_res[ 'register_funds'] = td_span.get_text( ).strip() or '-' elif index_tr == 2: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 0: for index_td_span, td_span in enumerate( td.find_all('span')): if index_td_span == 1: detail_res[ 'establish_date'] = td_span.get_text( ).strip() or '-' elif index_tr == 3: for index_td, td in enumerate( tr.find_all('td', recursive=False)): if index_td == 0: for index_td_div, td_div in enumerate( td.find_all('div', recursive=False)): if index_td_div == 0: for index_td_div_span, td_div_span in enumerate( td_div.find_all( 'span', recursive=False)): if index_td_div_span == 1: detail_res[ 'status'] = td_div_span.get_text( ).strip() or '-' return detail_res
def detail_by_url(self, comp_url: str): detail_res = dict() if not comp_url: return detail_res self.headers['Referer'] = comp_url is_ok, search_resp = api_get(url=comp_url, headers=self.headers, data={}, resptype='text') if not is_ok: return detail_res soup = BeautifulSoup(search_resp, 'lxml') # detail: 电话 邮箱 公司官网 地址 简介 detail_div = soup.find_all('div', class_="detail") def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if not detail_div: while 1: if is_ok and detail_div: break else: LOG.critical('验证############### %s ###############' % comp_url) random_sleep(20, 25) is_ok, search_resp = while_req(comp_url) soup = BeautifulSoup(search_resp, 'lxml') detail_div = soup.find_all('div', class_="detail") for div in detail_div[0].find_all('div'): if not div: continue # f0 电话 && 邮箱 if div.get('class') == ['f0']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['phone'] = child.get_text().strip( ) or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: detail_res['email'] = child.get_text().strip( ) or '-' break else: break # 公司官网 && 地址 elif div.get('class') == ['f0', 'clearfix']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['company_url'] = child.get_text( ).strip() or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: for small_index, small_child in enumerate( child.children): if small_index == 0: detail_res[ 'address'] = small_child.get_text( ).strip() or '-' break break else: break # 简介 elif div.get('class') == ['summary']: for big_index, big_child in enumerate(div): if big_index == 0: resume = big_child.string if resume: resume = resume.strip() detail_res['resume'] = resume or '-' break else: break else: continue # detail-list: detail_list_div = soup.find_all('div', class_="detail-list") if not detail_list_div: return detail_res for div in detail_list_div[0].find_all('div'): if not div: continue if div.get('tyc-event-ch') == 'CompangyDetail.gongshangxinxin': for index_1, child_1 in enumerate( div.find_all('div', recursive=False)): if index_1 == 1: for index_1_1, child_1_1 in enumerate(child_1): if index_1_1 == 2: for index_tr, tr in enumerate( child_1_1.find_all('tr')): if index_tr == 0: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 注册资本 detail_res[ 'register_funds'] = td.get_text( ).strip() or '-' elif index_td == 3: # 实缴资金 detail_res[ 'paidin_funds'] = td.get_text( ).strip() or '-' elif index_tr == 1: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 成立日期 detail_res[ 'establish_date'] = td.get_text( ).strip() or '-' elif index_td == 3: # 经营状态 detail_res[ 'status'] = td.get_text( ).strip() or '-' elif index_tr == 2: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 信用代码 detail_res[ 'credit_code'] = td.get_text( ).strip() or '-' elif index_tr == 4: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 公司类型 detail_res[ 'company_type'] = td.get_text( ).strip() or '-' elif index_td == 3: # 行业 detail_res[ 'industry'] = td.get_text( ).strip() or '-' elif index_tr == 6: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 营业期限 detail_res[ 'business_term'] = td.get_text( ).strip() or '-' elif index_tr == 10: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 经营范围 detail_res[ 'business_scope'] = td.get_text( ).strip() or '-' break break return detail_res
def work_by_key(self, key, min_page, max_page, type='default', queue=None, cid=None, sub_cid=None, city_info=None, sub_city_info=None): ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res # page for page in range(min_page, max_page + 1, 1): if API_MODE == 'tyc' and type == 'default': url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'tyc' and type == 'city': url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page, parse.quote(key), city_info.get('name')) elif API_MODE == 'tyc' and type == 'sub_city': if cid in ZXS_CITY_IDS: url = '%s/p%s?key=%s&base=%s&areaCode=%s' % ( TYC_SEARCH_API, page, parse.quote(key), sub_city_info.get('name'), sub_city_info.get('code')) else: url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page, parse.quote(key), sub_city_info.get('name')) elif API_MODE == 'pro' and type == 'default': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'pro' and type == 'city': url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name')) elif API_MODE == 'pro' and type == 'sub_city': if cid in ZXS_CITY_IDS: url = '%s/p%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \ % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name'))) else: url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key), sub_city_info.get('name')) else: LOG.critical('====== API_MODE is not in [tyc, pro] ======') sys.exit(1) LOG.info('%s[%s]%s' % (key, API_MODE, url)) self.headers['Referer'] = url is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue if self.check_no(url, _type='page'): continue soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() if API_MODE == 'tyc': tyc_url = tag.get('href').strip() elif API_MODE == 'pro': tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API, tag.get('href').strip()) else: tyc_url = '' res_dict['tyc_url'] = tyc_url res_dict['name'] = tag.get_text().strip() res_dict['key'] = key res_dict['is_send_email'] = False res_dict['city'] = city_info.get( 'full_name') if city_info else '-' res_dict['sub_city'] = sub_city_info.get( 'full_name') if sub_city_info else '-' detail_res = list() if API_MODE == 'tyc': detail_res = self.detail_by_url(res_dict.get('tyc_url')) elif API_MODE == 'pro': detail_res = self.detail_pro_by_url( res_dict.get('tyc_url')) res_dict.update(detail_res) print('%s[%s] %s' % (res_dict['name'], str(True if res_dict else False), res_dict['tyc_url'])) ret_res.append(res_dict) if queue: queue.put(res_dict) random_sleep(3.5, 4.5) if IS_TEST_BREAK: break if IS_TEST_BREAK: break return ret_res
------------------------------------------------ """ import sys from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from deploy.config import DB_LINK from deploy.utils.logger import logger as LOG DBSession = None if not DB_LINK: LOG.critical('DB configuration is unavail') sys.exit(1) db_link = DB_LINK ModelBase = declarative_base() def init_database_engine(): return create_engine(db_link, echo=False, pool_recycle=800, pool_size=100) def get_session(): global DBSession if not DBSession: dbengine_databus = init_database_engine()
def work_by_key(self, key, min_page=0, max_page=5, queue=None): ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res if not min_page: min_page = self.MIN_PAGE if not max_page: max_page = self.MAX_PAGE LOG.info('%s[%s ~ %s]' % (key, min_page, max_page)) # page for page in range(min_page, max_page, 1): if API_MODE == 'tyc': url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'pro': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key)) else: LOG.critical('====== API_MODE is not in [tyc, pro] ======') sys.exit(1) LOG.info('%s[%s]%s' % (key, API_MODE, url)) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() if API_MODE == 'tyc': tyc_url = tag.get('href').strip() elif API_MODE == 'pro': tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API, tag.get('href').strip()) else: tyc_url = '' res_dict['tyc_url'] = tyc_url res_dict['name'] = tag.get_text().strip() res_dict['key'] = key detail_res = list() if API_MODE == 'tyc': detail_res = self.detail_by_url(res_dict.get('tyc_url')) elif API_MODE == 'pro': detail_res = self.detail_pro_by_url( res_dict.get('tyc_url')) res_dict.update(detail_res) print('%s[%s] %s' % (res_dict['name'], str(True if res_dict else False), res_dict['tyc_url'])) ret_res.append(res_dict) if queue: queue.put(res_dict) random_sleep(3.2, 4.5) if IS_TEST_BREAK: break if IS_TEST_BREAK: break return ret_res
def _die(self, message: str = None): if message: LOG.critical(message) os._exit(0)