def catch_offer_lists(self, html_content): """提取所有页面的岗位信息""" metree = lxml.html.etree # 处理 offer_parser = metree.HTML(html_content) offer_item = [] cp_name = offer_parser.xpath( "//div[@id='contact']//div[@class='about_info iconboxAll']/h3/text()" )[0] offer_item.append(cp_name.strip()) s_name = offer_parser.xpath("//div[@class='black f16']/h1/text()")[0] offer_item.append(s_name.strip()) name = offer_parser.xpath( "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/div/text()" )[1] name = name.strip().split(r"(")[0] if name.isdigit(): offer_item.append("未备注姓名") else: offer_item.append(name) p = offer_parser.xpath( "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p") if 2 == int(len(p)): phone = offer_parser.xpath( "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p/img/@src" )[1] elif int(len(p)) == 1: phone = offer_parser.xpath( "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p/img/@src" )[0] else: phone = 'http://www.cnxhyp.com/api/image.png.php?auth=8e59p7CdwkARbg9ckaSIZnCmcsmvFULBV1JKM3BJKqD8xqbcDfWwafFQ5w' url = phone headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Referer': 'http://www.cnxhyp.com/sell/show-2702.html' } req = requests.get(url, headers=headers, proxies=proxypool.get_proxy()) rsp = req.content f = open("./tupian.png", 'wb') f.write(rsp) f.close() image = Image.open('./tupian.png') content = pytesseract.image_to_string(image) # 解析图片 offer_item.append(content) print(offer_item) self.offer_datas.append(offer_item)
def parse_offer_url(self, temp_url): """爬取整个页面内容""" offer_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) offer_html_content = offer_response.content.decode("utf-8") # 限制处理 wait_time = random.randint(0, 5) print("动态限制访问频率,%ds 后继续爬取数据..." % wait_time) time.sleep(wait_time) return offer_html_content
def catch_work_info(self, temp_url): """提取工作职责信息""" try: work_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) work_html_content = work_response.content.decode("gbk") work_parser = lxml.html.etree.HTML(work_html_content) work_infos = "".join( work_parser.xpath("//div[@class='bmsg job_msg inbox']//text()") ).strip().replace(" ", "") # 清洗数据 #print("工作职责:",work_infos) except Exception: work_infos = "暂无数据" return work_infos
def get_offer_pages(self): """动态获取页面数,int""" offer_page_response = requests.get(self.offer_index_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) # 获取网页源码 page_html_content = offer_page_response.content.decode("gbk") # 解析数据 metree = lxml.html.etree page_parser = metree.HTML(page_html_content) # 获得内容值 pages_content = page_parser.xpath( "//div[@class='dw_page']//span[@class='td']/text()")[0] pages = int(re.search(r"共(\d+)页", pages_content)[1]) return pages
def catch_company_info(self, temp_url): """提取公司简介信息""" try: company_response = requests.get( temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) company_html_content = company_response.content.decode("gbk") # 提取数据 company_parser = lxml.html.etree.HTML(company_html_content) company_infos = "".join( company_parser.xpath( "//div[@class='con_txt']//text()")).strip().replace( " ", "") except Exception: company_infos = "暂无数据" return company_infos
def catch_work_info(self, temp_url): """提取工作职责信息""" try: work_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) work_html_content = work_response.content.decode("utf-8") work_parser = lxml.html.etree.HTML(work_html_content) work_infos = work_parser.xpath( "//div[@class='tabs_box pllist active']//ul[@class='clearfix']/li" ) url = [] for li in work_infos: url.append(li.xpath("./a/@href")) except Exception: work_infos = "暂无数据" return url