def total_pages(industry_url): response = requests_get(industry_url) html = etree.HTML(response.content) # total_counts = int(html.xpath("//span[@class='operate_txt']/i/text()")[0]) total_pn = int( html.xpath( "//span[@class='num_operate']/i[@class='total_page']/text()")[0]) return total_pn
def industry_by_area(industry_url): response = requests_get(industry_url) html = etree.HTML(response.content) area = html.xpath( '//div[@class="filter_item filter_area"]/ul[@class="filter_items clearfix"]/li/a/text()' ) area_url = html.xpath( '//div[@class="filter_item filter_area"]/ul[@class="filter_items clearfix"]/li/a/@href' ) area_df = pd.DataFrame(list(zip(area[1:], area_url[1:])), columns=['行政区', '行业网址']) industry_db.insert_many(area_df.to_dict('records'))
def crawl_industry(city_url): response = requests_get(city_url) html = etree.HTML(response.content) industry_list = html.xpath("//ul[@class='indcatelist']/li/a/text()") industry_url = [ city_url[:-6] + item for item in html.xpath("//ul[@class='indcatelist']/li/a/@href") ] industry_df = pd.DataFrame(list(zip(industry_list, industry_url)), columns=['行业', '行业网址']) industry_df = industry_df.query('行业 != "不限"').reset_index(drop=True) industry_df['city_url'] = city_url industry_db.insert_many(industry_df.to_dict('records'))
def crawl_jobs(page_url): # page_url = 'https://sh.58.com/job/pn1/pve_5363_253_pve_5358_0/' response = requests_get(page_url) html = etree.HTML(response.content) total_page = int(html.xpath('//span[@class="total_page"]/text()')) jobs_url = html.xpath("//div[@class='job_name clearfix']/a/@href") jobs_url = [ job_url.split('?')[0] for job_url in jobs_url if not job_url.startswith('https://legoclick') ] jobs_df = pd.DataFrame(jobs_url, columns=["job_url"]) jobs_df['page_url'] = page_url jobs_urls.insert_many(jobs_df.to_dict('records')) print(page_url, '完成爬取...') return total_page
def parse_detail(job_url): response = requests_get(job_url, headers=HEADERS) code = response.status_code job_detail_dict = { "job_url": job_url, "status_code": code, "scrape_date": datetime.today().strftime('%Y/%m/%d') } if code == 200: doc = response.text html = etree.HTML(response.text) pub_date = findall('"pubDate":(.*?)"upDate"', doc) lontitude = findall('"lon":"(.*?)"', doc) latitude = findall('"lat":"(.*?)"}', doc) update_date = findall('"upDate":(.*?)}', doc) pos_title = xpath('//span[@class="pos_title"]/text()', html) pos_name = xpath('//span[@class="pos_name"]/text()', html) pos_salary = xpath('//span[@class="pos_salary"]/text()', html) pos_welfare = '、'.join( xpath('//span[@class="pos_welfare_item"]/text()', html, first=False)) pos_condition = '、'.join( xpath('//div[@class="pos_base_condition"]/span/text()', html, first=False)) pos_area = '-'.join( xpath('//span[@class="pos_area_item"]/text()', html, first=False)) pos_address = xpath('//div[@class="pos-area"]/span[2]/text()', html) pos_description = xpath('//div[@class="des"]', html, first=False, child=True) company_name = xpath('//div[@class="baseInfo_link"]/a/text()', html) company_url = xpath('//div[@class="baseInfo_link"]/a/@href', html) title_sign = xpath('//span[@class="baseInfo_sign"]/i/@title', html) company_industry = xpath('//p[@class="comp_baseInfo_belong"]/a/text()', html) company_scale = xpath('//p[@class="comp_baseInfo_scale"]/text()', html) job_offers = xpath('//a[@class="look_pos"]/@href', html) job_detail_dict.update({ "pub_date": pub_date, "update_date": update_date, "lontitude": lontitude, "latitude": latitude, "pos_title": pos_title, "pos_name": pos_name, "pos_salary": pos_salary, "pos_welfare": pos_welfare, "pos_condition": pos_condition, "pos_area": pos_area, "pos_address": pos_address, "pos_description": pos_description, "company_name": company_name, "company_url": company_url, "title_sign": title_sign, "company_industry": company_industry, "company_scale": company_scale, "job_offers": job_offers, }) return job_detail_dict