def get_position_detail(self, url, position, cookies=None): #position = copy.deepcopy(position) response = request.get(url, cookies=cookies) self.request_count += 1 if response: html = etree.HTML(response.content) self.logger.info( html.xpath('//title/text()')[0] if html. xpath('//title/text()') else 'title error ') # education = html.xpath("//dd[@class='job_request']/p[1]/span[4]/text()") # work_year = html.xpath("//dd[@class='job_request']/p[1]/span[3]/text()") job_nature = html.xpath( "//dd[@class='job_request']/p[1]/span[5]/text()") # education = str(education[0]).strip('/') if education else '' # work_year = str(work_year[0]).strip('/') if work_year else '' job_nature = job_nature[0].strip() if job_nature else '' job_detail = html.xpath("//dd[@class='job_bt']/div//text()") job_detail = [item.strip() for item in job_detail if item.strip()] job_detail = '\n'.join(job_detail).strip() job_address = html.xpath("//div[@class='work_addr']//text()") job_address = [item.strip() for item in job_address] job_address = ''.join(job_address[:-2]) district = html.xpath("//div[@class='work_addr']/a[2]/text()") district = district[0].strip() if district else '' position['job_nature'] = job_nature position['job_detail'] = job_detail position['job_address'] = job_address position['district'] = district else: self.except_count += 1 self.save_infos(position)
def get_positons_list(self, url, item, cookies): self.request_count += 1 response = request.get(url, cookies=cookies) if response: cookies = response.cookies else: self.except_count += 1 self.get_new_list(response, copy.deepcopy(item), cookies)
def get_positions_urls(self, list_url, item, cookies=None): self.logger.debug(type(cookies)) response = request.get(list_url, cookies=cookies) self.request_count += 1 if response: cookies = response.cookies html = etree.HTML(response.content) self.logger.info( html.xpath('//title/text()')[0] if html. xpath('//title/text()') else 'title error') item_list = html.xpath("//ul[@class='item_con_list']/li") for position in item_list: publish_date = position.xpath( ".//span[@class='format-time']/text()")[0] publish_date = self.switch_publish_date(publish_date) url = position.xpath(".//a[@class='position_link']/@href")[0] # 判断url是否存在 if url not in self.urls and not self.lagou_db.isexist_url(url): self.urls.append(url) position_name = position.xpath("@data-positionname")[0] salary = position.xpath("@data-salary")[0] other = position.xpath( ".//div[@class='li_b_l']/text()")[2].strip() add = position.xpath(".//span[@class='add']/em/text()")[0] city = add.split('·')[0] company_name = position.xpath("@data-company")[0] item['position_name'] = position_name.strip() item['publish_date'] = publish_date item['salary'] = salary.strip() item['education'] = other.split('/')[1].strip() item['work_year'] = other.split('/')[0][2:].strip() item['city'] = city.strip() item['company_name'] = company_name.strip() item['url'] = url.strip() item['job_nature'] = '' item['job_detail'] = '' item['job_address'] = '' item['district'] = '' g = gevent.spawn(self.get_position_detail, url, copy.deepcopy(item), cookies=cookies) self.pool.add(g) # self.get_position_detail(url, item, cookies=cookies) else: self.logger.debug('此url %s 已经存在!' % url) else: self.except_count += 1
def update_data(): db = dbmysql.DB() sql = "select * from positions where publish_date > '2018-04-06'" #sql = "select * from (select * from positions limit 1,100) " \ # "as t where t.publish_date > '2018-03-16'" positions = db.fetchall(sql) for position in positions: try: city, district = position['job_address'].split('-')[0:2] except Exception as e: city = '' district = '' if city == position['city'] and district == position['district']: pass else: response = request.get(position['url']) html = etree.HTML(response.content) # education = html.xpath("//dd[@class='job_request']/p[1]/span[4]/text()") # work_year = html.xpath("//dd[@class='job_request']/p[1]/span[3]/text()") # job_nature = html.xpath("//dd[@class='job_request']/p[1]/span[5]/text()") # education = str(education[0]).strip('/') if education else '' # work_year = str(work_year[0]).strip('/') if work_year else '' # job_nature = job_nature[0].strip() if job_nature else '' job_detail = html.xpath("//dd[@class='job_bt']/div//text()") job_detail = [item.strip() for item in job_detail if item.strip()] job_detail = '\n'.join(job_detail).strip() job_address = html.xpath("//div[@class='work_addr']//text()") job_address = [item.strip() for item in job_address] job_address = ''.join(job_address[:-2]) # district = html.xpath("//div[@class='work_addr']/a[2]/text()") # district = district[0].strip() if district else '' # position['job_nature'] = job_nature # position['job_detail'] = job_detail # position['job_address'] = job_address position = dict(position) position['publish_date'] = str(position['publish_date']) print_log(position['url'], position) sql = 'update positions set job_address=:job_address,job_detail=:job_detail where url=:url' db.edit( sql, { 'job_detail': job_detail, 'job_address': job_address, 'url': position['url'] })
def get_new_list(self, response, item, cookies): if response: # new_url = html.xpath("//div[@class='item order']/a[2]/@href") new_url = self.second_url % (item['first_type']) response = request.get(url=new_url, cookies=cookies) self.request_count += 1 if response: referer = response.url html = etree.HTML(response.content) page_num = html.xpath("//span[@class='span totalNum']/text()") page_num = int(page_num[0]) if page_num else 1 for num in range(1, page_num + 1): g = gevent.spawn(self.get_positions_list, num, copy.deepcopy(item), referer, cookies) self.pool.add(g) # self.get_positions_list(num, item, referer, cookies) # form_data = { # 'first': 'false', # 'pn': str(num), # 'kd': item['first_type'], # } # headers = { # 'Referer': referer, # } # # 如果请求失败,重新请求 # for i in range(5): # time.sleep(random.randint(1, 3)) # response = request.post(url=self.post_url, data=form_data, headers=headers, cookies=cookies) # try: # result = response.json(encoding='utf-8') # except Exception as e: # self.logger.error(e) # else: # if result.get('success'): # result = self.get_positions_urls(result, item, cookies=response.cookies) # if not result: # return # break # else: # self.logger.error('%s %s %s' % (self.post_url, form_data, response.text)) else: self.except_count += 1
def get_position_detail(self, url, position, cookies=None): response = request.get(url=url, cookies=cookies) self.request_count += 1 if response: html = etree.HTML(response.content) self.logger.info( html.xpath('//title/text()')[0] if html. xpath('//title/text()') else 'title error ') job_detail = html.xpath("//dd[@class='job_bt']/div//text()") job_detail = [item.strip() for item in job_detail if item.strip()] job_detail = '\n'.join(job_detail).strip() job_address = html.xpath("//div[@class='work_addr']//text()") job_address = [item.strip() for item in job_address] job_address = ''.join(job_address[:-2]) position['job_detail'] = job_detail position['job_address'] = job_address else: self.except_count += 1 self.save_infos(position)
def start_spider(self): """ 爬虫开始 : 获取所有 "技术" 相关的职位的url """ self.count = 0 self.request_count = 0 self.except_count = 0 self.error_count = 0 self.urls = [] start_time = time.time() response = request.get(self.start_url) self.request_count += 1 if response: cookies = response.cookies html = etree.HTML(response.content) print(html.xpath("//title/text()")[0]) menu = html.xpath("//div[@class='menu_sub dn']")[0] positions_dict = {} types = menu.xpath("dl") for item in types: type_name = item.xpath("dt/span/text()")[0] # print(type_name) positions_dict[type_name] = {} positions = item.xpath("dd/a") for position in positions[0:1]: position_name = position.text position_url = position.xpath('@href')[0] positions_dict[type_name][position_name] = position_url position_data = {'first_type': position_name, 'second_type': type_name} # self.get_positons_list(position_url, position_data, cookies) g = gevent.spawn(self.get_positons_list,*(position_url, position_data, cookies)) self.pool.add(g) else: self.except_count += 1 self.pool.join() self.send_email(start_time)
def get_positons_list(self, url, item, cookies): response = request.get(url, cookies=cookies) self.request_count += 1 if response: cookies = response.cookies html = etree.HTML(response.content) title = html.xpath('//title/text()') if not title or title[0] == '找工作-互联网招聘求职网-拉勾网': self.logger.error(url + ' error ') return html = etree.HTML(response.content) page_num = html.xpath("//span[@class='span totalNum']/text()") page_num = int(page_num[0]) if page_num else 1 if page_num > 0: for num in range(1, page_num + 1): list_url = '%s%d/' % (url, num) g = gevent.spawn(self.get_positions_urls, list_url, copy.deepcopy(item), cookies=cookies) self.pool.add(g) # self.get_positions_urls(list_url, item, cookies=cookies) else: self.except_count += 1