def parse(self, response): next_page_url = response.xpath('//li[@class="bk"][2]/a/@href') job_list = response.xpath('//*[@id="resultList"]/div[@class="el"]') # print(job_list.extract()) for each_job in job_list: job_info = JobsItem() job_info['job_title'] = each_job.xpath( './/p[contains(@class,"t1")]/span/a/text()') job_info['company'] = each_job.xpath( './/span[contains(@class,"t2")]/a/text()') job_info['job_href'] = each_job.xpath( './/span[contains(@class,"t2")]/a/@href') job_info['location'] = each_job.xpath( './/span[contains(@class,"t3")]/text()') job_info['salary'] = each_job.xpath( './/span[contains(@class,"t4")]/text()') job_info['post_date'] = each_job.xpath( './/span[contains(@class,"t5")]/text()') # mm-dd for k, v in job_info.items(): if v: job_info[k] = v.extract_first().strip() else: job_info[k] = 'unknown' # print(job_info) yield job_info if next_page_url is not None: abs_url = next_page_url.extract_first().strip() print('*' * 30) # time.sleep(1) yield response.follow(abs_url, callback=self.parse)
def parse_totalUrl(self, response): item = JobsItem() headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" } res = json.loads(response.body) if (res.get("success")): if res.get('content').get('positionResult').get('resultSize') != 0: results = res.get('content').get('positionResult').get( 'result') for result in results: item['url'] = "https://www.lagou.com/jobs/{}.html".format( result['positionId']) item['pubdate'] = result['createTime'] item['company'] = result['companyFullName'] item['longitude'] = result['longitude'] item['latitude'] = result['latitude'] item['city'] = result['city'] item['jobtitle'] = result['positionName'] item['workyear'] = result['workYear'] item['salary'] = result['salary'] yield Request(url=item['url'], meta={'item_1': item}, headers=headers, callback=self.parse_detail)
def mycontent(self, response): item = JobsItem() need_list = [] content_list = [] item['job'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()').extract_first() item['money'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract_first() item['addr'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/span/text()').extract_first() item['company'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/@title').extract_first() needs = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span') for need in needs: need_list.append(need.xpath('text()').extract_first()) item['needs'] = need_list contents = response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p') for content in contents: content1 = content.xpath('text()').extract_first() if not content1: content1 = content.xpath('span/text()').extract_first() if not content1: content1 = '暂无信息' content_list.append(content1) item['content'] = content_list data = response.xpath('/html/body/div[3]/div[2]/div[3]/div[3]/div/p') # 这里的p标签下面还有其他标签,用text()不能拿出全部文本,所以用string(.) work_addr = data.xpath('string(.)').extract_first() # 用正则表达式将\t \n \r 全部去掉 r表示转移原生字符 item['work_addr'] = re.sub(r'\t|\n|\r', '', work_addr) yield item
def parse(self, response): jdict = json.loads(response.body) jcontent = jdict['content'] jposresult = jcontent['positionResult'] jresult = jposresult['result'] self.totalPageCount = int(jposresult['totalCount']) / int( jcontent['pageSize']) + 1 for entry in jresult: if len(entry) < 10: continue item = JobsItem() item['pid'] = str(entry['positionId']) + "_" + str( entry['publisherId']) item['keyword'] = self.keyword for fn in self.item_fns: item[fn] = entry[fn] yield item if self.curPage <= self.totalPageCount: self.curPage += 1 yield self.next_request() elif self.kd_cur < len(self.keywords) - 1: self.curPage = 1 self.totalPageCount = 0 self.kd_cur += 1 self.keyword = self.keywords[self.kd_cur] yield self.next_request()
def parse(self, response): jdict = json.loads(response.body) jcontent = jdict['content'] jposresult = jcontent['positionResult'] jresult = jposresult['result'] resultSize = int(jposresult['resultSize']) pageSize = int(jcontent['pageSize']) print('[lagou][%s]resultSize: %d, pageNo: %d, pageSize: %d' % (self.keyword, resultSize, self.curPage, pageSize)) for entry in jresult: if len(entry) < 10: continue item = JobsItem() item['keyword'] = self.keyword item['pid'] = self.keyword + "_" + str( entry['positionId']) + "_" + str(entry['publisherId']) for fn in self.item_fns: item[fn] = entry[fn] yield item if pageSize <= resultSize: self.curPage += 1 yield self.next_request() elif self.kd_cur < len(self.keywords) - 1: self.curPage = 1 self.kd_cur += 1 self.keyword = self.keywords[self.kd_cur] yield self.next_request()
def getDetail(self, response): print('got', response.url, response) title = response.xpath( '//span[@class="pos_title"]/text()')[0].extract() try: salary = response.xpath( '//span[@class="pos_salary"]/text()')[0].extract() except: salary = 0 num = response.xpath( '//span[@class="item_condition pad_left_none"]/text()')[0].extract( ) edu = response.xpath( '//span[@class="item_condition"]/text()')[0].extract() exp = response.xpath( '//span[@class="item_condition border_right_None"]/text()' )[0].extract() area = response.xpath( '//span[@class="pos_area_span pos_address"]//a/text()').extract() # print(title,num,edu,area,exp,salary) item = JobsItem() item['title'] = title.strip() item['salary'] = salary.strip() item['num'] = num.strip('招人 ') item['edu'] = edu.strip() item['exp'] = exp.strip() item['area'] = area yield item
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.xpath("//span[@class='article-title'] ") items = [] for titles in titles: item = JobsItem() item["title"] = titles.select("a/text()").extract() #item["link"] = titles.select("a/@href").extract() items.append(item) return items
def vacansy_parse(self, response: HtmlResponse): name_job = response.xpath('//h1/text()').extract_first() salary_job = response.css( 'div._3MVeX span._1OuF_.ZON4b span._3mfro._2Wp8I.PlM3e._2JVkc::text' ).extract() link = response.url source_link = response.css( 'a.icMQ_._2JivQ::attr(href)').extract_first() yield JobsItem(name=name_job, salary=salary_job, link=link, source_link=source_link)
def parse_list (self, response): for i in range(0,20): joblist = self.driver.find_element_by_xpath("//div[@id='uxItemLink_"+ str(i+1+JobspiderSpider.get_jobcount())+"']/table/tbody/tr/td[2]") joblist.click() jobselems = joblist.find_elements_by_tag_name("span") job =JobsItem() job ['vacant'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxNoOfVac").text job ['company'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxCompany").text job ['name'] = jobselems[0].text job ['job_id'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxOrdNo").text details = self.driver.find_element_by_xpath("//div[@id='ctl00_ContentPlaceHolder1_uxJobCard_uxJcard']/table[2]/tbody/tr[5]/td").text if (details is not None): result ="" for i in range (0, len(details.split())): result = ''.join([result, details.split()[i], ' ']) job ['detail'] =result saldig = jobselems[1].text.find("月薪") if saldig > 0 : job ['salary'] = int(jobselems[1].text[1:7].replace(',','')) else: job['salary'] = int(jobselems[1].text[1:3].replace(',','')) job ['area'] = jobselems[2].text pdate = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxPostedDate").text self.logger.debug(pdate) job ['date_posted'] = date(int(pdate.split('/')[2]),int(pdate.split('/')[1]),int( pdate.split('/')[0])) # self.logger.info(job) # self.LAW_LOG.append(job) yield (job) JobspiderSpider.update_jobcount(JobspiderSpider.get_jobcount()+20) nexturl=None # try: nexturl = self.driver.find_element_by_xpath("//span[@id='ctl00_ContentPlaceHolder1_uxPageNum']/a[6]").get_attribute('href') self.logger.debug(nexturl +": expected partial") # self.LAW_LOG.append(nexturl +": expected partial") pos = nexturl.find('?') if pos > -1: nexturl = nexturl[:pos] self.logger.debug(nexturl + ": processed") qsdict = {'SearchFor':'simple', 'ID':'', 'SortBy':'', 'from':'', 'start':str(JobspiderSpider.get_jobcount())} qs = urlencode(qsdict) request = Request(nexturl+ '?' +qs, dont_filter=False, callback=self.parse_list) yield request
def parse_detail_page(self, response): jobTitle = response.css('.jobtitle> font::text').extract()[0].strip() company = response.css('.company::text').extract()[0] location = response.css('.location::text').extract()[0] jobSummary = response.css('.summary *::text').extract() print('Got item...', response.url) item = JobsItem() item['jobTitle'] = jobTitle item['company'] = company item['location'] = location item['jobSummary'] = jobSummary item['url'] = response.url yield item
def parse_jobs(self, response): torrent = JobsItem() torrent['title'] = response.css('h1[itemprop="name"]::text').extract() torrent['location'] = response.css( '#post-location span[itemprop="name"]::text').extract() torrent['post_url'] = response.url torrent['organization'] = response.css( 'strong[itemprop="name"]::text').extract() torrent['organization_url'] = response.css( '#post-location + a::text').extract() torrent['job_type'] = response.css('a[href^="/type"]::text').extract() torrent['job_category'] = response.css( 'a[href^="/category"]::text').extract() torrent['salary'] = '' yield torrent
def parse(self, response): data = json.loads(response.text) if len(data['data']['results']) > 0: self.page_count += 1 for each_group in data['data']['results']: item = JobsItem() item['job_title'] = each_group['jobName'] item['salary'] = each_group['salary'] item['experience'] = each_group['workingExp']['name'] item['location'] = each_group['city']['items'][0]['name'] item['detail_url'] = each_group['positionURL'] item['update_date'] = each_group['updateDate'].split()[0] item['welfare'] = ",".join(each_group['welfare']) item['key_word'] = self.query_list[self.query_count] item['company_title'] = each_group['company']['name'] item['company_scale'] = each_group['company']['size']['name'] item['company_nature'] = each_group['company']['type']['name'] item['crawl_date'] = self.date_time item['crawl_url'] = response.request.url yield item url = "https://fe-api.zhaopin.com/c/i/sou?start={0}&pageSize=90&cityId={1}&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={2}&kt=3&_v=0.34053159&x-zp-page-request-id=d1dd66ee655347939b69acf87870ccc1-1548983897497-953047".format( self.page_count * 90, self.city_list[self.city_count], self.query_list[self.query_count]) yield scrapy.Request(url=url, callback=self.parse) else: if self.query_count < len(self.query_list) - 1: self.query_count += 1 elif self.city_count < len(self.city_list) - 1: self.query_count = 0 self.city_count += 1 else: return url = "https://fe-api.zhaopin.com/c/i/sou?start={0}&pageSize=90&cityId={1}&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={2}&kt=3&_v=0.34053159&x-zp-page-request-id=d1dd66ee655347939b69acf87870ccc1-1548983897497-953047".format( 0, self.city_list[self.city_count], self.query_list[self.query_count]) self.page_count = 0 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): try: headers = {} headers[ "User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" time.sleep(5) item = JobsItem() works = BeautifulSoup(response.text, 'lxml').find( "div", class_="infolist").find_all("dl") for work in works: time.sleep(1) job_url = work.find('dt').find('a') url = job_url['href'] request = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(request) job_html = response.read().decode('utf-8') html = BeautifulSoup(job_html, 'lxml') item['id'] = uuid.uuid4() item['title'] = html.find('div', class_='item_con pos_info').find( 'span', class_='pos_name').get_text() item['salary'] = html.find('div', class_='pos_base_info').find( 'span', class_='pos_salary').get_text() item['company'] = html.find( 'div', class_='subitem_con company_baseInfo').find( 'p', class_='comp_baseInfo_title').find( 'a', class_='baseInfo_link').get_text() # item['company'] = "asd" item['scale'] = html.find( 'div', class_='subitem_con company_baseInfo').find( 'p', class_='comp_baseInfo_scale').get_text() item['address'] = html.find( 'div', class_='subitem_con work_adress').find( 'p', class_='detail_adress').get_text() item['city'] = u"成都" yield item except Exception, e: print e.message
def parse_info(self, response): # 实例化 job_item = JobsItem() selector = scrapy.Selector(response) item_cn = selector.xpath( '/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[@class="tHeader tHjob"]/div/div[@class="cn"]' ) # 1.职位名称 # job_name = selector.xpath('/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[@class="tHeader tHjob"]/div/div[@class="cn"]/h1/@title').extract() job_name = item_cn.xpath('h1/@title').extract()[0] job_item.job_name = job_name # 2.公司名称 company_name = item_cn.xpath( 'p[@class="cname"]/a[1]/@title').extract()[0] job_item.company_name = company_name # 3.福利 welfares = item_cn.xpath('div/div/span/text()').extract() # 4.月薪 monthly_pay = item_cn.xpath('strong/text()').extract()[0] # x-y 元/天 千/月 万/月 万以上/月 万/年 万以上/年 tmp_dict = { '元/天': 30, '千/月': 1000, '千以上/月': 1000, '千以下/月': 1000, '万/月': 10000, '万以上/月': 10000, '万/年': 1 / 12, '万以上/年': 1 / 12 } money_min = 0 money_max = 0 for key, value in tmp_dict.items(): if monthly_pay.strip() == '': money_min = money_max = 0 break elif key in monthly_pay: temp_money = monthly_pay.strip(key) temp_money_list = temp_money.split('-') if len(temp_money_list) == 2: money_min = float(temp_money_list[0]) * value money_max = float(temp_money_list[1]) * value break else: money_min = money_max = float(temp_money) * value break job_item.monthly_pay_min = money_min job_item.monthly_pay_max = money_max # print('------------- %d ~ %d' % (money_min, money_max)) # 5.职位要求 requirement = item_cn.xpath('p[2]/@title').extract()[0].replace( u'\xa0', u' ').replace(' ', '').split('|') #  解码 # 工作地 addrStr = requirement[0] addrList = addrStr.split('-') city = addrList[0] area = '' if len(addrList) == 2: area = addrList[1] job_item.area = area # 经验 experience = '无工作经验' # 学历 educationArr = ['初中及以下', '高中/中技/中专', '大专', '本科', '硕士', '博士'] education = '所有' # 人数 count = 0 for item in requirement: if '年经验' in item: experience = item[:-3] elif item in education: education = item elif re.match(r'招.*?人', item): count_str = item[1:-1] if count_str == '若干': count = 0 else: count = int(count_str) # print('工作城市%s, 区:%s, 经验:%s, 学历:%s, 招聘%s人' % (city, area, experience, education, ('若干' if count == 0 else str(count)))) job_item.education = education job_item.experience = experience item_detail = selector.xpath( '/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[3]' ) # 6.职位详情 # job_details = item_detail.xpath('div[1]/div[1]/p/text()').extract() job_details = item_detail.xpath('string(div[1]/div[1])').extract()[ 0] #terminal打印不完整 # print('------------') # print(job_details) job_item.job_detail = job_details com = re.compile( u'(岗位职责|工作职责)[::]?(.*?)(任职资格|任职要求)[::]?(.*?)(职能类别)[::]?(.*?)(关键字)[::]?' ) re_list = re.findall(com, unicode(job_details)) if re_list: print(re_list[0][0].strip()) # print('++++++++++++') # job_detail = job_details.replace(' ', '').replace('\n', '') # job_details = item_detail.xpath('div[1]/div/') # for item in item_detail: # title_h2 = item.xpath('h2/span/text()').extract()[0] # detail_p = item.xpath('string(div)').extract()[0] # # print ('------------') # print(detail_p) # print ('++++++++++++') # print(selector.xpath('/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[3]/div[1]/h2/span/text()').extract()[0]) # 7.上班地址 job_address = item_detail.xpath('div[2]/div/p/text()').extract() job_item.address = job_address # 8.地图位置 job_positions = self.sub_string( item_detail.xpath('div[2]/div/a/@onclick').extract()[0]) job_map = '' if job_positions: job_map = job_positions[0] item_company = selector.xpath('/html/body/div[3]/div[2]/div[4]/div[1]') # 9.公司名称 company_name = item_company.xpath('div[1]/a/p/@title').extract()[0] # 10.公司链接 company_link = '' company_links = item_company.xpath('div[1]/a/@href').extract() if company_links: company_link = company_links[0] job_item.company_link = company_link # 11.公司性质 company_nature = item_company.xpath('div[2]/p[1]/text()').extract()[0] # 12.员工人数 company_people = item_company.xpath('div[2]/p[2]/text()').extract()[0] # 13.所处行业 company_industry = item_company.xpath('div[2]/p[3]/a/text()').extract() # 14.公司信息 company_info = item_detail.xpath('string(div[3]/div)').extract()[0] job_item.company_info = company_info # print(company_info) # print(job_name+"--"+company_name+','.join(welfares)+'--'+monthly_pay+'--'+','.join(requirement)+'\n'+job_address[1]+job_map+'\n'+' '.join(company_industry)+company_link) # yield job_item return job_item # 将item提交给管道
def parse_item(self, response): item = JobsItem() item["post_url"] = response.url item["post_name"] = response.xpath("//h1//text()").extract_first() # salary = response.xpath( "//div[@class='cn']//strong//text()").extract_first() if len(salary) > 0: if salary[-3:] == '万/月': min_salary = float(re.findall('(.*?)-(.*?)万', salary)[0][0]) * 10000 max_salary = float(re.findall('(.*?)-(.*?)万', salary)[0][1]) * 10000 avg_salary = (min_salary + max_salary) / 2 elif salary[-3:] == '千/月': min_salary = float(re.findall('(.*?)-(.*?)千', salary)[0][0]) * 1000 max_salary = float(re.findall('(.*?)-(.*?)千', salary)[0][1]) * 1000 avg_salary = (min_salary + max_salary) / 2 else: avg_salary = 'N' item["post_salary"] = float(avg_salary) temp = re.sub( r'\xa0', '', response.xpath( "//*[@class='msg ltype']/@title").extract_first()).split("|") if len(temp) >= 5: item["post_city"] = temp[0] item["post_experience"] = temp[1] item["post_education"] = temp[2] number = re.findall('招(.*?)人', temp[3]) if len(number) == 0: number = "N" item["post_number"] = number else: item["post_number"] = int(number[0]) item["post_release_time"] = temp[4] item["post_information"] = ''.join( response.xpath("//div[@class='bmsg job_msg inbox']//p//text()"). extract()).strip("\n").strip('\r').strip('\t') item["post_category"] = ','.join( response.xpath("//div[@class='mt10']/p[1]//a//text()").extract()) item["post_keywords"] = ','.join( response.xpath("//div[@class='mt10']/p[2]//a//text()").extract()) item["company_url"] = response.xpath( "//div[@class='com_msg']//a/@href").extract_first() item["company_name"] = response.xpath( "//div[@class='com_msg']//a//text()").extract_first() item["company_nature"] = response.xpath( "//div[@class='com_tag']/p[1]//text()").extract_first() item["company_scale"] = response.xpath( "//div[@class='com_tag']/p[2]//text()").extract_first() item["company_category"] = re.sub( r'[\r\n\s]', '', ','.join( response.xpath( "//div[@class='com_tag']/p[3]//a//text()").extract())) item["crawl_date"] = datetime.datetime.now().strftime('%Y-%m-%d') yield item
def parse(self, response): self.page_count += 1 self.total_page_count += 1 print("lnkyzhang", self.total_page_count) if response.url.strip(): groups = response.xpath('//div[@class="job-list"]/ul/li') for each_group in groups: item = JobsItem() item['job_title'] = each_group.xpath( './/div[@class="job-title"]/text()').extract()[0] item['salary'] = each_group.xpath( './/span[@class="red"]/text()').extract()[0] item['experience'] = each_group.xpath( './/div[@class="info-primary"]/p/text()[2]').extract()[0] item['location'] = each_group.xpath( './/div[@class="info-primary"]/p/text()[1]').extract( )[0].split()[0] item['detail_url'] = each_group.xpath( './/h3[@class="name"]/a/@href').extract()[0].split()[0] item['update_date'] = each_group.xpath( './/div[@class="info-publis"]/p/text()').extract()[0] #item['welfare'] = ",".join(each_group['welfare']) item['key_word'] = self.query_list[self.query_count] item['company_title'] = each_group.xpath( './/div[@class="company-text"]/h3[@class="name"]/a/text()' ).extract()[0] item['crawl_date'] = self.date_time item['crawl_url'] = response.request.url item['crawl_pages'] = self.total_page_count try: item['company_scale'] = each_group.xpath( './/div[@class="company-text"]/p/text()[3]').extract( )[0] except (TypeError, IndexError): pass item['company_nature'] = each_group.xpath( './/div[@class="company-text"]/p/text()[2]').extract()[0] yield item if response.xpath( '//div[@class="page"]/a[@ka="page-next"]/@class'): if response.xpath( '//div[@class="page"]/a[@ka="page-next"]/@class' ).extract()[0] == 'next': pass else: self.page_count = 0 if self.query_count < len(self.query_list) - 1: self.query_count += 1 elif self.city_count < len(self.city_list) - 1: self.query_count = 0 self.city_count += 1 else: self.page_count = 0 if self.query_count < len(self.query_list) - 1: self.query_count += 1 elif self.city_count < len(self.city_list) - 1: self.query_count = 0 self.city_count += 1 url = 'https://www.zhipin.com/{}/?query={}&page={}'.format( self.city_list[self.city_count], self.query_list[self.query_count], self.page_count) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): self.page_count += 1 self.total_page_count += 1 print("lnkyzhang", self.total_page_count) if response.url.strip(): groups = response.xpath( '//div[@class="dw_table"]/div[@class="el"]') for each_group in groups: item = JobsItem() item['job_title'] = each_group.xpath( './/a[@target="_blank"]/@title').extract()[0] try: item['salary'] = each_group.xpath( './/span[@class="t4"]/text()').extract()[0] except (TypeError, IndexError): pass # item['experience'] = each_group.xpath( # './/div[@class="info-primary"]/p/text()[2]').extract()[0] item['location'] = each_group.xpath( './/span[@class="t3"]/text()').extract()[0].split()[0] item['detail_url'] = each_group.xpath( './/a[@target="_blank"]/@href').extract()[0].split()[0] item['update_date'] = each_group.xpath( './/span[@class="t5"]/text()').extract()[0] # item['welfare'] = ",".join(each_group['welfare']) item['key_word'] = self.query_list[self.query_count] item['company_title'] = each_group.xpath( './/span[@class="t2"]/a/@title').extract()[0] item['crawl_date'] = self.date_time item['crawl_url'] = response.request.url item['crawl_pages'] = self.total_page_count # try: # item['company_scale'] = each_group.xpath( # './/div[@class="company-text"]/p/text()[3]').extract()[0] # except (TypeError, IndexError): # pass # item['company_nature'] = each_group.xpath( # './/div[@class="company-text"]/p/text()[2]').extract()[0] yield item if response.xpath('//li[@class="bk"][2]/span/text()'): if response.xpath('//li[@class="bk"][2]/span/text()').extract( )[0] == '下一页': self.page_count = 1 if self.query_count < len(self.query_list) - 1: self.query_count += 1 elif self.city_count < len(self.city_list) - 1: self.query_count = 0 self.city_count += 1 url = 'https://search.51job.com/list/{0},000000,0000,00,9,99,{1},2,{2}.html'.format( self.city_list[self.city_count], urllib.parse.quote(self.query_list[self.query_count]), self.page_count) yield scrapy.Request(url=url, callback=self.parse)
def parse_page(self, response): res = response.xpath( '//div[contains(@class, "jobsearch-SerpJobCard unifiedRow row result")]' ) for i in range(1, len(res)): item = JobsItem() title = res[i].xpath( 'div[@class="title"]/a/text() | div[@class="title"]/a/b/text()' ).extract() item['title'] = ''.join(title).replace('\n', '') company = res[i].xpath( 'div/div/span[@class="company"]/a/text() | div/div/span[@class="company"]/text()' ).extract() item['company'] = ''.join(company).replace('\n', '') rating = res[i].xpath( 'div/div/span[@class="ratingsDisplay"]/a/span/text() | div/div/a/span[@class="ratings"]/@aria-label' ).extract() item['rating'] = ''.join(rating).replace('\n', '').replace( ' out of 5 star rating', '') location = res[i].xpath( 'div/div[contains(@class, "location")]/text() | div/span[contains(@class, "location")]/text()' ).extract() item['location'] = ''.join(location).replace('\n', '') salary = res[i].xpath( 'div/span/span[@class="salaryText"]/text()').extract() salary = ''.join(salary).replace('\n', '') if salary == '': continue if 'year' in salary: item['salary_unit'] = 'year' salary = salary.replace('a year', '') elif 'hour' in salary: item['salary_unit'] = 'hour' salary = salary.replace('an hour', '') elif 'month' in salary: item['salary_unit'] = 'month' salary = salary.replace('a month', '') elif 'week' in salary: item['salary_unit'] = 'week' salary = salary.replace('a week', '') elif 'day' in salary: item['salary_unit'] = 'day' salary = salary.replace('a day', '') else: print('Error Text: ' + salary) raise ValueError('Cannot parse salary text.') salary = salary.replace(',', '').replace(' ', '').replace( '$', '').replace('++', '').replace('From', '').replace('Upto', '') if '-' in salary: salary_all = salary.split('-') item['salary_from'] = salary_all[0] item['salary_to'] = salary_all[1] else: item['salary_from'] = salary item['salary_to'] = salary item['level'] = response.meta['level'] item['state'] = response.meta['state'] item['skill'] = response.meta['skill'] yield item