def page1_parse(self, response): position = response.xpath('//div[@class="dw_table"]/div[@class="el"]') if position is not None: for posi in position: posi_url = posi.xpath('./p//a/@href').extract()[0] print '===========>', posi_url posi_url = self.rule_key + crate_params( '1', (self.get_job(posi_url), )) yield scrapy.Request(url=posi_url, callback=self.detail_parse, priority=3) page = int( response.xpath('//div[@class="rt"][2]/text()').extract() [1].split('/')[1].strip()) if page != 1 and MORE_PAGE: for p in range(2, page + 1): #next_url = response.url.replace('1.html', str(p) + '.html') #yield scrapy.Request(url=next_url,callback=self.pages_parse) qs = get_qs(response.url) eduid = qs['degreefrom'] cityid = get_num(response.url.split(',')[0]) salaryid = response.url.split(',')[5] full_url = self.rule_key + crate_params( '0', (cityid, salaryid, '%2B', str(p), eduid, '%2C')) yield scrapy.Request(url=full_url, callback=self.pages_parse, priority=1)
def second_parse(self, response): #职位链接列表 posi_list = response.xpath( '//div[@class="job-info"]/h3/a/@href').extract() if posi_list: for posi in posi_list: posi = self.rule_key + crate_params('2', (get_num(posi), )) yield scrapy.Request(posi, callback=self.detail_parse, priority=3) else: print "=====>i didnt get any next page"
def first_parse(self, response): #职位链接列表 #posi_list = response.xpath('//div[@class="sojob-result "]//div[@class="job-info"]/h3/a/@href').extract() posi_list = response.xpath( '//div[@class="job-info"]/h3/a/@href').extract() if posi_list: for posi in posi_list: posi = self.rule_key + crate_params('2', (get_num(posi), )) yield scrapy.Request(posi, callback=self.detail_parse, priority=3) # 第一种:通过寻找下一页链接,循环每一页 # next_page = response.xpath('//div[@class="sojob-result "]//div[@class="pagerbar"]/a[last()-1]/@href').extract() # if next_page and 'javascript:;' not in next_page: # nextPage = 'https://www.liepin.com' + next_page[0] # print 'next:' + nextPage # yield scrapy.Request(nextPage,callback=self.first_parse) # 第二种:通过寻找尾页页码,循环此页码生成每一页请求 #last_page = response.xpath('//div[@class="sojob-result "]//div[@class="pagerbar"]/a[last()]/@href').extract() last_page = response.xpath( '//div[@class="pagerbar"]/a[last()]/@href').extract() if 'javascript:;' not in last_page and MORE_PAGE: #需要判断get参数 industries qs = get_qs(response.url) pageNum = int(last_page[0].split('=').pop()) for num in range(1, pageNum + 1): cityId = qs['dqs'] salaId = qs['salary'] sclaId = qs['compscale'] if qs.has_key('industries'): indusId = qs['industries'] full_url = self.rule_key + crate_params( '0', (cityId, indusId, salaId, sclaId, str(num))) yield scrapy.Request(full_url, callback=self.second_parse, priority=1) else: full_url = self.rule_key + crate_params( '1', (cityId, salaId, sclaId, str(num))) yield scrapy.Request(full_url, callback=self.second_parse, priority=1) else: print "=====>i didnt get any first page"
def parse_list(self, response): #职位链接列表 posi_list = response.xpath( '//div[@class="info-primary"]/h3/a/@href').extract() if posi_list: print "=====>i get page list" for posi in posi_list: posi = self.rule_key + crate_params('2', (get_num(posi), )) yield scrapy.Request(posi, callback=self.parse_item, priority=3) else: print "=====>i didnt get any page list"
def parse_list(self, response): # 获取当前招聘详情的url列表 recruit_url = response.xpath( '//div[@class="newlist_list_content"]//td[@class="zwmc"]//a/@href' ).extract() for url in recruit_url: # 筛选排除校园招聘 的url if 'jobs.zhaopin.com' in url: # print '详情招聘页链接:',url # 请求详情页信息 posi = self.rule_key + crate_params('1', (get_num(url), )) #yield scrapy.Request(url,callback=self.parse_detail,priority=2) yield scrapy.Request(posi, callback=self.parse_detail, priority=3) # 获取下一页是否存在url next_page = response.xpath( '//div[@class="pagesDown"]//li/a[@class="next-page"]/@href' ).extract() # 有下一页分页,再次请求当前parse_list方法请求,再次解析(递归,知道 没有下一页为止) print 'nex page==============>', next_page if next_page and MORE_PAGE: qs = get_qs(str(next_page[0])) city = unicode(qs['jl'], "utf-8") ct = qs['ct'] et = qs['et'] el = qs['el'] p = qs['p'] full_url = self.rule_key + crate_params( '0', (city, str(ct), str(et), str(el), str(p))) yield scrapy.Request(url=full_url, callback=self.parse_list, priority=1)
def parse_item(self, response): item = LagouItem() htmls = response.body if '已下线' not in htmls: # 公司名称 co_name = response.xpath( "//div[@class='company']/text()").extract()[0].encode('utf-8') # 职位名称 name = response.xpath("//div[@class='job-name']/span/text()" ).extract()[0].encode('utf-8') # 薪资 salary = response.xpath( "//dd[@class='job_request']//span[1]/text()").extract( )[0].encode('utf-8') # 区域 area = response.xpath("//dd[@class='job_request']//span[2]/text()" ).extract()[0].encode('utf-8').replace( '/', '') # 工作年限 exp = response.xpath("//dd[@class='job_request']//span[3]/text()" ).extract()[0].encode('utf-8').replace( '/', '') # 学历 edu = response.xpath("//dd[@class='job_request']//span[4]/text()" ).extract()[0].encode('utf-8').replace( '/', '') # 发布时间 time = response.xpath( "//p[@class='publish_time']/text()").extract()[0].split(' ')[0] time = getVal(time) # 职位描述 info = response.xpath("//dd[@class='job_bt']//p/text()").extract() info = getVal(info) if info != '': info = '\n'.join(info).encode('utf-8') # 工作地点 local = ','.join( response.xpath("//div[@class='work_addr']/a/text()").extract() [:-1]).encode('utf-8') # 公司福利 welfare = response.xpath('//dd[@class="job-advantage"]//p/text()' ).extract()[0].encode('utf-8') # 公司网址 co_url = response.xpath('//dl[@id="job_company"]//li/a/@href' ).extract()[0].encode('utf-8') # 招聘人数 num = '0' # 公司类别 co_type = response.xpath('//dl[@id="job_company"]//li[1]/text()' ).extract()[1].encode('utf-8') #print name,co_name,area,salary,exp,edu,num,time,welfare,info,local,co_url,co_type item['name'] = name item['co_name'] = co_name item['area'] = area item['salary'] = salary item['exp'] = exp item['edu'] = edu item['num'] = num item['time'] = time item['welfare'] = welfare item['info'] = info item['local'] = local item['co_url'] = co_url item['co_type'] = co_type item['spider_name'] = 'lagou' item['otherq'] = '' #识别id try: item['target_id'] = get_num(response.url) except: item['target_id'] = '' yield item
def parse_item(self, response): item = BossItem() # 公司名称 co_name = response.xpath( "//div[@class='job-sec']/div[@class='name']/text()").extract( )[0].encode('utf-8') # 职位名称 name = response.xpath( "//div[@class='name'][1]/h1/text()").extract()[0].encode('utf-8') # 薪资 salary = response.xpath( "//div[@class='name'][1]/span/text()").extract()[0].encode('utf-8') subTitle = response.xpath( "//div[@class='job-primary detail-box']/div[2]/p/text()").extract( ) # 区域 area = subTitle[0].split(u':')[1].encode('utf-8') # 工作年限 exp = subTitle[1].split(u':')[1].encode('utf-8') # 学历 edu = subTitle[2].split(u':')[1].encode('utf-8') # 发布时间 time = response.xpath( "//span[@class='time'][1]/text()").extract()[0].replace( u'发布于', '') time = getVal(time) # 职位描述 info = response.xpath( "//div[@class='job-sec']/div[@class='text']/text()").extract() info = getVal(info) if info != '': info = '\n'.join(info).encode('utf-8') # 工作地点 local = response.xpath("//div[@class='location-address']/text()" ).extract()[0].encode('utf-8') # 公司福利 welfare = '' # 公司网址 co_url = response.xpath( "//a[@ka='job-cominfo']/@href").extract()[0].encode('utf-8') # 招聘人数 num = '0' # 公司类别 co_type = response.xpath( "//li[@class='company-type']/text()").extract()[0].encode('utf-8') #识别id try: target_id = get_num( response.xpath("//link[@rel='alternate']/@href").extract()[0]) except: target_id = '' #print name,co_name,area,salary,exp,edu,num,time,welfare,info,local,co_url,co_type item['name'] = name item['co_name'] = co_name item['area'] = area item['salary'] = salary item['exp'] = exp item['edu'] = edu item['num'] = num item['time'] = time item['welfare'] = welfare item['info'] = info item['local'] = local item['co_url'] = co_url item['co_type'] = co_type item['spider_name'] = 'boss' item['otherq'] = '' item['target_id'] = target_id yield item
def parse_detail(self, response): item = ZhilianItem() #html = response.body.decode('utf-8') #print html name = response.xpath( '//div[@class="bread_crumbs"]//a[3]/strong/text()')[0].extract( ).encode('utf-8') # 公司福利 welfare_list = response.xpath( '//div[@class="fixed-inner-box"]//span/text()').extract() welfare = " ".join(welfare_list).encode("utf-8") left_info = response.xpath('//div[@class="terminalpage-left"]') for i in left_info: salary = i.xpath('.//ul/li[1]/strong/text()')[0].extract().encode( 'utf-8') # 薪资 area_city = i.xpath( './ul/li[2]/strong/a/text()').extract()[0].encode( 'utf-8') # 工作区域_市 area_qu = i.xpath('./ul/li[2]/strong/text()').extract() # 工作区域_区 area_qu = getVal(area_qu) if area_qu != "": area_qu = area_qu[0].encode("utf-8") area = area_city + area_qu # 完整的工作区域 exp = i.xpath('.//ul/li[5]/strong/text()')[0].extract().encode( 'utf-8') # 经验 edu = i.xpath('.//ul/li[6]/strong/text()')[0].extract().encode( 'utf-8') # 学历 num = i.xpath('.//ul/li[7]/strong/text()')[0].extract().encode( 'utf-8') # 人数 time = i.xpath( './/ul/li[3]/strong/span/text()')[0].extract().encode( 'utf-8') # 发布日期: local = response.xpath( '//div[@class="tab-inner-cont"]//h2/text()')[0].extract().encode( 'utf-8') # 工作地点 去换行 local = local.replace('\n', '').strip() info = response.xpath( '//div[@class="tab-inner-cont"]//p/text()').extract() # info = '\n'.join(info).encode('utf-8').replace('\n','').strip() #转了utf-8 info = '\n'.join(info).replace('\n', '').strip() #print info co_name = response.xpath( '//div[@class="terminalpage-right"]//p[@class="company-name-t"]//a/text()' )[0].extract().encode('utf-8') # 公司名称 co_type = response.xpath( '//div[@class="terminalpage-right"]//ul/li[2]/strong/text()' )[0].extract().encode('utf-8') # 公司类别(公司性质) ''' li_count = response.xpath('//div[@class="terminalpage-right"]//ul/li')[0].extract() # co_url = response.xpath('//div[@class="terminalpage-right"]//div[@class="company-box"]//ul/li[4]/strong/a/@href').extract()[0] # 有五个li,有网址 co_url = "" i = len(li_count) if i == 5: # print response.body.decode('utf-8') # 公司链接 href为空 co_url = response.xpath('//ul[@class="terminal-ul clearfix terminal-company mt20"]/li[4]/strong/a/text()')[0].extract() else: co_url = "该公司没有网站" ''' co_url = response.xpath( '//div[@class="terminalpage-right"]//p[@class="company-name-t"]//a/@href' )[0].extract().encode('utf-8') # 该公司在智联上的介绍url # print co_url # 封装item item['name'] = name item['welfare'] = welfare item['salary'] = salary item['area'] = area item['exp'] = exp item['edu'] = edu item['num'] = num item['time'] = time item['local'] = local item['info'] = info item['co_name'] = co_name item['co_type'] = co_type item['co_url'] = co_url #添加spider_name item['spider_name'] = 'zhilian' item['otherq'] = '' #识别id try: item['target_id'] = get_num( response.xpath("//link[@rel='alternate']/@href").extract()[0]) except: item['target_id'] = '' return item
def detail_parse(self, response): #判断信息是否存在 ifexists = lambda x: x[0] if x else '' job = Job51Item() #职位名称 job['name'] = response.xpath( '//div[@class="tHeader tHjob"]//h1//text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//p[@class="cname"]/a//text()').extract()[0] #区域 job['area'] = response.xpath( '//div[@class="tHeader tHjob"]//span/text()').extract()[0] #工资 job['salary'] = ifexists( response.xpath( '//div[@class="tHeader tHjob"]//strong/text()').extract()) #所有要求 #其他要求 otherq = '' all_require = response.xpath( '//div[@class="tBorderTop_box bt"]//div[@class="t1"]/span/text()' ).extract() for require in all_require: if '经验'.decode('utf8') in require: job['exp'] = require elif require in self.edu_type: job['edu'] = require elif '人'.decode('utf8') in require: job['num'] = require elif '发布'.decode('utf8') in require: job['time'] = require else: otherq = otherq + require + ' ' job['otherq'] = otherq #福利 welfare = ' ' fuli = response.xpath( '//div[@class="tBorderTop_box bt"]//p[@class="t2"]/span/text()' ).extract() for f in fuli: welfare = welfare + f + ' ' job['welfare'] = welfare #职位信息 posi_info = response.xpath( '//div[@class="tBorderTop_box"][1]//div[@class="bmsg job_msg inbox"]//text()' ).extract() for i in posi_info: if i in self.unrequire: posi_info.remove(i) else: i.strip() job['info'] = ' '.join(posi_info) #上班地址 job['local'] = ifexists( response.xpath( '//div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//p/text()[2]' ).extract()) #公司网址 job['co_url'] = response.xpath( '//div[@class="tHeader tHjob"]//p[@class="cname"]/a/@href' ).extract()[0] #公司类型 str1 = response.xpath( '//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()' ).extract()[0] strtotal = '' strlist = str1.split('|') for s in strlist: strtotal = strtotal + s.strip() + '|' job['co_type'] = strtotal job['spider_name'] = 'job51' #识别id try: job['target_id'] = get_num(response.url.replace('51job', '')) except: job['target_id'] = '' yield job
def detail_parse(self, response): panduan = lambda x: x[0] if x else '' job = LiepinItem() #如果是'/a/'类型网页 if '/a/' in response.url: #职位名称 job['name'] = response.xpath( '//div[@class="title-info"]/h1/text() | //div[@class="title-info "]/h1/text()' ).extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="title-info"]/h3/text() | //div[@class="title-info "]/h3/text()' ).extract()[0].strip() #区域 job['area'] = response.xpath( '//div[@class="title"]//p[@class="basic-infor"]/span/text()' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="title"]//p[@class="job-main-title"]/text()' ).extract()[0].strip() #经验 job['exp'] = response.xpath( '//div[@class="resume clearfix"]/span[2]/text()').extract()[0] #学历 job['edu'] = response.xpath( '//div[@class="resume clearfix"]/span[1]/text()').extract()[0] #招聘人数 job['num'] = '0' #发布时间 job['time'] = response.xpath( '//div[@class="job-title-left"]/p/time/text()').extract( )[0].strip() #其他要求 otherqlist = response.xpath( '//div[@class="resume clearfix"]/span[position()>2]/text()' ).extract() job['otherq'] = ','.join(otherqlist) #福利 fulis = [] fuliList = response.xpath( '//div[@class="job-main main-message"][3]//ul/li') for fuli in fuliList: fulis.append( fuli.xpath('./span/text()').extract()[0] + ':' + fuli.xpath('./text()').extract()[0]) job['welfare'] = ','.join(fulis) #职位信息 infolist = response.xpath( '//div[@class="job-main main-message"][1]/div[@class="content content-word"]/text()' ).extract() job['info'] = ' '.join(infolist) #上班地址 job['local'] = '' #公司网址 job['co_url'] = '' #公司类别 job['co_type'] = response.xpath( '//div[@class="job-main main-message"][2]//ul/li[5]/text()' ).extract()[0] #如果是 '/job/'类型网页 elif '/job/' in response.url: #职位名称 job['name'] = response.xpath( '//div[@class="title-info"]/h1/text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="title-info"]/h3/a/text()').extract()[0].strip() #区域 job['area'] = response.xpath( '//div[@class="job-item"]//p[@class="basic-infor"]/span/a/text()' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="job-item"]//p[@class="job-item-title"]//text()' ).extract()[0].strip() #经验 job['exp'] = response.xpath( '//div[@class="job-qualifications"]/span[2]/text()').extract( )[0] #学历 job['edu'] = response.xpath( '//div[@class="job-qualifications"]/span[1]/text()').extract( )[0] #招聘人数 job['num'] = '' #发布时间 job['time'] = response.xpath( '//div[@class="job-title-left"]/p/time/text()').extract( )[0].strip() #其他要求 otherqlist = response.xpath( '//div[@class="job-qualifications"]/span[position()>2]/text()' ).extract() job['otherq'] = ','.join(otherqlist) #福利 welist = response.xpath( '//div[@class="tag-list"]/span/text()').extract() job['welfare'] = ','.join(welist) #职位信息 infolist = response.xpath( '//div[@class="content content-word"]//text()').extract() job['info'] = ' '.join(infolist) #上班地址 job['local'] = response.xpath( '//div[@class="company-infor"]//ul[@class="new-compintro"]//li[3]//text()' ).extract()[0].split(':'.decode('utf8')).pop() #公司网址 job['co_url'] = response.xpath( '//div[@class="company-infor"]//div[@class="company-logo"]//p/a/@href' ).extract()[0] #公司类型 if response.xpath( '//ul[@class="new-compintro"]/li[1]/a/text()').extract(): job['co_type'] = response.xpath( '//ul[@class="new-compintro"]/li[1]/a/text()').extract()[0] else: job['co_type'] = response.xpath( '//ul[@class="new-compintro"]/li[1]/text()').extract()[0] #如果是'/cjob/'网页 else: #职位名称 job['name'] = response.xpath( '//div[@class="job-title"]/h1/text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="job-title"]/h2/text()').extract()[0] #区域 job['area'] = response.xpath( '//div[@class="job-main"]/p[@class="job-main-tip"]/span[1]/text()[2]' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="job-main"]/div[@class="job-main-title"]/strong/text()' ).extract()[0] #经验 job['exp'] = panduan( response.xpath( '//div[@class="job-main"]/p[@class="job-qualifications"]/span[2]/text()' ).extract()) #学历 job['edu'] = panduan( response.xpath( '//div[@class="job-main"]/p[@class="job-qualifications"]/span[1]/text()' ).extract()) #招聘人数 job['num'] = '' #发布时间 job['time'] = response.xpath( '//p[@class="job-main-tip"]/span[2]/text()').extract( )[0].strip() #其他要求 job['otherq'] = '' #福利 wellist = panduan( response.xpath( '//p[@class="job-labels"]/span/text()').extract()) job['welfare'] = ','.join(wellist) #职位信息 job['info'] = response.xpath( '//div[@class="job-info"]//div[@class="job-info-content"]/text()' ).extract()[0].strip() #上班地址 job['local'] = response.xpath( '//div[@class="side-box right-post-map"]/div[@class="side-content"]/p/text()' ).extract()[0] #公司网址 job['co_url'] = '' #公司类型 job['co_type'] = '' #爬虫名称 job['spider_name'] = 'liepin' #识别id try: job['target_id'] = get_num( response.xpath("//link[@rel='alternate']/@href").extract()[0]) except: job['target_id'] = '' yield job