Пример #1
0
    def page1_parse(self, response):
        position = response.xpath('//div[@class="dw_table"]/div[@class="el"]')
        if position is not None:
            for posi in position:
                posi_url = posi.xpath('./p//a/@href').extract()[0]
                print '===========>', posi_url
                posi_url = self.rule_key + crate_params(
                    '1', (self.get_job(posi_url), ))
                yield scrapy.Request(url=posi_url,
                                     callback=self.detail_parse,
                                     priority=3)

            page = int(
                response.xpath('//div[@class="rt"][2]/text()').extract()
                [1].split('/')[1].strip())
            if page != 1 and MORE_PAGE:
                for p in range(2, page + 1):
                    #next_url = response.url.replace('1.html', str(p) + '.html')
                    #yield scrapy.Request(url=next_url,callback=self.pages_parse)
                    qs = get_qs(response.url)
                    eduid = qs['degreefrom']
                    cityid = get_num(response.url.split(',')[0])
                    salaryid = response.url.split(',')[5]
                    full_url = self.rule_key + crate_params(
                        '0', (cityid, salaryid, '%2B', str(p), eduid, '%2C'))
                    yield scrapy.Request(url=full_url,
                                         callback=self.pages_parse,
                                         priority=1)
 def second_parse(self, response):
     #职位链接列表
     posi_list = response.xpath(
         '//div[@class="job-info"]/h3/a/@href').extract()
     if posi_list:
         for posi in posi_list:
             posi = self.rule_key + crate_params('2', (get_num(posi), ))
             yield scrapy.Request(posi,
                                  callback=self.detail_parse,
                                  priority=3)
     else:
         print "=====>i didnt get any next page"
    def first_parse(self, response):
        #职位链接列表

        #posi_list = response.xpath('//div[@class="sojob-result "]//div[@class="job-info"]/h3/a/@href').extract()
        posi_list = response.xpath(
            '//div[@class="job-info"]/h3/a/@href').extract()
        if posi_list:

            for posi in posi_list:
                posi = self.rule_key + crate_params('2', (get_num(posi), ))
                yield scrapy.Request(posi,
                                     callback=self.detail_parse,
                                     priority=3)

            # 第一种:通过寻找下一页链接,循环每一页
            # next_page = response.xpath('//div[@class="sojob-result "]//div[@class="pagerbar"]/a[last()-1]/@href').extract()
            # if next_page and 'javascript:;' not in next_page:
            #     nextPage = 'https://www.liepin.com' + next_page[0]
            #     print 'next:' + nextPage
            #     yield scrapy.Request(nextPage,callback=self.first_parse)

            # 第二种:通过寻找尾页页码,循环此页码生成每一页请求
            #last_page = response.xpath('//div[@class="sojob-result "]//div[@class="pagerbar"]/a[last()]/@href').extract()

            last_page = response.xpath(
                '//div[@class="pagerbar"]/a[last()]/@href').extract()

            if 'javascript:;' not in last_page and MORE_PAGE:
                #需要判断get参数 industries
                qs = get_qs(response.url)

                pageNum = int(last_page[0].split('=').pop())
                for num in range(1, pageNum + 1):
                    cityId = qs['dqs']
                    salaId = qs['salary']
                    sclaId = qs['compscale']

                    if qs.has_key('industries'):
                        indusId = qs['industries']
                        full_url = self.rule_key + crate_params(
                            '0', (cityId, indusId, salaId, sclaId, str(num)))
                        yield scrapy.Request(full_url,
                                             callback=self.second_parse,
                                             priority=1)
                    else:
                        full_url = self.rule_key + crate_params(
                            '1', (cityId, salaId, sclaId, str(num)))
                        yield scrapy.Request(full_url,
                                             callback=self.second_parse,
                                             priority=1)

        else:
            print "=====>i didnt get any  first page"
Пример #4
0
    def parse_list(self, response):
        #职位链接列表
        posi_list = response.xpath(
            '//div[@class="info-primary"]/h3/a/@href').extract()

        if posi_list:
            print "=====>i get page list"
            for posi in posi_list:
                posi = self.rule_key + crate_params('2', (get_num(posi), ))
                yield scrapy.Request(posi,
                                     callback=self.parse_item,
                                     priority=3)
        else:
            print "=====>i didnt get any page list"
Пример #5
0
    def parse_list(self, response):

        # 获取当前招聘详情的url列表
        recruit_url = response.xpath(
            '//div[@class="newlist_list_content"]//td[@class="zwmc"]//a/@href'
        ).extract()
        for url in recruit_url:
            # 筛选排除校园招聘 的url
            if 'jobs.zhaopin.com' in url:
                # print '详情招聘页链接:',url
                # 请求详情页信息
                posi = self.rule_key + crate_params('1', (get_num(url), ))
                #yield scrapy.Request(url,callback=self.parse_detail,priority=2)
                yield scrapy.Request(posi,
                                     callback=self.parse_detail,
                                     priority=3)

        # 获取下一页是否存在url
        next_page = response.xpath(
            '//div[@class="pagesDown"]//li/a[@class="next-page"]/@href'
        ).extract()
        # 有下一页分页,再次请求当前parse_list方法请求,再次解析(递归,知道 没有下一页为止)
        print 'nex page==============>', next_page
        if next_page and MORE_PAGE:
            qs = get_qs(str(next_page[0]))
            city = unicode(qs['jl'], "utf-8")
            ct = qs['ct']
            et = qs['et']
            el = qs['el']
            p = qs['p']

            full_url = self.rule_key + crate_params(
                '0', (city, str(ct), str(et), str(el), str(p)))
            yield scrapy.Request(url=full_url,
                                 callback=self.parse_list,
                                 priority=1)
Пример #6
0
    def parse_item(self, response):
        item = LagouItem()
        htmls = response.body
        if '已下线' not in htmls:
            # 公司名称
            co_name = response.xpath(
                "//div[@class='company']/text()").extract()[0].encode('utf-8')
            # 职位名称
            name = response.xpath("//div[@class='job-name']/span/text()"
                                  ).extract()[0].encode('utf-8')
            # 薪资
            salary = response.xpath(
                "//dd[@class='job_request']//span[1]/text()").extract(
                )[0].encode('utf-8')
            # 区域
            area = response.xpath("//dd[@class='job_request']//span[2]/text()"
                                  ).extract()[0].encode('utf-8').replace(
                                      '/', '')
            # 工作年限
            exp = response.xpath("//dd[@class='job_request']//span[3]/text()"
                                 ).extract()[0].encode('utf-8').replace(
                                     '/', '')
            # 学历
            edu = response.xpath("//dd[@class='job_request']//span[4]/text()"
                                 ).extract()[0].encode('utf-8').replace(
                                     '/', '')
            # 发布时间
            time = response.xpath(
                "//p[@class='publish_time']/text()").extract()[0].split(' ')[0]
            time = getVal(time)
            # 职位描述
            info = response.xpath("//dd[@class='job_bt']//p/text()").extract()
            info = getVal(info)
            if info != '':
                info = '\n'.join(info).encode('utf-8')

            # 工作地点
            local = ','.join(
                response.xpath("//div[@class='work_addr']/a/text()").extract()
                [:-1]).encode('utf-8')
            # 公司福利
            welfare = response.xpath('//dd[@class="job-advantage"]//p/text()'
                                     ).extract()[0].encode('utf-8')
            # 公司网址
            co_url = response.xpath('//dl[@id="job_company"]//li/a/@href'
                                    ).extract()[0].encode('utf-8')
            # 招聘人数
            num = '0'
            # 公司类别
            co_type = response.xpath('//dl[@id="job_company"]//li[1]/text()'
                                     ).extract()[1].encode('utf-8')
            #print name,co_name,area,salary,exp,edu,num,time,welfare,info,local,co_url,co_type
            item['name'] = name
            item['co_name'] = co_name
            item['area'] = area
            item['salary'] = salary
            item['exp'] = exp
            item['edu'] = edu
            item['num'] = num
            item['time'] = time
            item['welfare'] = welfare
            item['info'] = info
            item['local'] = local
            item['co_url'] = co_url
            item['co_type'] = co_type
            item['spider_name'] = 'lagou'
            item['otherq'] = ''

            #识别id
            try:
                item['target_id'] = get_num(response.url)
            except:
                item['target_id'] = ''

            yield item
Пример #7
0
    def parse_item(self, response):
        item = BossItem()

        # 公司名称
        co_name = response.xpath(
            "//div[@class='job-sec']/div[@class='name']/text()").extract(
            )[0].encode('utf-8')

        # 职位名称
        name = response.xpath(
            "//div[@class='name'][1]/h1/text()").extract()[0].encode('utf-8')
        # 薪资
        salary = response.xpath(
            "//div[@class='name'][1]/span/text()").extract()[0].encode('utf-8')

        subTitle = response.xpath(
            "//div[@class='job-primary detail-box']/div[2]/p/text()").extract(
            )
        # 区域
        area = subTitle[0].split(u':')[1].encode('utf-8')
        # 工作年限
        exp = subTitle[1].split(u':')[1].encode('utf-8')
        # 学历
        edu = subTitle[2].split(u':')[1].encode('utf-8')

        # 发布时间
        time = response.xpath(
            "//span[@class='time'][1]/text()").extract()[0].replace(
                u'发布于', '')
        time = getVal(time)

        # 职位描述
        info = response.xpath(
            "//div[@class='job-sec']/div[@class='text']/text()").extract()
        info = getVal(info)
        if info != '':
            info = '\n'.join(info).encode('utf-8')

        # 工作地点
        local = response.xpath("//div[@class='location-address']/text()"
                               ).extract()[0].encode('utf-8')

        # 公司福利
        welfare = ''
        # 公司网址
        co_url = response.xpath(
            "//a[@ka='job-cominfo']/@href").extract()[0].encode('utf-8')
        # 招聘人数
        num = '0'
        # 公司类别
        co_type = response.xpath(
            "//li[@class='company-type']/text()").extract()[0].encode('utf-8')

        #识别id
        try:
            target_id = get_num(
                response.xpath("//link[@rel='alternate']/@href").extract()[0])
        except:
            target_id = ''

        #print name,co_name,area,salary,exp,edu,num,time,welfare,info,local,co_url,co_type
        item['name'] = name
        item['co_name'] = co_name
        item['area'] = area
        item['salary'] = salary
        item['exp'] = exp
        item['edu'] = edu
        item['num'] = num
        item['time'] = time
        item['welfare'] = welfare
        item['info'] = info
        item['local'] = local
        item['co_url'] = co_url
        item['co_type'] = co_type
        item['spider_name'] = 'boss'
        item['otherq'] = ''
        item['target_id'] = target_id

        yield item
Пример #8
0
    def parse_detail(self, response):

        item = ZhilianItem()
        #html = response.body.decode('utf-8')
        #print html
        name = response.xpath(
            '//div[@class="bread_crumbs"]//a[3]/strong/text()')[0].extract(
            ).encode('utf-8')
        # 公司福利
        welfare_list = response.xpath(
            '//div[@class="fixed-inner-box"]//span/text()').extract()
        welfare = " ".join(welfare_list).encode("utf-8")

        left_info = response.xpath('//div[@class="terminalpage-left"]')
        for i in left_info:
            salary = i.xpath('.//ul/li[1]/strong/text()')[0].extract().encode(
                'utf-8')  # 薪资

            area_city = i.xpath(
                './ul/li[2]/strong/a/text()').extract()[0].encode(
                    'utf-8')  # 工作区域_市
            area_qu = i.xpath('./ul/li[2]/strong/text()').extract()  # 工作区域_区
            area_qu = getVal(area_qu)
            if area_qu != "":
                area_qu = area_qu[0].encode("utf-8")
            area = area_city + area_qu  # 完整的工作区域

            exp = i.xpath('.//ul/li[5]/strong/text()')[0].extract().encode(
                'utf-8')  # 经验
            edu = i.xpath('.//ul/li[6]/strong/text()')[0].extract().encode(
                'utf-8')  # 学历
            num = i.xpath('.//ul/li[7]/strong/text()')[0].extract().encode(
                'utf-8')  # 人数
            time = i.xpath(
                './/ul/li[3]/strong/span/text()')[0].extract().encode(
                    'utf-8')  # 发布日期:

        local = response.xpath(
            '//div[@class="tab-inner-cont"]//h2/text()')[0].extract().encode(
                'utf-8')  # 工作地点 去换行
        local = local.replace('\n', '').strip()

        info = response.xpath(
            '//div[@class="tab-inner-cont"]//p/text()').extract()
        # info = '\n'.join(info).encode('utf-8').replace('\n','').strip()  #转了utf-8
        info = '\n'.join(info).replace('\n', '').strip()
        #print info
        co_name = response.xpath(
            '//div[@class="terminalpage-right"]//p[@class="company-name-t"]//a/text()'
        )[0].extract().encode('utf-8')  # 公司名称
        co_type = response.xpath(
            '//div[@class="terminalpage-right"]//ul/li[2]/strong/text()'
        )[0].extract().encode('utf-8')  # 公司类别(公司性质)
        '''
        li_count = response.xpath('//div[@class="terminalpage-right"]//ul/li')[0].extract()
        # co_url = response.xpath('//div[@class="terminalpage-right"]//div[@class="company-box"]//ul/li[4]/strong/a/@href').extract()[0]
        # 有五个li,有网址
        co_url = ""
        i = len(li_count)
        if i == 5:
            # print response.body.decode('utf-8')
            # 公司链接 href为空
            co_url = response.xpath('//ul[@class="terminal-ul clearfix terminal-company mt20"]/li[4]/strong/a/text()')[0].extract()
        else:
            co_url = "该公司没有网站"
        '''
        co_url = response.xpath(
            '//div[@class="terminalpage-right"]//p[@class="company-name-t"]//a/@href'
        )[0].extract().encode('utf-8')  # 该公司在智联上的介绍url
        # print co_url
        # 封装item
        item['name'] = name
        item['welfare'] = welfare
        item['salary'] = salary
        item['area'] = area
        item['exp'] = exp
        item['edu'] = edu
        item['num'] = num
        item['time'] = time
        item['local'] = local
        item['info'] = info
        item['co_name'] = co_name
        item['co_type'] = co_type
        item['co_url'] = co_url
        #添加spider_name
        item['spider_name'] = 'zhilian'
        item['otherq'] = ''
        #识别id
        try:
            item['target_id'] = get_num(
                response.xpath("//link[@rel='alternate']/@href").extract()[0])
        except:
            item['target_id'] = ''
        return item
Пример #9
0
    def detail_parse(self, response):
        #判断信息是否存在
        ifexists = lambda x: x[0] if x else ''
        job = Job51Item()
        #职位名称
        job['name'] = response.xpath(
            '//div[@class="tHeader tHjob"]//h1//text()').extract()[0]
        #公司名称
        job['co_name'] = response.xpath(
            '//p[@class="cname"]/a//text()').extract()[0]
        #区域
        job['area'] = response.xpath(
            '//div[@class="tHeader tHjob"]//span/text()').extract()[0]
        #工资
        job['salary'] = ifexists(
            response.xpath(
                '//div[@class="tHeader tHjob"]//strong/text()').extract())
        #所有要求
        #其他要求
        otherq = ''
        all_require = response.xpath(
            '//div[@class="tBorderTop_box bt"]//div[@class="t1"]/span/text()'
        ).extract()
        for require in all_require:
            if '经验'.decode('utf8') in require:
                job['exp'] = require
            elif require in self.edu_type:
                job['edu'] = require
            elif '人'.decode('utf8') in require:
                job['num'] = require
            elif '发布'.decode('utf8') in require:
                job['time'] = require
            else:
                otherq = otherq + require + ' '
        job['otherq'] = otherq
        #福利
        welfare = ' '
        fuli = response.xpath(
            '//div[@class="tBorderTop_box bt"]//p[@class="t2"]/span/text()'
        ).extract()
        for f in fuli:
            welfare = welfare + f + ' '
        job['welfare'] = welfare
        #职位信息
        posi_info = response.xpath(
            '//div[@class="tBorderTop_box"][1]//div[@class="bmsg job_msg inbox"]//text()'
        ).extract()
        for i in posi_info:
            if i in self.unrequire:
                posi_info.remove(i)
            else:
                i.strip()
        job['info'] = ' '.join(posi_info)
        #上班地址
        job['local'] = ifexists(
            response.xpath(
                '//div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//p/text()[2]'
            ).extract())
        #公司网址
        job['co_url'] = response.xpath(
            '//div[@class="tHeader tHjob"]//p[@class="cname"]/a/@href'
        ).extract()[0]
        #公司类型
        str1 = response.xpath(
            '//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()'
        ).extract()[0]
        strtotal = ''
        strlist = str1.split('|')
        for s in strlist:
            strtotal = strtotal + s.strip() + '|'
        job['co_type'] = strtotal
        job['spider_name'] = 'job51'
        #识别id
        try:
            job['target_id'] = get_num(response.url.replace('51job', ''))
        except:
            job['target_id'] = ''

        yield job
    def detail_parse(self, response):

        panduan = lambda x: x[0] if x else ''
        job = LiepinItem()
        #如果是'/a/'类型网页
        if '/a/' in response.url:
            #职位名称
            job['name'] = response.xpath(
                '//div[@class="title-info"]/h1/text() | //div[@class="title-info "]/h1/text()'
            ).extract()[0]
            #公司名称
            job['co_name'] = response.xpath(
                '//div[@class="title-info"]/h3/text() | //div[@class="title-info "]/h3/text()'
            ).extract()[0].strip()
            #区域
            job['area'] = response.xpath(
                '//div[@class="title"]//p[@class="basic-infor"]/span/text()'
            ).extract()[0]
            #薪资
            job['salary'] = response.xpath(
                '//div[@class="title"]//p[@class="job-main-title"]/text()'
            ).extract()[0].strip()
            #经验
            job['exp'] = response.xpath(
                '//div[@class="resume clearfix"]/span[2]/text()').extract()[0]
            #学历
            job['edu'] = response.xpath(
                '//div[@class="resume clearfix"]/span[1]/text()').extract()[0]
            #招聘人数
            job['num'] = '0'
            #发布时间
            job['time'] = response.xpath(
                '//div[@class="job-title-left"]/p/time/text()').extract(
                )[0].strip()
            #其他要求
            otherqlist = response.xpath(
                '//div[@class="resume clearfix"]/span[position()>2]/text()'
            ).extract()
            job['otherq'] = ','.join(otherqlist)
            #福利
            fulis = []
            fuliList = response.xpath(
                '//div[@class="job-main main-message"][3]//ul/li')
            for fuli in fuliList:
                fulis.append(
                    fuli.xpath('./span/text()').extract()[0] + ':' +
                    fuli.xpath('./text()').extract()[0])
            job['welfare'] = ','.join(fulis)
            #职位信息
            infolist = response.xpath(
                '//div[@class="job-main main-message"][1]/div[@class="content content-word"]/text()'
            ).extract()
            job['info'] = ' '.join(infolist)
            #上班地址
            job['local'] = ''
            #公司网址
            job['co_url'] = ''
            #公司类别
            job['co_type'] = response.xpath(
                '//div[@class="job-main main-message"][2]//ul/li[5]/text()'
            ).extract()[0]
        #如果是 '/job/'类型网页
        elif '/job/' in response.url:
            #职位名称
            job['name'] = response.xpath(
                '//div[@class="title-info"]/h1/text()').extract()[0]
            #公司名称
            job['co_name'] = response.xpath(
                '//div[@class="title-info"]/h3/a/text()').extract()[0].strip()
            #区域
            job['area'] = response.xpath(
                '//div[@class="job-item"]//p[@class="basic-infor"]/span/a/text()'
            ).extract()[0]
            #薪资
            job['salary'] = response.xpath(
                '//div[@class="job-item"]//p[@class="job-item-title"]//text()'
            ).extract()[0].strip()
            #经验
            job['exp'] = response.xpath(
                '//div[@class="job-qualifications"]/span[2]/text()').extract(
                )[0]
            #学历
            job['edu'] = response.xpath(
                '//div[@class="job-qualifications"]/span[1]/text()').extract(
                )[0]
            #招聘人数
            job['num'] = ''
            #发布时间
            job['time'] = response.xpath(
                '//div[@class="job-title-left"]/p/time/text()').extract(
                )[0].strip()
            #其他要求
            otherqlist = response.xpath(
                '//div[@class="job-qualifications"]/span[position()>2]/text()'
            ).extract()
            job['otherq'] = ','.join(otherqlist)
            #福利
            welist = response.xpath(
                '//div[@class="tag-list"]/span/text()').extract()
            job['welfare'] = ','.join(welist)
            #职位信息
            infolist = response.xpath(
                '//div[@class="content content-word"]//text()').extract()
            job['info'] = ' '.join(infolist)
            #上班地址
            job['local'] = response.xpath(
                '//div[@class="company-infor"]//ul[@class="new-compintro"]//li[3]//text()'
            ).extract()[0].split(':'.decode('utf8')).pop()
            #公司网址
            job['co_url'] = response.xpath(
                '//div[@class="company-infor"]//div[@class="company-logo"]//p/a/@href'
            ).extract()[0]
            #公司类型
            if response.xpath(
                    '//ul[@class="new-compintro"]/li[1]/a/text()').extract():
                job['co_type'] = response.xpath(
                    '//ul[@class="new-compintro"]/li[1]/a/text()').extract()[0]
            else:
                job['co_type'] = response.xpath(
                    '//ul[@class="new-compintro"]/li[1]/text()').extract()[0]
        #如果是'/cjob/'网页
        else:
            #职位名称
            job['name'] = response.xpath(
                '//div[@class="job-title"]/h1/text()').extract()[0]
            #公司名称
            job['co_name'] = response.xpath(
                '//div[@class="job-title"]/h2/text()').extract()[0]
            #区域
            job['area'] = response.xpath(
                '//div[@class="job-main"]/p[@class="job-main-tip"]/span[1]/text()[2]'
            ).extract()[0]
            #薪资
            job['salary'] = response.xpath(
                '//div[@class="job-main"]/div[@class="job-main-title"]/strong/text()'
            ).extract()[0]
            #经验
            job['exp'] = panduan(
                response.xpath(
                    '//div[@class="job-main"]/p[@class="job-qualifications"]/span[2]/text()'
                ).extract())
            #学历
            job['edu'] = panduan(
                response.xpath(
                    '//div[@class="job-main"]/p[@class="job-qualifications"]/span[1]/text()'
                ).extract())
            #招聘人数
            job['num'] = ''
            #发布时间
            job['time'] = response.xpath(
                '//p[@class="job-main-tip"]/span[2]/text()').extract(
                )[0].strip()
            #其他要求
            job['otherq'] = ''
            #福利
            wellist = panduan(
                response.xpath(
                    '//p[@class="job-labels"]/span/text()').extract())
            job['welfare'] = ','.join(wellist)
            #职位信息
            job['info'] = response.xpath(
                '//div[@class="job-info"]//div[@class="job-info-content"]/text()'
            ).extract()[0].strip()
            #上班地址
            job['local'] = response.xpath(
                '//div[@class="side-box right-post-map"]/div[@class="side-content"]/p/text()'
            ).extract()[0]
            #公司网址
            job['co_url'] = ''
            #公司类型
            job['co_type'] = ''

        #爬虫名称
        job['spider_name'] = 'liepin'

        #识别id
        try:
            job['target_id'] = get_num(
                response.xpath("//link[@rel='alternate']/@href").extract()[0])
        except:
            job['target_id'] = ''

        yield job