예제 #1
0
    def page_parse(self,response):
        # 从初始页获取部分信息
        page = response.meta['meta_0']
        offset = response.meta['meta_2']
        items = []
        for j in range(0, len(response.xpath("//div[@class='newlist_detail newlist']").extract())):
            item = ZhilianItem()

            # 公司名称
            item['companyName'] = response.xpath('//form//ul/li[2]/a[1]/text()').extract()[j].replace('\'', '\"')

            # 公司性质
            item['companyNature'] = response.xpath('//form//ul/li[3]/span[2]/text()').extract()[j][5:].replace('\'',
                                                                                                               '\"')
            # 公司规模
            item['companyScale'] = response.xpath("//li[@class='newlist_deatil_two']/span[3]/text()").extract()[j][
                                   5:].replace('\'', '\"')

            # 详细链接
            item['subUrl'] = response.xpath('//form//ul/li[1]/div//@href').extract()[j]

            items.append(item)

        for item in items:
            # 发送详情页请求,传递item,并用detail_parse()处理所得响应
            print "request: " + item['subUrl']
            yield scrapy.Request(url=item['subUrl'], meta={'meta_1': item}, callback=self.detail_parse)

        offset += 1
        if offset <= page:
            pos = response.url.rfind('=')
            yield scrapy.Request(url=response.url[0:pos+1] + str(offset), meta={'meta_0':page,'meta_2':offset },callback=self.page_parse)
예제 #2
0
    def detail_parse(self, response):
        item = ZhilianItem()
        job = response.css(
            '.top-fixed-box .fixed-inner-box .fl h1::text').extract_first()
        company = response.css(
            '.top-fixed-box .fixed-inner-box .fl h2 a::text').extract_first()
        salary = response.css(
            '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(1) > strong::text'
        ).extract_first()
        educational = response.css(
            '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(6) > strong::text'
        ).extract_first()
        experience = response.css(
            '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(5) > strong::text'
        ).extract_first()

        requirement = ''
        for terminalpage in response.css(
                '.terminalpage-main .tab-cont-box .tab-inner-cont > p::text'
        ).extract():
            requirement += terminalpage.replace("\n", "").strip()
        item['job'] = job
        item['company'] = company
        item['salary'] = salary
        item['educational'] = educational
        item['experience'] = experience
        item['requirement'] = requirement

        yield item
예제 #3
0
 def parse_detail(self, response):
     zhilian = ZhilianItem()
     zls = Selector(response)
     zhilian['bussiessname'] = zls.xpath(
         '//div[@class="inner-left fl"]/h2/a/text()').extract()
     zhilian['bussiessurl'] = zls.xpath(
         '//div[@class="inner-left fl"]/h2/a/@href').extract()
     zhilian['jobname'] = zls.xpath(
         '//div[@class="inner-left fl"]/h1/text()').extract()
     bussiessaddress = zls.xpath(
         '//div[contains(@class, "tab-inner-cont")]/h2/text()').extract()
     zhilian['bussiessaddress'] = ''.join(bussiessaddress).strip()
     zhilian['workaddress'] = zls.xpath(
         '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[2]/strong/a/text()'
     ).extract()
     zhilian['money'] = zls.xpath(
         '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[1]/strong/text()'
     ).extract()
     zhilian['education'] = zls.xpath(
         '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[6]/strong/text()'
     ).extract()
     zhilian['releasetime'] = zls.xpath(
         '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[5]/strong/text()'
     ).extract()
     zhilian['num'] = zls.xpath(
         '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[7]/strong/text()'
     ).extract()
     description = zls.xpath(
         '//div[contains(@class, "tab-inner-cont")]/p/text()').extract()
     descriptionStr = ''
     for des in description:
         currentdes = des.strip()
         descriptionStr += currentdes
     zhilian['description'] = descriptionStr
     yield zhilian
예제 #4
0
    def parse(self, response):
        #----------主页数据操作-----------------------
        print('url=',self.start_urls)
        titles = response.xpath('//a[@name="itemlist-title"]/@title').extract()
        prices = response.xpath('//span[@class ="price_n"]/text()').extract()
        list = []
        for i in range(0, len(titles)):
            map = {}
            map['title'] = titles[i]
            map['price'] = prices[i]
            list.append(map)

        print('list=', len(list))
        print('当前请求地址', response.request.url);
        print('获取上一次请求传递的数据meta=',response.meta)
        #获取页面
        # print('本页面=',response.body_as_unicode())
        str_url = response.request.url
        str_url = str_url[str_url.index('com/') + 4:str_url.index('-')]
        print('截取地址 str1=', str_url)

        zlitem=ZhilianItem()
        #禁止这样写 不然yield不起作用
        # zlitem.list=list
        zlitem['list']=list
        #每一页都存一个json
        zlitem['page']=str_url+'_data.json'
        zlitem['type'] = '主页'

        #保存主页数据
        # yield把数据给引擎在扔给pipelines.py的item
        #yield只是把任务加入队列
        yield zlitem
        # ------获取子页面,子页面太多这里只做2个--------
        #主页面每个item的子页面连接
        a_list=response.xpath('//a[contains(@name,"itemlist-picture") and contains(@target,"_blank") and contains(@class,"pic") and contains(@dd_name,"单品图片")]/@href').getall()
        print('子页面连接=',len(a_list))
        a_list=a_list[0:2]
        for i in range(0,len(a_list)):
            #callback=self.childPage:回调的函数
            #meta:给下一个页面传递的数据
            yield scrapy.Request(url=a_list[i], callback=self.childPage,meta={'info':'子页面','url':a_list[i],'jsonName': str_url + '子页面'})

        print('子连接=',a_list)
        #------下一页--------
        # <a href="/pg3-cid4008154.html" title="下一页">下一页</a>
        #get()=/pg3-cid4008154.html
        url_str = response.xpath('//a[@title="下一页"]/@href').get()
        next_url = 'http://category.dangdang.com' + url_str
        print('next_url=',next_url)

        if  not url_str:
            # 退出方法
            return
        else:
            # 在把请求放到队列里面
            # callback=self.parse 解析的是同一个页面所以还用parse当前方法
            # allowed_domains和start_urls一定要一样不然scrapy.Request无效
            yield scrapy.Request(url=next_url, callback=self.parse)
예제 #5
0
 def parse_list(self,response):
     resp=response.text
     title=resp.xpath('div[@class="contentpile__content__wrapper__item clearfix]//span[@class="contentpile__content__wrapper__item__info__box__jobname__title]/@title"').extract()
     company=resp.xpath('div[@class="contentpile__content__wrapper__item clearfix]//a[@class="contentpile__content__wrapper__item__info__box__cname__title company_title]/text()"').extract()
     item=ZhilianItem()
     item['title']=title
     item['company']=company
     yield item
예제 #6
0
    def parse_job_info(self, response):
        """
            解析工作信息
        :param response:
        :return:
        """
        zl_table_list = response.xpath(
            "//div[@id='newlist_list_content_table']/table[@class='newlist']")
        for zl_table in zl_table_list[1:]:
            # tbody 是网页自动生成的 运行起来看效果/或者右键查看源码
            # zl_td_list = zl_table.xpath("tr[1]/td")
            # 问题:td 数不是5个,会报错--索引越界
            # td1 = zl_table_list[0]
            # td2 = zl_table_list[1]
            # td3 = zl_table_list[2]
            # td4 = zl_table_list[3]
            # td5 = zl_table_list[4]

            # 查找元素尽量用xpath定位,少用索引,因为有可能出现索引越界错误
            # 只有在不明确错误时使用异常捕获
            # //text()获取标签内所有文本
            # extract()把列表里的元素转换成文本,本身还是列表
            # extract_first('默认值')把列表里的元素转换成文本并取出第一个,如果取不到,返回默认值
            td1 = zl_table.xpath(
                "tr/td[@class='zwmc']/div/a//text()").extract()
            # map返回的是一个列表 td1 = list(map(str.strip, td1))
            td1 = map(str.strip, td1)
            job_name = "".join(td1).replace(",", "/")
            # strip()只能清除两端的
            fan_kui_lv = zl_table.xpath("tr/td[@class='fk_lv']/span/text()"
                                        ).extract_first('没有反馈率').strip()
            job_company_name = zl_table.xpath(
                "tr/td[@class='gsmc']/a[1]/text()").extract_first(
                    '没有公司名称').strip()
            job_salary = zl_table.xpath(
                "tr/td[@class='zwyx']/text()").extract_first('面议').strip()
            job_place = zl_table.xpath(
                "tr/td[@class='gzdd']/text()").extract_first('没有工作地点').strip()
            print(job_name, fan_kui_lv, job_company_name, job_salary,
                  job_place)
            item = ZhilianItem()
            item['job_name'] = job_name
            item['job_company_name'] = job_company_name
            item['job_place'] = job_place
            item['job_salary'] = job_salary
            item['job_time'] = "没有时间"
            item['job_type'] = "智联招聘"
            item['fan_kui_lv'] = fan_kui_lv
            yield item
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True,
        )
예제 #7
0
 def index_parse(self, response):
     detail_urls = re.findall(r'<td class="zwmc".*?href="(.*?)"',
                              response.text, re.S)
     for detail_url in detail_urls:
         item = ZhilianItem()
         item['detail_url'] = detail_url
         print("详细信息:", detail_url)
         yield item
         yield Request(url=item['detail_url'],
                       callback=self.detail_parse,
                       dont_filter=True)
예제 #8
0
    def parse(self, response):
        item = ZhilianItem()

        title = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[1]/a/text()'
        ).extract()  #名称
        address = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[2]/span[1]/span/em/text()'
        ).extract()  #地址
        numbers = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[2]/span[2]/span/em/text()'
        ).extract()  #人数
        industry = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[3]/span/em/text()'
        ).extract()  #行业
        introduction = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[6]'
        ).extract()  #未处理的简介   s = re.findall(r'1.+',soup.text)
        company_type = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[1]/span/em/text()'
        ).extract()  #公司类型
        scale = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[2]/span/em/text()'
        ).extract()  #公司规模
        job_type = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[4]/span/em/text()'
        ).extract()  #职位类别
        company = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[2]/span/text()'
        ).extract()  #公司名
        time = response.xpath(
            '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[3]/span/span/text()[2]'
        ).extract()  #发布时间

        for a, b, c, d, e, f, g, h, i, j in zip(title, address, numbers,
                                                industry, introduction,
                                                company_type, company, scale,
                                                job_type, time):
            item['title'] = a
            item['time'] = j
            item['address'] = b
            item['numbers'] = c
            item['industry'] = d
            item['job_type'] = i
            item['company'] = g
            item['company_type'] = f
            item['scale'] = h
            item['introduction'] = re.findall(r'<span>.+?</span>', e)
            yield item
 def parse(self, response):
     web_data = response.text
     soup = BeautifulSoup(web_data, 'lxml')
     job_name = soup.select(
         "table.newlist > tr > td.zwmc > div > a:nth-of-type(1)")
     salary = soup.find_all('td', {'class': 'zwyx'})
     for name, salary in zip(job_name, salary):
         item = ZhilianItem()
         item["job_name"] = name.get_text()
         url = name.get('href')
         item["salary"] = salary.get_text()
         yield Request(url=url,
                       meta={"item": item},
                       callback=self.parse_moive,
                       dont_filter=True)
예제 #10
0
 def parse_item(self, response):
     itme = ZhilianItem()
     response = Selector(response)
     itme['职位月薪'] = response.re('职位月.*>([\u4e00-\u9fa5/\d-]+)')
     itme['发布日期'] = response.re('发布日期.*>([\d: -]+)')
     itme['工作地点'] = response.re('工作地点.*?>-([\u4e00-\u9fa5]+)')
     itme['工作性质'] = response.re('工作性质.*?>([\u4e00-\u9fa5]+)')
     itme['工作经验'] = response.re('工作经验.*?>([\u4e00-\u9fa5]+)')
     itme['最低学历'] = response.re('最低学历.*?>([\u4e00-\u9fa5]+)')
     itme['招聘人数'] = response.re('招聘人数.*?>([\u4e00-\u9fa5\d]+)')
     itme['职位类别'] = response.re('职位类别.*?>([\u4e00-\u9fa5]+)')
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return itme
예제 #11
0
 def childPage(self,response):
     print('------子页面开始------')
     title=response.xpath('//div[contains(@class,"name_info") and contains(@ddt-area,"001")]/h1/@title').get()
     price=response.xpath('//p[@id="dd-price"]/text()').get()
     map=response.meta
     print('子页面meta=',map)
     list={'title':title,'price':price}
     print('子页面数据=',list)
     zlitem = ZhilianItem()
     zlitem['type']='子页'
     zlitem['list']=list
     zlitem['page']=map['jsonName']
     print('子页面数据=', zlitem)
     yield  zlitem
     print('------子页面结束------')
예제 #12
0
    def parse_zhilian(self, response):
        info = response.xpath('//table[@class="newlist"]')
        print info
        for each in info[1:]:
            item = ZhilianItem()
            item["post_name"] = each.xpath(
                './/td[@class="zwmc"]//a[@style="font-weight: bold"]/text()'
            ).extract()[0]
            item["company_name"] = each.xpath(
                './/td[@class="gsmc"]/a/text()').extract()[0]
            item["pay"] = each.xpath(
                './/td[@class="zwyx"]/text()').extract()[0]
            item["location"] = each.xpath(
                './/td[@class="gzdd"]/text()').extract()[0]

            yield item
예제 #13
0
파일: zl.py 프로젝트: ZeqiWang/Scrapy-job
    def get_info(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        infos = soup.find('ul',class_='terminal-ul').find_all('li')
        item = ZhilianItem()
        item['zwyx'] = infos[0].find("strong").get_text().encode('utf-8')#职位月薪
        item['gzdd'] = infos[1].find("strong").find('a').get_text().encode('utf-8')  # 工作地点
        item['gzxz'] = infos[3].find("strong").get_text().encode('utf-8')#工作性质
        item['gzjy'] = infos[4].find("strong").get_text().encode('utf-8')#工作经验
        item['zdxl'] = infos[5].find("strong").get_text().encode('utf-8')#最低学历
        item['zprs'] = infos[6].find("strong").get_text().encode('utf-8')  # 招聘人数
        item['zwlb'] = infos[7].find("strong").find('a').get_text().encode('utf-8')  # 职位类别
        item['zwmc'] = soup.find('div',class_='top-fixed-box').find('h1').get_text().encode('utf-8') #职位名称
        #fldy_list = soup.find('div', class_='welfare-tab-box').find_all('span')

        #print (item)
        return item
예제 #14
0
    def parse(self, response):
        js = json.loads(response.text)

        results = js["data"]["results"]
        for r in results:
        	zhilian = ZhilianItem()
        	zhilian["number"] = r["number"]
        	zhilian["jobname"] = r["jobName"]
        	zhilian["company"] = r["company"]["name"]
        	zhilian["position"] = r["city"]["display"]
        	zhilian["size"] = r["company"]["size"]["name"]
        	zhilian["edulevel"] = r["eduLevel"]["name"]
        	zhilian["salary"] = r["salary"]
        	zhilian["workexp"] = r["workingExp"]["name"]
        	workUrl = "https://jobs.zhaopin.com/"+zhilian["number"]+".htm"
        	yield scrapy.Request(workUrl,meta = {"zzll":copy.deepcopy(zhilian)},callback = self.getResult)
예제 #15
0
    def parse(self, response):
        table_list = response.xpath('//table[@class="newlist"]')
        # print(table_list)
        for table in table_list[2:]:
            item = ZhilianItem()
            position = table.xpath('.//td[@class="zwmc"]/div/a[1]/text()').extract_first()
            item["position"] = position.strip() if position else None

            url = table.xpath('.//td[@class="zwmc"]/div/a[1]/@href').extract_first()
            item["link"] = url.strip() if url else None
            if url:
                yield scrapy.Request(url=url, meta={"item": item}, callback=self.parse_link, dont_filter=True)
        next_page = response.xpath('//li[@class="pagesDown-pos"]/a/@href').extract_first()
        if next_page:
            next_url = response.urljoin(next_page)
            yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)
예제 #16
0
    def content_parse(self, response):

        item = ZhilianItem()

        city_name = self.city

        jobname = response.xpath(
            '//div[@class="main1 cl main1-stat"]/div/ul/li/h1/text()'
        )[0].extract()

        salary = response.xpath(
            '//div[@class="main1 cl main1-stat"]/div/ul/li/div/strong/text()'
        )[0].extract()

        position = response.xpath(
            '//div[@class="info-three l"]/span/a/text()')[0].extract()

        span = response.xpath(
            '//div[@class="info-three l"]/span/text()').extract()
        if len(span) > 3:
            workingExp = span[1]
            eduLevel = span[2]
        else:
            workingExp = response.xpath(
                '//div[@class="info-three l"]/span/text()')[0].extract()
            eduLevel = response.xpath(
                '//div[@class="info-three l"]/span/text()')[1].extract()

        company_name = response.xpath(
            '//div[@class="company l"]/a/text()')[0].extract()

        update_time = response.meta['update_time']

        require = response.xpath('//div[@class="pos-ul"]/p/text()').extract()
        if require == []:
            require = response.xpath(
                '//div[@class="pos-ul"]/p/span/text()').extract()
        job_require = require
        sha1 = hashlib.sha1()
        string = (company_name + '' + update_time)
        stri = string.encode('utf8')
        sha1.update(stri)
        hash_id = sha1.hexdigest()

        for field in item.fields.keys():
            item[field] = eval(field)
        yield item
예제 #17
0
    def parse_xiangqin(self, response):

        item = ZhilianItem()
        #公司名称
        company = response.xpath(
            '//h2//a[@target="_blank"]/text()').extract()[0]
        #薪水
        pay = response.xpath(
            '//ul[@class="terminal-ul clearfix"]//strong/text()').extract()[0]
        #工作地点
        site = response.xpath(
            '//ul[@class="terminal-ul clearfix"]//strong//a/text()').extract(
            )[0]
        #发布时间
        times = response.xpath(
            '//ul[@class="terminal-ul clearfix"]//strong//span/text()'
        ).extract()[0]
        #职位名
        gangwei = response.xpath(
            '//ul[@class="terminal-ul clearfix"]//strong//a/text()').extract(
            )[1]
        #url
        url = response.url
        #岗位职责
        miaoshu = response.xpath(
            './/div[@class="tab-inner-cont"]/p/text() | .//div[@class="tab-inner-cont"]//span/text() | .//div[@class="tab-inner-cont"]//div/text()'
        ).extract()
        num = 0
        while num < len(miaoshu):
            miaoshu[0] += miaoshu[num].strip()
            num += 1

        #print miaoshu[0]
        #print company,pay,site,times,gangwei,miaoshu
        item['company'] = company
        item['pay'] = pay
        item['site'] = site
        item['times'] = times
        item['gangwei'] = gangwei
        item['url'] = self.md5(url)
        if miaoshu:
            item['miaoshu'] = miaoshu[0]
        else:
            item['miaoshu'] = ''
        print '生成item'
        yield item
예제 #18
0
    def parse_item(self, response):

        job = ZhilianItem()
        node = response.xpath('//div[@class="terminalpage-left"]')

        #公司名
        job["company_name"] = response.xpath(
            '//div[@class="fixed-inner-box"]//h2/a/text()').extract()[0].strip(
            )
        #职位
        job["job_name"] = response.xpath(
            '//div[contains(@class,"f")]/h1/text()').extract()[0].strip()
        #工资
        job["wages"] = node.xpath(
            './/li[1]/strong/text()').extract()[0].strip()
        #工作地点
        fulllocal = node.xpath('.//li[2]')
        job["local"] = fulllocal.xpath('string(.)').extract()[0].strip()
        #发布日期
        job["push_date"] = node.xpath(
            './/li[3]/strong//text()').extract()[0].strip()
        #工作经验
        job["experience"] = node.xpath(
            './/li[5]/strong/text()').extract()[0].strip()
        #学历
        job["education"] = node.xpath(
            './/li[6]/strong/text()').extract()[0].strip()
        #招聘人数
        job["quantity"] = node.xpath(
            './/li[7]/strong/text()').extract()[0].strip()
        #工作要求:因为都是同级的P标签  所以先把该node下面的全部P标签匹配出来 再把末尾的信息进行切片
        rlist = node.xpath(
            './/div[@class="tab-inner-cont"][1]/p/text()').extract()[:-4]
        job["require"] = " ".join(rlist)

        # print ("公司名:"+job["company_name"])
        # print ("职位:"+job["job_name"])
        # print ("工资:"+job["wages"])
        # print ("工作地点:"+job["local"])
        # print ("发布日期:"+job["push_date"])
        # print ("工作经验:"+job["experience"])
        # print ("学历:"+job["education"])
        # print ("招聘人数:"+job["quantity"])
        # print ("工作要求:"+job["require"])

        return job
예제 #19
0
 def parse(self, response):  #处理下载的response的默认方法
     result = json.loads(response.text)
     results = result['data']['results']
     item = ZhilianItem()
     for items in results:
         item['jobName'] = items['jobName']
         item['salary'] = items['salary']
         item['welfare'] = items['welfare']
         item['url'] = items['positionURL']
         item['city_display'] = items['city']['display']
         item['company'] = items['company']['name']
         item['company_size'] = items['company']['size']['name']
         item['company_type'] = items['company']['type']['name']
         item['emplType'] = items['emplType']
         item['eduLevel'] = items['eduLevel']['name']
         item['updateDate'] = items['updateDate']
         item['workingExp'] = items['workingExp']['name']
         yield item
예제 #20
0
파일: code.py 프로젝트: watchxu/Spiders
 def parse(self, response):
     result = json.loads(response.text)
     if result.get("data"):
         results = result["data"]["results"]
         for i in results:
             item = ZhilianItem()
             item["city"] = i["city"]
             item["companyLogo"] = i["companyLogo"]
             item["createDate"] = i["createDate"]
             item["eduLevel"] = i["eduLevel"]
             item["emplType"] = i["emplType"]
             item["endDate"] = i["endDate"]
             item["positionURL"] = i["positionURL"]
             item["salary"] = i["salary"]
             item["updateDate"] = i["updateDate"]
             item["welfare"] = i["welfare"]
             item["workingExp"] = i["workingExp"]
             yield item
예제 #21
0
 def parse(self, response):
     item = ZhilianItem()
     result = json.loads(response.text)
     for i in range(0, 60):  # 一个页面有59个数据字典,遍历每个字典
         results = result['data']['results'][i]
         jobName = results['jobName']
         createDate = results['createDate']
         company = results['company']['name']
         eduLevel = results['eduLevel']['name']
         workingExp = results['workingExp']['name']
         city = results['city']['display']
         item['jobName'] = jobName
         item['createDate'] = createDate
         item['company'] = company
         item['eduLevel'] = eduLevel
         item['salary'] = results.get('salary')
         item['workingExp'] = workingExp
         item['city'] = city
         yield item
예제 #22
0
 def parse(self, response):
     for x in response.xpath(
             "//div[@id='listContent']/div").extract_first():
         name = x.xpath("//span[@title]/text()").extract_first()
         salary = x.xpath("//p[@class='job_saray']/text()").extract_first()
         company = x.xpath(
             "//*[@class='company_title']/text()").extract_first()
         city = x.xpath(
             "//ul/li[@class='demand_item']/text()").extract_first()
         welfare = x.xpath("//div[@class='welfare_item']/text()").extract()
         url = x.xpath("//div[@class='commpanyName']/a/@href").extract()
         item = ZhilianItem()
         item['name'] = name
         item['salary'] = salary
         item['company'] = company
         item['city'] = city
         item['welfare'] = welfare
         item['url'] = url
         yield item
예제 #23
0
 def parse(self, response):
     soup = BeautifulSoup(response.body.decode('utf-8'),'lxml')
     for site in soup.find_all('table',class_='newlist'):
         item = ZhilianItem()
         try:
             item['name'] = site.find('td',class_='zwmc').get_text().strip()
             #print(item['name'])
             item['rate'] = site.find('td',class_='fk_lv').get_text()
             #print(item['rate'])
             item['compy'] = site.find('td',class_='gsmc').get_text()
             #print(item['compy'])
             item['money'] = site.find('td',class_='zwyx').get_text()
             #print(item['money'])
             item['place'] = site.find('td',class_='gzdd').get_text()
             #print(item['place'])
             #print(item)
             yield item
         except:
             pass
 def parse_urls(self, response):
     job_list = json.loads(response.text)['data']['results']
     for j in range(0, len(job_list)):
         items = ZhilianItem()
         items['date'] = job_list[j]['updateDate']
         items['job_name'] = job_list[j]['jobName']
         items['job_url'] = job_list[j]['positionURL']
         items['company_id'] = job_list[j]['company']['number']
         items['city'] = job_list[j]['city']['display']
         items['company'] = job_list[j]['company']['name']
         items['company_type'] = job_list[j]['company']['type']['name']
         items['salary'] = job_list[j]['salary']
         items['experience'] = job_list[j]['workingExp']['name']
         items['education'] = job_list[j]['eduLevel']['name']
         items['address'] = job_list[j]['businessArea']
         job_detail_url = job_list[j]['positionURL']
         yield scrapy.Request(job_detail_url,
                              meta={'items': items},
                              callback=self.parse_job_detail_url,
                              dont_filter=True)
예제 #25
0
    def parse(self, response):
        titles = response.xpath('//a[@name="itemlist-title"]/@title').extract()
        prices = response.xpath('//span[@class ="price_n"]/text()').extract()
        list = []
        for i in range(0, len(titles)):
            map = {}
            map['title'] = titles[i]
            map['price'] = prices[i]
            list.append(map)

        print('list=', len(list))

        zlitem = ZhilianItem()
        #禁止这样写 不然yield不起作用
        # zlitem.list=list
        zlitem['price'] = prices
        zlitem['title'] = titles
        zlitem['list'] = list
        zlitem['data'] = list
        # yield把数据给引擎在扔给pipelines.py的item
        yield zlitem
예제 #26
0
    def work_parse(self, response):
        print(response.url)
        print(response.status)
        logger.info('url:' + response.url)
        logger.info('url:' + str(response.status))
        item = ZhilianItem()
        item['JobTitle'] = response.xpath(
            "//div[@class='fixed-inner-box']/div[1]/h1/text()").extract()[0]
        item['company'] = response.xpath(
            "//div[@class='fixed-inner-box']/div[1]/h2/a/text()").extract()[0]
        item['JobTag'] = response.xpath(
            "//div[@class='fixed-inner-box']/div[1]/div[1]/span/text()"
        ).extract()
        item['MonthSalanry'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[1]/strong/text()"
        ).extract()[0]
        item['WorkPlace'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()"
        ).extract()[0]
        item['ReleaseData'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[3]/strong/span/text()"
        ).extract()[0]
        item['WorkNature'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[4]/strong/text()"
        ).extract()[0]
        item['WorkExperience'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[5]/strong/text()"
        ).extract()[0]
        item['MinDegree'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[6]/strong/text()"
        ).extract()[0]
        item['RecruitingNumbers'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[7]/strong/text()"
        ).extract()[0]
        item['JobCategory'] = response.xpath(
            "//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()"
        ).extract()[0]

        yield item
예제 #27
0
 def job_parse(self, response):
     item = ZhilianItem()
     item['link'] = response.meta['link']
     item['company'] = response.meta['company']
     description = {}
     description["payment"] = re.compile(u'职位月薪:')
     description["place"] = re.compile(u'工作地点:')
     description["date"] = re.compile(u'发布日期:')
     description["prop"] = re.compile(u'工作性质:')
     description["exp"] = re.compile(u'工作经验:')
     description["academic"] = re.compile(u'最低学历:')
     description["num"] = re.compile(u'招聘人数:')
     description["job_type"] = re.compile(u'职位类别:')
     prefix = "//div[@class='terminalpage clearfix']/div[@class='terminalpage-left']/"
     for li in response.xpath(prefix +
                              "ul[@class='terminal-ul clearfix']/li"):
         text = self.encode(li.xpath('span/text()').extract())
         for desc in description:
             if description[desc].search(text.decode('utf-8')):
                 text = ''
                 for childnode in li.xpath(
                         'strong/descendant-or-self::node()'):
                     t = self.encode(childnode.xpath('text()').extract())
                     text += t
                 item[desc] = text
                 break
     text = ''
     xpath = prefix
     xpath += "div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont']"
     xpath += "/p/descendant-or-self::node()"
     for t in response.xpath(xpath):
         t = self.encode(t.xpath("text()").extract())
         if len(t.strip()) > 9:
             text += t + '\n  '.encode('utf-8')
     item['description'] = text
     return item
예제 #28
0
    def parse(self, response):
        totalCount = response.xpath(
            "/html/body/div[3]/div[3]/div[2]/span[1]/em/text()").extract()[0]

        pages = int(int(totalCount) / 60)

        if pages >= 100:
            pages = 100
        else:
            pages = pages

        for result in response.xpath(
                '//*[@id="newlist_list_content_table"]/table')[1:]:
            item = ZhilianItem()
            try:
                item['jobName'] = result.xpath(
                    './/tr[1]/td[1]/div/a/text()').extract()[0]
            except Exception as e:
                item['jobName'] = ''

            try:
                item['jobUrl'] = result.xpath(
                    './/tr[1]/td[1]/div/a/@href').extract()[0]
            except Exception as e:
                item['jobUrl'] = ''

            try:
                item['companyName'] = result.xpath(
                    './/tr[1]/td[3]/a[1]/text()').extract()[0]
            except Exception as e:
                item['companyName'] = ''

            try:
                item['companyUrl'] = result.xpath(
                    './/tr[1]/td[3]/a[1]/@href').extract()[0]
            except Exception as e:
                item['companyUrl'] = ''
            try:
                item['companySize'] = (result.xpath(
                    u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"公司规模")]/text()'
                ).extract()[0]).replace(u'公司规模:', '')
            except Exception as e:
                item['companySize'] = ''
            try:
                item['companyNature'] = (result.xpath(
                    u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"公司性质")]/text()'
                ).extract()[0]).replace(u'公司性质:', '')
            except Exception as e:
                item['companyNature'] = ''

            try:
                item['salary'] = result.xpath(
                    './/tr[1]/td[4]/text()').extract()[0]
                salaryList = re.split('-', item['salary'])
                item['minSalary'] = salaryList[0]
                item['maxSalary'] = salaryList[1]
            except Exception as e:
                item['salary'] = ''
                item['minSalary'] = ''
                item['maxSalary'] = ''

            try:
                item['location'] = result.xpath(
                    './/tr[1]/td[5]/text()').extract()[0]
            except Exception as e:
                item['location'] = ''

            try:
                item['deatil'] = result.xpath(
                    './/tr[2]/td/div/div/ul/li[2]/text()').extract()[0]
            except Exception as e:
                item['deatil'] = ''

            try:
                item['edu'] = (result.xpath(
                    u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"学历")]/text()'
                ).extract()[0]).replace(u'学历:', '')
            except Exception as e:
                item['edu'] = ''

            try:
                item['workYear'] = (result.xpath(
                    u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"经验")]/text()'
                ).extract()[0]).replace(u'经验:', '')
            except Exception as e:
                item['workYear'] = ''
            yield item

        p = int(response.meta.get('p')) + 1
        if p <= pages:
            yield scrapy.Request(self.url + str(p),
                                 method='GET',
                                 meta={'p': p},
                                 headers=self.headers,
                                 callback=self.parse)
예제 #29
0
    def parse_item(self, response):
        # 校园招聘遇301重定向
        if response.status == 301:
            newurl = response.headers['Location'].decode('utf8')
            yield scrapy.Request(
                newurl,
                callback=self.parse_item,
                dont_filter=True,
                priority=3,
                meta={'post_time': response.meta['post_time']})

        # 网页解析
        elif response.status == 200:
            try:
                selector = Selector(response)
                link = response.url
                place = 'Null'

                if '//xiaoyuan' in link:
                    # 解析校园招聘详情页
                    job_name = selector.xpath(
                        '//div[@class="cJobDetailInforWrap"]/h1/text()'
                    ).extract_first()
                    if job_name:
                        _id = ''.join(re.findall('com/job/(.*)', link))

                        job_data = selector.xpath(
                            '//ul["@class=cJobDetailInforBotWrap clearfix c3"]/li[@class="cJobDetailInforWd2 marb"]/text()'
                        ).extract()
                        place, job_kind, job_number, post_time, job_nature, education = \
                            job_data[0], job_data[1], job_data[2], job_data[3], job_data[4], job_data[5]
                        place = place.split('-')[0]
                        job_content = selector.xpath(
                            '//div[@class="cJob_Detail f14"]/p//text()'
                        ).extract()
                        company_name = selector.xpath(
                            '//li[@id="jobCompany"]/a/text()').extract_first()
                        company_industry = selector.xpath(
                            '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[4]/@title'
                        ).extract_first()
                        company_size = selector.xpath(
                            '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[6]/text()'
                        ).extract_first()
                        company_nature = selector.xpath(
                            '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[8]/text()'
                        ).extract_first()
                        job_place = selector.xpath(
                            '//div[@class="clearfix p20"]/p[@class="c9"]/text()'
                        ).extract_first()
                        company_address = selector.xpath(
                            '//div[@class="clearfix p20"]/p[@class="c9"]/text()'
                        ).extract_first()
                        company_homepage = selector.xpath(
                            '//div[@class="clearfix p20"]/p[@class="c9 mt5"]/a/@href'
                        ).extract_first()
                        item = XiaozhaoItem()
                    else:
                        return

                else:
                    # 解析社会招聘详情页
                    job_name = selector.xpath(
                        '//h3[@class="summary-plane__title"]/text()'
                    ).extract_first()
                    if job_name:
                        _id = ''.join(re.findall('com/(.*?).htm', link))
                        salary = selector.xpath(
                            '//span[@class="summary-plane__salary"]/text()'
                        ).extract_first()
                        place = selector.xpath(
                            '//ul[@class="summary-plane__info"]/li[1]/a/text()'
                        ).extract_first()
                        experience = selector.xpath(
                            '//ul[@class="summary-plane__info"]/li[2]/text()'
                        ).extract_first()
                        education = selector.xpath(
                            '//ul[@class="summary-plane__info"]/li[3]/text()'
                        ).extract_first()
                        job_number = selector.xpath(
                            '//ul[@class="summary-plane__info"]/li[4]/text()'
                        ).extract_first()
                        job_kind = selector.xpath(
                            '//span[@class="pos-name"]/a/text()'
                        ).extract_first()  # miss
                        job_content = selector.xpath(
                            '//div[@class="describtion__detail-content"]//text()'
                        ).extract()
                        job_place = selector.xpath(
                            '//span[@class="job-address__content-text"]/text()'
                        ).extract_first()
                        company_name = selector.xpath(
                            '//a[@class="company__title"]/text()'
                        ).extract_first()
                        advantage = selector.xpath(
                            '//div[@class="highlights__content"]//text()'
                        ).extract()
                        company_industry = selector.xpath(
                            '//button[@class="company__industry"]//text()'
                        ).extract_first()
                        company_nature = selector.xpath(
                            '//ul[@class="promulgator-ul cl"]/li[2]/strong/text()'
                        ).extract_first()  # miss
                        company_size = selector.xpath(
                            '//button[@class="company__size"]/text()'
                        ).extract_first()
                        company_homepage = selector.xpath(
                            '//a[@class="company__home-page"]/@href'
                        ).extract_first()
                        company_address = selector.xpath(
                            '//ul[@class="promulgator-ul cl"]/li[5]/strong/text()'
                        ).extract_first()  # miss
                        job_content = ''.join(job_content).strip()
                        post_time = response.meta['post_time']
                        post_time = '20' + post_time if post_time else 'NULL'
                        job_nature = "全职"
                        salary = salary_formater(
                            redundancy_cleaner(salary, '元/月'))
                        advantage = '/'.join(advantage)

                        item = ZhilianItem()
                        item["salary"] = salary if salary else "NULL"
                        item[
                            "experience"] = experience if experience else "NULL"
                        item["advantage"] = advantage if advantage else "NULL"

                # 公有字段清洗
                place = redundancy_cleaner(place)
                job_content = content_cleaner(job_content)
                job_name = redundancy_cleaner(job_name)
                job_place = redundancy_cleaner(job_place)
                job_number = get_num(job_number)

                item["_id"] = _id
                item["job_name"] = job_name if job_name else "NULL"
                item["link"] = link if link else "NULL"
                item["place"] = place if place else "NULL"
                item["post_time"] = post_time
                item["job_number"] = job_number if job_number else "NULL"
                item["education"] = education if education else "NULL"
                item["job_nature"] = job_nature if job_nature else "NULL"
                item["job_kind"] = job_kind if job_kind else "NULL"
                item["job_place"] = job_place if job_place else "NULL"
                item["job_content"] = job_content if job_content else "NULL"
                item["company_size"] = company_size if company_size else "NULL"
                item[
                    "company_nature"] = company_nature if company_nature else "NULL"
                item[
                    "company_industry"] = company_industry if company_industry else "NULL"
                item["company_name"] = company_name if company_name else "NULL"
                item[
                    "company_homepage"] = company_homepage if company_homepage else "NULL"
                item["company_address"] = company_address.strip(
                ) if company_address else "NULL"

                yield item

            # except ValueError as ve:
            #     log.msg("Error: " + _id + ' ' + str(ve), level=log.ERROR)

            except Exception as ex:
                log.msg("Error when parsing {0}".format(response.url),
                        level=log.ERROR)
                raise ex

        else:
            log.msg("Page Not Found, reason: {0}".format(response.status),
                    level=log.WARNING)
예제 #30
0
 def filter_Career(self, response):
     url = response.url
     print(url)
     company_url = response.xpath(
         '//div[@class="inner-left fl"]/h2/a/@href').extract_first()
     name = response.xpath(
         '//div[@class="inner-left fl"]/h1/text()').extract_first()
     company = response.xpath(
         '//div[@class="inner-left fl"]/h2/a/text()').extract_first()
     welfare = ','.join(
         response.xpath(
             '//div[@class="welfare-tab-box"]/span/text()').extract())
     ul_xpath = '//ul[@class="terminal-ul clearfix"]/li'
     salary = response.xpath(ul_xpath + '/strong/text()').extract_first()
     location = response.xpath(ul_xpath +
                               '/strong/a/text()').extract_first()
     job = response.xpath(ul_xpath + '/strong/a/text()').extract()[1]
     job_kind = response.xpath(ul_xpath +
                               '[4]/strong/text()').extract_first()
     job_time = response.xpath(ul_xpath +
                               '[5]/strong/text()').extract_first()
     xueli = response.xpath(ul_xpath + '[6]/strong/text()').extract_first()
     push_time = response.xpath(
         '//*[@id="span4freshdate"]/text()').extract_first()
     people_num = response.xpath(ul_xpath +
                                 '[7]/strong/text()').extract_first()
     bs4 = BeautifulSoup(response.text, 'lxml')
     for terminalpage in bs4.find_all('div',
                                      class_='terminalpage-main clearfix'):
         for box in terminalpage.find_all('div', class_='tab-cont-box'):
             cont = box.find_all('div', class_='tab-inner-cont')[0]
             ms = cont.contents
             list = []
             for i in ms:
                 try:
                     if i.text.split() == []:
                         pass
                     else:
                         job_ms = ''.join(i.text.split())
                         list.append(job_ms)
                 except:
                     pass
             jog_location = ''.join(list).split('工作地址:').pop()
             job_mss = ''.join(list).split('工作地址:')[0]
             if job_mss == '':
                 job_ms = bs4.find('div', class_="tab-inner-cont")
                 job_mss = ''.join(job_ms.text.split()).split('工作地址')[0]
             loader = ItemLoader(item=ZhilianItem(), response=response)
             loader.add_value('url', url)
             loader.add_value('name', name)
             loader.add_value('company', company)
             loader.add_value('company_url', company_url)
             loader.add_value('welfare', welfare)
             loader.add_value('salary', salary)
             loader.add_value('location', location)
             loader.add_value('job', job)
             loader.add_value('job_kind', job_kind)
             loader.add_value('job_time', job_time)
             loader.add_value('xueli', xueli)
             loader.add_value('push_time', push_time)
             loader.add_value('people_num', people_num)
             loader.add_value('jog_location', jog_location)
             loader.add_value('job_mss', job_mss)
             print(loader.load_item())
             yield loader.load_item()