Python ScrapyItem示例，zhaopin.items.ScrapyItem Python示例

示例#1

0

显示文件

 def parse_item(self, response):
     item = ScrapyItem()
     link = response.url
     title = response.css('div.fixed-inner-box h1::text').extract()[0]
     company = response.css('div.fixed-inner-box h2 a::text').extract()[0]
     info = response.xpath(
         '//div[@class="terminalpage-left"]/ul/li/strong//text()').extract(
         )
     content = response.xpath(
         '//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/p/text()'
     ).extract()
     content = ''.join(
         [i.replace('\n\r\t\xa0', '').strip() for i in content])
     salary = exp = None
     if len(info) == 8:
         salary, item['location'], item['date_time'], item[
             'job_type'], exp, item['degree'], _, _ = info
     elif len(info) == 9:
         salary, item['location'], item['addr'], item['date_time'], item[
             'job_type'], exp, item['degree'], _, _ = info
     item['salary_l'], item['salary_h'] = self.toInt(salary)
     p = re.compile(r'(\d+)')
     exp = p.search(exp)
     if exp:
         exp = exp.group(1)
     else:
         exp = 0
     item['exp'] = int(exp)
     if item.get('addr'):
         item['addr'] = item['addr'].strip('-')
     else:
         item['addr'] = ''
     item['date_time'] = item['date_time'].split(' ')[0]
     aid = self.trmd5(link)
     item['aid'] = aid
     item['content'] = content
     item['title'] = title
     item['company'] = company
     item['link'] = link
     item['referer'] = '智联招聘'
     yield item

示例#2

0

显示文件

文件： lagou.py 项目： lxc198/zhaopin_scrapy

 def parse_item(self, response):
     item = ScrapyItem()
     title = response.css('span.name::text').extract()[0]
     link = response.url
     info = [i.strip('/').strip() for i in response.css('dd.job_request p')[0].css('span::text').extract()]
     date_time = response.css('p.publish_time::text').extract()[0].split()[0]
     company = response.css('dl#job_company dt img::attr(alt)').extract()[0]
     date_time = self.Strfdate(date_time)
     aid = self.trmd5(link)
     salary,item['location'],exp,item['degree'],item['job_type'] = info
     salary_l,salary_h = [int(i.strip('kK'))*1000 for i in salary.split('-')]
     addr = response.xpath('//div[@class="work_addr"]//text()').extract()
     addr =''.join([i.replace('\n\r\t\xa0', '')).strip('查看地图- ') for i in addr])

示例#3

0

显示文件

文件： 51job.py 项目： lxc198/zhaopin_scrapy

    def parse_item(self, response):
        item = ScrapyItem()
        html = BeautifulSoup(response.text, 'html.parser')
        link = response.url
        title = response.css('div.cn h1::text').extract()
        adinfo = response.css('div.cn span.lname::text').extract()
        salary = response.css('div.cn strong::text').extract()
        company = response.css('div.cn p.cname a::text').extract()
        title, adinfo, company = self.default_value([title, adinfo, company])
        if salary:
            salary = salary[0]
            item['salary_l'], item['salary_h'] = self.toInt(salary)
        else:
            item['salary_l'] = item['salary_h'] = 0
        info = [
            html.find('em', {'class': 'i{}'.format(i)}) for i in range(1, 5)
        ]
        info = self.default_info(info)
        exp, degree, _, date_time = info
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if not exp:
            exp = 0
        else:
            exp = exp.group(1)
        date_time = self.Strfdate(date_time)

        content = response.xpath(
            '//div[@class="bmsg job_msg inbox"]//text()').extract()
        content = ''.join(
            [i.replace('\n\t\r\xa0', '').strip() for i in content])

        if '-' in adinfo:
            location, addr = adinfo.split('-')
            item['addr'] = addr
        else:
            location = adinfo
            item['addr'] = ''
        item['job_type'] = '全职'
        item['link'] = link
        item['title'] = title
        item['company'] = company
        item['location'] = location
        item['content'] = content
        item['exp'] = exp
        item['degree'] = degree
        item['date_time'] = date_time
        item['aid'] = self.trmd5(link)
        item['referer'] = '51Job'
        yield item

示例#4

0

显示文件

文件： dajie_2.py 项目： lxc198/zhaopin_scrapy

    def parse_item(self, response):
        item = ScrapyItem()
        link = response.url
        title = response.css(
            '.job-msg-top-text span.job-name::text').extract()[0]
        job_type = response.css(
            '.job-msg-top-text span.blue-icon::text').extract()[0].strip('（）')
        salary = response.css('.job-msg-top span.job-money::text').extract()[0]
        salary_l, salary_h = self.toInt(salary)
        location = response.css('li.ads span::text').extract()
        exp = response.css('li.exp span::text').extract()
        degree = response.css('li.edu span::text').extract()
        content = response.css('div#jp_maskit pre::text').extract()
        date_time = response.css('.job-msg-bottom .date::text').extract()
        addr = response.css('div.ads-msg span::text').extract()
        company = response.css('div.p-side-right p.title a::text').extract()
        location, exp, degree, content, date_time, addr, company = self.default_value(
            [location, exp, degree, content, date_time, addr, company])
        content = ''.join(
            [i.replace('\n\r\t\xa0', '').strip() for i in content if i])
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if not exp:
            exp = 0
        else:
            exp = int(exp.group(1))
        exp = int(exp)
        date_time = date_time.strip('发布于')
        aid = self.trmd5(link)

        item['aid'] = aid
        item['job_type'] = job_type
        item['salary_l'] = salary_l
        item['salary_h'] = salary_h
        item['location'] = location
        item['exp'] = exp
        item['degree'] = degree
        item['content'] = content
        item['date_time'] = date_time
        item['title'] = title
        item['link'] = link
        item['addr'] = addr
        item['company'] = company
        item['referer'] = '大街网'
        yield item

示例#5

0

显示文件

文件： chinahr.py 项目： lxc198/zhaopin_scrapy

 def parse_item(self, response):
     item = ScrapyItem()
     link = response.url
     aid = self.trmd5(link)
     html = BeautifulSoup(response.text, 'html.parser')
     title = html.find('span', {'class': 'job_name'}).text
     salary = html.find('span', {'class': 'job_price'}).text
     if '面议' in salary:
         salary_l = salary_h = 0
     else:
         salary_l, salary_h = [int(i) for i in salary.split('-')]
     adinfo = html.find('span', {'class': 'job_loc'})
     job_type = adinfo.find_next_sibling().find_next_sibling()
     degree = job_type.find_next_sibling().find_next_sibling().text
     exp = html.find('span', {'class': 'job_exp'}).text
     company = html.find('div', {'class': 'job-company'}).find('h4').text
     date_time = html.find('p', {'class': 'updatetime'}).text.strip('更新')
     date_time = self.Strfdate(date_time)
     content = html.find('div', {
         'class': 'job_intro_info'
     }).text.replace('\xa0', '').strip()
     p = re.compile(r'(\d+)')
     exp = p.search(exp)
     if not exp:
         exp = 0
     else:
         exp = int(exp.group(1))
     location, addr = adinfo.text.split(' ')
     item['aid'] = aid
     item['title'] = title
     item['link'] = link
     item['salary_l'] = salary_l
     item['salary_h'] = salary_h
     item['location'] = location
     item['addr'] = addr
     item['job_type'] = job_type.text
     item['degree'] = degree
     item['exp'] = exp
     item['date_time'] = date_time
     item['content'] = content
     item['company'] = company
     item['referer'] = '中华英才网'
     yield item

示例#6

0

显示文件

文件： tc58.py 项目： lxc198/zhaopin_scrapy

    def parse_item(self, response):
        item = ScrapyItem()
        link = response.url
        title = response.css('div.pos_base_info .pos_title::text').extract()[0]
        salary = response.css(
            'div.pos_base_info .pos_salary::text').extract()[0]
        info = response.css('div.pos_base_condition span::text').extract()
        adinfo = response.xpath(
            '//div[@class="pos-area"]/span//text()').extract()
        company = response.css(
            'div.comp_baseInfo_title .baseInfo_link a::text').extract()[0]
        content = response.css('div.des::text').extract()
        content = ''.join(
            [i.replace('\n\r\t\xa0', '').strip() for i in content if i])
        location = adinfo[1]
        addr = ''.join(adinfo[3:6:2])
        date_time = response.css(
            'span.pos_base_update span::text').extract()[0]
        date_time = self.Strfdate(date_time)
        _, degree, exp = info
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if exp:
            exp = exp.group(1)
        else:
            exp = 0

        item['location'] = location
        item['addr'] = addr
        item['job_type'] = '全职'
        item['date_time'] = date_time
        item['exp'] = int(exp)
        item['degree'] = degree
        item['salary_l'], item['salary_h'] = self.toInt(salary)
        item['title'] = title
        item['company'] = company
        item['aid'] = self.trmd5(link)
        item['link'] = link
        item['content'] = content
        item['referer'] = '58同城'

        yield item

示例#7

0

显示文件

文件： boss.py 项目： lxc198/zhaopin_scrapy

    def parse_item(self, response):
        item = ScrapyItem()
        link = response.url
        title = response.css('div.job-banner .name::text').extract()[0]
        salary = response.css(
            'div.job-banner .name .salary::text').extract()[0]
        info = response.css('div.job-banner p::text').extract()[:3]
        print(info)
        location, exp, degree = info
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if exp:
            exp = exp.group(1)
        else:
            exp = 0
        date_time = response.css('div.job-tags span.time::text').extract()[0]
        date_time = date_time.strip('发布于')
        date_time = self.Strfdate(date_time)
        company = response.css('div.info-primary .name::text').extract()[0]
        content = response.css(
            'div.detail-content .job-sec .text::text').extract()
        content = ''.join(
            [i.replace('\n\r\t\xa0', '').strip() for i in content])
        addr = response.css(
            'div.job-location .location-address::text').extract()[0]

        item['location'] = location
        item['addr'] = addr
        item['job_type'] = '全职'
        item['date_time'] = date_time
        item['exp'] = int(exp)
        item['degree'] = degree
        item['salary_l'], item['salary_h'] = self.toInt(salary)
        item['title'] = title
        item['company'] = company
        item['aid'] = self.trmd5(link)
        item['link'] = link
        item['content'] = content
        item['referer'] = 'boss直聘'
        yield item