Пример #1
0
    def parse(self, response):
        # print('-----------------------')
        for sel in response.xpath('//div[@id="project_list"]/ul/li'):
            item = JobItem()
            item['platform'] = '大鲲网'
            item['title'] = sel.xpath('a/h3/text()').extract()[0].encode(
                "utf-8")
            url = sel.xpath('a/@href').extract()[0]
            item['url'] = url
            item['checksum'] = '{}_{}'.format(
                'pro_lagou', item['url'].split('/')[-1].split('.')[0])
            item['price'] = sel.xpath('a/div[2]/span/text()').extract(
            )[0].encode("utf-8").split('元')[0].split('-')[0]
            print item['url']
            # 查看次数
            # item['view_count'] = sel.xpath('a/div[4]/div[3]/div[2]/strong/text()').extract()[0].encode("utf-8")

            # expire_date 无
            categories = []
            for cats in sel.xpath('a/div[@class="category_list"]/span'):
                categories.append(
                    cats.xpath('text()').extract()[0].encode("utf-8"))
            item['categories'] = categories
            item['roles'] = ''
            item['expire_date'] = ''
            request = scrapy.Request(url, callback=self.parse_contents)
            request.meta['item'] = item
            yield request
Пример #2
0
    def parse_detail(self, response):
        item = JobItem()

        categories_part_1 = response.xpath(
            '//div[@class="box-aw"]/div[1]/a['
            '@class="bn bn-icon bn-small light-green"]/text()').extract()
        categories_part_2 = response.xpath(
            '//div[@class="box-aw"]/div[1]/span['
            '@class="text-gray"]/text()').extract()

        skills = ''
        skills_block = response.xpath(
            '//div[@class="project-attr"]/text()').extract()
        for skill in skills_block:
            value = skill.replace(u'\xa0', '').encode("UTF8").split(':')
            if value[0].strip() == '技能要求':
                skills = value[1].strip()

        release_date = response.xpath(
            '//span[@class="publish-date text-gray"]/text()').extract(
            )[1].split(":")[1].strip()

        item['platform'] = u'开源中国'
        item['title'] = response.xpath(
            '//h2[@class="wrap"]/text()').extract_first().strip()
        item['url'] = response.url
        item['body'] = ''.join(
            response.xpath('//div[@class="simditor-body wrap"]//text()').
            extract()).strip()
        item['checksum'] = '{}_{}'.format('zb_oschina_net',
                                          response.url.split('/')[-1])
        item['city'] = ''
        price = response.xpath(
            '//div['
            '@class="box box-fr column justify reward-amount"'
            ']/div[1]/span/text()').extract_first().strip().replace(',', '')
        if re.search('\d+', price):
            price = int(re.search('\d+', price).group())
        else:
            price = 0
        item['price'] = price
        item['categories'] = categories_part_1 + categories_part_2
        item['roles'] = []
        item['skills'] = [skills]
        item['release_date'] = self.utc_rfc3339_string(release_date,
                                                       format='%Y-%m-%d')
        item['expire_date'] = ''

        yield item
Пример #3
0
 def parse_detail(self, response):
     item = JobItem()
     #大约 19 小时前发布 1 天前发布 大约 1 个月前发布 大约 1 年前发布
     item['platform'] = u'实现网'
     item['title'] = response.xpath('//article[@class="job-show"]/h1[@class="title"]/text()').extract_first().strip()
     # print('title ==>', title)
     item['url'] = response.url
     item['body'] = ''.join(response.xpath('//div[@class="content"]/p//text()').extract()).strip()
     item['checksum'] = '{}_{}'.format( 'shixian_com', response.url.split('/')[-1] )
     item['city'] = response.xpath('//ol[@class="breadcrumb jobs md-no-padding"]/li/a/text()').extract_first().strip()
     item['price'] = int(response.xpath('//strong[@class="price"]/text()').re('\d+')[0])
     release_date = response.xpath('//small[@class="time"]/text()').extract_first().strip()
     item['release_date'] = self.str2date(release_date)
     expire_date = response.xpath('//section[@class="info clearfix"]/dl/dd/span/text()')[3].extract().strip()
     item['expire_date'] = self.str2date(expire_date)
     #item['view_count'] = int(response.xpath('//div[@class="pull-right text-muted"]/text()').re('\d+')[0])
     item['categories'] = response.xpath('//section[@class="info clearfix"]/dl/dd/span/text()')[0].extract().strip().split('/')
     item['roles'] = [response.xpath('//ol[@class="breadcrumb jobs md-no-padding"]/li/a/text()')[1].extract().strip()]
     item['skills'] = response.xpath('//section[@class="skill-tags clearfix"]/dl/dd/text()').extract()
     yield item