def parse(self, response): # print('-----------------------') for sel in response.xpath('//div[@id="project_list"]/ul/li'): item = JobItem() item['platform'] = '大鲲网' item['title'] = sel.xpath('a/h3/text()').extract()[0].encode( "utf-8") url = sel.xpath('a/@href').extract()[0] item['url'] = url item['checksum'] = '{}_{}'.format( 'pro_lagou', item['url'].split('/')[-1].split('.')[0]) item['price'] = sel.xpath('a/div[2]/span/text()').extract( )[0].encode("utf-8").split('元')[0].split('-')[0] print item['url'] # 查看次数 # item['view_count'] = sel.xpath('a/div[4]/div[3]/div[2]/strong/text()').extract()[0].encode("utf-8") # expire_date 无 categories = [] for cats in sel.xpath('a/div[@class="category_list"]/span'): categories.append( cats.xpath('text()').extract()[0].encode("utf-8")) item['categories'] = categories item['roles'] = '' item['expire_date'] = '' request = scrapy.Request(url, callback=self.parse_contents) request.meta['item'] = item yield request
def parse_detail(self, response): item = JobItem() categories_part_1 = response.xpath( '//div[@class="box-aw"]/div[1]/a[' '@class="bn bn-icon bn-small light-green"]/text()').extract() categories_part_2 = response.xpath( '//div[@class="box-aw"]/div[1]/span[' '@class="text-gray"]/text()').extract() skills = '' skills_block = response.xpath( '//div[@class="project-attr"]/text()').extract() for skill in skills_block: value = skill.replace(u'\xa0', '').encode("UTF8").split(':') if value[0].strip() == '技能要求': skills = value[1].strip() release_date = response.xpath( '//span[@class="publish-date text-gray"]/text()').extract( )[1].split(":")[1].strip() item['platform'] = u'开源中国' item['title'] = response.xpath( '//h2[@class="wrap"]/text()').extract_first().strip() item['url'] = response.url item['body'] = ''.join( response.xpath('//div[@class="simditor-body wrap"]//text()'). extract()).strip() item['checksum'] = '{}_{}'.format('zb_oschina_net', response.url.split('/')[-1]) item['city'] = '' price = response.xpath( '//div[' '@class="box box-fr column justify reward-amount"' ']/div[1]/span/text()').extract_first().strip().replace(',', '') if re.search('\d+', price): price = int(re.search('\d+', price).group()) else: price = 0 item['price'] = price item['categories'] = categories_part_1 + categories_part_2 item['roles'] = [] item['skills'] = [skills] item['release_date'] = self.utc_rfc3339_string(release_date, format='%Y-%m-%d') item['expire_date'] = '' yield item
def parse_detail(self, response): item = JobItem() #大约 19 小时前发布 1 天前发布 大约 1 个月前发布 大约 1 年前发布 item['platform'] = u'实现网' item['title'] = response.xpath('//article[@class="job-show"]/h1[@class="title"]/text()').extract_first().strip() # print('title ==>', title) item['url'] = response.url item['body'] = ''.join(response.xpath('//div[@class="content"]/p//text()').extract()).strip() item['checksum'] = '{}_{}'.format( 'shixian_com', response.url.split('/')[-1] ) item['city'] = response.xpath('//ol[@class="breadcrumb jobs md-no-padding"]/li/a/text()').extract_first().strip() item['price'] = int(response.xpath('//strong[@class="price"]/text()').re('\d+')[0]) release_date = response.xpath('//small[@class="time"]/text()').extract_first().strip() item['release_date'] = self.str2date(release_date) expire_date = response.xpath('//section[@class="info clearfix"]/dl/dd/span/text()')[3].extract().strip() item['expire_date'] = self.str2date(expire_date) #item['view_count'] = int(response.xpath('//div[@class="pull-right text-muted"]/text()').re('\d+')[0]) item['categories'] = response.xpath('//section[@class="info clearfix"]/dl/dd/span/text()')[0].extract().strip().split('/') item['roles'] = [response.xpath('//ol[@class="breadcrumb jobs md-no-padding"]/li/a/text()')[1].extract().strip()] item['skills'] = response.xpath('//section[@class="skill-tags clearfix"]/dl/dd/text()').extract() yield item