def parse(self, response): super(SimplyhiredSpider, self).parse(response) # # with scrapy selector # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # item = self.init_item(response) # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # #this works, but return different url # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # item['title'] = sel.css('a.title::text').extract()[0].strip() # with bs4 soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') if len(soupitems) < 1: append(self.fail_url_path, 'no data:' + response.url) return for soupitem in soupitems: item = self.init_item(response) item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find('span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find('p', itemprop="description").text item['published'] = timeago2datetimestr(item['date_search'], soupitem.select('span.ago')[0].text) # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) #for url in response.xpath('//link[@rel="next"]/@href').extract()[0]: next = response.css('a.next::attr(href)').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( # self.base_url + next[0]['href'], # with soup next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): super(GlassdoorSpider, self).parse(response) # list = response.css('.standardJobListings') list = response.css('.jlGrid') rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0] item['item_url'] = self.base_url + url # a text could have span tag inside item['title'] = row.xpath( 'string(.//h3[@itemprop="title"]/a)').extract()[0].strip() item['company'] = row.xpath( 'string(.//span[@class="employerName"])').extract()[0].strip() published = row.css('.logo').css( '.minor::text').extract()[0].strip() item['published'] = timeago2datetimestr(item['date_search'], published) item['short_description'] = row.xpath( 'string(.//p[@itemprop="description"])').extract()[0].strip() loc = row.xpath('string(.//span[@itemprop="addressLocality"])' ).extract()[0].strip() try: item['locality'], item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item}) next = response.css('.next a::attr(href)').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( # self.base_url + next[0]['href'], # with soup self.base_url + next[0], callback=self.parse, meta={ 'keyword': response.meta['keyword'], 'location': response.meta['location'] }) else: self.logger.debug('no next url')
def parse(self, response): self.logger.debug('in parse') # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # self.logger.debug('parsing') # item = ScrapyscrappersItem() # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0] # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # #item['title'] = sel.xpaht('.//h2') # item['title'] = sel.css('a.title::text').extract()[0].strip() soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') for soupitem in soupitems: self.logger.debug('parsing') item = ScrapyscrappersItem() item['keyword'] = response.meta['keyword'] item['date_search'] = current_datetime() item['item_url'] = [ a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href') ][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find( 'span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find( 'p', itemprop="description").text item['published'] = timeago2datetimestr( item['date_search'], soupitem.select('span.ago')[0].text) #salary #clearance #department #description self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item})
def parse(self, response): super(CareerbuilderSpider, self).parse(response) rows = response.css('.gs-job-result-abstract') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) item['item_url'] = row.css('.jt').xpath('@href').extract()[0] item['title'] = row.css('.jt::text').extract()[0] #item['short_description'] = row.css('span[itemprop="description"]::text').extract()[0] # the same with xpath item['short_description'] = row.xpath('.//span[@itemprop="description"]/text()').extract()[0] item['company'] = row.xpath('.//a/@companyname').extract()[0] # or # try: # item['company'] = row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()[0] # except IndexError: # self.logger.debug(row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()) location = row.xpath('.//div[@itemprop="jobLocation"]/span/text()').extract()[0] try: item['region'] , item['locality'] = location.split(' - ') except ValueError: item['locality'] = location teaser = [i.strip() for i in row.xpath('.//div[contains(@id, "pnlTeaser")]/p/text()').extract()[0].split('|')] if len(teaser) == 2: item['salary'] = teaser[1].split(':')[1] else: item['salary'] = '' item['department'] = '' ago = row.css('.jl_rslt_posted_cell span::text').extract()[0] item['published'] = timeago2datetimestr(item['date_search'] , ago) self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.JL_MXDLPagination2_next').xpath('@href').extract() if next: self.logger.debug('next url: %s' % next[0]) yield Request( next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')
def parse(self, response): self.logger.debug('in parse') # # for sel in response.xpath('//div[@class="job"]'): # for sel in response.css('div.job'): # self.logger.debug('parsing') # item = ScrapyscrappersItem() # item['keyword'] = response.meta['keyword'] # item['date_search'] = current_datetime() # # item['item_url'] = sel.xpath('.//div[@class="tools"]/@href').extract()[0] # #item['item_url'] = sel.css('a.title::attr(href)').extract()[0] # item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0] # #item['title'] = sel.xpaht('.//h2') # item['title'] = sel.css('a.title::text').extract()[0].strip() soup = bs4.BeautifulSoup(response.body) soupitems = soup.select('div.job') for soupitem in soupitems: self.logger.debug('parsing') item = ScrapyscrappersItem() item['keyword'] = response.meta['keyword'] item['date_search'] = current_datetime() item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0] item['title'] = soupitem.select('h2')[0].text.strip() try: item['company'] = soupitem.h4.text except AttributeError: pass # logger.debug('item: %s has no h4 tag' % i) if soupitem.find('span', itemprop="addressLocality"): item['locality'] = soupitem.find('span', itemprop="addressLocality").text if soupitem.find('span', itemprop="addressRegion"): item['region'] = soupitem.find('span', itemprop="addressRegion").text item['short_description'] = soupitem.find('p', itemprop="description").text item['published'] = timeago2datetimestr(item['date_search'], soupitem.select('span.ago')[0].text) #salary #clearance #department #description self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} )
def parse(self, response): super(GlassdoorSpider, self).parse(response) # list = response.css('.standardJobListings') list = response.css('.jlGrid') rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]') if len(rows) < 1: append(self.fail_url_path, 'no data:' + response.url) return for row in rows: item = self.init_item(response) url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0] item['item_url'] = self.base_url + url # a text could have span tag inside item['title'] = row.xpath('string(.//h3[@itemprop="title"]/a)').extract()[0].strip() item['company'] = row.xpath('string(.//span[@class="employerName"])').extract()[0].strip() published = row.css('.logo').css('.minor::text').extract()[0].strip() item['published'] = timeago2datetimestr(item['date_search'], published) item['short_description'] = row.xpath('string(.//p[@itemprop="description"])').extract()[0].strip() loc = row.xpath('string(.//span[@itemprop="addressLocality"])').extract()[0].strip() try: item['locality'] , item['region'] = loc.split(', ') except ValueError: item['locality'] = loc item['region'] = '' # data not available in this website item['salary'] = '' item['clearance'] = '' item['department'] = '' self.logger.debug('title %s' % item['title']) yield Request(item['item_url'], callback=self.parse_item, meta={'item': item} ) next = response.css('.next a::attr(href)').extract() if next: self.logger.debug('next url: %s' % self.base_url + next[0]) yield Request( # self.base_url + next[0]['href'], # with soup self.base_url + next[0], callback=self.parse, meta={'keyword': response.meta['keyword'], 'location': response.meta['location']} ) else: self.logger.debug('no next url')