示例#1
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@class, "content")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//div[contains(@class, "posting-headline")]/h2/text()'
         ).extract_first()
         item["company"] = job.xpath(
             '//div[contains(@class, "main-footer-text page-centered")]/p/a/text()'
         ).extract()
         item["company_url"] = job.xpath(
             '//div[contains(@class, "main-footer-text page-centered")]/p/a/@href'
         ).extract()
         item["body"] = job.xpath(
             '//div[contains(@class, "section page-centered")]').extract()
         item["location"] = job.xpath(
             '//div[contains(@class, "sort-by-time posting-category medium-category-label")]'
         ).extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Lever"
         item["board_url"] = "lever.co"
         items.append(item)
     return items
示例#2
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     item['company'] = 'Remote Python'
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = 'n/a'
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Remote Python"
     item["board_url"] = "www.remotepython.com"
     item["email"] = 'n/a'
     item["salary"] = 'n/a'
     item['location'] = 'n/a'
     return item
示例#3
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     item['company'] = node.xpath('source/text()').extract_first()
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Indeed"
     item["board_url"] = "www.indeed.com"
     item["email"] = str('n/a')
     item["salary"] = str('n/a')
     item['location'] = str('n/a')
     return item
示例#4
0
 def parse_item(self, response):
     job = JobItem()
     job['title'] = response.xpath('/html/head/title/text()').extract()
     job['body'] = response.xpath('/html/body/blockquote/font').extract()
     job['url'] = response.url
     job['pub_date'] = response.xpath(
         '(/html/body/p)[2]/strong/br/following-sibling::text()').extract()
     job['scrape_date'] = timezone.now()
     job['board_title'] = self.board_title
     job['board_url'] = self.board_url
     job['org_title'] = response.xpath(
         '(/html/body/p)[2]/strong/big/text()').extract()
     job['org_email'] = response.xpath(
         '(/html/body/p)[2]/strong/a/text()').extract()
     yield job
示例#5
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     # item['title'] = str('test')
     item['company'] = str('Django Gigs')
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item['scrape_date'] = timezone.now()
     item['job_board'] = "Django Gigs"
     item['board_url'] = "www.djangogigs.com"
     item['email'] = str('n/a')
     item['salary'] = str('n/a')
     item['location'] = str('n/a')
     return item
示例#6
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first().split('(', 1)[0]
     item['company'] = node.xpath('name/text()').extract_first()
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Stack Overflow"
     item["board_url"] = "www.stackoverflow.com"
     item["email"] = str('n/a')
     item["salary"] = str('n/a')
     if node.xpath('location/text()'):
         item['location'] = node.xpath('location/text()').extract_first()
     else:
         item['location'] = str('n/a')
     return item
示例#7
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//main[contains(@class, "stacked")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = str('n/a')
         item["company"] = str('n/a')
         item["body"] = job.xpath(
             '//main[contains(@class, "stacked")]').extract()
         item["location"] = job.xpath(
             '//p[contains(@class, "meta")]').extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         # item["tags"] = job.css('.-tags p a.post-tag::text').extract()
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Workable"
         item["board_url"] = "www.workable.com"
         items.append(item)
     return items
示例#8
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@id, "content")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//h1[contains(@class, "jobtitle")]/text()').extract_first()
         item["company"] = str('n/a')
         item["body"] = job.xpath(
             '//div[contains(@class, "jobdesciption")]').extract()
         item["location"] = job.xpath(
             '//span[contains(@class, "meta-job-location-city")]').extract(
             )
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Recruiter Box"
         item["board_url"] = "www.recruiterbox.com"
         items.append(item)
     return items
示例#9
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@id, "app_body")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//h1[contains(@class, "app-title")]/text()').extract_first()
         item["company"] = job.xpath(
             '//span[contains(@class, "company-name")]/text()'
         ).extract_first()
         item["body"] = job.xpath(
             '//div[contains(@id, "content")]').extract()
         item["location"] = job.xpath(
             '//div[contains(@class, "location")]').extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Greenhouse"
         item["board_url"] = "www.greenhouse.io"
         items.append(item)
     return items