Exemplo n.º 1
0
 def parse_job_detail(self, response):
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted()
     company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted()
     
     if title and company:
         city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted()
         category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted()
         images_url = hxs.select("//img[@id='mainimage']/@src").extract()
         item=JobItem()
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
     
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url))
         loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)"))
         loader.add_value('id', self.generate_id(response.url))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()
Exemplo n.º 2
0
 def parse_job_detail(self, response):        
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//span[@class='header']/text()").extract_unquoted()       
     company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted()   
          
     if title and company: 
         city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted()
         category = response.request.meta['category']        
         published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted()
         
         item=JobItem()
         images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract()        
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
         
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id')))
         loader.add_value('published_date', published_date)
         loader.add_value('id', self.generate_id(response.url, ('najdi', 'id')))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()
Exemplo n.º 3
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']})

        for job in hxs.select("//ul[@id='newJobs']/li"):
            name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted()
            company = job.select("strong/text()").extract_unquoted()

            if name and company:
                detail_url = job.select("p[@class='jobTitle']/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])

                if detail_url:
                    images_url = job.select("div[@class='jobImgDiv']/img/@src").extract()
                    item = JobItem()

                    item['title'] = name
                    item['company'] = company
                    item['category'] = response.request.meta['category']
                    item['summary'] =  job.select("p[2]/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))

                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
Exemplo n.º 4
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@class, "content")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//div[contains(@class, "posting-headline")]/h2/text()'
         ).extract_first()
         item["company"] = job.xpath(
             '//div[contains(@class, "main-footer-text page-centered")]/p/a/text()'
         ).extract()
         item["company_url"] = job.xpath(
             '//div[contains(@class, "main-footer-text page-centered")]/p/a/@href'
         ).extract()
         item["body"] = job.xpath(
             '//div[contains(@class, "section page-centered")]').extract()
         item["location"] = job.xpath(
             '//div[contains(@class, "sort-by-time posting-category medium-category-label")]'
         ).extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Lever"
         item["board_url"] = "lever.co"
         items.append(item)
     return items
Exemplo n.º 5
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     item['company'] = 'Remote Python'
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = 'n/a'
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Remote Python"
     item["board_url"] = "www.remotepython.com"
     item["email"] = 'n/a'
     item["salary"] = 'n/a'
     item['location'] = 'n/a'
     return item
Exemplo n.º 6
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     item['company'] = node.xpath('source/text()').extract_first()
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Indeed"
     item["board_url"] = "www.indeed.com"
     item["email"] = str('n/a')
     item["salary"] = str('n/a')
     item['location'] = str('n/a')
     return item
Exemplo n.º 7
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract()
        
        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)
          
        category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract()      
        informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']")
        
        for information in informations:
            name    = information.select("h2/a/text()").extract()
            company = information.select("p[3]/strong/text()").extract()

            if name and company:                
                detail_url = information.select("h2/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['summary'] = information.select("p[2]/text()").extract()
                    item['city'] = information.select("p[1]/text()").extract()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract()
                    
                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
                
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
Exemplo n.º 8
0
    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)

        category = hxs.select(        
            "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" +
            "//following-sibling::label[1]/text()"
        ).extract_unquoted()
        
        for job in hxs.select("//tr[@class='bg_oglas_dm']"):
            name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted()
            company = job.select("td[@class='dva']/a/text()").extract_unquoted()
            
            if name and company:
                detail_url = job.select("td[@class='ena']/div/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = job.select("td[@class='stiri']//img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
        
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
Exemplo n.º 9
0
 def parse_item(self, response):
     job = JobItem()
     job['title'] = response.xpath('/html/head/title/text()').extract()
     job['body'] = response.xpath('/html/body/blockquote/font').extract()
     job['url'] = response.url
     job['pub_date'] = response.xpath(
         '(/html/body/p)[2]/strong/br/following-sibling::text()').extract()
     job['scrape_date'] = timezone.now()
     job['board_title'] = self.board_title
     job['board_url'] = self.board_url
     job['org_title'] = response.xpath(
         '(/html/body/p)[2]/strong/big/text()').extract()
     job['org_email'] = response.xpath(
         '(/html/body/p)[2]/strong/a/text()').extract()
     yield job
Exemplo n.º 10
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first()
     # item['title'] = str('test')
     item['company'] = str('Django Gigs')
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item['scrape_date'] = timezone.now()
     item['job_board'] = "Django Gigs"
     item['board_url'] = "www.djangogigs.com"
     item['email'] = str('n/a')
     item['salary'] = str('n/a')
     item['location'] = str('n/a')
     return item
Exemplo n.º 11
0
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('title/text()').extract_first().split('(', 1)[0]
     item['company'] = node.xpath('name/text()').extract_first()
     item['body'] = node.xpath('description/text()').extract()
     item['pub_date'] = node.xpath('pubDate/text()').extract_first()
     item['url'] = node.xpath('link/text()').extract_first()
     item["scrape_date"] = timezone.now()
     item["job_board"] = "Stack Overflow"
     item["board_url"] = "www.stackoverflow.com"
     item["email"] = str('n/a')
     item["salary"] = str('n/a')
     if node.xpath('location/text()'):
         item['location'] = node.xpath('location/text()').extract_first()
     else:
         item['location'] = str('n/a')
     return item
Exemplo n.º 12
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//main[contains(@class, "stacked")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = str('n/a')
         item["company"] = str('n/a')
         item["body"] = job.xpath(
             '//main[contains(@class, "stacked")]').extract()
         item["location"] = job.xpath(
             '//p[contains(@class, "meta")]').extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         # item["tags"] = job.css('.-tags p a.post-tag::text').extract()
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Workable"
         item["board_url"] = "www.workable.com"
         items.append(item)
     return items
Exemplo n.º 13
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@id, "content")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//h1[contains(@class, "jobtitle")]/text()').extract_first()
         item["company"] = str('n/a')
         item["body"] = job.xpath(
             '//div[contains(@class, "jobdesciption")]').extract()
         item["location"] = job.xpath(
             '//span[contains(@class, "meta-job-location-city")]').extract(
             )
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Recruiter Box"
         item["board_url"] = "www.recruiterbox.com"
         items.append(item)
     return items
Exemplo n.º 14
0
 def parse_detail_pages(self, response):
     hxs = Selector(response)
     jobs = hxs.xpath('//div[contains(@id, "app_body")]')
     items = []
     for job in jobs:
         item = JobItem()
         item["title"] = job.xpath(
             '//h1[contains(@class, "app-title")]/text()').extract_first()
         item["company"] = job.xpath(
             '//span[contains(@class, "company-name")]/text()'
         ).extract_first()
         item["body"] = job.xpath(
             '//div[contains(@id, "content")]').extract()
         item["location"] = job.xpath(
             '//div[contains(@class, "location")]').extract_first()
         item["url"] = response.request.url
         item["pub_date"] = str('n/a')
         item["email"] = str('n/a')
         item["salary"] = str('n/a')
         item["scrape_date"] = timezone.now()
         item["job_board"] = "Greenhouse"
         item["board_url"] = "www.greenhouse.io"
         items.append(item)
     return items