'from_page_enabled': True, 'from_list__next_page__css': 'ul.pagination li.arrow a::attr(href)', 'from_list__jobs_lists__css': 'body', 'from_list__jobs__css': 'div[itemtype="http://schema.org/JobPosting"]', 'from_list__url__css': 'div#job-title h2 a::attr(href)', 'from_list__title__css': 'div#job-title h2 a::text', 'from_list__publication_datetime__css': 'span[itemprop="datePosted"]', 'from_list__tags__css': 'p[itemprop="skills"] a::text', 'from_list__address__css': 'span[itemprop="jobLocation"]::text', 'from_list__company__css': 'div[itemprop="hiringOrganization"] a::text', # 'from_list__company_url__css': 'div[itemprop="hiringOrganization"] a::attr(href)', 'from_page__description__css': 'div.job-content', 'from_page__publication_datetime__css': 'p.info span:nth-child(2)', } def _get_from_list__publication_datetime(self, node): return datetime.now() def _get_from_page__publication_datetime(self, node): raw_date = self._extract_first(node, 'from_page__publication_datetime') if raw_date: # La date est sous la forme "24 août 2015" raw_date_english = self._month_french_to_english(raw_date) # On lma converti en Anglais return datetime.strptime(raw_date_english, '%d %B %Y') # On extrait la date de ce texte return datetime.now() # N'oubliez pas cette ligne source = JobSource.from_job_spider(LesJeudisSpider)
company_info.css('.website > a::attr(href)').extract()) # description = job_node.css('div.job-description').extract() tags_html = self._extract_all(job_node, 'from_page__tags', required=False) if tags_html: item['tags'] = [Tag(tag, 1) for tag in tags_html] # 19 mars 2015, # # thedate_xpath = './div[@id="content-core"]/div[@id="content-core"]/div[@class="discreet"]/text()' # # print 'DATE IS', job_node.xpath(thedate_xpath)[0].extract().strip().splitlines() # thedate = job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()[0].strip().replace(u'Créé le ', '') # # Now date is formatted as "14/10/2015 13:17" # thedatetime = datetime.strptime(thedate, '%d/%m/%Y %H:%M') # publication_datetime = thedatetime item['address'] = address item['url'] = url # used as uid item['source'] = self.name item['company'] = company item['company_url'] = company_url item['initial_crawl_datetime'] = datetime.now() item['status'] = JobItem.CrawlStatus.COMPLETED yield item source = JobSource.from_job_spider(RemixJobsSpider)
), 'from_page__address__css': 'div.container div.row:nth-child(2) .col-md-9 h4:nth-child(4)::text', 'from_page__description__css': 'div.container div.row:nth-child(4) div.col-md-9', 'from_page__tags__css': 'div.container div.row:nth-child(4) div.col-md-9' } def _get_from_list__jobs(self, node): jobs = super(LinuxJobsSpider, self)._get_from_list__jobs(node) if jobs: return jobs[::-1] # Reverse jobs list (they are in asc order) return jobs def _get_from_list__url(self, jobs_node): if len(jobs_node.css('h4')) < 1: # If no h4, then, this is not a job raise NotCrawlable() return super(LinuxJobsSpider, self)._get_from_list__url(jobs_node) def _get_from_page__publication_datetime(self, job_container): publication_datetime_str = self._extract_first(job_container, 'from_page__publication_datetime') publication_datetime_str = publication_datetime_str.replace(u'Ajout\xe9e le', '') publication_datetime_str_english = self._month_french_to_english(publication_datetime_str) return datetime.strptime(publication_datetime_str_english, '%d %B %Y') def _get_from_page__address(self, job_container): address = super(LinuxJobsSpider, self)._get_from_page__address(job_container) if address: return re.sub(r'\([^)]*\)', '', address).strip() # address is like Paris (programmeurs) return None source = JobSource.from_job_spider(LinuxJobsSpider)
from pyjobs_crawlers.spiders import JobSpider, JobSource class UrbanLinkerSpider(JobSpider): name = 'urbanlinker' start_urls = ['http://www.urbanlinker.com/offresdemploi/motcle/python/'] label = 'Urban Linker' url = 'http://www.urbanlinker.com/' logo_url = 'http://www.urbanlinker.com/wp-content/themes/urbanlinker/images/logo-new.jpg' _crawl_parameters = { 'from_page_enabled': True, 'from_list__jobs_lists__css': '#contentoffres', 'from_list__jobs__css': 'article.post', 'from_list__url__css': 'h2.title-article a::attr(href)', 'from_list__next_page__css': 'ul.bottomnav-content li.last a::attr(href)', 'from_list__title__css': 'h2.title-article h2 a::text', 'from_list__publication_datetime__css': '.post-info time::attr(datetime)', 'from_page__container__css': 'article.post', 'from_page__title__css': 'h1.title-job::text', 'from_page__description__css': 'div.post-content', 'from_page__address__css': 'header h1 + span::text', } def _get_from_list__publication_datetime(self, node): return self._extract_first(node, 'from_list__publication_datetime', required=False) # N'oubliez pas cette ligne source = JobSource.from_job_spider(UrbanLinkerSpider)
name = 'urbanlinker' start_urls = ['http://www.urbanlinker.com/offresdemploi/motcle/python/'] label = 'Urban Linker' url = 'http://www.urbanlinker.com/' logo_url = 'http://www.urbanlinker.com/wp-content/themes/urbanlinker/images/logo-new.jpg' _crawl_parameters = { 'from_page_enabled': True, 'from_list__jobs_lists__css': '#contentoffres', 'from_list__jobs__css': 'article.post', 'from_list__url__css': 'h2.title-article a::attr(href)', 'from_list__next_page__css': 'ul.bottomnav-content li.last a::attr(href)', 'from_list__title__css': 'h2.title-article h2 a::text', 'from_list__publication_datetime__css': '.post-info time::attr(datetime)', 'from_page__container__css': 'article.post', 'from_page__title__css': 'h1.title-job::text', 'from_page__description__css': 'div.post-content', 'from_page__address__css': 'header h1 + span::text', } def _get_from_list__publication_datetime(self, node): return self._extract_first(node, 'from_list__publication_datetime', required=False) # N'oubliez pas cette ligne source = JobSource.from_job_spider(UrbanLinkerSpider)
satisfying = super(LolixJobSpider, self)._item_satisfying(item) if satisfying: # fixme if item['title'].lower().find('python') <= 0: return False return satisfying def address_forbidden_content(self): return [ u'Tél', u'offre', u'Administration', u'BTP', u'Enseignement', u'Industrie', u'Informatique', u'Recherche', u'Editeur', u'Internet', u'SSII' ] def match_str(self, string, forbidden_string_items): for forbidden_item in forbidden_string_items: if string.find(forbidden_item) >= 0: return True return False source = JobSource.from_job_spider(LolixJobSpider)
jobs = super(LinuxJobsSpider, self)._get_from_list__jobs(node) if jobs: return jobs[::-1] # Reverse jobs list (they are in asc order) return jobs def _get_from_list__url(self, jobs_node): if len(jobs_node.css('h4')) < 1: # If no h4, then, this is not a job raise NotCrawlable() return super(LinuxJobsSpider, self)._get_from_list__url(jobs_node) def _get_from_page__publication_datetime(self, job_container): publication_datetime_str = self._extract_first( job_container, 'from_page__publication_datetime') publication_datetime_str = publication_datetime_str.replace( u'Ajout\xe9e le', '') publication_datetime_str_english = self._month_french_to_english( publication_datetime_str) return datetime.strptime(publication_datetime_str_english, '%d %B %Y') def _get_from_page__address(self, job_container): address = super(LinuxJobsSpider, self)._get_from_page__address(job_container) if address: return re.sub( r'\([^)]*\)', '', address).strip() # address is like Paris (programmeurs) return None source = JobSource.from_job_spider(LinuxJobsSpider)
# company_url = job_infos.xpath('./li[1]/a/@href').extract_first().strip() address = job_infos.xpath('./li[4]/text()').extract_first().strip().rstrip(',') # description = job_node.css('div.job-description').extract() tags_html = self._extract_first(job_node, 'from_page__tags', required=False) if tags_html: item['tags'] = self.extract_tags(tags_html) # 19 mars 2015, # # thedate_xpath = './div[@id="content-core"]/div[@id="content-core"]/div[@class="discreet"]/text()' # # print 'DATE IS', job_node.xpath(thedate_xpath)[0].extract().strip().splitlines() # thedate = job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()[0].strip().replace(u'Créé le ', '') # # Now date is formatted as "14/10/2015 13:17" # thedatetime = datetime.strptime(thedate, '%d/%m/%Y %H:%M') # publication_datetime = thedatetime item['address'] = address item['url'] = url # used as uid item['source'] = self.name item['company'] = company # item['company_url'] = company_url item['initial_crawl_datetime'] = datetime.now() item['status'] = JobItem.CrawlStatus.COMPLETED yield item source = JobSource.from_job_spider(RemixJobsSpider)
logo_url = u'http://www.pole-emploi.fr/accueil/image/site/logo/logo-pole-emploi_region.png' _crawl_parameters = { 'from_page_enabled': True, 'from_list__jobs_lists__css': 'div#offrescartezone div.result-page table.definition-table', 'from_list__jobs__css': 'tr[itemtype="http://schema.org/JobPosting"]', 'from_list__url__css': 'a::attr(href)', 'from_list__title__css': 'a.title::text', 'from_list__company__css': 'span.company span[itemprop=name]::text', 'from_list__next_page__css': None, # FIXME - D.A. - 2016-02-19 - next page is protected by javascript # This is not a problem for us (we crawl every 15 minutes 'from_page__container__css': '#offre-body', 'from_page__title__css': 'h4[itemprop=title]', 'from_page__publication_datetime__css': 'span[itemprop=datePosted]::text', 'from_page__company__css': '#second h3.nom::text', 'from_page__address__css': 'li[itemprop=addressRegion]::text', 'from_page__description__css': '#offre-body div', 'from_page__tags__css': 'p[itemprop=description]::text', } def _get_from_page__publication_datetime(self, job_node): date_text = self._extract_first(job_node, 'from_page__publication_datetime') if date_text: return datetime.strptime(date_text, '%d/%m/%Y') return super(PoleEmploiSpider, self)._get_from_page__publication_datetime(job_node) # N'oubliez pas cette ligne source = JobSource.from_job_spider(PoleEmploiSpider)
'from_page__address__xpath': './/h4[1]/following-sibling::div[@class="row"]/text()', 'from_page__description__css': '#content', 'from_page__tags__xpath': './div[@id="content-core"]/div[@id="content-core"]' } def _get_from_page__publication_datetime(self, job_container): try: publication_date_text = self._extract_first( job_container, 'from_page__publication_datetime') if publication_date_text: publication_date_text_clean = publication_date_text.replace( u'Créé le ', '') return datetime.strptime(publication_date_text_clean, '%d/%m/%Y %H:%M') return super( AfpyJobSpider, self)._get_from_page__publication_datetime(job_container) except Exception, exc: self.get_connector().log( self.name, self.ACTION_CRAWL_ERROR, "Error during publication date extraction: %s" % str(exc)) return super( AfpyJobSpider, self)._get_from_page__publication_datetime(job_container) source = JobSource.from_job_spider(AfpyJobSpider)
name = 'human' start_urls = ['http://jobs.humancoders.com/python'] label = 'Human coders' url = 'http://jobs.humancoders.com/' logo_url = 'http://jobs.humancoders.com/assets/logo-b2ddc104507a3e9f623788cf9278ba0e.png' _crawl_parameters = { 'from_page_enabled': True, 'from_list__jobs_lists__css': 'body', 'from_list__jobs__css': 'li.job', 'from_list__url__css': 'div.job_title h2 a::attr(href)', 'from_list__title__css': 'div.job_title h2 a::text', 'from_list__publication_datetime__css': 'div.date::text', 'from_list__tags__css': 'ul.tags li p::text', 'from_list__company__css': 'div.company span.company_name::text', 'from_list__address__css': 'div.location::text', 'from_page__container__css': 'body', 'from_page__company_url__css': 'div.company_url a::attr(href)', 'from_page__description__css': '#description' } def _get_from_list__publication_datetime(self, node): raw_date = self._extract_first(node, 'from_list__publication_datetime') if raw_date: # La date est sous la forme "24 août 2015" raw_date_english = self._month_french_to_english(raw_date) # On lma converti en Anglais return datetime.strptime(raw_date_english, '%d %B %Y') # On extrait la date de ce texte # N'oubliez pas cette ligne source = JobSource.from_job_spider(HumanCodersSpider)
def _get_from_list__url(self, node): extracted_url = self._extract_first(node, 'from_list__url', required=True) return self._get_absolute_url(extracted_url.encode('utf-8')) def _get_from_list__tags(self, node): """ Tags are hidden in img/alt ('alt="recrutement développeur python"') TODO : Must find another way to create tag instead of creating obj """ raw_tags = self._extract_all(node, 'from_list__tags') if raw_tags: return [Tag(tag.split()[-1]) for tag in raw_tags] return True def _get_from_list__publication_datetime(self, node): """ The datetime is humanized/natural (ex: "2 days ago") Our goal here is to parse to a datetime object """ raw_date = self._extract_first(node, 'from_list__publication_datetime') if raw_date: cal = parsedatetime.Calendar() time_struct, parse_status = cal.parse(raw_date) return datetime(*time_struct[:6]) source = JobSource.from_job_spider(BlueCodersSpider)
and not self.match_str(content, self.address_forbidden_content()): address += content + ', ' return address def _item_satisfying(self, item): satisfying = super(LolixJobSpider, self)._item_satisfying(item) if satisfying: # fixme if item['title'].lower().find('python') <= 0: return False return satisfying def address_forbidden_content(self): return [ u'Tél', u'offre', u'Administration', u'BTP', u'Enseignement', u'Industrie', u'Informatique', u'Recherche', u'Editeur', u'Internet', u'SSII' ] def match_str(self, string, forbidden_string_items): for forbidden_item in forbidden_string_items: if string.find(forbidden_item) >= 0: return True return False source = JobSource.from_job_spider(LolixJobSpider)
'from_page__container__css': 'div.fiche', 'from_page__title__css': '#premier h2[itemprop=title]::text', 'from_page__publication_datetime__css': 'p.navinfo time::attr(datetime)', 'from_page__company__css': '#second h3.nom::text', 'from_page__company_url__css': '#second a[itemprop=url]::attr(href)', 'from_page__address__css': '#premier b[itemprop=jobLocation]::text', 'from_page__description__css': '#premier p[itemprop=description]', 'from_page__tags__css': '#premier p[itemprop=skills] b::text', } def _get_from_list__publication_datetime(self, job_node): return datetime.now() def _get_from_page__publication_datetime(self, job_node): date_text = self._extract_first(job_node, 'from_page__publication_datetime') if date_text: return date_text return super(AlsaCreationsSpider, self)._get_from_list__publication_datetime(job_node) # def _get_from_page__tags(self, job_node): # # TODO - 2016-02-18 - D.A. - Make tags import ok # # Use the standard tags methods to extract tags (according to base list # tags = self._extract_all(job_node, 'from_page__tags') # if tags: # return tags # return super(AlsaCreationsSpider, self)._get_from_page__tags(job_node) # N'oubliez pas cette ligne source = JobSource.from_job_spider(AlsaCreationsSpider)
'from_page__title__xpath': './h1[@id="parent-fieldname-title"]/text()', 'from_page__company__xpath': ('.//h4/a/text()', './/h4/text()'), 'from_page__company_url__xpath': './div[@id="content-core"]/div[@id="content-core"]/h4/a/@href', 'from_page__address__xpath': './/h4[1]/following-sibling::div[@class="row"]/text()', 'from_page__description__css': '#content', 'from_page__tags__xpath': './div[@id="content-core"]/div[@id="content-core"]' } def _get_from_list__publication_datetime(self, job_container): try: publication_date_text = self._extract_first(job_container, 'from_list__publication_datetime') if publication_date_text: publication_date_text_clean = publication_date_text.replace(u'Créé le ', '').replace(u' par', '') return datetime.strptime(publication_date_text_clean, '%d/%m/%Y %H:%M') return super(AfpyJobSpider, self)._get_from_page__publication_datetime(job_container) except Exception, exc: self.get_connector().log( self.name, self.ACTION_CRAWL_ERROR, "Error during publication date extraction: %s" % str(exc) ) return super(AfpyJobSpider, self)._get_from_page__publication_datetime(job_container) def _get_from_page__description(self, node): description = super(AfpyJobSpider, self)._get_from_page__description(node) if description: return re.sub('<h1[^>]*?>.*?</h1>', '', description) return description source = JobSource.from_job_spider(AfpyJobSpider)
label = 'Human coders' url = 'http://jobs.humancoders.com/' logo_url = 'http://jobs.humancoders.com/assets/logo-b2ddc104507a3e9f623788cf9278ba0e.png' _crawl_parameters = { 'from_page_enabled': True, 'from_list__jobs_lists__css': 'body', 'from_list__jobs__css': 'li.job', 'from_list__url__css': 'div.job_title h2 a::attr(href)', 'from_list__title__css': 'div.job_title h2 a::text', 'from_list__publication_datetime__css': 'div.date::text', 'from_list__tags__css': 'ul.tags li p::text', 'from_list__company__css': 'div.company span.company_name::text', 'from_list__address__css': 'div.location::text', 'from_page__container__css': 'body', 'from_page__company_url__css': 'div.company_url a::attr(href)', 'from_page__description__css': '#description' } def _get_from_list__publication_datetime(self, node): raw_date = self._extract_first(node, 'from_list__publication_datetime') if raw_date: # La date est sous la forme "24 août 2015" raw_date_english = self._month_french_to_english( raw_date) # On lma converti en Anglais return datetime.strptime( raw_date_english, '%d %B %Y') # On extrait la date de ce texte # N'oubliez pas cette ligne source = JobSource.from_job_spider(HumanCodersSpider)
'from_list__jobs_lists__css': 'div#offrescartezone div.result-page table.definition-table', 'from_list__jobs__css': 'tr[itemtype="http://schema.org/JobPosting"]', 'from_list__url__css': 'a::attr(href)', 'from_list__title__css': 'a.title::text', 'from_list__company__css': 'span.company span[itemprop=name]::text', 'from_list__next_page__css': None, # FIXME - D.A. - 2016-02-19 - next page is protected by javascript # This is not a problem for us (we crawl every 15 minutes 'from_page__container__css': '#offre-body', 'from_page__title__css': 'h4[itemprop=title]', 'from_page__publication_datetime__css': 'span[itemprop=datePosted]::text', 'from_page__company__css': '#second h3.nom::text', 'from_page__address__css': 'li[itemprop=addressRegion]::text', 'from_page__description__css': '#offre-body p[itemprop=description]', 'from_page__tags__css': 'p[itemprop=description]::text', } def _get_from_page__publication_datetime(self, job_node): date_text = self._extract_first(job_node, 'from_page__publication_datetime') if date_text: return datetime.strptime(date_text, '%d/%m/%Y') return super(PoleEmploiSpider, self)._get_from_page__publication_datetime(job_node) # N'oubliez pas cette ligne source = JobSource.from_job_spider(PoleEmploiSpider)
'p.navinfo time::attr(datetime)', 'from_page__company__css': '#second h3.nom::text', 'from_page__company_url__css': '#second a[itemprop=url]::attr(href)', 'from_page__address__css': '#premier b[itemprop=jobLocation]::text', 'from_page__description__css': '#premier p[itemprop=description]', 'from_page__tags__css': '#premier p[itemprop=skills] b::text', } def _get_from_list__publication_datetime(self, job_node): return datetime.now() def _get_from_page__publication_datetime(self, job_node): date_text = self._extract_first(job_node, 'from_page__publication_datetime') if date_text: return date_text return super(AlsaCreationsSpider, self)._get_from_list__publication_datetime(job_node) # def _get_from_page__tags(self, job_node): # # TODO - 2016-02-18 - D.A. - Make tags import ok # # Use the standard tags methods to extract tags (according to base list # tags = self._extract_all(job_node, 'from_page__tags') # if tags: # return tags # return super(AlsaCreationsSpider, self)._get_from_page__tags(job_node) # N'oubliez pas cette ligne source = JobSource.from_job_spider(AlsaCreationsSpider)