def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url + 'page/', f'{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&p={page}' if '?' in entry_url else f'?p={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
class JobberManProvider(AbstractTokenProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'jobberman.com.gh' name = 'Jobber Man' def __init__(self): self.jobs = JobsList() def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str): print(entry_url) self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): print(job_link) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}' for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] scheme_host = urlparse(entry_url) scheme_host = scheme_host.scheme + '://' + scheme_host.netloc intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)] for job_link in intial_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] loop_url = f'{entry_url}?{self.pagination}={page}' current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)] if current_page_links==intial_page_links: break for job_link in current_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) intial_page_links=current_page_links page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s "%job_link) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') print(loop_url) for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s " % job_link) print("Scraped page %s" % page) page += 1 return self.jobs
class BrighterMondayProvider(AbstractTokenProvider): name = "Brighter Monday" pagination = 'page' host = [ 'brightermonday.co.ke', 'brightermonday.co.ug', 'brightermonday.co.tz' ] timezone = pytz.timezone("Africa/Nairobi") def __init__(self): self.jobs = JobsList() def fetch_page(self, page_url): buffer = [] for job_link in self.get_jobs_list(page_url): try: buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) return buffer def fetch(self, entry_url: str) -> JobsList: page_buffer = self.fetch_page(entry_url) self.jobs = JobsList() page = 2 while page_buffer: self.jobs.extend(page_buffer) loop_url = f'{entry_url}?{self.pagination}={page}' page_buffer = self.fetch_page(loop_url) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] prep_url = PreparedRequest() prep_url.prepare(url=entry_url, params={'page': page}) next_page_url = prep_url.url for job_link in self.get_jobs_list(next_page_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) print("Scraped page %s" % page) page += 1 return self.jobs
class PigiaMeProvider(AbstractTokenProvider): name = "Pigiame" pagination = 'page' host = 'pigiame.co.ke' timezone = pytz.timezone('Africa/Nairobi') def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] scheme_host = urlparse(entry_url) scheme_host = scheme_host.scheme + '://' + scheme_host.netloc intial_page_links = [ job_link for job_link in self.get_jobs_list(entry_url) ] for job_link in intial_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] loop_url = f'{entry_url}?{self.pagination}={page}' current_page_links = [ job_link for job_link in self.get_jobs_list(loop_url) ] if current_page_links == intial_page_links: break for job_link in current_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) intial_page_links = current_page_links page += 1 return self.jobs return self.jobs def post_process(self, job): # you can do field standardization here # Glassdoor gives date in this format '2018-12-11', we need a timestamp posted = datetime.strptime(job["date_posted"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["date_posted"] = str(posted) posted = datetime.strptime(job["valid_through"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["valid_through"] = str(posted) return job
def fetch(self, entry_url: str) -> JobsList: page_buffer = self.fetch_page(entry_url) self.jobs = JobsList() page = 2 while page_buffer: self.jobs.extend(page_buffer) loop_url = f'{entry_url}?{self.pagination}={page}' page_buffer = self.fetch_page(loop_url) page += 1 return self.jobs
class FuzuProvider(AbstractTokenProvider): name = "Fuzu" pagination = 'page' host = 'fuzu.com' timezone = pytz.timezone("Africa/Nairobi") def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] scheme_host = urlparse(entry_url) scheme_host = scheme_host.scheme + '://' + scheme_host.netloc intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)] for job_link in intial_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] loop_url = f'{entry_url}?{self.pagination}={page}' current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)] if current_page_links==intial_page_links: break for job_link in current_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) intial_page_links=current_page_links page += 1 return self.jobs def post_process(self, job): # I am in charge of standardizing fields across job objects # Fuzu's validthrough timestamp is ok but dateposted is not posted = datetime.strptime(job["date_posted"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["date_posted"] = str(posted) return job
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() next_url = entry_url while next_url: content = session.get(next_url, headers=self.headers).content for job_url in self.get_urls_from_content(content): print(job_url) try: self.jobs.append(self.get_job(job_url)) except Exception as e: print("Error adding job at %s %s" % (job_url, e)) urls = fromstring(content.decode()).xpath(self.pagination_xpath) if urls: next_url_element, = urls next_url = next_url_element.attrib['href'] next_url = urljoin(entry_url, next_url) else: next_url = None return self.jobs
class CareerPointProvider(AbstractHTMLProvider, AbstractTokenProvider): name = "Career Point" pagination = 'page' host = 'careerpointkenya.co.ke' properties = None urls_xpath = '//header/h2/a' timezone = pytz.timezone("Africa/Nairobi") def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job): # I am in charge of standardizing fields across job objects # Fuzu's validthrough timestamp is ok but dateposted is not posted = datetime.strptime(job["date_posted"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["date_posted"] = str(posted) return job
class GlassDoorProvider(AbstractTokenProvider): name = "Glassdoor" pagination_xpath = "//li[@class='next']//a" host = 'glassdoor.com' timezone = pytz.timezone('Africa/Nairobi') headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)" " Ubuntu Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36" } def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() next_url = entry_url while next_url: content = session.get(next_url, headers=self.headers).content for job_url in self.get_urls_from_content(content): print(job_url) try: self.jobs.append(self.get_job(job_url)) except Exception as e: print("Error adding job at %s %s" % (job_url, e)) urls = fromstring(content.decode()).xpath(self.pagination_xpath) if urls: next_url_element, = urls next_url = next_url_element.attrib['href'] next_url = urljoin(entry_url, next_url) else: next_url = None return self.jobs def post_process(self, job): # you can do field standardization here # Glassdoor gives date in this format '2018-12-11', we need a timestamp posted = datetime.strptime(job["date_posted"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["date_posted"] = str(posted) posted = datetime.strptime(job["valid_through"], "%Y-%m-%d") posted = self.timezone.localize(posted) job["valid_through"] = str(posted) return job
class RwandaJobProvider(AbstractHTMLProvider, AbstractTokenProvider): name = "RwandaJob" host = 'rwandajob.com' urls_xpath = "//div[@class='job-search-result ']/div/div/h5/a" properties = {} def get_job(self, job_link): return AbstractTokenProvider.get_job(self, job_link) def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] prep_url = PreparedRequest() prep_url.prepare(url=entry_url, params={'page': page}) next_page_url = prep_url.url for job_link in self.get_jobs_list(next_page_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) print("Scraped page %s" % page) page += 1 return self.jobs
class JobMailProvider(AbstractHTMLProvider, AbstractTokenProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'jobmail.co.za' name = 'Job Mail' urls_xpath = "//a[contains(@class, 'btnView')]" def __init__(self): self.jobs = JobsList() def get_job(self, job_url: str): return AbstractTokenProvider.get_job(self, job_url) def fetch(self, entry_url: str): print(entry_url) self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): print(job_link) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}' for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
class JobWebProvider(AbstractHTMLProvider, AbstractTokenProvider): name = "Job Web" host = ['jobwebkenya.com', 'jobwebghana.com', 'jobwebrwanda.com'] urls_xpath = '//ol/li/div[2]/strong/a' properties = {} def get_job(self, job_link): return AbstractTokenProvider.get_job(self, job_link) def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s "%job_link) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') print(loop_url) for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s " % job_link) print("Scraped page %s" % page) page += 1 return self.jobs
class BestJobsProvider(AbstractHTMLProvider, AbstractTokenProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'bestjobs.co.za' name = 'Best Jobs' urls_xpath = "//a[@class='js-o-link']" def __init__(self): self.jobs = JobsList() def get_job(self, job_url: str): return AbstractTokenProvider.get_job(self, job_url) def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&p={page}' if '?' in entry_url else f'?p={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
class EmergeProvider(AbstractHTMLProvider, AbstractTokenProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'e-merge.co.za' name = 'Emerge' urls_xpath = "//div[contains(@class, 'et_pb_text_inner')]//a[@class='wpjb-job_title wpjb-title']" def __init__(self): self.jobs = JobsList() def get_job(self, job_url: str): return AbstractTokenProvider.get_job(self, job_url) def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url + 'page/', f'{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
class GigaJobProvider(AbstractHTMLProvider, AbstractTokenProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'en-za.gigajob.com' name = 'Giga Job' urls_xpath = "//section/div/div/h3/a" def __init__(self): self.jobs = JobsList() def get_job(self, job_url: str): return AbstractTokenProvider.get_job(self, job_url) def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}') print("Loop url", loop_url) for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
class Careers24Provider(AbstractHTMLProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'careers24.com' name = 'Careers 24' urls_xpath = "//a[@data-trigger='jobalertmodal']" properties = { 'job_title': "//meta[@property='og:title']/@content", 'hiring_organization': "//span[@class='posted']/span/span", 'city': "//a[@id='ctl00_contentPrimaryPlaceHolder_NewJobDetail_NewJobSummary_hlLocation']", 'employment_type': "//ul[@class='job-detail-summary']/li[6]/span/span", 'date_posted': "//span[@class='posted']/text()", 'valid_through': "//span[contains(text(), 'Apply before')]/span[1]", 'description': "//div[@class='job_detail_container']", 'instructions': "//div[@id='ctl00_contentPrimaryPlaceHolder__ctrl_0_divCandReq']", "country": None, "education_requirements": None, "qualifications": None, "experience_requirement": None, "industry": "//span[contains(text(),'Sectors:')]/following-sibling::a", "skills": "//div[@id='ctl00_contentPrimaryPlaceHolder__ctrl_0_divCandSkills']", "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, "url": None, "value_period": None, "source": None, } def __init__(self): self.jobs = JobsList() def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}') print("Next page", loop_url) for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job_dict): # process job posted, process time try: posted = datetime.strptime(job_dict['date_posted'], 'on %A, %B %d, %Y') except ValueError: posted = datetime.strptime(job_dict['date_posted'], 'Posted on %A, %B %d, %Y') posted = self.timezone.localize(posted) job_dict['date_posted'] = str(posted) valid_through = datetime.strptime(job_dict['valid_through'], '%A, %B %d, %Y') valid_through = self.timezone.localize(valid_through) job_dict['valid_through'] = str(valid_through) job_dict['url'] = job_dict['url'].split('?')[0] return job_dict
class IHubProvider(AbstractHTMLProvider): name = "Ihub" host = 'ihub.co.ke' urls_xpath = '//h3/a' timezone = pytz.timezone('Africa/Nairobi') properties = { 'job_title': "//div[@class='container-fluid job-article-header']/div/h1", 'hiring_organization': "//div[@class='container-fluid job-article-header']/div[1]/ul/li/a[1]", 'city': "//div[@class='city-location']", 'employment_type': "//div[@class='container-fluid job-article-header']/div[1]/ul/li/a[2]", 'date_posted': "//div[@class='container-fluid job-article-header']/div[1]/ul/li/span[1]", 'valid_through': "//div[@class='container-fluid job-article-header']/div[1]/ul/li/span[2]", 'description': "//div[@class='vacancy-description']", 'instructions': "//div[@class='how-to-apply']", "country": None, "education_requirements": None, "qualifications": None, "experience_requirement": None, "industry": None, "skills": None, "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, "url": None, "value_period": None, "instructions": None, "source": None, } def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url + '/', f'{page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job): # you can do field standardization here # Ihub gives date in this format '01 Dec, 2018', we need a timestamp posted = datetime.strptime(job["date_posted"], "%d %b, %Y") posted = self.timezone.localize(posted) job["date_posted"] = str(posted) posted = datetime.strptime(job["valid_through"], "%d %b, %Y") posted = self.timezone.localize(posted) job["valid_through"] = str(posted) job["city"], job["country"] = job["city"].rsplit(", ")[-2:] job["country"] = country_mapping.get(job["country"], job["country"]) job.update({ "value_period": None, "education_requirements": None, "qualifications": None, "experience_requirement": None, "industry": None, "skills": None, "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, }) return job
def __init__(self): self.jobs = JobsList()
class PNetProvider(AbstractHTMLProvider): host = 'pnet.co.za' name = 'P Net' urls_xpath = "//a[contains(@class, 'job-element__url')]" properties = { 'job_title': "//h1[contains(@class, 'listing__job-title')]", 'hiring_organization': "//h6[contains(@class, 'listing__company-name')]", 'city': "//li[contains(@class, 'at-listing__list-icons_location')]/a/span[2]", 'employment_type': "//li[contains(@class, 'at-listing__list-icons_work-type')]/text()[2]", 'date_posted': "//span[contains(@class, 'date-time-ago')]/@data-date", 'valid_through': None, 'description': "//main[contains(@class, 'offer__content')][section]", 'instructions': None, "country": None, "education_requirements": None, "qualifications": None, "experience_requirement": None, "industry": None, "skills": None, "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, "url": None, "value_period": None, "source": None, } def __init__(self): self.jobs = JobsList() def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] offset = page * 100 loop_url = entry_url + (f'&of={offset}' if '?' in entry_url else f'?of={offset}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job_dict): # process job posted, process time posted = parse(job_dict['date_posted']) job_dict['date_posted'] = str(posted) return job_dict
class JobVineProvider(AbstractHTMLProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'jobvine.co.za' name = 'Job Vine' urls_xpath = "//p[contains(@class, 'job-title')]/a" properties = { 'job_title': "//h1[@class='job-title']", 'hiring_organization': "//span[text()='Recruiter:']/following-sibling::strong", 'city': "//span[text()='Location:']/following-sibling::strong", 'employment_type': "//span[text()='Job Type:']/following-sibling::strong", 'date_posted': "//span[text()='Date added:']/following-sibling::strong", 'valid_through': None, 'description': "//div[@class='job-item premium']/following-sibling::div[1]", 'instructions': None, "country": None, "education_requirements": None, "qualifications": None, "experience_requirement": None, "industry": None, "skills": None, "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, "url": None, "value_period": None, "source": None } def __init__(self): self.jobs = JobsList() def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job_dict): # process job posted, process time posted = datetime.strptime(job_dict['date_posted'], '%d %B %Y') posted = self.timezone.localize(posted) job_dict['date_posted'] = str(posted) return job_dict
class JobsInGhanaProvider(AbstractHTMLProvider): timezone = pytz.timezone('Africa/Nairobi') host = 'jobsinghana.com' name = 'Jobs Ghana' urls_xpath = "//a[@property='title']" properties = { 'job_title': "//div[@class='jobdetailtitle']/h3", 'hiring_organization': "//td[text()='Company']/following-sibling::td/@title", 'city': "//td[text()='Location']/following-sibling::td/@title", 'employment_type': "//td[text()='Job Status']/following-sibling::td/@title", 'date_posted': None, 'valid_through': "//td[text()='Job Expires']/following-sibling::td/@title", 'description': "//td[@class='job_desc']", 'instructions': None, "country": None, "education_requirements": None, "qualifications": None, "experience_requirement": "//td[text()='Experience']/following-sibling::td/@title", "industry": "//td[text()='Industry']/following-sibling::td/@title", "skills": None, "responsibilities": None, "value_currency": None, "min_value": None, "max_value": None, "url": None, "value_period": None, "source": None } def __init__(self): self.jobs = JobsList() def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs def post_process(self, job_dict): # process job posted, process time # Sep 16, 2019 posted = datetime.strptime(job_dict['valid_through'], '%b %d, %Y') posted = self.timezone.localize(posted) job_dict['valid_through'] = str(posted) return job_dict