예제 #1
0
    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url + 'page/', f'{page}/')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&p={page}'
                                    if '?' in entry_url else f'?p={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
class JobberManProvider(AbstractTokenProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'jobberman.com.gh'
    name = 'Jobber Man'

    def __init__(self):
        self.jobs = JobsList()

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
    def fetch(self, entry_url: str):
        print(entry_url)
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            print(job_link)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}'

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []
        scheme_host = urlparse(entry_url)
        scheme_host = scheme_host.scheme + '://' + scheme_host.netloc

        intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)]
        for job_link in intial_page_links:
            job_path = urlparse(job_link).path
            job_link = urljoin(scheme_host, job_path)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))

        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            loop_url = f'{entry_url}?{self.pagination}={page}'
            current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)]
            if current_page_links==intial_page_links:
                break
            for job_link in current_page_links:
                job_path = urlparse(job_link).path
                job_link = urljoin(scheme_host, job_path)
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            intial_page_links=current_page_links
            page += 1
        return self.jobs
예제 #6
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except:
                print("Error Processing %s "%job_link)

        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            print(loop_url)
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except:
                    print("Error Processing %s " % job_link)

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
예제 #7
0
class BrighterMondayProvider(AbstractTokenProvider):
    name = "Brighter Monday"
    pagination = 'page'
    host = [
        'brightermonday.co.ke', 'brightermonday.co.ug', 'brightermonday.co.tz'
    ]
    timezone = pytz.timezone("Africa/Nairobi")

    def __init__(self):
        self.jobs = JobsList()

    def fetch_page(self, page_url):
        buffer = []

        for job_link in self.get_jobs_list(page_url):
            try:
                buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))

        return buffer

    def fetch(self, entry_url: str) -> JobsList:
        page_buffer = self.fetch_page(entry_url)
        self.jobs = JobsList()
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            loop_url = f'{entry_url}?{self.pagination}={page}'
            page_buffer = self.fetch_page(loop_url)
            page += 1

        return self.jobs
예제 #8
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(
                    AbstractTokenProvider.get_job(self, job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(
                        AbstractTokenProvider.get_job(self, job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
예제 #9
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error Processing %s %s " % (job_link, e))

        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            prep_url = PreparedRequest()
            prep_url.prepare(url=entry_url, params={'page': page})
            next_page_url = prep_url.url

            for job_link in self.get_jobs_list(next_page_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error Processing %s %s " % (job_link, e))

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
class PigiaMeProvider(AbstractTokenProvider):
    name = "Pigiame"
    pagination = 'page'
    host = 'pigiame.co.ke'
    timezone = pytz.timezone('Africa/Nairobi')

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []
        scheme_host = urlparse(entry_url)
        scheme_host = scheme_host.scheme + '://' + scheme_host.netloc

        intial_page_links = [
            job_link for job_link in self.get_jobs_list(entry_url)
        ]
        for job_link in intial_page_links:
            job_path = urlparse(job_link).path
            job_link = urljoin(scheme_host, job_path)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))

        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            loop_url = f'{entry_url}?{self.pagination}={page}'
            current_page_links = [
                job_link for job_link in self.get_jobs_list(loop_url)
            ]
            if current_page_links == intial_page_links:
                break
            for job_link in current_page_links:
                job_path = urlparse(job_link).path
                job_link = urljoin(scheme_host, job_path)
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            intial_page_links = current_page_links
            page += 1
        return self.jobs
        return self.jobs

    def post_process(self, job):
        # you can do field standardization here
        # Glassdoor gives date in this format '2018-12-11', we need a timestamp
        posted = datetime.strptime(job["date_posted"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["date_posted"] = str(posted)
        posted = datetime.strptime(job["valid_through"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["valid_through"] = str(posted)
        return job
예제 #11
0
    def fetch(self, entry_url: str) -> JobsList:
        page_buffer = self.fetch_page(entry_url)
        self.jobs = JobsList()
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            loop_url = f'{entry_url}?{self.pagination}={page}'
            page_buffer = self.fetch_page(loop_url)
            page += 1

        return self.jobs
class FuzuProvider(AbstractTokenProvider):
    name = "Fuzu"
    pagination = 'page'
    host = 'fuzu.com'
    timezone = pytz.timezone("Africa/Nairobi")

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []
        scheme_host = urlparse(entry_url)
        scheme_host = scheme_host.scheme + '://' + scheme_host.netloc

        intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)]
        for job_link in intial_page_links:
            job_path = urlparse(job_link).path
            job_link = urljoin(scheme_host, job_path)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))

        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            loop_url = f'{entry_url}?{self.pagination}={page}'
            current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)]
            if current_page_links==intial_page_links:
                break
            for job_link in current_page_links:
                job_path = urlparse(job_link).path
                job_link = urljoin(scheme_host, job_path)
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            intial_page_links=current_page_links
            page += 1
        return self.jobs

    def post_process(self, job):
        # I am in charge of standardizing fields across job objects
        # Fuzu's validthrough timestamp is ok but dateposted is not
        posted = datetime.strptime(job["date_posted"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["date_posted"] = str(posted)
        return job
 def fetch(self, entry_url: str) -> JobsList:
     self.jobs = JobsList()
     next_url = entry_url
     while next_url:
         content = session.get(next_url, headers=self.headers).content
         for job_url in self.get_urls_from_content(content):
             print(job_url)
             try:
                 self.jobs.append(self.get_job(job_url))
             except Exception as e:
                 print("Error adding job at %s %s" % (job_url, e))
         urls = fromstring(content.decode()).xpath(self.pagination_xpath)
         if urls:
             next_url_element, = urls
             next_url = next_url_element.attrib['href']
             next_url = urljoin(entry_url, next_url)
         else:
             next_url = None
     return self.jobs
예제 #14
0
class CareerPointProvider(AbstractHTMLProvider, AbstractTokenProvider):
    name = "Career Point"
    pagination = 'page'
    host = 'careerpointkenya.co.ke'
    properties = None
    urls_xpath = '//header/h2/a'
    timezone = pytz.timezone("Africa/Nairobi")

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(
                    AbstractTokenProvider.get_job(self, job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(
                        AbstractTokenProvider.get_job(self, job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs

    def post_process(self, job):
        # I am in charge of standardizing fields across job objects
        # Fuzu's validthrough timestamp is ok but dateposted is not
        posted = datetime.strptime(job["date_posted"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["date_posted"] = str(posted)
        return job
class GlassDoorProvider(AbstractTokenProvider):
    name = "Glassdoor"
    pagination_xpath = "//li[@class='next']//a"
    host = 'glassdoor.com'
    timezone = pytz.timezone('Africa/Nairobi')
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Ubuntu Chromium/70.0.3538.77 Chrome/70.0.3538.77 Safari/537.36"
    }

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        next_url = entry_url
        while next_url:
            content = session.get(next_url, headers=self.headers).content
            for job_url in self.get_urls_from_content(content):
                print(job_url)
                try:
                    self.jobs.append(self.get_job(job_url))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_url, e))
            urls = fromstring(content.decode()).xpath(self.pagination_xpath)
            if urls:
                next_url_element, = urls
                next_url = next_url_element.attrib['href']
                next_url = urljoin(entry_url, next_url)
            else:
                next_url = None
        return self.jobs

    def post_process(self, job):
        # you can do field standardization here
        # Glassdoor gives date in this format '2018-12-11', we need a timestamp
        posted = datetime.strptime(job["date_posted"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["date_posted"] = str(posted)
        posted = datetime.strptime(job["valid_through"], "%Y-%m-%d")
        posted = self.timezone.localize(posted)
        job["valid_through"] = str(posted)
        return job
예제 #16
0
class RwandaJobProvider(AbstractHTMLProvider, AbstractTokenProvider):
    name = "RwandaJob"
    host = 'rwandajob.com'
    urls_xpath = "//div[@class='job-search-result  ']/div/div/h5/a"
    properties = {}

    def get_job(self, job_link):
        return AbstractTokenProvider.get_job(self, job_link)

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error Processing %s %s " % (job_link, e))

        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            prep_url = PreparedRequest()
            prep_url.prepare(url=entry_url, params={'page': page})
            next_page_url = prep_url.url

            for job_link in self.get_jobs_list(next_page_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error Processing %s %s " % (job_link, e))

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
class JobMailProvider(AbstractHTMLProvider, AbstractTokenProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'jobmail.co.za'
    name = 'Job Mail'
    urls_xpath = "//a[contains(@class, 'btnView')]"

    def __init__(self):
        self.jobs = JobsList()

    def get_job(self, job_url: str):
        return AbstractTokenProvider.get_job(self, job_url)

    def fetch(self, entry_url: str):
        print(entry_url)
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            print(job_link)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}'

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
예제 #18
0
class JobWebProvider(AbstractHTMLProvider, AbstractTokenProvider):
    name = "Job Web"
    host = ['jobwebkenya.com', 'jobwebghana.com', 'jobwebrwanda.com']
    urls_xpath = '//ol/li/div[2]/strong/a'
    properties = {}

    def get_job(self, job_link):
        return AbstractTokenProvider.get_job(self, job_link)

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except:
                print("Error Processing %s "%job_link)

        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            print(loop_url)
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except:
                    print("Error Processing %s " % job_link)

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
class BestJobsProvider(AbstractHTMLProvider, AbstractTokenProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'bestjobs.co.za'
    name = 'Best Jobs'
    urls_xpath = "//a[@class='js-o-link']"

    def __init__(self):
        self.jobs = JobsList()

    def get_job(self, job_url: str):
        return AbstractTokenProvider.get_job(self, job_url)

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&p={page}'
                                    if '?' in entry_url else f'?p={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
예제 #20
0
class EmergeProvider(AbstractHTMLProvider, AbstractTokenProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'e-merge.co.za'
    name = 'Emerge'
    urls_xpath = "//div[contains(@class, 'et_pb_text_inner')]//a[@class='wpjb-job_title wpjb-title']"

    def __init__(self):
        self.jobs = JobsList()

    def get_job(self, job_url: str):
        return AbstractTokenProvider.get_job(self, job_url)

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url + 'page/', f'{page}/')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
class GigaJobProvider(AbstractHTMLProvider, AbstractTokenProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'en-za.gigajob.com'
    name = 'Giga Job'
    urls_xpath = "//section/div/div/h3/a"

    def __init__(self):
        self.jobs = JobsList()

    def get_job(self, job_url: str):
        return AbstractTokenProvider.get_job(self, job_url)

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}')
            print("Loop url", loop_url)
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
class Careers24Provider(AbstractHTMLProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'careers24.com'
    name = 'Careers 24'
    urls_xpath = "//a[@data-trigger='jobalertmodal']"
    properties = {
        'job_title': "//meta[@property='og:title']/@content",
        'hiring_organization': "//span[@class='posted']/span/span",
        'city': "//a[@id='ctl00_contentPrimaryPlaceHolder_NewJobDetail_NewJobSummary_hlLocation']",
        'employment_type': "//ul[@class='job-detail-summary']/li[6]/span/span",
        'date_posted': "//span[@class='posted']/text()",
        'valid_through': "//span[contains(text(), 'Apply before')]/span[1]",

        'description': "//div[@class='job_detail_container']",

        'instructions': "//div[@id='ctl00_contentPrimaryPlaceHolder__ctrl_0_divCandReq']",
        "country": None,
        "education_requirements": None,
        "qualifications": None,
        "experience_requirement": None,
        "industry": "//span[contains(text(),'Sectors:')]/following-sibling::a",
        "skills": "//div[@id='ctl00_contentPrimaryPlaceHolder__ctrl_0_divCandSkills']",
        "responsibilities": None,
        "value_currency": None,
        "min_value": None,
        "max_value": None,
        "url": None,
        "value_period": None,
        "source": None,
    }

    def __init__(self):
        self.jobs = JobsList()

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&page={page}' if '?' in entry_url else f'?page={page}')
            print("Next page", loop_url)

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs

    def post_process(self, job_dict):
        # process job posted, process time
        try:
            posted = datetime.strptime(job_dict['date_posted'], 'on %A, %B %d, %Y')
        except ValueError:
            posted = datetime.strptime(job_dict['date_posted'], 'Posted on %A, %B %d, %Y')
        posted = self.timezone.localize(posted)
        job_dict['date_posted'] = str(posted)
        valid_through = datetime.strptime(job_dict['valid_through'], '%A, %B %d, %Y')
        valid_through = self.timezone.localize(valid_through)
        job_dict['valid_through'] = str(valid_through)
        job_dict['url'] = job_dict['url'].split('?')[0]
        return job_dict
class IHubProvider(AbstractHTMLProvider):
    name = "Ihub"
    host = 'ihub.co.ke'
    urls_xpath = '//h3/a'
    timezone = pytz.timezone('Africa/Nairobi')
    properties = {
        'job_title':
        "//div[@class='container-fluid job-article-header']/div/h1",
        'hiring_organization':
        "//div[@class='container-fluid job-article-header']/div[1]/ul/li/a[1]",
        'city': "//div[@class='city-location']",
        'employment_type':
        "//div[@class='container-fluid job-article-header']/div[1]/ul/li/a[2]",
        'date_posted':
        "//div[@class='container-fluid job-article-header']/div[1]/ul/li/span[1]",
        'valid_through':
        "//div[@class='container-fluid job-article-header']/div[1]/ul/li/span[2]",
        'description': "//div[@class='vacancy-description']",
        'instructions': "//div[@class='how-to-apply']",
        "country": None,
        "education_requirements": None,
        "qualifications": None,
        "experience_requirement": None,
        "industry": None,
        "skills": None,
        "responsibilities": None,
        "value_currency": None,
        "min_value": None,
        "max_value": None,
        "url": None,
        "value_period": None,
        "instructions": None,
        "source": None,
    }

    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url + '/', f'{page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))

            page += 1

        return self.jobs

    def post_process(self, job):
        # you can do field standardization here
        # Ihub gives date in this format '01 Dec, 2018', we need a timestamp
        posted = datetime.strptime(job["date_posted"], "%d %b, %Y")
        posted = self.timezone.localize(posted)
        job["date_posted"] = str(posted)
        posted = datetime.strptime(job["valid_through"], "%d %b, %Y")
        posted = self.timezone.localize(posted)
        job["valid_through"] = str(posted)
        job["city"], job["country"] = job["city"].rsplit(", ")[-2:]
        job["country"] = country_mapping.get(job["country"], job["country"])
        job.update({
            "value_period": None,
            "education_requirements": None,
            "qualifications": None,
            "experience_requirement": None,
            "industry": None,
            "skills": None,
            "responsibilities": None,
            "value_currency": None,
            "min_value": None,
            "max_value": None,
        })
        return job
 def __init__(self):
     self.jobs = JobsList()
예제 #25
0
class PNetProvider(AbstractHTMLProvider):
    host = 'pnet.co.za'
    name = 'P Net'
    urls_xpath = "//a[contains(@class, 'job-element__url')]"
    properties = {
        'job_title': "//h1[contains(@class, 'listing__job-title')]",
        'hiring_organization': "//h6[contains(@class, 'listing__company-name')]",
        'city': "//li[contains(@class, 'at-listing__list-icons_location')]/a/span[2]",

        'employment_type': "//li[contains(@class, 'at-listing__list-icons_work-type')]/text()[2]",
        'date_posted': "//span[contains(@class, 'date-time-ago')]/@data-date",
        'valid_through': None,

        'description': "//main[contains(@class, 'offer__content')][section]",

        'instructions': None,
        "country": None,
        "education_requirements": None,
        "qualifications": None,
        "experience_requirement": None,
        "industry": None,
        "skills": None,
        "responsibilities": None,
        "value_currency": None,
        "min_value": None,
        "max_value": None,
        "url": None,
        "value_period": None,
        "source": None,
    }

    def __init__(self):
        self.jobs = JobsList()

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            offset = page * 100
            loop_url = entry_url + (f'&of={offset}' if '?' in entry_url else f'?of={offset}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs

    def post_process(self, job_dict):
        # process job posted, process time
        posted = parse(job_dict['date_posted'])
        job_dict['date_posted'] = str(posted)
        return job_dict
예제 #26
0
class JobVineProvider(AbstractHTMLProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'jobvine.co.za'
    name = 'Job Vine'
    urls_xpath = "//p[contains(@class, 'job-title')]/a"
    properties = {
        'job_title': "//h1[@class='job-title']",
        'hiring_organization':
        "//span[text()='Recruiter:']/following-sibling::strong",
        'city': "//span[text()='Location:']/following-sibling::strong",
        'employment_type':
        "//span[text()='Job Type:']/following-sibling::strong",
        'date_posted':
        "//span[text()='Date added:']/following-sibling::strong",
        'valid_through': None,
        'description':
        "//div[@class='job-item premium']/following-sibling::div[1]",
        'instructions': None,
        "country": None,
        "education_requirements": None,
        "qualifications": None,
        "experience_requirement": None,
        "industry": None,
        "skills": None,
        "responsibilities": None,
        "value_currency": None,
        "min_value": None,
        "max_value": None,
        "url": None,
        "value_period": None,
        "source": None
    }

    def __init__(self):
        self.jobs = JobsList()

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&page={page}'
                                    if '?' in entry_url else f'?page={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs

    def post_process(self, job_dict):
        # process job posted, process time
        posted = datetime.strptime(job_dict['date_posted'], '%d %B %Y')
        posted = self.timezone.localize(posted)
        job_dict['date_posted'] = str(posted)
        return job_dict
class JobsInGhanaProvider(AbstractHTMLProvider):
    timezone = pytz.timezone('Africa/Nairobi')
    host = 'jobsinghana.com'
    name = 'Jobs Ghana'
    urls_xpath = "//a[@property='title']"
    properties = {
        'job_title': "//div[@class='jobdetailtitle']/h3",
        'hiring_organization':
        "//td[text()='Company']/following-sibling::td/@title",
        'city': "//td[text()='Location']/following-sibling::td/@title",
        'employment_type':
        "//td[text()='Job Status']/following-sibling::td/@title",
        'date_posted': None,
        'valid_through':
        "//td[text()='Job Expires']/following-sibling::td/@title",
        'description': "//td[@class='job_desc']",
        'instructions': None,
        "country": None,
        "education_requirements": None,
        "qualifications": None,
        "experience_requirement":
        "//td[text()='Experience']/following-sibling::td/@title",
        "industry": "//td[text()='Industry']/following-sibling::td/@title",
        "skills": None,
        "responsibilities": None,
        "value_currency": None,
        "min_value": None,
        "max_value": None,
        "url": None,
        "value_period": None,
        "source": None
    }

    def __init__(self):
        self.jobs = JobsList()

    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&page={page}'
                                    if '?' in entry_url else f'?page={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs

    def post_process(self, job_dict):
        # process job posted, process time
        # Sep 16, 2019
        posted = datetime.strptime(job_dict['valid_through'], '%b %d, %Y')
        posted = self.timezone.localize(posted)
        job_dict['valid_through'] = str(posted)
        return job_dict