Пример #1
0
class KarieraJobOffersParser(BaseJobOfferParser):

    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://kariera.sk'

    def parse(self, url, verbose=False):

        # Get the parsed HTML content of the input URL
        parsed = self.content(url)

        # Get the job offers
        offers = self.offers(parsed)

        if verbose: pprint(offers)
        return offers

    def content(self, url):
        return self.parser.parse(url)

    def offers(self, parsed_html):

        # Validate the input data
        if not parsed_html:
            return None

        # Parse the HTML for job offers
        offers = parsed_html.find_all('div', class_='column2 offer-list-info')
        if not offers:
            self.logger.warning('No <div> tags with job offers found. Returning None')
            return None

        # Parse the job offers
        result = []
        for offer in offers:
            if offer.find('h2'):
                header = offer.find('h2').find('a', href=True, text=True)
                result.append(
                    {'url': header['href'],
                     'txt': header.text,
                     'emp': offer.find('a', class_='employer', href=True, text=True).text,
                     'loc': offer.find('span', class_='place').text}
                )

        return result
Пример #2
0
class ProfesiaJobOffersParser(BaseJobOfferParser):
    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://www.profesia.sk'

    def parse(self, url, verbose=False):

        # Get the parsed HTML content of the input URL
        parsed = self.content(url)

        # Get the job offers
        offers = self.offers(parsed)

        if verbose: pprint(offers)
        return offers

    def content(self, url):
        return self.parser.parse(url)

    def offers(self, parsed_html):

        # Validate the input data
        if not parsed_html:
            return None

        # Parse the HTML for job offers
        offers = parsed_html.find_all('li', class_='list-row')
        if not offers:
            self.logger.warning(
                'No <li> tags with job offers found. Returning None')
            return None

        # Parse the job offers
        result = []
        for offer in offers:
            if offer.find('h2'):
                header = offer.find('h2').find('a', href=True, text=True)
                result.append({
                    'url': '/'.join((self.weburl, header['href'])),
                    'txt': header.text,
                    'emp': offer.find('span', class_='employer'),
                    'loc': offer.find('span', class_='job-location')
                })

        return result
Пример #3
0
class BaseWebContentParser(object):

    HTML_PARSER_TYPE = ''
    HTML_PARSER_NAME = ''

    BASE_URL_STRING = ''
    REQUEST_TIMEOUT = 10
    REQUEST_HEADERS = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding':
        'none',
        'Accept-Language':
        'en-US,en;q=0.8',
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/61.0.3163.100 '
        'Safari/537.36'
    }

    def __init__(self):
        super().__init__()

        self.parser = None
        self.logger = Logger(self.__class__.__name__)

    def parse(self, url):
        return self.get_valid_parsed_object(
            self.get_valid_response(self.get_valid_url(url)))

    def get_valid_url(self, url=None):
        if not url:
            return None
        else:
            url = self.get_url(url)
            if not self.is_valid_url(url):
                self.logger.warning(
                    f'Invalid URL {url} encountered. Returning None')
                return None
            return url

    def get_valid_response(self, url=None):
        if not url:
            return None
        else:
            response = self.get_response(url)
            if not self.is_valid_response(response):
                self.logger.warning(
                    f'Status code {response.status_code}. Returning None')
                return None
            return response

    def get_valid_parsed_object(self, response):
        if not response or not self.is_valid_response(response):
            return None
        else:
            parsed_object = self.get_parsed_object(response)
            if not self.is_valid_parsed_object(parsed_object):
                self.logger.warning(
                    'HTML parser was not initialized. Returning None')
                return None
            return parsed_object

    def get_url(self, url):
        return str(url) if url else self.BASE_URL_STRING

    def get_response(self, url):
        return requests.get(url,
                            headers=self.REQUEST_HEADERS,
                            timeout=self.REQUEST_TIMEOUT)

    def get_parsed_object(self, response):
        return self.parser(response.content,
                           self.HTML_PARSER_TYPE) if self.parser else None

    @staticmethod
    def is_valid_url(url):
        return url and (validators.url(url) is True)

    @staticmethod
    def is_valid_response(response):
        return response.status_code == requests.codes.ok

    @staticmethod
    def is_valid_parsed_object(parsed_object):
        return parsed_object if parsed_object else False