Пример #1
0
class ResponseCacheKeyCache(object):
    #
    # The memory impact of having a large number of items in this cache is
    # really low, both the keys and the values are short strings (the result of
    # quick_hash)
    #
    MAX_SIZE = 2000

    def __init__(self):
        self._cache = SynchronizedLRUDict(self.MAX_SIZE)

    def get_response_cache_key(self,
                               http_response,
                               clean_response=None,
                               headers=None):

        # When the clean response is available, use that body to calculate the
        # cache key. It has been cleaned (removed request paths and QS parameters)
        # so it has a higher chance of being equal to other responses / being
        # already in the cache
        if clean_response is not None:
            body = clean_response.body
        else:
            body = http_response.body

        cache_key = '%s%s' % (smart_str_ignore(body), headers)
        cache_key = quick_hash(cache_key)

        result = self._cache.get(cache_key, None)

        if result is not None:
            return result

        result = get_response_cache_key(http_response,
                                        clean_response=clean_response,
                                        headers=headers)

        self._cache[cache_key] = result
        return result

    def clear_cache(self):
        self._cache.clear()
Пример #2
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
Пример #3
0
class ParserCache(object):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    LRU_LENGTH = 40
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    PARSER_TIMEOUT = 60 # in seconds
    DEBUG = False
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    def __init__(self):
        self._cache = SynchronizedLRUDict(self.LRU_LENGTH)
        self._pool = None
        self._processes = None
        self._parser_finished_events = {}
        self._start_lock = threading.RLock()

        # These are here for debugging:
        self._archive = set()
        self._from_LRU = 0.0
        self._calculated_more_than_once = 0.0
        self._total = 0.0

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:
                # Keep track of which pid is processing which http response
                # pylint: disable=E1101
                self._processes = manager.dict()
                # pylint: enable=E1101

                # The pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         maxtasksperchild=25,
                                         initializer=init_worker,
                                         initargs=(log_queue,))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.terminate()
            self._pool = None
            self._processes = None

        # We don't need this data anymore
        self._cache.clear()

        if self.DEBUG:
            re_calc_rate = (self._calculated_more_than_once / self._total)
            print('parser_cache LRU rate: %s' % (self._from_LRU / self._total))
            print('parser_cache re-calculation rate: %s' % re_calc_rate)
            print('parser_cache size: %s' % self.LRU_LENGTH)

    def get_cache_key(self, http_response):
        """
        Before I used md5, but I realized that it was unnecessary. I
        experimented a little bit with python's hash functions and the builtin
        hash was the fastest.

        At first I thought that the built-in hash wasn't good enough, as it
        could create collisions... but... given that the LRU has only 40
        positions, the real probability of a collision is too low.

        :return: The key to be used in the cache for storing this http_response
        """
        # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the
        #       body before hashing
        uri_str = http_response.get_uri().url_string.encode('utf-8')

        body_str = http_response.body
        if isinstance(body_str, unicode):
            body_str = body_str.encode('utf-8', 'replace')

        _to_hash = body_str + uri_str

        # Added adler32 after finding some hash() collisions in builds
        hash_string = str(hash(_to_hash))
        hash_string += str(zlib.adler32(_to_hash))
        return hash_string

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def _test_parse_http_response(self, http_response, *args):
        """
        Left here for testing!
        """
        return DocumentParser(http_response)

    def _parse_http_response_in_worker(self, http_response, hash_string):
        """
        This parses the http_response in a pool worker. This has two features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :return: The DocumentParser instance
        """
        event = multiprocessing.Event()
        self._parser_finished_events[hash_string] = event

        # Start the worker processes if needed
        self.start_workers()

        apply_args = (ProcessDocumentParser,
                      http_response,
                      self._processes,
                      hash_string)

        # Push the task to the workers
        result = self._pool.apply_async(apply_with_return_error, (apply_args,))

        try:
            parser_output = result.get(timeout=self.PARSER_TIMEOUT)
        except multiprocessing.TimeoutError:
            # Near the timeout error, so we make sure that the pid is still
            # running our "buggy" input
            pid = self._processes.pop(hash_string, None)
            if pid is not None:
                try:
                    os.kill(pid, signal.SIGTERM)
                except OSError, ose:
                    msg = 'An error occurred while killing the parser' \
                          ' process: "%s"'
                    om.out.debug(msg % ose)

            msg = '[timeout] The parser took more than %s seconds'\
                  ' to complete parsing of "%s", killed it!'

            om.out.debug(msg % (self.PARSER_TIMEOUT,
                                http_response.get_url()))

            # Act just like when there is no parser
            msg = 'There is no parser for "%s".' % http_response.get_url()
            raise BaseFrameworkException(msg)
        else:
Пример #4
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser
Пример #5
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
Пример #6
0
class ParserCache(object):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    LRU_LENGTH = 40
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    PARSER_TIMEOUT = 60  # in seconds
    DEBUG = False
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    def __init__(self):
        self._cache = SynchronizedLRUDict(self.LRU_LENGTH)
        self._pool = None
        self._processes = None
        self._parser_finished_events = {}
        self._start_lock = threading.RLock()

        # These are here for debugging:
        self._archive = set()
        self._from_LRU = 0.0
        self._calculated_more_than_once = 0.0
        self._total = 0.0

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:
                # Keep track of which pid is processing which http response
                # pylint: disable=E1101
                self._processes = manager.dict()
                # pylint: enable=E1101

                # The pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         maxtasksperchild=25,
                                         initializer=init_worker,
                                         initargs=(log_queue, ))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.terminate()
            self._pool = None
            self._processes = None

        # We don't need this data anymore
        self._cache.clear()

        if self.DEBUG:
            re_calc_rate = (self._calculated_more_than_once / self._total)
            print('parser_cache LRU rate: %s' % (self._from_LRU / self._total))
            print('parser_cache re-calculation rate: %s' % re_calc_rate)
            print('parser_cache size: %s' % self.LRU_LENGTH)

    def get_cache_key(self, http_response):
        """
        Before I used md5, but I realized that it was unnecessary. I
        experimented a little bit with python's hash functions and the builtin
        hash was the fastest.

        At first I thought that the built-in hash wasn't good enough, as it
        could create collisions... but... given that the LRU has only 40
        positions, the real probability of a collision is too low.

        :return: The key to be used in the cache for storing this http_response
        """
        # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the
        #       body before hashing
        uri_str = http_response.get_uri().url_string.encode('utf-8')

        body_str = http_response.body
        if isinstance(body_str, unicode):
            body_str = body_str.encode('utf-8', 'replace')

        _to_hash = body_str + uri_str

        # Added adler32 after finding some hash() collisions in builds
        hash_string = str(hash(_to_hash))
        hash_string += str(zlib.adler32(_to_hash))
        return hash_string

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def _test_parse_http_response(self, http_response, *args):
        """
        Left here for testing!
        """
        return DocumentParser(http_response)

    def _parse_http_response_in_worker(self, http_response, hash_string):
        """
        This parses the http_response in a pool worker. This has two features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :return: The DocumentParser instance
        """
        event = multiprocessing.Event()
        self._parser_finished_events[hash_string] = event

        # Start the worker processes if needed
        self.start_workers()

        apply_args = (ProcessDocumentParser, http_response, self._processes,
                      hash_string)

        # Push the task to the workers
        result = self._pool.apply_async(apply_with_return_error,
                                        (apply_args, ))

        try:
            parser_output = result.get(timeout=self.PARSER_TIMEOUT)
        except multiprocessing.TimeoutError:
            # Near the timeout error, so we make sure that the pid is still
            # running our "buggy" input
            pid = self._processes.pop(hash_string, None)
            if pid is not None:
                try:
                    os.kill(pid, signal.SIGTERM)
                except OSError, ose:
                    msg = 'An error occurred while killing the parser' \
                          ' process: "%s"'
                    om.out.debug(msg % ose)

            msg = '[timeout] The parser took more than %s seconds'\
                  ' to complete parsing of "%s", killed it!'

            om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url()))

            # Act just like when there is no parser
            msg = 'There is no parser for "%s".' % http_response.get_url()
            raise BaseFrameworkException(msg)
        else:
Пример #7
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()

        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(),
                                                     default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url(
                )
                raise BaseFrameworkException(msg)
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser

    def _log_return_empty(self, http_response, detail):
        msg = 'Returning empty list in get_tags_by_filter("%s"). '
        msg += detail
        om.out.debug(msg % http_response.get_uri())

    def get_tags_by_filter(self,
                           http_response,
                           tags,
                           yield_text=False,
                           cache=True):
        """
        Get specific tags from http_response using the cache if possible

        :param http_response: The http response instance
        :param tags: List of tags to get, or None if all tags should be returned
        :param yield_text: Include the tag text (<a>text</a>)
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # This is a performance hack that should reduce the time consumed by
        # this method without impacting its results. Note that in HTML this is
        # valid:
        #
        #   <script
        #
        # And this is invalid:
        #
        #   < script
        #
        # We use that in order to speed-up this function
        #
        if tags is not None:
            body_lower = http_response.get_body().lower()

            for tag in tags:
                lt_tag = '<%s' % tag
                if lt_tag in body_lower:
                    break
            else:
                # No tag was found in the HTML
                return []

        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            self._log_return_empty(http_response, 'No parser available')
            return []

        args = '%r%r' % (tags, yield_text)
        hash_string = get_body_unique_id(http_response, prepend=args)

        if hash_string in self._parser_blacklist:
            self._log_return_empty(http_response,
                                   'HTTP response is blacklisted')
            return []

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Timeout waiting for response')
                return []

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                tags = mp_doc_parser.get_tags_by_filter(http_response,
                                                        tags,
                                                        yield_text=yield_text)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(
                    http_response, 'Timeout waiting for get_tags_by_filter()')
                return []
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Reached memory usage limit')
                return []
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except Exception as e:
                # Act just like when there is no parser
                msg = 'Unhandled exception running get_tags_by_filter("%s"): %s'
                args = (http_response.get_url(), e)
                raise BaseFrameworkException(msg % args)
            else:
                if cache:
                    self._cache[hash_string] = tags
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return tags
Пример #8
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def parser_warpper(func):
        @functools.wraps(func)
        def inner(self, *args, **kwargs):
            if not hasattr(self, 'disk_cache'):
                self.disk_cache = {'key_set': set(), 'disk_cache': DiskDict('rsp_parser')}
            return func(self, *args, **kwargs)
        return inner

    @parser_warpper
    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        if http_response.is_image():
            # Act just like when there is no parser
            msg = 'There is no parser for image("%s")' % (http_response.get_url())
            raise BaseFrameworkException(msg)

        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser:
            self._handle_cache_hit(hash_string)
            # om.out.debug('[parser cache][memory] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
        else:
            # om.out.debug('[parser cache][memory] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event
            
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            try:
                if hash_string in self.disk_cache['key_set']:
                    parser = self.disk_cache['disk_cache'][hash_string]
                    # om.out.debug('[parser cache][disk] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                else:
                    # om.out.debug('[parser cache][disk] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                    try:
                        parser = mp_doc_parser.get_document_parser_for(http_response)
                    except Exception as e:
                        # Act just like when there is no parser
                        msg = 'There is no parser for "%s".e=%s' % (http_response.get_url(), e)
                        raise BaseFrameworkException(msg)
                    else:
                        self.disk_cache['disk_cache'][hash_string] = parser
                        self.disk_cache['key_set'].add(hash_string)

                        save_to_cache = self.should_cache(http_response) and cache
                        if save_to_cache:
                            self._cache[hash_string] = parser
                        else:
                            self._handle_no_cache(hash_string)
            finally:
                self._parser_finished_events.pop(hash_string, None)
                event.set()

        return parser