Exemplo n.º 1
0
    def get_document_parser_for(self, http_response):
        res = None

        # Before I used md5, but I realized that it was unnecessary. I
        # experimented a little bit with python's hash functions and this is
        # what I got:
        #
        #   dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import zlib; s="aaa"*1234' 'zlib.crc32(s)'
        #   100000 loops, best of 3: 6.03 usec per loop
        #   dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import zlib; s="aaa"*1234' 'zlib.adler32(s)'
        #   100000 loops, best of 3: 3.87 usec per loop
        #   dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hashlib.sha1(s).hexdigest()'
        #   100000 loops, best of 3: 16.6 usec per loop
        #   dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hashlib.md5(s).hexdigest()'
        #   100000 loops, best of 3: 12.9 usec per loop
        #   dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hash(s)'
        #   100000 loops, best of 3: 0.117 usec per loop
        #
        #   At first I thought that the built-in hash wasn't good enough, as it could create collisions... but...
        #   given that the LRU has only 30 positions, the real probability of a colission is too low.
        #
        self._total += 1

        # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the
        #       body before hashing
        uri_str = http_response.get_uri().url_string.encode('utf-8')

        body_str = http_response.body
        if isinstance(body_str, unicode):
            body_str = body_str.encode('utf-8', 'replace')

        hash_string = hash(body_str + uri_str)

        with self._LRULock:
            if hash_string in self._cache:
                res = self._cache[hash_string]
                self._debug_in_cache(hash_string)
            else:
                # Create a new instance of dp, add it to the cache
                res = DocumentParser.DocumentParser(http_response)
                self._cache[hash_string] = res
                self._debug_not_in_cache(hash_string)
            return res
Exemplo n.º 2
0
    def _get_images(self, fuzzable_request):
        """
        Get all img tags and retrieve the src.

        :param fuzzable_request: The request to modify
        :return: A list with tuples containing (img_src, image_hash, http_response)
        """
        res = []

        try:
            response = self._uri_opener.GET(fuzzable_request.get_uri(),
                                            cache=False)
        except:
            om.out.debug('Failed to retrieve the page for finding captchas.')
        else:
            # Do not use parser_cache here, it's not good since CAPTCHA implementations
            # *might* change the image name for each request of the HTML
            #
            # dp = parser_cache.dpc.get_document_parser_for( response )
            #
            try:
                document_parser = DocumentParser.DocumentParser(response)
            except BaseFrameworkException:
                return []

            image_path_list = document_parser.get_references_of_tag('img')

            GET = self._uri_opener.GET
            sha1 = hashlib.sha1

            result_iter = self.worker_pool.imap_unordered(GET, image_path_list)

            for image_response in result_iter:
                if image_response.is_image():
                    img_src = image_response.get_uri()
                    img_hash = sha1(image_response.get_body()).hexdigest()
                    res.append((img_src, img_hash, response))

        return res