def getDocumentParserFor( self, httpResponse, normalizeMarkup=True ): res = None # Before I used md5, but I realized that it was unnecessary. I experimented a little bit with # python's hash functions and this is what I got: # # dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import zlib; s="aaa"*1234' 'zlib.crc32(s)' # 100000 loops, best of 3: 6.03 usec per loop # dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import zlib; s="aaa"*1234' 'zlib.adler32(s)' # 100000 loops, best of 3: 3.87 usec per loop # dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hashlib.sha1(s).hexdigest()' # 100000 loops, best of 3: 16.6 usec per loop # dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hashlib.md5(s).hexdigest()' # 100000 loops, best of 3: 12.9 usec per loop # dz0@laptop:~/w3af/trunk$ python -m timeit -n 100000 -s 'import hashlib; s="aaa"*1234' 'hash(s)' # 100000 loops, best of 3: 0.117 usec per loop # # At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... # given that the LRU has only 30 positions, the real probability of a colission is too low. # hash_string = hash( httpResponse.getBody() ) with self._LRULock: if hash_string in self._cache: res = self._cache[ hash_string ] else: # Create a new instance of dp, add it to the cache res = documentParser.documentParser( httpResponse, normalizeMarkup ) self._cache[ hash_string ] = res return res
def _get_images( self, fuzzable_request ): ''' Get all img tags and retrieve the src. @parameter fuzzable_request: The request to modify @return: A map with the img src as a key, and a hash of the image contents as the value ''' res = {} try: response = self._urlOpener.GET( fuzzable_request.getURI(), useCache=False ) except: om.out.debug('Failed to retrieve the page for finding captchas.') else: # Do not use dpCache here, it's no good. #dp = dpCache.dpc.getDocumentParserFor( response ) try: document_parser = documentParser.documentParser( response ) except w3afException: pass else: image_list = document_parser.getReferencesOfTag('img') image_list = [ urlParser.uri2url(i) for i in image_list] for img_src in image_list: # TODO: Use self._tm.startFunction try: image_response = self._urlOpener.GET( img_src, useCache=False ) except: om.out.debug('Failed to retrieve the image for finding captchas.') else: if image_response.is_image(): res[ img_src ] = sha.new(image_response.getBody()).hexdigest() return res