class VariantDB(object): """ See the notes on PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS above. Also understand that we'll keep "dirty" versions of the references/fuzzable requests in order to be able to answer "False" to a call for need_more_variants in a situation like this: >> need_more_variants('http://foo.com/abc?id=32') True >> append('http://foo.com/abc?id=32') True >> need_more_variants('http://foo.com/abc?id=32') False """ HASH_IGNORE_HEADERS = ('referer', ) TAG = '[variant_db]' MAX_IN_MEMORY = 50 def __init__(self): self._variants = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY, table_prefix='variant_db') self._variants_eq = ScalableBloomFilter() self._variants_form = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY, table_prefix='variant_db_form') self.params_max_variants = cf.cf.get('params_max_variants') self.path_max_variants = cf.cf.get('path_max_variants') self.max_equal_form_variants = cf.cf.get('max_equal_form_variants') self._db_lock = threading.RLock() def cleanup(self): self._variants.cleanup() self._variants_form.cleanup() def append(self, fuzzable_request): """ :return: True if we added a new fuzzable request variant to the DB, False if NO more variants are required for this fuzzable request. """ with self._db_lock: if self._seen_exactly_the_same(fuzzable_request): return False if self._has_form(fuzzable_request): if not self._need_more_variants_for_form(fuzzable_request): return False if not self._need_more_variants_for_uri(fuzzable_request): return False # Yes, please give me more variants of fuzzable_request return True def _log_return_false(self, fuzzable_request, reason): args = (reason, fuzzable_request) msg = 'VariantDB is returning False because of "%s" for "%s"' om.out.debug(msg % args) def _need_more_variants_for_uri(self, fuzzable_request): # # Do we need more variants for the fuzzable request? (similar match) # PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS # clean_dict_key = clean_fuzzable_request(fuzzable_request) count = self._variants.get(clean_dict_key, None) if count is None: self._variants[clean_dict_key] = 1 return True # We've seen at least one fuzzable request with this pattern... url = fuzzable_request.get_uri() has_params = url.has_query_string() or fuzzable_request.get_raw_data() # Choose which max_variants to use if has_params: max_variants = self.params_max_variants max_variants_type = 'params' else: max_variants = self.path_max_variants max_variants_type = 'path' if count >= max_variants: _type = 'need_more_variants_for_uri(%s)' % max_variants_type self._log_return_false(fuzzable_request, _type) return False self._variants[clean_dict_key] = count + 1 return True def _seen_exactly_the_same(self, fuzzable_request): # # Is the fuzzable request already known to us? (exactly the same) # request_hash = fuzzable_request.get_request_hash( self.HASH_IGNORE_HEADERS) if request_hash in self._variants_eq: return True # Store it to avoid duplicated fuzzable requests in our framework self._variants_eq.add(request_hash) self._log_return_false(fuzzable_request, 'seen_exactly_the_same') return False def _has_form(self, fuzzable_request): raw_data = fuzzable_request.get_raw_data() if raw_data and len(raw_data.get_param_names()) >= 2: return True return False def _need_more_variants_for_form(self, fuzzable_request): # # Do we need more variants for this form? (similar match) # MAX_EQUAL_FORM_VARIANTS # clean_dict_key_form = clean_fuzzable_request_form(fuzzable_request) count = self._variants_form.get(clean_dict_key_form, None) if count is None: self._variants_form[clean_dict_key_form] = 1 return True if count >= self.max_equal_form_variants: self._log_return_false(fuzzable_request, 'need_more_variants_for_form') return False self._variants_form[clean_dict_key_form] = count + 1 return True
class VariantDB(object): """ See the notes on PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS above. Also understand that we'll keep "dirty" versions of the references/fuzzable requests in order to be able to answer "False" to a call for need_more_variants in a situation like this: >> need_more_variants('http://foo.com/abc?id=32') True >> append('http://foo.com/abc?id=32') True >> need_more_variants('http://foo.com/abc?id=32') False """ HASH_IGNORE_HEADERS = ('referer',) TAG = '[variant_db]' MAX_IN_MEMORY = 50 def __init__(self): self._variants = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY, table_prefix='variant_db') self._variants_eq = ScalableBloomFilter() self._variants_form = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY, table_prefix='variant_db_form') self.params_max_variants = cf.cf.get('params_max_variants') self.path_max_variants = cf.cf.get('path_max_variants') self.max_equal_form_variants = cf.cf.get('max_equal_form_variants') self._db_lock = threading.RLock() def cleanup(self): self._variants.cleanup() self._variants_form.cleanup() def append(self, fuzzable_request): """ :return: True if we added a new fuzzable request variant to the DB, False if NO more variants are required for this fuzzable request. """ with self._db_lock: if self._seen_exactly_the_same(fuzzable_request): return False if self._has_form(fuzzable_request): if not self._need_more_variants_for_form(fuzzable_request): return False if not self._need_more_variants_for_uri(fuzzable_request): return False # Yes, please give me more variants of fuzzable_request return True def _log_return_false(self, fuzzable_request, reason): args = (reason, fuzzable_request) msg = 'VariantDB is returning False because of "%s" for "%s"' om.out.debug(msg % args) def _need_more_variants_for_uri(self, fuzzable_request): # # Do we need more variants for the fuzzable request? (similar match) # PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS # clean_dict_key = clean_fuzzable_request(fuzzable_request) count = self._variants.get(clean_dict_key, None) if count is None: self._variants[clean_dict_key] = 1 return True # We've seen at least one fuzzable request with this pattern... url = fuzzable_request.get_uri() has_params = url.has_query_string() or fuzzable_request.get_raw_data() # Choose which max_variants to use if has_params: max_variants = self.params_max_variants max_variants_type = 'params' else: max_variants = self.path_max_variants max_variants_type = 'path' if count >= max_variants: _type = 'need_more_variants_for_uri(%s)' % max_variants_type self._log_return_false(fuzzable_request, _type) return False self._variants[clean_dict_key] = count + 1 return True def _seen_exactly_the_same(self, fuzzable_request): # # Is the fuzzable request already known to us? (exactly the same) # request_hash = fuzzable_request.get_request_hash(self.HASH_IGNORE_HEADERS) if request_hash in self._variants_eq: return True # Store it to avoid duplicated fuzzable requests in our framework self._variants_eq.add(request_hash) self._log_return_false(fuzzable_request, 'seen_exactly_the_same') return False def _has_form(self, fuzzable_request): raw_data = fuzzable_request.get_raw_data() if raw_data and len(raw_data.get_param_names()) >= 2: return True return False def _need_more_variants_for_form(self, fuzzable_request): # # Do we need more variants for this form? (similar match) # MAX_EQUAL_FORM_VARIANTS # clean_dict_key_form = clean_fuzzable_request_form(fuzzable_request) count = self._variants_form.get(clean_dict_key_form, None) if count is None: self._variants_form[clean_dict_key_form] = 1 return True if count >= self.max_equal_form_variants: self._log_return_false(fuzzable_request, 'need_more_variants_for_form') return False self._variants_form[clean_dict_key_form] = count + 1 return True
class Fingerprint404(object): """ Read the 404 page(s) returned by the server. :author: Andres Riancho ([email protected]) """ _instance = None def __init__(self): # # Set the opener, I need it to perform some tests and gain # the knowledge about the server's 404 response bodies. # self._uri_opener = None # # Store the 404 responses in a dict which has normalized paths # as keys and 404 data as values. # # The most commonly used keys for this dict are stored in memory # while the least commonly used are stored in SQLite # self._404_responses = CachedDiskDict(max_in_memory=MAX_404_IN_MEMORY, table_prefix='is_404') def is_404(self, http_response): """ All of my previous versions of is_404 were very complex and tried to struggle with all possible cases. The truth is that in most "strange" cases w3af was failing miserably, so now I changed w3af's 404 detection once again, but keeping it as simple as possible. Also, and because I was trying to cover ALL CASES, I was performing a lot of requests in order to cover them, which in most situations was unnecessary. So now I go for a much simple approach: 1- Handle the most common case of all using no HTTP requests, this is implemented in _is_404_basic() 2- Handle common cases using only 1 HTTP request, this is implemented in _is_404_complex() 3- Handle rare cases with 2 HTTP requests, this is implemented in _handle_large_http_responses() 4- Perform extensive caching in LRUCache404 to store the results generated by 1, 2, 3. Caching is performed by URL and response body. Use these results in the next calls to is_404() 5- Give the users the power to configure the 404 detection by setting a string that identifies the 404 response (in case the other cases are not being able to handle this web application) :param http_response: The HTTP response :return: True if the HTTP response is a 404 """ if self._is_404_basic(http_response): return True if self._is_404_complex(http_response): return True return False def _is_404_basic(self, http_response): """ Verifies if the response is a 404 by checking the user's configuration and applying very basic algorithms. :param http_response: The HTTP response :return: True if the HTTP response is a 404 """ domain_path = http_response.get_url().get_domain_path() # # First we handle the user configured exceptions: # if domain_path in cf.cf.get('always_404'): return True if domain_path in cf.cf.get('never_404'): return False # # The user configured setting. "If this string is in the response, # then it is a 404" # string_match_404 = cf.cf.get('string_match_404') if string_match_404: if string_match_404 in http_response: return True # # This is the most simple case, we don't even have to think about this # # If there is some custom website that always returns 404 codes, then # we are screwed, but this is open source, and the pentester working # on that site can modify these lines. # if http_response.get_code() == 404: return True # # This is an edge case. Let me explain... # # Doing try/except in all plugins that send HTTP requests was hard (tm) # so plugins don't use ExtendedUrllib directly, instead they use the # UrlOpenerProxy (defined in plugin.py). This proxy catches any # exceptions and returns a 204 response. # # In most cases that works perfectly, because it will allow the plugin # to keep working without caring much about the exceptions. In some # edge cases someone will call is_404(204_response_generated_by_w3af) # and that will most likely return False, because the 204 response we # generate doesn't look like anything w3af has in the 404 DB. # # The following iff fixes the race condition # if is_no_content_response(http_response): return True return False @PreventMultipleThreads def _is_404_complex(self, http_response): # 404_body stored in the DB was cleaned when creating the # FourOhFourResponse class. # # Clean the body received as parameter in order to have a fair # comparison query = FourOhFourResponse.from_http_response(http_response) return self._is_404_complex_impl(http_response, query) @LRUCache404 def _is_404_complex_impl(self, http_response, query): """ Verifies if the response is a 404 by comparing it with other responses which are known to be 404s, potentially sends HTTP requests to the server. :param http_response: The HTTP response :param query: The HTTP response in FourOhFourResponse form (normalized URL, clean body, etc.) :return: True if the HTTP response is a 404 """ response_did = http_response.get_debugging_id() debugging_id = response_did if response_did is not None else rand_alnum( 8) # # Compare query with a known 404 from the DB (or a generated one # if there is none with the same path in the DB) # known_404 = self._get_404_response(http_response, query, debugging_id) # Trivial performance improvement that prevents running fuzzy_equal if query.code in NOT_404_RESPONSE_CODES and known_404.code == 404: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [known 404 with ID %s uses 404 code]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return False # Since the fuzzy_equal function is CPU-intensive we want to # avoid calling it for cases where we know it won't match, for # example in comparing an image and an html if query.content_type != known_404.content_type: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [document type mismatch with known 404 with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return False # This is the simplest case. If they are 100% equal, no matter how # large or complex the responses are, then query is a 404 if known_404.body == query.body: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [string equals with 404 DB entry with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return True is_fuzzy_equal = fuzzy_equal(known_404.body, query.body, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with known 404 with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, known_404.id) om.out.debug(msg % args) return False if len(query.body) < MAX_FUZZY_LENGTH: # The response bodies are fuzzy-equal, and the length is less than # MAX_FUZZY_LENGTH. This is good, it means that they are equal and # long headers / footers in HTTP response bodies are not # interfering with fuzzy-equals. # # Some sites have really large headers and footers which they # include for all pages, including 404s. When that happens one page # might look like: # # {header-4000bytes} # Hello world # {footer-4000bytes} # # The header might contain large CSS and the footer might include # JQuery or some other large JS. Then, the 404 might look like: # # {header-4000bytes} # Not found # {footer-4000bytes} # # A user with a browser might only see the text, and clearly # identify one as a valid page and another as a 404, but the # fuzzy_equal() function will return True, indicating that they # are equal because 99% of the bytes are the same. msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with 404 DB entry with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, known_404.id) om.out.debug(msg % args) return True else: # See the large comment above on why we need to check for # MAX_FUZZY_LENGTH. # # The way to handle this case is to send an extra HTTP # request that will act as a tie-breaker. return self._handle_large_http_responses(http_response, query, known_404, debugging_id) def _handle_large_http_responses(self, http_response, query, known_404, debugging_id): """ When HTTP response bodies are large the fuzzy_equal() will generate 404 false positives. This is explained in a comment above, (search for "{header-4000bytes}"). This method will handle that case by using three HTTP responses instead of two (which is the most common case). The three HTTP responses used by this method are: * known_404: The forced 404 generated by this class * query: The HTTP response we want to know if it is a 404 * Another forced 404 generated by this method The method will diff the two 404 responses, and one 404 response with the query response, then compare using fuzzy_equal() to determine if the query is a 404. :return: True if the query response is a 404! """ # Make the algorithm easier to read known_404_1 = known_404 if known_404_1.diff is not None: # At some point during the execution of this scan we already sent # an HTTP request to use in this process and calculated the diff # # In order to prevent more HTTP requests from being sent to the # server, and also to reduce CPU usage, we saved the diff as an # attribute. pass else: # Need to send the second request and calculate the diff, there is # no previous knowledge that we can use # # Send exclude=[known_404_1.url] to prevent the function from sending # an HTTP request to the same forced 404 URL known_404_2 = send_request_generate_404(self._uri_opener, http_response, debugging_id, exclude=[known_404_1.url]) known_404_1.diff, _ = chunked_diff(known_404_1.body, known_404_2.body) known_404_1.diff_with_id = known_404_2.id self._404_responses[query.normalized_path] = known_404_1.dumps() diff_x = known_404_1.diff _, diff_y = chunked_diff(known_404_1.body, query.body) is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([ str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id) ])) om.out.debug(msg % args) return False msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([ str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id) ])) om.out.debug(msg % args) return True def set_url_opener(self, urlopener): self._uri_opener = urlopener def _get_404_response(self, http_response, query, debugging_id): """ :return: A FourOhFourResponse instance. * First try to get the response from the 404 DB * If the data is not there then send an HTTP request with a randomly generated path or name to force a 404, save the data to the DB and then return it. """ serialized_known_404 = self._404_responses.get(query.normalized_path, None) if serialized_known_404 is not None: return FourOhFourResponse.loads(serialized_known_404) known_404 = send_request_generate_404(self._uri_opener, http_response, debugging_id) self._404_responses[query.normalized_path] = known_404.dumps() return known_404
class Fingerprint404(object): """ Read the 404 page(s) returned by the server. :author: Andres Riancho ([email protected]) """ _instance = None def __init__(self): # # Set the opener, I need it to perform some tests and gain # the knowledge about the server's 404 response bodies. # self._uri_opener = None # # Store the 404 responses in a dict which has normalized paths # as keys and 404 data as values. # # The most commonly used keys for this dict are stored in memory # while the least commonly used are stored in SQLite # self._404_responses = CachedDiskDict(max_in_memory=MAX_404_IN_MEMORY, table_prefix='is_404') @PreventMultipleThreads @LRUCache404 def is_404(self, http_response): """ All of my previous versions of is_404 were very complex and tried to struggle with all possible cases. The truth is that in most "strange" cases I was failing miserably, so now I changed my 404 detection once again, but keeping it as simple as possible. Also, and because I was trying to cover ALL CASES, I was performing a lot of requests in order to cover them, which in most situations was unnecessary. So now I go for a much simple approach: 1- Handle the most common case of all using only 1 HTTP request 2- Handle rare cases with 2 HTTP requests 3- Give the users the power to configure the 404 detection by setting a string that identifies the 404 response (in case we are missing it for some reason in cases #1 and #2) :param http_response: The HTTP response :return: True if the HTTP response is a 404 """ if self._is_404_basic(http_response): return True if self._is_404_complex(http_response): return True return False def _is_404_basic(self, http_response): """ Verifies if the response is a 404 by checking the user's configuration and applying very basic algorithms. :param http_response: The HTTP response :return: True if the HTTP response is a 404 """ domain_path = http_response.get_url().get_domain_path() # # First we handle the user configured exceptions: # if domain_path in cf.cf.get('always_404'): return True if domain_path in cf.cf.get('never_404'): return False # # The user configured setting. "If this string is in the response, # then it is a 404" # if cf.cf.get('string_match_404') and cf.cf.get('string_match_404') in http_response: return True # # This is the most simple case, we don't even have to think about this # # If there is some custom website that always returns 404 codes, then # we are screwed, but this is open source, and the pentester working # on that site can modify these lines. # if http_response.get_code() == 404: return True # # This is an edge case. Let me explain... # # Doing try/except in all plugins that send HTTP requests was hard (tm) # so plugins don't use ExtendedUrllib directly, instead they use the # UrlOpenerProxy (defined in plugin.py). This proxy catches any # exceptions and returns a 204 response. # # In most cases that works perfectly, because it will allow the plugin # to keep working without caring much about the exceptions. In some # edge cases someone will call is_404(204_response_generated_by_w3af) # and that will most likely return False, because the 204 response we # generate doesn't look like anything w3af has in the 404 DB. # # The following iff fixes the race condition # if http_response.get_code() == 204: if http_response.get_msg() == NO_CONTENT_MSG: if http_response.get_headers() == Headers(): return True return False def _is_404_complex(self, http_response): """ Verifies if the response is a 404 by comparing it with other responses which are known to be 404s, potentially sends HTTP requests to the server. :param http_response: The HTTP response :return: True if the HTTP response is a 404 """ response_did = http_response.get_debugging_id() debugging_id = response_did if response_did is not None else rand_alnum(8) # 404_body stored in the DB was cleaned when creating the # FourOhFourResponse class. # # Clean the body received as parameter in order to have a fair # comparison query = FourOhFourResponse(http_response) # # Compare query with a known 404 from the DB (or a generated one # if there is none with the same path in the DB) # known_404 = self._get_404_response(http_response, query, debugging_id) # Trivial performance improvement that prevents running fuzzy_equal if query.code in NOT_404_RESPONSE_CODES and known_404.code == 404: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [known 404 with ID %s uses 404 code]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return False # Since the fuzzy_equal function is CPU-intensive we want to # avoid calling it for cases where we know it won't match, for # example in comparing an image and an html if query.doc_type != known_404.doc_type: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [document type mismatch with known 404 with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return False # This is the simplest case. If they are 100% equal, no matter how # large or complex the responses are, then query is a 404 if known_404.body == query.body: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [string equals with 404 DB entry with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, known_404.id) om.out.debug(msg % args) return True is_fuzzy_equal = fuzzy_equal(known_404.body, query.body, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with known 404 with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, known_404.id) om.out.debug(msg % args) return False if len(query.body) < MAX_FUZZY_LENGTH: # The response bodies are fuzzy-equal, and the length is less than # MAX_FUZZY_LENGTH. This is good, it means that they are equal and # long headers / footers in HTTP response bodies are not # interfering with fuzzy-equals. # # Some sites have really large headers and footers which they # include for all pages, including 404s. When that happens one page # might look like: # # {header-4000bytes} # Hello world # {footer-4000bytes} # # The header might contain large CSS and the footer might include # JQuery or some other large JS. Then, the 404 might look like: # # {header-4000bytes} # Not found # {footer-4000bytes} # # A user with a browser might only see the text, and clearly # identify one as a valid page and another as a 404, but the # fuzzy_equal() function will return True, indicating that they # are equal because 99% of the bytes are the same. msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with 404 DB entry with ID %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, known_404.id) om.out.debug(msg % args) return True else: # See the large comment above on why we need to check for # MAX_FUZZY_LENGTH. # # The way to handle this case is to send an extra HTTP # request that will act as a tie-breaker. return self._handle_large_http_responses(http_response, query, known_404, debugging_id) def _handle_large_http_responses(self, http_response, query, known_404, debugging_id): """ When HTTP response bodies are large the fuzzy_equal() will generate 404 false positives. This is explained in a comment above, (search for "{header-4000bytes}"). This method will handle that case by using three HTTP responses instead of two (which is the most common case). The three HTTP responses used by this method are: * known_404: The forced 404 generated by this class * query: The HTTP response we want to know if it is a 404 * Another forced 404 generated by this method The method will diff the two 404 responses, and one 404 response with the query response, then compare using fuzzy_equal() to determine if the query is a 404. :return: True if the query response is a 404! """ # Make the algorithm easier to read known_404_1 = known_404 if known_404_1.diff is not None: # At some point during the execution of this scan we already sent # an HTTP request to use in this process and calculated the diff # # In order to prevent more HTTP requests from being sent to the # server, and also to reduce CPU usage, we saved the diff as an # attribute. pass else: # Need to send the second request and calculate the diff, there is # no previous knowledge that we can use # # Send exclude=[known_404_1.url] to prevent the function from sending # an HTTP request to the same forced 404 URL known_404_2 = send_request_generate_404(self._uri_opener, http_response, debugging_id, exclude=[known_404_1.url]) known_404_1.diff, _ = diff(known_404_1.body, known_404_2.body) known_404_1.diff_with_id = known_404_2.id self._404_responses[query.normalized_path] = known_404_1 diff_x = known_404_1.diff _, diff_y = diff(known_404_1.body, query.body) is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id)])) om.out.debug(msg % args) return False msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id)])) om.out.debug(msg % args) return True def set_url_opener(self, urlopener): self._uri_opener = urlopener def _get_404_response(self, http_response, query, debugging_id): """ :return: A FourOhFourResponse instance. * First try to get the response from the 404 DB * If the data is not there then send an HTTP request with a randomly generated path or name to force a 404, save the data to the DB and then return it. """ known_404 = self._404_responses.get(query.normalized_path, None) if known_404 is not None: return known_404 known_404 = send_request_generate_404(self._uri_opener, http_response, debugging_id) self._404_responses[query.normalized_path] = known_404 return known_404