예제 #1
0
class VariantDB(object):
    """
    See the notes on PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS above. Also
    understand that we'll keep "dirty" versions of the references/fuzzable
    requests in order to be able to answer "False" to a call for
    need_more_variants in a situation like this:

        >> need_more_variants('http://foo.com/abc?id=32')
        True

        >> append('http://foo.com/abc?id=32')
        True

        >> need_more_variants('http://foo.com/abc?id=32')
        False

    """
    HASH_IGNORE_HEADERS = ('referer', )
    TAG = '[variant_db]'

    MAX_IN_MEMORY = 50

    def __init__(self):
        self._variants = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,
                                        table_prefix='variant_db')
        self._variants_eq = ScalableBloomFilter()
        self._variants_form = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,
                                             table_prefix='variant_db_form')

        self.params_max_variants = cf.cf.get('params_max_variants')
        self.path_max_variants = cf.cf.get('path_max_variants')
        self.max_equal_form_variants = cf.cf.get('max_equal_form_variants')

        self._db_lock = threading.RLock()

    def cleanup(self):
        self._variants.cleanup()
        self._variants_form.cleanup()

    def append(self, fuzzable_request):
        """
        :return: True if we added a new fuzzable request variant to the DB,
                 False if NO more variants are required for this fuzzable
                 request.
        """
        with self._db_lock:
            if self._seen_exactly_the_same(fuzzable_request):
                return False

            if self._has_form(fuzzable_request):
                if not self._need_more_variants_for_form(fuzzable_request):
                    return False

            if not self._need_more_variants_for_uri(fuzzable_request):
                return False

            # Yes, please give me more variants of fuzzable_request
            return True

    def _log_return_false(self, fuzzable_request, reason):
        args = (reason, fuzzable_request)
        msg = 'VariantDB is returning False because of "%s" for "%s"'
        om.out.debug(msg % args)

    def _need_more_variants_for_uri(self, fuzzable_request):
        #
        # Do we need more variants for the fuzzable request? (similar match)
        # PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS
        #
        clean_dict_key = clean_fuzzable_request(fuzzable_request)
        count = self._variants.get(clean_dict_key, None)

        if count is None:
            self._variants[clean_dict_key] = 1
            return True

        # We've seen at least one fuzzable request with this pattern...
        url = fuzzable_request.get_uri()
        has_params = url.has_query_string() or fuzzable_request.get_raw_data()

        # Choose which max_variants to use
        if has_params:
            max_variants = self.params_max_variants
            max_variants_type = 'params'
        else:
            max_variants = self.path_max_variants
            max_variants_type = 'path'

        if count >= max_variants:
            _type = 'need_more_variants_for_uri(%s)' % max_variants_type
            self._log_return_false(fuzzable_request, _type)
            return False

        self._variants[clean_dict_key] = count + 1
        return True

    def _seen_exactly_the_same(self, fuzzable_request):
        #
        # Is the fuzzable request already known to us? (exactly the same)
        #
        request_hash = fuzzable_request.get_request_hash(
            self.HASH_IGNORE_HEADERS)
        if request_hash in self._variants_eq:
            return True

        # Store it to avoid duplicated fuzzable requests in our framework
        self._variants_eq.add(request_hash)

        self._log_return_false(fuzzable_request, 'seen_exactly_the_same')
        return False

    def _has_form(self, fuzzable_request):
        raw_data = fuzzable_request.get_raw_data()
        if raw_data and len(raw_data.get_param_names()) >= 2:
            return True

        return False

    def _need_more_variants_for_form(self, fuzzable_request):
        #
        # Do we need more variants for this form? (similar match)
        # MAX_EQUAL_FORM_VARIANTS
        #
        clean_dict_key_form = clean_fuzzable_request_form(fuzzable_request)
        count = self._variants_form.get(clean_dict_key_form, None)

        if count is None:
            self._variants_form[clean_dict_key_form] = 1
            return True

        if count >= self.max_equal_form_variants:
            self._log_return_false(fuzzable_request,
                                   'need_more_variants_for_form')
            return False

        self._variants_form[clean_dict_key_form] = count + 1
        return True
예제 #2
0
class VariantDB(object):
    """
    See the notes on PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS above. Also
    understand that we'll keep "dirty" versions of the references/fuzzable
    requests in order to be able to answer "False" to a call for
    need_more_variants in a situation like this:

        >> need_more_variants('http://foo.com/abc?id=32')
        True

        >> append('http://foo.com/abc?id=32')
        True

        >> need_more_variants('http://foo.com/abc?id=32')
        False

    """
    HASH_IGNORE_HEADERS = ('referer',)
    TAG = '[variant_db]'

    MAX_IN_MEMORY = 50

    def __init__(self):
        self._variants = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,
                                        table_prefix='variant_db')
        self._variants_eq = ScalableBloomFilter()
        self._variants_form = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,
                                             table_prefix='variant_db_form')

        self.params_max_variants = cf.cf.get('params_max_variants')
        self.path_max_variants = cf.cf.get('path_max_variants')
        self.max_equal_form_variants = cf.cf.get('max_equal_form_variants')

        self._db_lock = threading.RLock()

    def cleanup(self):
        self._variants.cleanup()
        self._variants_form.cleanup()

    def append(self, fuzzable_request):
        """
        :return: True if we added a new fuzzable request variant to the DB,
                 False if NO more variants are required for this fuzzable
                 request.
        """
        with self._db_lock:
            if self._seen_exactly_the_same(fuzzable_request):
                return False

            if self._has_form(fuzzable_request):
                if not self._need_more_variants_for_form(fuzzable_request):
                    return False

            if not self._need_more_variants_for_uri(fuzzable_request):
                return False

            # Yes, please give me more variants of fuzzable_request
            return True

    def _log_return_false(self, fuzzable_request, reason):
        args = (reason, fuzzable_request)
        msg = 'VariantDB is returning False because of "%s" for "%s"'
        om.out.debug(msg % args)

    def _need_more_variants_for_uri(self, fuzzable_request):
        #
        # Do we need more variants for the fuzzable request? (similar match)
        # PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS
        #
        clean_dict_key = clean_fuzzable_request(fuzzable_request)
        count = self._variants.get(clean_dict_key, None)

        if count is None:
            self._variants[clean_dict_key] = 1
            return True

        # We've seen at least one fuzzable request with this pattern...
        url = fuzzable_request.get_uri()
        has_params = url.has_query_string() or fuzzable_request.get_raw_data()

        # Choose which max_variants to use
        if has_params:
            max_variants = self.params_max_variants
            max_variants_type = 'params'
        else:
            max_variants = self.path_max_variants
            max_variants_type = 'path'

        if count >= max_variants:
            _type = 'need_more_variants_for_uri(%s)' % max_variants_type
            self._log_return_false(fuzzable_request, _type)
            return False

        self._variants[clean_dict_key] = count + 1
        return True

    def _seen_exactly_the_same(self, fuzzable_request):
        #
        # Is the fuzzable request already known to us? (exactly the same)
        #
        request_hash = fuzzable_request.get_request_hash(self.HASH_IGNORE_HEADERS)
        if request_hash in self._variants_eq:
            return True

        # Store it to avoid duplicated fuzzable requests in our framework
        self._variants_eq.add(request_hash)

        self._log_return_false(fuzzable_request, 'seen_exactly_the_same')
        return False

    def _has_form(self, fuzzable_request):
        raw_data = fuzzable_request.get_raw_data()
        if raw_data and len(raw_data.get_param_names()) >= 2:
            return True

        return False

    def _need_more_variants_for_form(self, fuzzable_request):
        #
        # Do we need more variants for this form? (similar match)
        # MAX_EQUAL_FORM_VARIANTS
        #
        clean_dict_key_form = clean_fuzzable_request_form(fuzzable_request)
        count = self._variants_form.get(clean_dict_key_form, None)

        if count is None:
            self._variants_form[clean_dict_key_form] = 1
            return True

        if count >= self.max_equal_form_variants:
            self._log_return_false(fuzzable_request, 'need_more_variants_for_form')
            return False

        self._variants_form[clean_dict_key_form] = count + 1
        return True
예제 #3
0
class Fingerprint404(object):
    """
    Read the 404 page(s) returned by the server.

    :author: Andres Riancho ([email protected])
    """

    _instance = None

    def __init__(self):
        #
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        #
        self._uri_opener = None

        #
        #   Store the 404 responses in a dict which has normalized paths
        #   as keys and 404 data as values.
        #
        #   The most commonly used keys for this dict are stored in memory
        #   while the least commonly used are stored in SQLite
        #
        self._404_responses = CachedDiskDict(max_in_memory=MAX_404_IN_MEMORY,
                                             table_prefix='is_404')

    def is_404(self, http_response):
        """
        All of my previous versions of is_404 were very complex and tried to
        struggle with all possible cases. The truth is that in most "strange"
        cases w3af was failing miserably, so now I changed w3af's 404 detection
        once again, but keeping it as simple as possible.

        Also, and because I was trying to cover ALL CASES, I was performing a
        lot of requests in order to cover them, which in most situations was
        unnecessary.

        So now I go for a much simple approach:
            1- Handle the most common case of all using no HTTP requests, this
               is implemented in _is_404_basic()

            2- Handle common cases using only 1 HTTP request, this is
               implemented in _is_404_complex()

            3- Handle rare cases with 2 HTTP requests, this is implemented in
               _handle_large_http_responses()

            4- Perform extensive caching in LRUCache404 to store the results
               generated by 1, 2, 3. Caching is performed by URL and response
               body. Use these results in the next calls to is_404()

            5- Give the users the power to configure the 404 detection by
               setting a string that identifies the 404 response (in case the
               other cases are not being able to handle this web application)

        :param http_response: The HTTP response
        :return: True if the HTTP response is a 404
        """
        if self._is_404_basic(http_response):
            return True

        if self._is_404_complex(http_response):
            return True

        return False

    def _is_404_basic(self, http_response):
        """
        Verifies if the response is a 404 by checking the user's configuration
        and applying very basic algorithms.

        :param http_response: The HTTP response
        :return: True if the HTTP response is a 404
        """
        domain_path = http_response.get_url().get_domain_path()

        #
        # First we handle the user configured exceptions:
        #
        if domain_path in cf.cf.get('always_404'):
            return True

        if domain_path in cf.cf.get('never_404'):
            return False

        #
        # The user configured setting. "If this string is in the response,
        # then it is a 404"
        #
        string_match_404 = cf.cf.get('string_match_404')

        if string_match_404:
            if string_match_404 in http_response:
                return True

        #
        # This is the most simple case, we don't even have to think about this
        #
        # If there is some custom website that always returns 404 codes, then
        # we are screwed, but this is open source, and the pentester working
        # on that site can modify these lines.
        #
        if http_response.get_code() == 404:
            return True

        #
        # This is an edge case. Let me explain...
        #
        # Doing try/except in all plugins that send HTTP requests was hard (tm)
        # so plugins don't use ExtendedUrllib directly, instead they use the
        # UrlOpenerProxy (defined in plugin.py). This proxy catches any
        # exceptions and returns a 204 response.
        #
        # In most cases that works perfectly, because it will allow the plugin
        # to keep working without caring much about the exceptions. In some
        # edge cases someone will call is_404(204_response_generated_by_w3af)
        # and that will most likely return False, because the 204 response we
        # generate doesn't look like anything w3af has in the 404 DB.
        #
        # The following iff fixes the race condition
        #
        if is_no_content_response(http_response):
            return True

        return False

    @PreventMultipleThreads
    def _is_404_complex(self, http_response):
        # 404_body stored in the DB was cleaned when creating the
        # FourOhFourResponse class.
        #
        # Clean the body received as parameter in order to have a fair
        # comparison
        query = FourOhFourResponse.from_http_response(http_response)

        return self._is_404_complex_impl(http_response, query)

    @LRUCache404
    def _is_404_complex_impl(self, http_response, query):
        """
        Verifies if the response is a 404 by comparing it with other responses
        which are known to be 404s, potentially sends HTTP requests to the
        server.

        :param http_response: The HTTP response
        :param query: The HTTP response in FourOhFourResponse form (normalized
                      URL, clean body, etc.)
        :return: True if the HTTP response is a 404
        """
        response_did = http_response.get_debugging_id()
        debugging_id = response_did if response_did is not None else rand_alnum(
            8)

        #
        # Compare query with a known 404 from the DB (or a generated one
        # if there is none with the same path in the DB)
        #
        known_404 = self._get_404_response(http_response, query, debugging_id)

        # Trivial performance improvement that prevents running fuzzy_equal
        if query.code in NOT_404_RESPONSE_CODES and known_404.code == 404:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [known 404 with ID %s uses 404 code]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, known_404.id)
            om.out.debug(msg % args)
            return False

        # Since the fuzzy_equal function is CPU-intensive we want to
        # avoid calling it for cases where we know it won't match, for
        # example in comparing an image and an html
        if query.content_type != known_404.content_type:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [document type mismatch with known 404 with ID %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, known_404.id)
            om.out.debug(msg % args)
            return False

        # This is the simplest case. If they are 100% equal, no matter how
        # large or complex the responses are, then query is a 404
        if known_404.body == query.body:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
                   ' [string equals with 404 DB entry with ID %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, known_404.id)
            om.out.debug(msg % args)
            return True

        is_fuzzy_equal = fuzzy_equal(known_404.body, query.body,
                                     IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with known 404 with ID %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, IS_EQUAL_RATIO, known_404.id)
            om.out.debug(msg % args)
            return False

        if len(query.body) < MAX_FUZZY_LENGTH:
            # The response bodies are fuzzy-equal, and the length is less than
            # MAX_FUZZY_LENGTH. This is good, it means that they are equal and
            # long headers / footers in HTTP response bodies are not
            # interfering with fuzzy-equals.
            #
            # Some sites have really large headers and footers which they
            # include for all pages, including 404s. When that happens one page
            # might look like:
            #
            #   {header-4000bytes}
            #   Hello world
            #   {footer-4000bytes}
            #
            # The header might contain large CSS and the footer might include
            # JQuery or some other large JS. Then, the 404 might look like:
            #
            #   {header-4000bytes}
            #   Not found
            #   {footer-4000bytes}
            #
            # A user with a browser might only see the text, and clearly
            # identify one as a valid page and another as a 404, but the
            # fuzzy_equal() function will return True, indicating that they
            # are equal because 99% of the bytes are the same.
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
                   ' [similarity_ratio > %s with 404 DB entry with ID %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, IS_EQUAL_RATIO, known_404.id)
            om.out.debug(msg % args)
            return True

        else:
            # See the large comment above on why we need to check for
            # MAX_FUZZY_LENGTH.
            #
            # The way to handle this case is to send an extra HTTP
            # request that will act as a tie-breaker.
            return self._handle_large_http_responses(http_response, query,
                                                     known_404, debugging_id)

    def _handle_large_http_responses(self, http_response, query, known_404,
                                     debugging_id):
        """
        When HTTP response bodies are large the fuzzy_equal() will generate
        404 false positives. This is explained in a comment above,
        (search for "{header-4000bytes}").

        This method will handle that case by using three HTTP responses instead
        of two (which is the most common case). The three HTTP responses used
        by this method are:

            * known_404: The forced 404 generated by this class
            * query:  The HTTP response we want to know if it is a 404
            * Another forced 404 generated by this method

        The method will diff the two 404 responses, and one 404 response with
        the query response, then compare using fuzzy_equal() to determine if the
        query is a 404.

        :return: True if the query response is a 404!
        """
        # Make the algorithm easier to read
        known_404_1 = known_404

        if known_404_1.diff is not None:
            # At some point during the execution of this scan we already sent
            # an HTTP request to use in this process and calculated the diff
            #
            # In order to prevent more HTTP requests from being sent to the
            # server, and also to reduce CPU usage, we saved the diff as an
            # attribute.
            pass
        else:
            # Need to send the second request and calculate the diff, there is
            # no previous knowledge that we can use
            #
            # Send exclude=[known_404_1.url] to prevent the function from sending
            # an HTTP request to the same forced 404 URL
            known_404_2 = send_request_generate_404(self._uri_opener,
                                                    http_response,
                                                    debugging_id,
                                                    exclude=[known_404_1.url])

            known_404_1.diff, _ = chunked_diff(known_404_1.body,
                                               known_404_2.body)
            known_404_1.diff_with_id = known_404_2.id
            self._404_responses[query.normalized_path] = known_404_1.dumps()

        diff_x = known_404_1.diff
        _, diff_y = chunked_diff(known_404_1.body, query.body)

        is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with diff of 404]'
                   ' [Request IDs: %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, IS_EQUAL_RATIO, ', '.join([
                        str(http_response.id),
                        str(known_404_1.id),
                        str(known_404_1.diff_with_id)
                    ]))
            om.out.debug(msg % args)
            return False

        msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
               ' [similarity_ratio > %s with diff of 404]'
               ' [Request IDs: %s]')
        args = (http_response.get_url(), http_response.id,
                http_response.get_code(), len(http_response.get_body()),
                debugging_id, IS_EQUAL_RATIO, ', '.join([
                    str(http_response.id),
                    str(known_404_1.id),
                    str(known_404_1.diff_with_id)
                ]))
        om.out.debug(msg % args)
        return True

    def set_url_opener(self, urlopener):
        self._uri_opener = urlopener

    def _get_404_response(self, http_response, query, debugging_id):
        """
        :return: A FourOhFourResponse instance.
                    * First try to get the response from the 404 DB

                    * If the data is not there then send an HTTP request
                    with a randomly generated path or name to force a 404,
                    save the data to the DB and then return it.
        """
        serialized_known_404 = self._404_responses.get(query.normalized_path,
                                                       None)

        if serialized_known_404 is not None:
            return FourOhFourResponse.loads(serialized_known_404)

        known_404 = send_request_generate_404(self._uri_opener, http_response,
                                              debugging_id)

        self._404_responses[query.normalized_path] = known_404.dumps()
        return known_404
예제 #4
0
class Fingerprint404(object):
    """
    Read the 404 page(s) returned by the server.

    :author: Andres Riancho ([email protected])
    """

    _instance = None

    def __init__(self):
        #
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        #
        self._uri_opener = None

        #
        #   Store the 404 responses in a dict which has normalized paths
        #   as keys and 404 data as values.
        #
        #   The most commonly used keys for this dict are stored in memory
        #   while the least commonly used are stored in SQLite
        #
        self._404_responses = CachedDiskDict(max_in_memory=MAX_404_IN_MEMORY,
                                             table_prefix='is_404')

    @PreventMultipleThreads
    @LRUCache404
    def is_404(self, http_response):
        """
        All of my previous versions of is_404 were very complex and tried to
        struggle with all possible cases. The truth is that in most "strange"
        cases I was failing miserably, so now I changed my 404 detection once
        again, but keeping it as simple as possible.

        Also, and because I was trying to cover ALL CASES, I was performing a
        lot of requests in order to cover them, which in most situations was
        unnecessary.

        So now I go for a much simple approach:
            1- Handle the most common case of all using only 1 HTTP request

            2- Handle rare cases with 2 HTTP requests

            3- Give the users the power to configure the 404 detection by
               setting a string that identifies the 404 response (in case we
               are missing it for some reason in cases #1 and #2)

        :param http_response: The HTTP response
        :return: True if the HTTP response is a 404
        """
        if self._is_404_basic(http_response):
            return True

        if self._is_404_complex(http_response):
            return True

        return False

    def _is_404_basic(self, http_response):
        """
        Verifies if the response is a 404 by checking the user's configuration
        and applying very basic algorithms.

        :param http_response: The HTTP response
        :return: True if the HTTP response is a 404
        """
        domain_path = http_response.get_url().get_domain_path()

        #
        # First we handle the user configured exceptions:
        #
        if domain_path in cf.cf.get('always_404'):
            return True

        if domain_path in cf.cf.get('never_404'):
            return False

        #
        # The user configured setting. "If this string is in the response,
        # then it is a 404"
        #
        if cf.cf.get('string_match_404') and cf.cf.get('string_match_404') in http_response:
            return True

        #
        # This is the most simple case, we don't even have to think about this
        #
        # If there is some custom website that always returns 404 codes, then
        # we are screwed, but this is open source, and the pentester working
        # on that site can modify these lines.
        #
        if http_response.get_code() == 404:
            return True

        #
        # This is an edge case. Let me explain...
        #
        # Doing try/except in all plugins that send HTTP requests was hard (tm)
        # so plugins don't use ExtendedUrllib directly, instead they use the
        # UrlOpenerProxy (defined in plugin.py). This proxy catches any
        # exceptions and returns a 204 response.
        #
        # In most cases that works perfectly, because it will allow the plugin
        # to keep working without caring much about the exceptions. In some
        # edge cases someone will call is_404(204_response_generated_by_w3af)
        # and that will most likely return False, because the 204 response we
        # generate doesn't look like anything w3af has in the 404 DB.
        #
        # The following iff fixes the race condition
        #
        if http_response.get_code() == 204:
            if http_response.get_msg() == NO_CONTENT_MSG:
                if http_response.get_headers() == Headers():
                    return True

        return False

    def _is_404_complex(self, http_response):
        """
        Verifies if the response is a 404 by comparing it with other responses
        which are known to be 404s, potentially sends HTTP requests to the
        server.

        :param http_response: The HTTP response
        :return: True if the HTTP response is a 404
        """
        response_did = http_response.get_debugging_id()
        debugging_id = response_did if response_did is not None else rand_alnum(8)

        # 404_body stored in the DB was cleaned when creating the
        # FourOhFourResponse class.
        #
        # Clean the body received as parameter in order to have a fair
        # comparison
        query = FourOhFourResponse(http_response)

        #
        # Compare query with a known 404 from the DB (or a generated one
        # if there is none with the same path in the DB)
        #
        known_404 = self._get_404_response(http_response, query, debugging_id)

        # Trivial performance improvement that prevents running fuzzy_equal
        if query.code in NOT_404_RESPONSE_CODES and known_404.code == 404:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [known 404 with ID %s uses 404 code]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    known_404.id)
            om.out.debug(msg % args)
            return False

        # Since the fuzzy_equal function is CPU-intensive we want to
        # avoid calling it for cases where we know it won't match, for
        # example in comparing an image and an html
        if query.doc_type != known_404.doc_type:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [document type mismatch with known 404 with ID %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    known_404.id)
            om.out.debug(msg % args)
            return False

        # This is the simplest case. If they are 100% equal, no matter how
        # large or complex the responses are, then query is a 404
        if known_404.body == query.body:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
                   ' [string equals with 404 DB entry with ID %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    known_404.id)
            om.out.debug(msg % args)
            return True

        is_fuzzy_equal = fuzzy_equal(known_404.body, query.body, IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with known 404 with ID %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    IS_EQUAL_RATIO,
                    known_404.id)
            om.out.debug(msg % args)
            return False

        if len(query.body) < MAX_FUZZY_LENGTH:
            # The response bodies are fuzzy-equal, and the length is less than
            # MAX_FUZZY_LENGTH. This is good, it means that they are equal and
            # long headers / footers in HTTP response bodies are not
            # interfering with fuzzy-equals.
            #
            # Some sites have really large headers and footers which they
            # include for all pages, including 404s. When that happens one page
            # might look like:
            #
            #   {header-4000bytes}
            #   Hello world
            #   {footer-4000bytes}
            #
            # The header might contain large CSS and the footer might include
            # JQuery or some other large JS. Then, the 404 might look like:
            #
            #   {header-4000bytes}
            #   Not found
            #   {footer-4000bytes}
            #
            # A user with a browser might only see the text, and clearly
            # identify one as a valid page and another as a 404, but the
            # fuzzy_equal() function will return True, indicating that they
            # are equal because 99% of the bytes are the same.
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
                   ' [similarity_ratio > %s with 404 DB entry with ID %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    IS_EQUAL_RATIO,
                    known_404.id)
            om.out.debug(msg % args)
            return True

        else:
            # See the large comment above on why we need to check for
            # MAX_FUZZY_LENGTH.
            #
            # The way to handle this case is to send an extra HTTP
            # request that will act as a tie-breaker.
            return self._handle_large_http_responses(http_response,
                                                     query,
                                                     known_404,
                                                     debugging_id)

    def _handle_large_http_responses(self, http_response, query, known_404, debugging_id):
        """
        When HTTP response bodies are large the fuzzy_equal() will generate
        404 false positives. This is explained in a comment above,
        (search for "{header-4000bytes}").

        This method will handle that case by using three HTTP responses instead
        of two (which is the most common case). The three HTTP responses used
        by this method are:

            * known_404: The forced 404 generated by this class
            * query:  The HTTP response we want to know if it is a 404
            * Another forced 404 generated by this method

        The method will diff the two 404 responses, and one 404 response with
        the query response, then compare using fuzzy_equal() to determine if the
        query is a 404.

        :return: True if the query response is a 404!
        """
        # Make the algorithm easier to read
        known_404_1 = known_404

        if known_404_1.diff is not None:
            # At some point during the execution of this scan we already sent
            # an HTTP request to use in this process and calculated the diff
            #
            # In order to prevent more HTTP requests from being sent to the
            # server, and also to reduce CPU usage, we saved the diff as an
            # attribute.
            pass
        else:
            # Need to send the second request and calculate the diff, there is
            # no previous knowledge that we can use
            #
            # Send exclude=[known_404_1.url] to prevent the function from sending
            # an HTTP request to the same forced 404 URL
            known_404_2 = send_request_generate_404(self._uri_opener,
                                                    http_response,
                                                    debugging_id,
                                                    exclude=[known_404_1.url])

            known_404_1.diff, _ = diff(known_404_1.body, known_404_2.body)
            known_404_1.diff_with_id = known_404_2.id
            self._404_responses[query.normalized_path] = known_404_1

        diff_x = known_404_1.diff
        _, diff_y = diff(known_404_1.body, query.body)

        is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with diff of 404]'
                   ' [Request IDs: %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    IS_EQUAL_RATIO,
                    ', '.join([str(http_response.id),
                               str(known_404_1.id),
                               str(known_404_1.diff_with_id)]))
            om.out.debug(msg % args)
            return False

        msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
               ' [similarity_ratio > %s with diff of 404]'
               ' [Request IDs: %s]')
        args = (http_response.get_url(),
                http_response.id,
                http_response.get_code(),
                len(http_response.get_body()),
                debugging_id,
                IS_EQUAL_RATIO,
                ', '.join([str(http_response.id),
                           str(known_404_1.id),
                           str(known_404_1.diff_with_id)]))
        om.out.debug(msg % args)
        return True

    def set_url_opener(self, urlopener):
        self._uri_opener = urlopener

    def _get_404_response(self, http_response, query, debugging_id):
        """
        :return: A FourOhFourResponse instance.
                    * First try to get the response from the 404 DB

                    * If the data is not there then send an HTTP request
                    with a randomly generated path or name to force a 404,
                    save the data to the DB and then return it.
        """
        known_404 = self._404_responses.get(query.normalized_path, None)
        if known_404 is not None:
            return known_404

        known_404 = send_request_generate_404(self._uri_opener,
                                              http_response,
                                              debugging_id)

        self._404_responses[query.normalized_path] = known_404
        return known_404