Exemplo n.º 1
0
    def _get_404_response(self, http_response, query, debugging_id):
        """
        :return: A FourOhFourResponse instance.
                    * First try to get the response from the 404 DB

                    * If the data is not there then send an HTTP request
                    with a randomly generated path or name to force a 404,
                    save the data to the DB and then return it.
        """
        known_404 = self._404_responses.get(query.normalized_path, None)
        if known_404 is not None:
            return known_404

        known_404 = send_request_generate_404(self._uri_opener, http_response,
                                              debugging_id)

        self._404_responses[query.normalized_path] = known_404
        return known_404
Exemplo n.º 2
0
    def _get_404_response(self, http_response, query, debugging_id):
        """
        :return: A FourOhFourResponse instance.
                    * First try to get the response from the 404 DB

                    * If the data is not there then send an HTTP request
                    with a randomly generated path or name to force a 404,
                    save the data to the DB and then return it.
        """
        known_404 = self._404_responses.get(query.normalized_path, None)
        if known_404 is not None:
            return known_404

        known_404 = send_request_generate_404(self._uri_opener,
                                              http_response,
                                              debugging_id)

        self._404_responses[query.normalized_path] = known_404
        return known_404
Exemplo n.º 3
0
    def _handle_large_http_responses(self, http_response, query, known_404,
                                     debugging_id):
        """
        When HTTP response bodies are large the fuzzy_equal() will generate
        404 false positives. This is explained in a comment above,
        (search for "{header-4000bytes}").

        This method will handle that case by using three HTTP responses instead
        of two (which is the most common case). The three HTTP responses used
        by this method are:

            * known_404: The forced 404 generated by this class
            * query:  The HTTP response we want to know if it is a 404
            * Another forced 404 generated by this method

        The method will diff the two 404 responses, and one 404 response with
        the query response, then compare using fuzzy_equal() to determine if the
        query is a 404.

        :return: True if the query response is a 404!
        """
        # Make the algorithm easier to read
        known_404_1 = known_404

        if known_404_1.diff is not None:
            # At some point during the execution of this scan we already sent
            # an HTTP request to use in this process and calculated the diff
            #
            # In order to prevent more HTTP requests from being sent to the
            # server, and also to reduce CPU usage, we saved the diff as an
            # attribute.
            pass
        else:
            # Need to send the second request and calculate the diff, there is
            # no previous knowledge that we can use
            #
            # Send exclude=[known_404_1.url] to prevent the function from sending
            # an HTTP request to the same forced 404 URL
            known_404_2 = send_request_generate_404(self._uri_opener,
                                                    http_response,
                                                    debugging_id,
                                                    exclude=[known_404_1.url])

            known_404_1.diff, _ = chunked_diff(known_404_1.body,
                                               known_404_2.body)
            known_404_1.diff_with_id = known_404_2.id
            self._404_responses[query.normalized_path] = known_404_1.dumps()

        diff_x = known_404_1.diff
        _, diff_y = chunked_diff(known_404_1.body, query.body)

        is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with diff of 404]'
                   ' [Request IDs: %s]')
            args = (http_response.get_url(), http_response.id,
                    http_response.get_code(), len(http_response.get_body()),
                    debugging_id, IS_EQUAL_RATIO, ', '.join([
                        str(http_response.id),
                        str(known_404_1.id),
                        str(known_404_1.diff_with_id)
                    ]))
            om.out.debug(msg % args)
            return False

        msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
               ' [similarity_ratio > %s with diff of 404]'
               ' [Request IDs: %s]')
        args = (http_response.get_url(), http_response.id,
                http_response.get_code(), len(http_response.get_body()),
                debugging_id, IS_EQUAL_RATIO, ', '.join([
                    str(http_response.id),
                    str(known_404_1.id),
                    str(known_404_1.diff_with_id)
                ]))
        om.out.debug(msg % args)
        return True
Exemplo n.º 4
0
    def _handle_large_http_responses(self, http_response, query, known_404, debugging_id):
        """
        When HTTP response bodies are large the fuzzy_equal() will generate
        404 false positives. This is explained in a comment above,
        (search for "{header-4000bytes}").

        This method will handle that case by using three HTTP responses instead
        of two (which is the most common case). The three HTTP responses used
        by this method are:

            * known_404: The forced 404 generated by this class
            * query:  The HTTP response we want to know if it is a 404
            * Another forced 404 generated by this method

        The method will diff the two 404 responses, and one 404 response with
        the query response, then compare using fuzzy_equal() to determine if the
        query is a 404.

        :return: True if the query response is a 404!
        """
        # Make the algorithm easier to read
        known_404_1 = known_404

        if known_404_1.diff is not None:
            # At some point during the execution of this scan we already sent
            # an HTTP request to use in this process and calculated the diff
            #
            # In order to prevent more HTTP requests from being sent to the
            # server, and also to reduce CPU usage, we saved the diff as an
            # attribute.
            pass
        else:
            # Need to send the second request and calculate the diff, there is
            # no previous knowledge that we can use
            #
            # Send exclude=[known_404_1.url] to prevent the function from sending
            # an HTTP request to the same forced 404 URL
            known_404_2 = send_request_generate_404(self._uri_opener,
                                                    http_response,
                                                    debugging_id,
                                                    exclude=[known_404_1.url])

            known_404_1.diff, _ = diff(known_404_1.body, known_404_2.body)
            known_404_1.diff_with_id = known_404_2.id
            self._404_responses[query.normalized_path] = known_404_1

        diff_x = known_404_1.diff
        _, diff_y = diff(known_404_1.body, query.body)

        is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO)

        if not is_fuzzy_equal:
            msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404'
                   ' [similarity_ratio < %s with diff of 404]'
                   ' [Request IDs: %s]')
            args = (http_response.get_url(),
                    http_response.id,
                    http_response.get_code(),
                    len(http_response.get_body()),
                    debugging_id,
                    IS_EQUAL_RATIO,
                    ', '.join([str(http_response.id),
                               str(known_404_1.id),
                               str(known_404_1.diff_with_id)]))
            om.out.debug(msg % args)
            return False

        msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404'
               ' [similarity_ratio > %s with diff of 404]'
               ' [Request IDs: %s]')
        args = (http_response.get_url(),
                http_response.id,
                http_response.get_code(),
                len(http_response.get_body()),
                debugging_id,
                IS_EQUAL_RATIO,
                ', '.join([str(http_response.id),
                           str(known_404_1.id),
                           str(known_404_1.diff_with_id)]))
        om.out.debug(msg % args)
        return True