Пример #1
    def test_namedtuple(self):
        disk_deque = DiskDeque(maxsize=2)

        disk_deque.append(FourOhFourResponse('body', 'image', '/'))

        for fofr in disk_deque:
            self.assertEqual(fofr.doc_type, 'image')
Пример #2
    def test_namedtuple(self):
        disk_deque = DiskDeque(maxsize=2)

        disk_deque.append(FourOhFourResponse('body', 'image'))

        for fofr in disk_deque:
            self.assertEqual(fofr.doc_type, 'image')
Пример #3
    def test_namedtuple(self):
        disk_deque = DiskDeque(maxsize=2)


        for fofr in disk_deque:
            self.assertEqual(fofr.content_type, 'image')
Пример #4
    def test_iter(self):
        disk_deque = DiskDeque(maxsize=2)

        contents = []
        for i in disk_deque:

        self.assertEqual(contents, [1, 2])
Пример #5
    def test_iter(self):
        disk_deque = DiskDeque(maxsize=2)

        contents = []
        for i in disk_deque:

        self.assertEqual(contents, [1, 2])
Пример #6
    def test_int(self):
        disk_deque = DiskDeque(maxsize=2)


        self.assertIn(1, disk_deque)
        self.assertIn(2, disk_deque)


        self.assertNotIn(1, disk_deque)
        self.assertIn(2, disk_deque)
        self.assertIn(3, disk_deque)
Пример #7
    def __init__(self):
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        self._uri_opener = None
        self._worker_pool = None

        #   Internal variables
        self._already_analyzed = False
        self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES)
        self._lock = thread.allocate_lock()
        self._directory_uses_404_codes = ScalableBloomFilter()

        # It is OK to store 200 here, I'm only storing path+filename as the key,
        # and bool as the value.
        self.is_404_LRU = SynchronizedLRUDict(250)
Пример #8
    def __init__(self):
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        self._uri_opener = None
        self._worker_pool = None
        #   Internal variables
        self._already_analyzed = False
        self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES)
        self._lock = thread.allocate_lock()
        self._directory_uses_404_codes = ScalableBloomFilter()

        # It is OK to store 200 here, I'm only storing path+filename as the key,
        # and bool as the value.
        self.is_404_LRU = SynchronizedLRUDict(250)
Пример #9
    def test_int(self):
        disk_deque = DiskDeque(maxsize=2)


        self.assertIn(1, disk_deque)
        self.assertIn(2, disk_deque)


        self.assertNotIn(1, disk_deque)
        self.assertIn(2, disk_deque)
        self.assertIn(3, disk_deque)
Пример #10
class fingerprint_404(object):
    Read the 404 page(s) returned by the server.

    :author: Andres Riancho ([email protected])

    _instance = None

    def __init__(self):
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        self._uri_opener = None
        self._worker_pool = None
        #   Internal variables
        self._already_analyzed = False
        self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES)
        self._lock = thread.allocate_lock()
        self._directory_uses_404_codes = ScalableBloomFilter()

        # It is OK to store 200 here, I'm only storing path+filename as the key,
        # and bool as the value.
        self.is_404_LRU = SynchronizedLRUDict(250)

    def set_url_opener(self, urlopener):
        self._uri_opener = urlopener

    def set_worker_pool(self, worker_pool):
        self._worker_pool = worker_pool

    def generate_404_knowledge(self, url):
        Based on a URL, request something that we know is going to be a 404.
        Afterwards analyze the 404's and summarise them.

        :return: A list with 404 bodies.
        #    This is the case when nobody has properly configured
        #    the object in order to use it.
        if self._uri_opener is None:
            msg = ('404 fingerprint database was incorrectly initialized.'
                   ' URL opener is None.')
            raise RuntimeError(msg)

        # Get the filename extension and create a 404 for it
        extension = url.get_extension()
        domain_path = url.get_domain_path()

        #   This is a list of the most common handlers, in some configurations,
        #   the 404 depends on the handler, so I want to make sure that I catch
        #   the 404 for each one
        handlers = {'py', 'php', 'asp', 'aspx', 'do', 'jsp', 'rb', 'do',
                    'gif', 'htm', 'pl', 'cgi', 'xhtml', 'htmls', 'foobar'}
        if extension:

        test_urls = []

        for extension in handlers:
            rand_alnum_file = rand_alnum(8) + '.' + extension
            url404 = domain_path.url_join(rand_alnum_file)

        imap_unordered = self._worker_pool.imap_unordered
        not_exist_resp_lst = []
        for not_exist_resp in imap_unordered(self._send_404, test_urls):

        # I have the 404 responses in not_exist_resp_lst, but maybe they
        # all look the same, so I'll filter the ones that look alike.
        # Just add the first one to the 404 responses list, since that one is
        # "unique"
        if len(not_exist_resp_lst):
            http_response = not_exist_resp_lst[0]
            four_oh_data = FourOhFourResponseFactory(http_response)

        # And now add the unique responses
        for i in not_exist_resp_lst:
            for j in not_exist_resp_lst:

                if i is j:

                if fuzzy_equal(i.body, j.body, IS_EQUAL_RATIO):
                    # They are equal, just ignore it
                    # They are no equal, this means that we'll have to add this
                    # one to the 404 responses
                    four_oh_data = FourOhFourResponseFactory(j)

        # And I return the ones I need
        msg_fmt = 'The 404 body result database has a length of %s.'
        om.out.debug(msg_fmt % len(self._404_responses))

    @retry(tries=2, delay=0.5, backoff=2)
    def _send_404(self, url404):
        Sends a GET request to url404.

        :return: The HTTP response body.
        # I don't use the cache, because the URLs are random and the only thing
        # that cache does is to fill up disk space
            response = self._uri_opener.GET(url404, cache=False, grep=False)
        except HTTPRequestException, hre:
            message = 'Exception found while detecting 404: "%s"'
            raise FourOhFourDetectionException(message % hre)

        return response
Пример #11
    def test_len(self):
        disk_deque = DiskDeque(maxsize=2)
        self.assertEqual(len(disk_deque), 0)

        self.assertEqual(len(disk_deque), 1)
Пример #12
class fingerprint_404(object):
    Read the 404 page(s) returned by the server.

    :author: Andres Riancho ([email protected])

    _instance = None

    def __init__(self):
        #   Set the opener, I need it to perform some tests and gain
        #   the knowledge about the server's 404 response bodies.
        self._uri_opener = None
        self._worker_pool = None

        #   Internal variables
        self._already_analyzed = False
        self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES)
        self._lock = thread.allocate_lock()
        self._directory_uses_404_codes = ScalableBloomFilter()

        # It is OK to store 200 here, I'm only storing path+filename as the key,
        # and bool as the value.
        self.is_404_LRU = SynchronizedLRUDict(250)

    def set_url_opener(self, urlopener):
        self._uri_opener = urlopener

    def set_worker_pool(self, worker_pool):
        self._worker_pool = worker_pool

    def generate_404_knowledge(self, url):
        Based on a URL, request something that we know is going to be a 404.
        Afterwards analyze the 404's and summarise them.

        :return: A list with 404 bodies.
        #    This is the case when nobody has properly configured
        #    the object in order to use it.
        if self._uri_opener is None:
            msg = ('404 fingerprint database was incorrectly initialized.'
                   ' URL opener is None.')
            raise RuntimeError(msg)

        # Get the filename extension and create a 404 for it
        extension = url.get_extension()
        domain_path = url.get_domain_path()

        #   This is a list of the most common handlers, in some configurations,
        #   the 404 depends on the handler, so I want to make sure that I catch
        #   the 404 for each one
        handlers = {
            'py', 'php', 'asp', 'aspx', 'do', 'jsp', 'rb', 'do', 'gif', 'htm',
            'pl', 'cgi', 'xhtml', 'htmls', 'foobar'
        if extension:

        test_urls = []

        for extension in handlers:
            rand_alnum_file = rand_alnum(8) + '.' + extension
            url404 = domain_path.url_join(rand_alnum_file)

        imap_unordered = self._worker_pool.imap_unordered
        not_exist_resp_lst = []

        for not_exist_resp in imap_unordered(self._send_404, test_urls):

        # I have the 404 responses in not_exist_resp_lst, but maybe they
        # all look the same, so I'll filter the ones that look alike.
        # Just add the first one to the 404 responses list, since that one is
        # "unique"
        if len(not_exist_resp_lst):
            http_response = not_exist_resp_lst[0]
            four_oh_data = FourOhFourResponseFactory(http_response)

        # And now add the unique responses
        for i in not_exist_resp_lst:
            for j in not_exist_resp_lst:

                if i is j:

                if fuzzy_equal(i.body, j.body, IS_EQUAL_RATIO):
                    # They are equal, just ignore it
                    # They are no equal, this means that we'll have to add this
                    # one to the 404 responses
                    four_oh_data = FourOhFourResponseFactory(j)

        # And I return the ones I need
        msg_fmt = 'The 404 body result database has a length of %s.'
        om.out.debug(msg_fmt % len(self._404_responses))

    @retry(tries=2, delay=0.5, backoff=2)
    def _send_404(self, url404):
        Sends a GET request to url404.

        :return: The HTTP response body.
        # I don't use the cache, because the URLs are random and the only thing
        # that cache does is to fill up disk space
            response = self._uri_opener.GET(url404, cache=False, grep=False)
        except HTTPRequestException, hre:
            message = 'Exception found while detecting 404: "%s"'
            raise FourOhFourDetectionException(message % hre)

        return response
Пример #13
    def test_len(self):
        disk_deque = DiskDeque(maxsize=2)
        self.assertEqual(len(disk_deque), 0)

        self.assertEqual(len(disk_deque), 1)