def test_namedtuple(self): disk_deque = DiskDeque(maxsize=2) disk_deque.append(FourOhFourResponse('body', 'image', '/')) for fofr in disk_deque: self.assertEqual(fofr.doc_type, 'image')
def test_namedtuple(self): disk_deque = DiskDeque(maxsize=2) disk_deque.append(FourOhFourResponse('body', 'image')) for fofr in disk_deque: self.assertEqual(fofr.doc_type, 'image')
def test_namedtuple(self): disk_deque = DiskDeque(maxsize=2) disk_deque.append( FourOhFourResponse(clean_body='body', content_type='image', url='/')) for fofr in disk_deque: self.assertEqual(fofr.content_type, 'image')
def test_iter(self): disk_deque = DiskDeque(maxsize=2) disk_deque.append(1) disk_deque.append(2) contents = [] for i in disk_deque: contents.append(i) self.assertEqual(contents, [1, 2])
def test_int(self): disk_deque = DiskDeque(maxsize=2) disk_deque.append(1) disk_deque.append(2) self.assertIn(1, disk_deque) self.assertIn(2, disk_deque) disk_deque.append(3) self.assertNotIn(1, disk_deque) self.assertIn(2, disk_deque) self.assertIn(3, disk_deque)
def __init__(self): # # Set the opener, I need it to perform some tests and gain # the knowledge about the server's 404 response bodies. # self._uri_opener = None self._worker_pool = None # # Internal variables # self._already_analyzed = False self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES) self._lock = thread.allocate_lock() self._directory_uses_404_codes = ScalableBloomFilter() # It is OK to store 200 here, I'm only storing path+filename as the key, # and bool as the value. self.is_404_LRU = SynchronizedLRUDict(250)
class fingerprint_404(object): """ Read the 404 page(s) returned by the server. :author: Andres Riancho ([email protected]) """ _instance = None def __init__(self): # # Set the opener, I need it to perform some tests and gain # the knowledge about the server's 404 response bodies. # self._uri_opener = None self._worker_pool = None # # Internal variables # self._already_analyzed = False self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES) self._lock = thread.allocate_lock() self._directory_uses_404_codes = ScalableBloomFilter() # It is OK to store 200 here, I'm only storing path+filename as the key, # and bool as the value. self.is_404_LRU = SynchronizedLRUDict(250) def set_url_opener(self, urlopener): self._uri_opener = urlopener def set_worker_pool(self, worker_pool): self._worker_pool = worker_pool def generate_404_knowledge(self, url): """ Based on a URL, request something that we know is going to be a 404. Afterwards analyze the 404's and summarise them. :return: A list with 404 bodies. """ # # This is the case when nobody has properly configured # the object in order to use it. # if self._uri_opener is None: msg = ('404 fingerprint database was incorrectly initialized.' ' URL opener is None.') raise RuntimeError(msg) # Get the filename extension and create a 404 for it extension = url.get_extension() domain_path = url.get_domain_path() # # This is a list of the most common handlers, in some configurations, # the 404 depends on the handler, so I want to make sure that I catch # the 404 for each one # handlers = {'py', 'php', 'asp', 'aspx', 'do', 'jsp', 'rb', 'do', 'gif', 'htm', 'pl', 'cgi', 'xhtml', 'htmls', 'foobar'} if extension: handlers.add(extension) test_urls = [] for extension in handlers: rand_alnum_file = rand_alnum(8) + '.' + extension url404 = domain_path.url_join(rand_alnum_file) test_urls.append(url404) imap_unordered = self._worker_pool.imap_unordered not_exist_resp_lst = [] for not_exist_resp in imap_unordered(self._send_404, test_urls): not_exist_resp_lst.append(not_exist_resp) # # I have the 404 responses in not_exist_resp_lst, but maybe they # all look the same, so I'll filter the ones that look alike. # # Just add the first one to the 404 responses list, since that one is # "unique" # if len(not_exist_resp_lst): http_response = not_exist_resp_lst[0] four_oh_data = FourOhFourResponseFactory(http_response) self._404_responses.append(four_oh_data) # And now add the unique responses for i in not_exist_resp_lst: for j in not_exist_resp_lst: if i is j: continue if fuzzy_equal(i.body, j.body, IS_EQUAL_RATIO): # They are equal, just ignore it continue else: # They are no equal, this means that we'll have to add this # one to the 404 responses four_oh_data = FourOhFourResponseFactory(j) self._404_responses.append(four_oh_data) # And I return the ones I need msg_fmt = 'The 404 body result database has a length of %s.' om.out.debug(msg_fmt % len(self._404_responses)) @retry(tries=2, delay=0.5, backoff=2) def _send_404(self, url404): """ Sends a GET request to url404. :return: The HTTP response body. """ # I don't use the cache, because the URLs are random and the only thing # that cache does is to fill up disk space try: response = self._uri_opener.GET(url404, cache=False, grep=False) except HTTPRequestException, hre: message = 'Exception found while detecting 404: "%s"' raise FourOhFourDetectionException(message % hre) return response
def test_len(self): disk_deque = DiskDeque(maxsize=2) self.assertEqual(len(disk_deque), 0) disk_deque.append(5) self.assertEqual(len(disk_deque), 1)
class fingerprint_404(object): """ Read the 404 page(s) returned by the server. :author: Andres Riancho ([email protected]) """ _instance = None def __init__(self): # # Set the opener, I need it to perform some tests and gain # the knowledge about the server's 404 response bodies. # self._uri_opener = None self._worker_pool = None # # Internal variables # self._already_analyzed = False self._404_responses = DiskDeque(maxsize=MAX_404_RESPONSES) self._lock = thread.allocate_lock() self._directory_uses_404_codes = ScalableBloomFilter() # It is OK to store 200 here, I'm only storing path+filename as the key, # and bool as the value. self.is_404_LRU = SynchronizedLRUDict(250) def set_url_opener(self, urlopener): self._uri_opener = urlopener def set_worker_pool(self, worker_pool): self._worker_pool = worker_pool def generate_404_knowledge(self, url): """ Based on a URL, request something that we know is going to be a 404. Afterwards analyze the 404's and summarise them. :return: A list with 404 bodies. """ # # This is the case when nobody has properly configured # the object in order to use it. # if self._uri_opener is None: msg = ('404 fingerprint database was incorrectly initialized.' ' URL opener is None.') raise RuntimeError(msg) # Get the filename extension and create a 404 for it extension = url.get_extension() domain_path = url.get_domain_path() # # This is a list of the most common handlers, in some configurations, # the 404 depends on the handler, so I want to make sure that I catch # the 404 for each one # handlers = { 'py', 'php', 'asp', 'aspx', 'do', 'jsp', 'rb', 'do', 'gif', 'htm', 'pl', 'cgi', 'xhtml', 'htmls', 'foobar' } if extension: handlers.add(extension) test_urls = [] for extension in handlers: rand_alnum_file = rand_alnum(8) + '.' + extension url404 = domain_path.url_join(rand_alnum_file) test_urls.append(url404) imap_unordered = self._worker_pool.imap_unordered not_exist_resp_lst = [] for not_exist_resp in imap_unordered(self._send_404, test_urls): not_exist_resp_lst.append(not_exist_resp) # # I have the 404 responses in not_exist_resp_lst, but maybe they # all look the same, so I'll filter the ones that look alike. # # Just add the first one to the 404 responses list, since that one is # "unique" # if len(not_exist_resp_lst): http_response = not_exist_resp_lst[0] four_oh_data = FourOhFourResponseFactory(http_response) self._404_responses.append(four_oh_data) # And now add the unique responses for i in not_exist_resp_lst: for j in not_exist_resp_lst: if i is j: continue if fuzzy_equal(i.body, j.body, IS_EQUAL_RATIO): # They are equal, just ignore it continue else: # They are no equal, this means that we'll have to add this # one to the 404 responses four_oh_data = FourOhFourResponseFactory(j) self._404_responses.append(four_oh_data) # And I return the ones I need msg_fmt = 'The 404 body result database has a length of %s.' om.out.debug(msg_fmt % len(self._404_responses)) @retry(tries=2, delay=0.5, backoff=2) def _send_404(self, url404): """ Sends a GET request to url404. :return: The HTTP response body. """ # I don't use the cache, because the URLs are random and the only thing # that cache does is to fill up disk space try: response = self._uri_opener.GET(url404, cache=False, grep=False) except HTTPRequestException, hre: message = 'Exception found while detecting 404: "%s"' raise FourOhFourDetectionException(message % hre) return response