def _inner_loop(self, url): """Runs the inner loop of the iterative process. Note: The loop is executed inside the threading manager. After retrieving the urls from the queue. Args: url (str): the url being requested. """ self.visited_urls.add(url) if len(self.visited_urls) > self.req_limit: self.manager.stop_all_workers() if len(self.visited_urls) % self.log_frequency == 0: msg = "Visited pages: {}; Targets found {}; Running visits {}".format( len(self.visited_urls), len(self.target_pages), len(self.unvisited)) self.logger.debug(msg) try: new_page = wp.WebPage(url, timeout=self._timeout) if self.identify_target(new_page): self.target_pages[new_page.url] = new_page else: self.other_pages[new_page.url] = new_page self.inner_urls |= set(new_page.child_urls) if not self.greedy: new_page.free() # frees memory after parsing except (urllib.error.HTTPError, wp.TimeoutException) as e: self._on_timeout_exception(url, e) except urllib.error.URLError: self.invalid_urls.add(url)
def _buildPage(self, title, tabs, content): title = unicode(title) subject = title if subject == '': title = '(Home)' web_header = webpage.WebPageHeader(self.__wiki_name + ' >>> ' + title) web_header.append( '<link rel="stylesheet" type="text/css" href="/?getcss" />') tabs_content = self.buildTabs(tabs) menu_content = self.buildMenu() template = self.DEFAULT_TEMPLATE if self.__template is not None: if os.path.isfile(self.__template): f = file(self.__template, 'r') try: template = f.read() finally: f.close() html_content = template % { 'wiki_name': self.__wiki_name, 'title': title, 'tabs': tabs_content, 'content': content, 'copy': self.__copyright, 'menu': menu_content } web_content = webpage.WebPageContent(html_content) return webpage.WebPage(web_header, web_content)
def test_invalid_target_page(self): """Tests if an invalid target url yields an invalid target page.""" route = "/ganhe-brindes" url = helpers.get_url(self.domain, route) page = wp.WebPage(url) self.assertTrue(page.title) self.assertTrue(page.domain) self.assertFalse(page.valid_target) self.assertEqual(page.target_name, page._INVALID_TARGET)
def test_valid_target_page(self): """Tests if a valid target url yields a full target page.""" route = "/hypnose-eau-de-toilette-lancome-perfume-feminino/p" url = helpers.get_url(self.domain, route) page = wp.WebPage(url) self.assertTrue(page.title) self.assertTrue(page.domain) self.assertTrue(page.valid_target) self.assertEqual(page.domain, self.domain)
def __init__( self, domain, req_limit=1e4, greedy=False, indentify_target=lambda page: True): # Initialize self.root_page = wp.WebPage(domain) self.target_pages = {} self.other_pages = {} self.visited_urls = set() self.invalid_urls = set() self.inner_urls = set() self.unvisited = set() self.manager = None # Crawl parameters self.req_limit = req_limit self.greedy = greedy self.identify_target = indentify_target self.log_frequency = req_limit/10 self.logger = helpers.get_debug_logger("mylogger")
def test_invalid_attribute(self): """Tests if an invalid attribute raises AttributeError""" with self.assertRaises(AttributeError): wp.WebPage(1)
def test_invalid_url(self): """Tests if an invalid url raises URLError""" invalid_url = "invalid" with self.assertRaises(urllib.error.URLError): wp.WebPage(invalid_url)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.domain = "www.epocacosmeticos.com.br" self.domain_page = wp.WebPage(self.domain) self.domain_urls = self.domain_page.child_urls
def setUp(self): base_url = 'https://www.unrealengine.com' query_url = base_url + '/marketplace/assets?lang=&q=test' self.wp = webpage.WebPage(query_url, base_url)
def buildErrorPage(self, title, message): header = webpage.WebPageHeader(title) content = webpage.WebPageContent("<pre>%s</pre>" % message) return webpage.WebPage(header, content)