Exemplo n.º 1
0
    def _inner_loop(self, url):
        """Runs the inner loop of the iterative process.
        Note:
            The loop is executed inside the threading manager. After retrieving
            the urls from the queue.
        Args:
            url (str): the url being requested.
        """

        self.visited_urls.add(url)
        if len(self.visited_urls) > self.req_limit:
            self.manager.stop_all_workers()
        if len(self.visited_urls) % self.log_frequency == 0:
            msg = "Visited pages: {}; Targets found {}; Running visits {}".format(
                len(self.visited_urls), len(self.target_pages), len(self.unvisited))
            self.logger.debug(msg)

        try:

            new_page = wp.WebPage(url, timeout=self._timeout)
            if self.identify_target(new_page):
                self.target_pages[new_page.url] = new_page
            else:
                self.other_pages[new_page.url] = new_page
            self.inner_urls |= set(new_page.child_urls)
            if not self.greedy:
                new_page.free()  # frees memory after parsing

        except (urllib.error.HTTPError, wp.TimeoutException) as e:
            self._on_timeout_exception(url, e)
        except urllib.error.URLError:
            self.invalid_urls.add(url)
Exemplo n.º 2
0
    def _buildPage(self, title, tabs, content):
        title = unicode(title)
        subject = title
        if subject == '':
            title = '(Home)'
        web_header = webpage.WebPageHeader(self.__wiki_name + ' >>> ' + title)
        web_header.append(
            '<link rel="stylesheet" type="text/css" href="/?getcss" />')

        tabs_content = self.buildTabs(tabs)
        menu_content = self.buildMenu()

        template = self.DEFAULT_TEMPLATE
        if self.__template is not None:
            if os.path.isfile(self.__template):
                f = file(self.__template, 'r')
                try:
                    template = f.read()
                finally:
                    f.close()

        html_content = template % {
            'wiki_name': self.__wiki_name,
            'title': title,
            'tabs': tabs_content,
            'content': content,
            'copy': self.__copyright,
            'menu': menu_content
        }

        web_content = webpage.WebPageContent(html_content)
        return webpage.WebPage(web_header, web_content)
Exemplo n.º 3
0
    def test_invalid_target_page(self):
        """Tests if an invalid target url yields an invalid target page."""
        route = "/ganhe-brindes"
        url = helpers.get_url(self.domain, route)
        page = wp.WebPage(url)

        self.assertTrue(page.title)
        self.assertTrue(page.domain)
        self.assertFalse(page.valid_target)
        self.assertEqual(page.target_name, page._INVALID_TARGET)
Exemplo n.º 4
0
    def test_valid_target_page(self):
        """Tests if a valid target url yields a full target page."""
        route = "/hypnose-eau-de-toilette-lancome-perfume-feminino/p"
        url = helpers.get_url(self.domain, route)
        page = wp.WebPage(url)

        self.assertTrue(page.title)
        self.assertTrue(page.domain)
        self.assertTrue(page.valid_target)
        self.assertEqual(page.domain, self.domain)
Exemplo n.º 5
0
    def __init__(
            self, domain, req_limit=1e4, greedy=False,
            indentify_target=lambda page: True):

        # Initialize
        self.root_page = wp.WebPage(domain)
        self.target_pages = {}
        self.other_pages = {}
        self.visited_urls = set()
        self.invalid_urls = set()
        self.inner_urls = set()
        self.unvisited = set()
        self.manager = None

        # Crawl parameters
        self.req_limit = req_limit
        self.greedy = greedy
        self.identify_target = indentify_target
        self.log_frequency = req_limit/10
        self.logger = helpers.get_debug_logger("mylogger")
Exemplo n.º 6
0
 def test_invalid_attribute(self):
     """Tests if an invalid attribute raises AttributeError"""
     with self.assertRaises(AttributeError):
         wp.WebPage(1)
Exemplo n.º 7
0
 def test_invalid_url(self):
     """Tests if an invalid url raises URLError"""
     invalid_url = "invalid"
     with self.assertRaises(urllib.error.URLError):
         wp.WebPage(invalid_url)
Exemplo n.º 8
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.domain = "www.epocacosmeticos.com.br"
        self.domain_page = wp.WebPage(self.domain)
        self.domain_urls = self.domain_page.child_urls
Exemplo n.º 9
0
 def setUp(self):
     base_url = 'https://www.unrealengine.com'
     query_url = base_url + '/marketplace/assets?lang=&q=test'
     self.wp = webpage.WebPage(query_url, base_url)
Exemplo n.º 10
0
 def buildErrorPage(self, title, message):
     header = webpage.WebPageHeader(title)
     content = webpage.WebPageContent("<pre>%s</pre>" % message)
     return webpage.WebPage(header, content)