예제 #1
0
    def crawl(self):
        while not self.queue.empty():
            (url, attempt, depth) = self.queue.get()

            print((url, attempt, depth))

            if self.is_outer_url(url):
                self.outer_link_counter += 1
            elif self.is_subdomain_url(url):
                self.subdomain_set.add(self.get_subdomain_name(url))
            else:
                self.inner_link_counter += 1
                if depth >= self.max_depth:
                    continue

                content = self.get_page(url)
                if not content:
                    if attempt >= self.max_attempts:
                        self.error_counter += 1
                        continue
                    else:
                        self.queue.put((url, attempt+1, depth))
                        continue

                parser =HtmlParser(content)
                url_list = parser.get_links()

                for u in url_list:
                    if len(u) < 1:
                        continue
                    u = self.make_full_link(u)
                    if u not in self.reached_urls:
                        self.reached_urls[u] = depth+1
                        self.queue.put((u, 0, depth+1))

        print(self.reached_urls)
        print(self.queue.qsize())
        print("Subdomains:", self.subdomain_set)
        print("Inner links count:", self.inner_link_counter)
        print("Outer links count:", self.outer_link_counter)
        print("Unavailable pages count:", self.error_counter)
예제 #2
0
 def test_empty_page(self):
     parser = HtmlParser("")
     self.assertEqual(parser.get_links(), [])
     self.assertEqual(parser.get_text(), '')
예제 #3
0
 def test_get_several_links(self):
     parser = HtmlParser(
         "<div><a href='http://abc.abc/'>text</a></div><a href='http://def.def/'></a><div><A HREF='q.html'>text</a></div>"
     )
     self.assertEqual(parser.get_links(), ['http://abc.abc/', 'http://def.def/', 'q.html'])
예제 #4
0
 def test_one_link(self):
     parser = HtmlParser(
         "<a href='http://abc.abc/'>text</a>"
     )
     self.assertEqual(parser.get_links(), ['http://abc.abc/'])