예제 #1
0
 def discover(self, start_url: str, limit: int) -> List[str]:
     """
     Fetch the url provided and retrieve links, subsequently fetching
     the pages at those links until reaching limit (or running out of links).
     :param start_url: url to start from
     :param limit: number of urls to return in list
     :return: list of urls discovered
     """
     urls = [start_url]
     seen = {start_url: True}
     count = 1
     while len(urls) > 0 and count < limit:
         url = urls.pop()
         contents = self.content_fetcher.retrieve_page(url)
         new_urls = filter(lambda x: x not in seen, extract_urls(url, contents))
         for new_url in new_urls:
             if count == limit:
                 break
             urls.append(new_url)
             seen[new_url] = True
             count += 1
     return list(seen.keys())
예제 #2
0
 def test_extract_links_finds_all_links_in_atags(self):
     self.assertEqual(
         extract_urls(self.origin_url, self.html_both_links_in_tags),
         ["https://crawler-test.com", "https://crawler-test.com/hello.txt"])
예제 #3
0
 def test_extract_links_finds_absolute_link_in_atag(self):
     self.assertEqual(
         extract_urls(self.origin_url, self.html_absolute_link_in_tag),
         ["https://crawler-test.com"])
예제 #4
0
 def test_extract_links_resolves_relative_link_in_atag(self):
     self.assertEqual(
         extract_urls(self.origin_url, self.html_relative_link_in_tag),
         ["https://crawler-test.com/hello.txt"])
예제 #5
0
 def test_extract_links_does_not_find_link_outside_atags(self):
     self.assertEqual(extract_urls(self.origin_url, self.html_link_in_text),
                      [])
예제 #6
0
 def test_extract_links_finds_no_links_when_not_present(self):
     self.assertEqual(extract_urls(self.origin_url, self.html_no_link), [])