def discover(self, start_url: str, limit: int) -> List[str]: """ Fetch the url provided and retrieve links, subsequently fetching the pages at those links until reaching limit (or running out of links). :param start_url: url to start from :param limit: number of urls to return in list :return: list of urls discovered """ urls = [start_url] seen = {start_url: True} count = 1 while len(urls) > 0 and count < limit: url = urls.pop() contents = self.content_fetcher.retrieve_page(url) new_urls = filter(lambda x: x not in seen, extract_urls(url, contents)) for new_url in new_urls: if count == limit: break urls.append(new_url) seen[new_url] = True count += 1 return list(seen.keys())
def test_extract_links_finds_all_links_in_atags(self): self.assertEqual( extract_urls(self.origin_url, self.html_both_links_in_tags), ["https://crawler-test.com", "https://crawler-test.com/hello.txt"])
def test_extract_links_finds_absolute_link_in_atag(self): self.assertEqual( extract_urls(self.origin_url, self.html_absolute_link_in_tag), ["https://crawler-test.com"])
def test_extract_links_resolves_relative_link_in_atag(self): self.assertEqual( extract_urls(self.origin_url, self.html_relative_link_in_tag), ["https://crawler-test.com/hello.txt"])
def test_extract_links_does_not_find_link_outside_atags(self): self.assertEqual(extract_urls(self.origin_url, self.html_link_in_text), [])
def test_extract_links_finds_no_links_when_not_present(self): self.assertEqual(extract_urls(self.origin_url, self.html_no_link), [])