def test_excludes_non_anchor(): page = """ <a href="http://example.com/test">click me</a> <a href="#about-us">about</a> <a onclick="something() href="">check</a> """ urls = extract_links(page) assert len(urls) == 1 assert "http://example.com/test2" in urls
def test_extract_urls_from_href(): page = """ <a href="http://example.com/test">click me</a> <a href="/about-us">about</a> """ urls = extract_links(page) assert len(urls) == 2 assert "http://example.com/test" in urls assert "/about-us" in urls
def test_excludes_non_web(): page = """ <a href="http://example.com/test">click me</a> <a href="mailto:[email protected]">mail us</a> <a href="skype:someone>call us</a> <a href="tel:+4733378901>call us here to</a> <a href="javascript:alert('Hello World!');">click</a> """ urls = extract_links(page) assert len(urls) == 1 assert "http://example.com/test2" in urls
def test_extract_urls_from_plain_text(): page = """ <p>Visit http://example.com/test2 to know more or https://ya.ru/search</p> <p>Also http://your.shop/cart for more!</p> <p> thisisnothttp://ya.ru/even it look like one</p> <p> but amazon.com is shurele a link</a> <a href="/about-us">about</a> """ urls = extract_links(page) assert len(urls) == 4 assert "http://example.com/test2" in urls assert "/about-us" in urls assert "https://ya.ru/search" in urls assert "http://your.shop/cart" in urls
def _lookup_links(self, page: str): """Extracts all URLs from the page. Checks if they should be visited and updates self._to_visit""" links = utils.extract_links(page) for link in links: if link.startswith('/'): link = self.start_url.scheme \ + "://" \ + self.start_url.netloc \ + link parsed_link = urlparse(link) if parsed_link.netloc != self.start_url.netloc: continue if not parsed_link.path.startswith(self.start_url.path): continue if link in self._visited_urls: continue self._to_visit.add(link)