async def test_extract_disconnect_urls(): target_url = "http://perdu.com/" respx.get(target_url).mock(return_value=httpx.Response( 200, text= "<html><head><title>Vous Etes Perdu ?</title></head><body><h1>Perdu sur l'Internet ?</h1> \ <h2>Pas de panique, on va vous aider</h2> \ <strong><pre> * <----- vous êtes ici</pre></strong><a href='http://perdu.com/foobar/'></a> \ <a href='http://perdu.com/foobar/logout'></a> \ <a href='http://perdu.com/foobar/logoff'></a> \ <a href='http://perdu.com/foobar/signout'></a> \ <a href='http://perdu.com/foobar/signoff'></a> \ <a href='http://perdu.com/foobar/disconnect'></a> \ <a href='../../foobar/déconnexion'></a> \ </div></body></html>")) crawler = AsyncCrawler(Request(target_url), timeout=1) page = await crawler.async_get(Request(target_url)) disconnect_urls = crawler._extract_disconnect_urls(page) test_disconnect_urls = [ "http://perdu.com/foobar/logout", "http://perdu.com/foobar/logoff", "http://perdu.com/foobar/signout", "http://perdu.com/foobar/signoff", "http://perdu.com/foobar/disconnect", "http://perdu.com/foobar/déconnexion" ] assert len(disconnect_urls) == len(test_disconnect_urls) assert all(url in disconnect_urls for url in test_disconnect_urls) is True
def test_extract_disconnect_urls_no_url(): target_url = "http://perdu.com/" respx.get(target_url).mock(return_value=httpx.Response( 200, text= "<html><head><title>Vous Etes Perdu ?</title></head><body><h1>Perdu sur l'Internet ?</h1> \ <h2>Pas de panique, on va vous aider</h2> \ <strong><pre> * <----- vous êtes ici</pre></strong><a href='http://perdu.com/foobar/'></a> \ <a href='http://perdu.com/foobar/foobar'></a></body></html>")) resp = httpx.get(target_url, follow_redirects=False) page = Page(resp) crawler = AsyncCrawler(Request(target_url), timeout=1) disconnect_urls = crawler._extract_disconnect_urls(page) assert len(disconnect_urls) == 0