예제 #1
0
async def test_extract_disconnect_urls():
    target_url = "http://perdu.com/"
    respx.get(target_url).mock(return_value=httpx.Response(
        200,
        text=
        "<html><head><title>Vous Etes Perdu ?</title></head><body><h1>Perdu sur l'Internet ?</h1> \
            <h2>Pas de panique, on va vous aider</h2> \
            <strong><pre>    * <----- vous &ecirc;tes ici</pre></strong><a href='http://perdu.com/foobar/'></a> \
            <a href='http://perdu.com/foobar/logout'></a> \
            <a href='http://perdu.com/foobar/logoff'></a> \
            <a href='http://perdu.com/foobar/signout'></a> \
            <a href='http://perdu.com/foobar/signoff'></a> \
            <a href='http://perdu.com/foobar/disconnect'></a> \
            <a href='../../foobar/déconnexion'></a> \
            </div></body></html>"))

    crawler = AsyncCrawler(Request(target_url), timeout=1)

    page = await crawler.async_get(Request(target_url))

    disconnect_urls = crawler._extract_disconnect_urls(page)

    test_disconnect_urls = [
        "http://perdu.com/foobar/logout", "http://perdu.com/foobar/logoff",
        "http://perdu.com/foobar/signout", "http://perdu.com/foobar/signoff",
        "http://perdu.com/foobar/disconnect",
        "http://perdu.com/foobar/déconnexion"
    ]

    assert len(disconnect_urls) == len(test_disconnect_urls)
    assert all(url in disconnect_urls for url in test_disconnect_urls) is True
예제 #2
0
def test_extract_disconnect_urls_no_url():
    target_url = "http://perdu.com/"
    respx.get(target_url).mock(return_value=httpx.Response(
        200,
        text=
        "<html><head><title>Vous Etes Perdu ?</title></head><body><h1>Perdu sur l'Internet ?</h1> \
            <h2>Pas de panique, on va vous aider</h2> \
            <strong><pre>    * <----- vous &ecirc;tes ici</pre></strong><a href='http://perdu.com/foobar/'></a> \
            <a href='http://perdu.com/foobar/foobar'></a></body></html>"))

    resp = httpx.get(target_url, follow_redirects=False)
    page = Page(resp)

    crawler = AsyncCrawler(Request(target_url), timeout=1)

    disconnect_urls = crawler._extract_disconnect_urls(page)

    assert len(disconnect_urls) == 0