Exemplo n.º 1
0
def test_get_hyperlinks():

    html = """<html><head></head><body>
        before
        <a href="http://example.com/page1">link text</a>
        after

        <a href="/page2">relative2</a>
        <a href="page3?q=1#d">relative3</a>
        <a href="http://other.example.com/page4">absolute4</a>
        <a href="//other.example.com/page5?q=1#d">absolute5</a>
        <a href="https://other.example.com/page6?q=1#d">absolute6</a>
        <a href="javascript:func()">js1</a>

        </body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()

    links = page.get_external_hyperlinks()
    assert len(links) == 3
    assert links[0]["href"].url == "http://other.example.com/page4"
    assert links[0]["text"] == "absolute4"

    assert links[1]["href"].url == "http://other.example.com/page5?q=1#d"
    assert links[1]["text"] == "absolute5"

    assert links[2]["href"].url == "https://other.example.com/page6?q=1#d"
    assert links[2]["text"] == "absolute6"

    # This doesn't return URLs, it returns strings (they are paths)
    links = page.get_internal_hyperlinks()
    assert len(links) == 3
    assert links[0]["path"] == "/page1"
    assert links[0]["text"] == "link text"

    assert links[1]["path"] == "/page2"
    assert links[1]["text"] == "relative2"

    assert links[2]["path"] == "page3?q=1#d"
    assert links[2]["text"] == "relative3"

    # All links in absolute
    links = page.get_hyperlinks()
    assert len(links) == 6
    assert links[2]["href"].url == "http://example.com/page3?q=1#d"