Пример #1
0
def test_html_page_should_be_able_to_scrap_rel_links():
    """
    Test scraping page looking for url in href
    """
    page = HTMLPage("""
        <!-- The <th> elements below are a terrible terrible hack for setuptools -->
        <li>
        <strong>Home Page:</strong>
        <!-- <th>Home Page -->
        <a href="http://supervisord.org/">http://supervisord.org/</a>
        </li>""", "supervisor")

    links = list(page.scraped_rel_links())
    assert len(links) == 1
    assert links[0].url == 'http://supervisord.org/'
Пример #2
0
def test_html_page_should_be_able_to_scrap_rel_links():
    """
    Test scraping page looking for url in href
    """
    page = HTMLPage(
        """
        <!-- The <th> elements below are a terrible terrible hack for setuptools -->
        <li>
        <strong>Home Page:</strong>
        <!-- <th>Home Page -->
        <a href="http://supervisord.org/">http://supervisord.org/</a>
        </li>""", "supervisor")

    links = list(page.scraped_rel_links())
    assert len(links) == 1
    assert links[0].url == 'http://supervisord.org/'
Пример #3
0
def test_html_page_should_be_able_to_filter_links_by_rel():
    """
    Test selecting links by the rel attribute
    """
    page = HTMLPage("""
        <a href="http://example.com/page.html">Some page</a>
        <a href="http://example.com/archive-1.2.3.tar.gz" rel="download">Download URL</a>
        <a href="http://example.com/home.html" rel="homepage">Homepage</a>
        """, "archive")

    links = list(page.rel_links())
    urls = [l.url for l in links]
    hlinks = list(page.rel_links(('homepage',)))
    dlinks = list(page.rel_links(('download',)))
    assert len(links) == 2
    assert 'http://example.com/archive-1.2.3.tar.gz' in urls
    assert 'http://example.com/home.html' in urls
    assert len(hlinks) == 1
    assert hlinks[0].url == 'http://example.com/home.html'
    assert len(dlinks) == 1
    assert dlinks[0].url == 'http://example.com/archive-1.2.3.tar.gz'
Пример #4
0
def test_base_url(html, url, expected):
    assert HTMLPage(html, url).base_url == expected