Exemplos de Crawler em Python, exemplos de deadlinks.Crawler em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: tests_crawler.py Projeto: butuzov/deadlinks

def test_mailto(server):
    """ Extra mailto test. """

    MAILTO = "mailto:[email protected]"
    CONTENT = """  <a href="{}">mail link</a>""".format(MAILTO)

    address = server.router({
        '^/$': Page(CONTENT).exists(),
    })

    c = Crawler(Settings(address, check_external_urls=True))
    c.start()

    assert len(c.ignored) == 1
    assert MAILTO in c.ignored

    assert len(c.failed) == 0
    assert len(c.index) == 2

Exemplo n.º 2

0

Exibir arquivo

def test_double_start(simple_site):

    c = Crawler(Settings(simple_site, threads=10))
    c.start()

    # should not take same time again.
    c.start()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: tests_crawler.py Projeto: stmps/deadlinks

def test_within_site_root(server):
    """
        This Test checks a case when url without trailing slash is ignored
        because it's not stays within path.
    """
    CONTENT = """
        <a href="http://{0}:{1}">link</a>
        <a href="http://{0}:{1}/">link</a>
    """.format(*server.sa)

    CONTENT_DOCS = CONTENT.replace('">',
                                   '/docs/">').replace('//docs/', '/docs')

    address = server.router({
        '^/$': Page(CONTENT).exists(),
        '^/docs/?$': Page(CONTENT_DOCS).exists(),
    })

    for base in {address.rstrip("/") + "/", address.rstrip("/") + "/docs/"}:
        settings = Settings(base, stay_within_path=True)
        c = Crawler(settings)
        c.start()

        assert len(c.ignored) == 0

Exemplo n.º 4

0

Exibir arquivo

Arquivo: tests_crawler.py Projeto: stmps/deadlinks

def test_no_index_page(server):

    from random import sample

    pages = list(range(1, 51))
    format_link = lambda x: "<a href='/link-%s'>link</a>" % x

    routes = {
        '^/$': Page("").exists(),
    }

    for step in pages:
        route_key = '^/link-%s/$' % step
        route_contents = Page(" / ".join(map(format_link,
                                             sample(pages, 4)))).exists()
        routes.update({route_key: route_contents})

    address = server.router(routes)

    settings = Settings(address, threads=10)
    c = Crawler(settings)
    c.start()

    assert len(c.index) == 1

Exemplo n.º 5

0

Exibir arquivo

Arquivo: tests_crawler.py Projeto: stmps/deadlinks

def test_base_url_ignored(server):
    """ Starting URL is Ignored Domain (ip:port pair) """

    address = server.router({'^/$': Page('ok').exists()})
    with pytest.raises(DeadlinksIgnoredURL):
        Crawler(Settings(address, ignore_domains=[address.split("//")[1]]))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: tests_crawler.py Projeto: stmps/deadlinks

def test_base_url_badurl(url):
    """ Testing bad urls to start API crawling """

    with pytest.raises(DeadlinksSettingsBase):
        Crawler(Settings(url))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: tests_robots.py Projeto: stmps/deadlinks

def test_gobyexample():
    """ special case - aws substitute robots.txt """

    with pytest.raises(DeadlinksIgnoredURL):
        c = Crawler(Settings("https://gobyexample.com"))
        c.start()