def test_mailto(server): """ Extra mailto test. """ MAILTO = "mailto:[email protected]" CONTENT = """ <a href="{}">mail link</a>""".format(MAILTO) address = server.router({ '^/$': Page(CONTENT).exists(), }) c = Crawler(Settings(address, check_external_urls=True)) c.start() assert len(c.ignored) == 1 assert MAILTO in c.ignored assert len(c.failed) == 0 assert len(c.index) == 2
def test_double_start(simple_site): c = Crawler(Settings(simple_site, threads=10)) c.start() # should not take same time again. c.start()
def test_within_site_root(server): """ This Test checks a case when url without trailing slash is ignored because it's not stays within path. """ CONTENT = """ <a href="http://{0}:{1}">link</a> <a href="http://{0}:{1}/">link</a> """.format(*server.sa) CONTENT_DOCS = CONTENT.replace('">', '/docs/">').replace('//docs/', '/docs') address = server.router({ '^/$': Page(CONTENT).exists(), '^/docs/?$': Page(CONTENT_DOCS).exists(), }) for base in {address.rstrip("/") + "/", address.rstrip("/") + "/docs/"}: settings = Settings(base, stay_within_path=True) c = Crawler(settings) c.start() assert len(c.ignored) == 0
def test_no_index_page(server): from random import sample pages = list(range(1, 51)) format_link = lambda x: "<a href='/link-%s'>link</a>" % x routes = { '^/$': Page("").exists(), } for step in pages: route_key = '^/link-%s/$' % step route_contents = Page(" / ".join(map(format_link, sample(pages, 4)))).exists() routes.update({route_key: route_contents}) address = server.router(routes) settings = Settings(address, threads=10) c = Crawler(settings) c.start() assert len(c.index) == 1
def test_base_url_ignored(server): """ Starting URL is Ignored Domain (ip:port pair) """ address = server.router({'^/$': Page('ok').exists()}) with pytest.raises(DeadlinksIgnoredURL): Crawler(Settings(address, ignore_domains=[address.split("//")[1]]))
def test_base_url_badurl(url): """ Testing bad urls to start API crawling """ with pytest.raises(DeadlinksSettingsBase): Crawler(Settings(url))
def test_gobyexample(): """ special case - aws substitute robots.txt """ with pytest.raises(DeadlinksIgnoredURL): c = Crawler(Settings("https://gobyexample.com")) c.start()