def test_crawl_all(monkeypatch):
    path = tutorial_mysite.__path__[0]
    with monkeypatch.context() as patch:
        patch.chdir(path)
        patch.syspath_prepend(path)

        os.environ['DJANGO_SETTINGS_MODULE'] = 'mysite.settings'
        django.setup()
        from django.core import management
        management.call_command("migrate")
        management.call_command("loaddata", "polls/fixtures/fixtures.json")

        TestRunner = get_runner(settings)
        test_runner = TestRunner()
        client = Client()

        crawler = Crawler(
            client=client,
            initial_paths=['/', '/polls'],
            rules=(
                PERMISSIVE_ALL_ELEMENTS_RULE_SET +
                SUBMIT_GET_FORMS_RULE_SET +
                SUBMIT_POST_FORMS_RULE_SET
            ),
            ignore_form_fields={'csrfmiddlewaretoken'},
            capture_exceptions=False,
        )
        crawler.crawl()

        # check urls
        for i in range(1, 4):
            assert f"/polls/{i}/" in crawler.graph.visited_paths
            assert f"/polls/{i}/vote" in crawler.graph.visited_paths
def test_50x_trapped_but_allowed(app, client):
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=[
                          Rule(ANCHOR, ".*", GET, Request()),
                          Rule(ANCHOR, ".*", GET, Allow([400, 500])),
                      ])
    crawler.crawl()
def test_exclusion(app, client):
    unwanted_urls = {'/page-d'}
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET +
                      [Rule(ANCHOR, '^/page-d$', GET, Ignore())])
    crawler.crawl()
    assert crawler.graph.visited_paths == DIRECTLY_ACCESSIBLE_URLS - unwanted_urls
def test_ignore_anchor_by_class(app, client):
    unwanted_urls = {'/page-d'}
    selectors_to_ignore = ['a.menu-link-page-d-class']
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
                      ignore_css_selectors=selectors_to_ignore)
    crawler.crawl()
    assert crawler.graph.visited_paths == DIRECTLY_ACCESSIBLE_URLS - unwanted_urls
def test_extract_srcs(app, client):
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_ALL_ELEMENTS_RULE_SET,
                      path_attrs=(HREF, SRC))
    crawler.crawl()
    img_nodes = crawler.graph.get_nodes_by_source("img")
    assert len(img_nodes) == 1
    assert img_nodes[0].path == '/image.png'
    assert img_nodes[0].status_code == 200
def test_check_response_handler_positive_case(app, client):
    def only_page_a_recommends(node, response):
        return ('recommend' in get_response_content(response)
                or node.path != '/page-a')

    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
                      check_response_handlers=[only_page_a_recommends])
    crawler.crawl()
def test_crawl_fails_after_too_many_requests(app, client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
        max_requests=10,
    )

    with pytest.raises(TooManyRequestsError):
        crawler.crawl()
def test_inclusion(app, client):
    wanted_urls = {'/', '/page-a'}
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=[
                          Rule(ANCHOR, '^/$', GET, Request()),
                          Rule(ANCHOR, '^/page-a$', GET, Request())
                      ])
    crawler.crawl()
    assert crawler.graph.visited_paths == wanted_urls
def test_check_response_handler_negative_case(app, client):
    def insist_on_cheshire_cat(node, response):
        return "grin" in get_response_content(response)

    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
                      capture_exceptions=False,
                      check_response_handlers=[insist_on_cheshire_cat])
    with pytest.raises(UnexpectedResponseError):
        crawler.crawl()
def test_50x_raising_exception(app, client):
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      capture_exceptions=False,
                      rules=[
                          Rule(ANCHOR, ".*", GET, Request()),
                          Rule(ANCHOR, ".*", GET, Allow([400]))
                      ])
    with pytest.raises(HttpStatusError) as excinfo:
        crawler.crawl()
    assert excinfo.value.status_code == 500
def test_should_process_handlers(app, client):
    unwanted_urls = {'/page-d'}

    def should_not_proess_page_d(node):
        return node.path != '/page-d'

    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
                      should_process_handlers=[should_not_proess_page_d])
    crawler.crawl()
    assert crawler.graph.visited_paths == DIRECTLY_ACCESSIBLE_URLS - unwanted_urls
def test_extract_link_hrefs(app, client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET + \
            REQUEST_EXTERNAL_RESOURCE_LINKS_RULE_SET
    )
    crawler.crawl()
    link_nodes = crawler.graph.get_nodes_by_source("link")
    assert len(link_nodes) == 1
    assert link_nodes[0].path == '/style.css'
    assert link_nodes[0].status_code == 200
def test_capture_exceptions(app, client, capfd):
    failure_paths = {'/page-c', '/page-d'}
    app.config['FAILURE_PATHS'] = failure_paths
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
                      capture_exceptions=True)
    with pytest.raises(AssertionError):
        crawler.crawl()
    out, err = capfd.readouterr()
    for path in failure_paths:
        assert f"Exception: Instructed to fail at {path}" in out
def test_ignore_form_by_name(app, client):
    selectors_to_ignore = ['form[name=form-get-name]']
    crawler = Crawler(client=client,
                      initial_paths=['/'],
                      rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET +
                      SUBMIT_GET_FORMS_RULE_SET,
                      ignore_css_selectors=selectors_to_ignore)
    crawler.crawl()
    submitted_forms = [
        form for form in crawler.graph.get_nodes_by_source(FORM)
        if form.requested
    ]
    assert len(submitted_forms) == 0
示例#15
0
def _crawl(client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=(
            ALL_ELEMENTS_RULE_SET + SUBMIT_GET_FORMS_RULE_SET +
            SUBMIT_POST_FORMS_RULE_SET + [
                # don't logout
                Rule(".*", r"/auth/logout", GET, Ignore()),
                # allow 400 on create and update
                Rule(".*", r"/create", POST, Allow([400])),
                Rule(".*", r"/\d+/update", POST, Allow([400])),
            ]),
    )
    crawler.crawl()
    return crawler
def test_crawl_all(app, client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET,
    )
    crawler.crawl()

    # check everything visited
    assert crawler.graph.visited_paths == DIRECTLY_ACCESSIBLE_URLS

    # check status codes recorded
    expected_status_codes = {path: 200 for path in DIRECTLY_ACCESSIBLE_URLS}
    expected_status_codes['/redirect/with/301'] = 301
    expected_status_codes['/redirect/with/302'] = 302
    expected_status_codes['/abort/with/400'] = 400
    expected_status_codes['/abort/with/500'] = 500
    for path, status_code in expected_status_codes.items():
        assert crawler.graph.get_nodes_by_path(
            path)[0].status_code == status_code
def test_submit_forms_with_extra_data(app, client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=(PERMISSIVE_HYPERLINKS_ONLY_RULE_SET +
               SUBMIT_POST_FORMS_RULE_SET + [
                   Rule(FORM, ".*", GET, Request(params={'extra': 'extra'})),
                   Rule(FORM, ".*", POST, Request(params={'extra': 'extra'})),
               ]))
    crawler.crawl()

    # check we always submitted extra data when we submitted any
    submitted_forms = [
        form for form in crawler.graph.get_nodes_by_source(FORM)
        if form.requested
    ]
    assert len(submitted_forms) > 1
    for form in submitted_forms:
        entries = lookup_requests(app, form.path, method=form.method)
        for entry in entries:
            if entry.params:
                assert 'extra' in {key for key, val in entry.params}
def test_submit_forms_by_post(app, client):
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=PERMISSIVE_HYPERLINKS_ONLY_RULE_SET + SUBMIT_POST_FORMS_RULE_SET,
    )
    crawler.crawl()

    # check we found and submitted the form
    submitted_forms = [
        form for form in crawler.graph.get_nodes_by_source(FORM)
        if form.requested
    ]
    assert len(submitted_forms) == 1
    form = submitted_forms[0]
    submissions = [
        entry
        for entry in lookup_requests(app, path=form.path, method=form.method)
        if entry.params
    ]
    assert len(submissions) == 1

    # check we visited the onward link
    assert '/form-submitted-by-post-onward-link' in crawler.graph.visited_paths
示例#19
0
def test_other_crawl(client, auth):
    auth.login()
    crawler = Crawler(
        client=client,
        initial_paths=['/'],
        rules=(
            ALL_ELEMENTS_RULE_SET + SUBMIT_GET_FORMS_RULE_SET +
            SUBMIT_POST_FORMS_RULE_SET + [
                # don't logout
                Rule(".*", r"/auth/logout", GET, Ignore()),

                # submit some data to create
                Rule(".*", r"/create", POST,
                     Request(params={
                         "title": "A Title",
                         "body": "body text"
                     })),

                # add the missing body when updating
                Rule(".*", r"/\d+/update", POST,
                     Request(params={"body": "updated body"})),
            ]),
    )
    crawler.crawl()