Пример #1
0
def test_assertion_error_propagation(
    task_manager_creator, default_params, testing, expectation
):
    """Test that assertion errors bubble up through the TaskManager when running tests"""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    manager_params.testing = testing
    manager, _ = task_manager_creator((manager_params, browser_params[:1]))
    cs = CommandSequence("http://example.com", blocking=True)
    cs.append_command(CrashingAssertionCommand())
    with expectation:
        with manager:
            manager.execute_command_sequence(cs)
Пример #2
0
    def test_local_callbacks(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)

        def callback(argument: List[int], success: bool):
            argument.extend([1, 2, 3])

        my_list = []
        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list)
        )
        sequence.get()

        manager.execute_command_sequence(sequence)
        manager.close()
        assert my_list == [1, 2, 3]
Пример #3
0
def test_local_callbacks(default_params, task_manager_creator):
    """Test the storage controller as well as the entire callback machinery
    to see if all callbacks get correctly called"""
    manager, _ = task_manager_creator(default_params)
    TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"

    def callback(argument: List[int], success: bool) -> None:
        argument.extend([1, 2, 3])

    my_list: List[int] = []
    sequence = CommandSequence(
        TEST_SITE, blocking=True, callback=partial(callback, my_list)
    )
    sequence.get()

    manager.execute_command_sequence(sequence)
    manager.close()
    assert my_list == [1, 2, 3]
Пример #4
0
    def test_s3_callbacks(self):
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager_params, browser_params = self.get_config()
        dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory)
        manager = task_manager.TaskManager(manager_params, browser_params)
        queue = Queue()

        def ensure_site_in_s3(success: bool):
            # Ensure http table is created
            queue.put(
                TEST_SITE in dataset.load_table("http_requests").top_level_url.unique()
            )

        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3
        )
        sequence.get()
        manager.execute_command_sequence(sequence)
        manager.close()

        assert queue.get()
Пример #5
0
def test_seed_persistence(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    p = Path("profile.tar.gz")
    for browser_param in browser_params:
        browser_param.seed_tar = p
    manager, db = task_manager_creator(default_params)

    command_sequences = []
    for _ in range(2):
        cs = CommandSequence(url=BASE_TEST_URL)
        cs.get()
        cs.append_command(AssertConfigSetCommand("test_pref", True))
        command_sequences.append(cs)

    for cs in command_sequences:
        manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        db,
        "SELECT * FROM crawl_history;",
    )
    assert len(query_result) > 0
    for row in query_result:
        assert row[
            "command_status"] == "ok", f"Command {tuple(row)} was not ok"
Пример #6
0
def test_profile_recovery(monkeypatch, default_params, task_manager_creator,
                          testcase, stateful, seed_tar):
    """Test browser profile recovery in various scenarios."""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    browser_params[0].seed_tar = seed_tar
    manager, db = task_manager_creator((manager_params, browser_params[:1]))
    manager.get(BASE_TEST_URL, reset=not stateful)

    if testcase == "normal_operation":
        pass
    elif testcase == "on_crash":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
    elif testcase == "on_crash_during_launch":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
        # This will cause browser restarts to fail
        monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL")

        # Let the launch succeed after some failed launch attempts
        def undo_monkeypatch():
            time.sleep(5)  # This should be smaller than _SPAWN_TIMEOUT
            monkeypatch.undo()

        Thread(target=undo_monkeypatch).start()
    elif testcase == "on_timeout":
        # Set a very low timeout to cause a restart
        manager.get("about:config", reset=not stateful, timeout=0.1)

    cs = CommandSequence("about:config", reset=not stateful)
    expected_value = True if seed_tar else False
    cs.append_command(AssertConfigSetCommand("test_pref", expected_value))
    tar_directory = manager_params.data_directory / "browser_profile"
    tar_path = tar_directory / "profile.tar.gz"
    cs.dump_profile(tar_path, True)
    manager.execute_command_sequence(cs)
    manager.close()

    # Check that a consistent profile is used for stateful crawls but
    # not for stateless crawls
    with tarfile.open(tar_path) as tar:
        tar.extractall(tar_directory)
    ff_db = tar_directory / "places.sqlite"
    rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
    places = [url for (url, ) in rows]
    if stateful:
        assert BASE_TEST_URL in places
    else:
        assert BASE_TEST_URL not in places

    # Check if seed_tar was loaded on restart
    rows = db_utils.query_db(
        db,
        "SELECT command_status FROM crawl_history WHERE"
        " command='AssertConfigSetCommand'",
    )
    assert rows[0][0] == "ok"
Пример #7
0
def test_dump_profile_command(default_params, task_manager_creator):
    """Test saving the browser profile using a command."""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    manager, _ = task_manager_creator((manager_params, browser_params[:1]))
    cs = CommandSequence(url=BASE_TEST_URL)
    cs.get()
    tar_path = manager_params.data_directory / "profile.tar.gz"
    cs.dump_profile(tar_path, True)
    manager.execute_command_sequence(cs)
    manager.close()
    assert tar_path.is_file()
Пример #8
0
    def test_display_shutdown(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)
        port = manager.browsers[0].display_port

        sequence = CommandSequence(TEST_SITE)
        sequence.get()
        sequence.append_command(ExceptionCommand)
        manager.execute_command_sequence(sequence)
        manager.close()
        assert not os.path.exists("/tmp/.X%s-lock" % port)
Пример #9
0
def test_display_shutdown(task_manager_creator, default_params):
    """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown"""
    manager_params, browser_params = default_params
    for browser_param in browser_params:
        browser_param.display_mode = "xvfb"
    TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
    manager, db = task_manager_creator((manager_params, browser_params))
    port = manager.browsers[0].display_port

    sequence = CommandSequence(TEST_SITE)
    sequence.get()
    sequence.append_command(ExceptionCommand())
    manager.execute_command_sequence(sequence)
    manager.close()
    assert not os.path.exists("/tmp/.X%s-lock" % port)
Пример #10
0
 def test_seed_persistance(self):
     manager_params, browser_params = self.get_test_config(num_browsers=1)
     browser_params[0].seed_tar = "."
     command_sequences = []
     for _ in range(2):
         cs = CommandSequence(url="https://example.com", reset=True)
         cs.get()
         cs.append_command(TestConfigSetCommand("test_pref", True))
         command_sequences.append(cs)
     manager = TaskManager(manager_params, browser_params)
     for cs in command_sequences:
         manager.execute_command_sequence(cs)
     manager.close()
     query_result = db_utils.query_db(
         manager_params.database_name,
         "SELECT * FROM crawl_history;",
     )
     assert len(query_result) > 0
     for row in query_result:
         assert row[
             "command_status"] == "ok", f"Command {tuple(row)} was not ok"
Пример #11
0
    def test_seed_persistance(self):
        def test_config_is_set(*args, **kwargs):
            driver = kwargs["driver"]
            driver.get("about:config")
            result = driver.execute_script("""
                var prefs = Components
                            .classes["@mozilla.org/preferences-service;1"]
                            .getService(Components.interfaces.nsIPrefBranch);
                try {
                    return prefs.getBoolPref("test_pref")
                } catch (e) {
                    return false;
                }
            """)
            assert result

        manager_params, browser_params = self.get_test_config(num_browsers=1)
        browser_params[0]["seed_tar"] = "."
        command_sequences = []
        for _ in range(2):
            cs = CommandSequence(url="https://example.com", reset=True)
            cs.get()
            cs.run_custom_function(test_config_is_set)
            command_sequences.append(cs)
        manager = task_manager.TaskManager(manager_params, browser_params)
        for cs in command_sequences:
            manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params["db"],
            "SELECT * FROM crawl_history;",
        )
        assert len(query_result) > 0
        for row in query_result:
            assert row[
                "command_status"] == "ok", f"Command {tuple(row)} was not ok"
# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.append_command(GetCommand(url=site, sleep=10), timeout=60)
    # Have a look at custom_command.py to see how to implement your own command
    command_sequence.append_command(LinkCountingCommand())

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
Пример #13
0
def processSite(site):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = [site]

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )
            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())
            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
    return (None)
Пример #14
0
def test_cache_hits_recorded(http_params, task_manager_creator):
    """Verify all http responses are recorded, including cached responses

    Note that we expect to see all of the same requests and responses
    during the second vist (even if cached) except for images. Cached
    images do not trigger Observer Notification events.
    See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073

    The test page includes an image which does several permanent redirects
    before returning a 404. We expect to see new requests and responses
    for this image when the page is reloaded. Additionally, the redirects
    should be cached.
    """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()
    # ensuring that we only spawn one browser
    manager_params.num_browsers = 1
    manager, db = task_manager_creator((manager_params, [browser_params[0]]))
    for i in range(2):
        cs = CommandSequence(test_url, site_rank=i)
        cs.get(sleep=5)
        manager.execute_command_sequence(cs)

    manager.close()

    request_id_to_url = dict()

    # HTTP Requests
    rows = db_utils.query_db(
        db,
        """
        SELECT hr.*
        FROM http_requests as hr
        JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id
        WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # HACK: favicon caching is unpredictable, don't bother checking it
        if row["url"].split("?")[0].endswith("favicon.ico"):
            continue
        observed_records.add((
            row["url"].split("?")[0],
            row["top_level_url"],
            row["triggering_origin"],
            row["loading_origin"],
            row["loading_href"],
            row["is_XHR"],
            row["is_third_party_channel"],
            row["is_third_party_to_top_window"],
            row["resource_type"],
        ))
        request_id_to_url[row["request_id"]] = row["url"]
    assert observed_records == HTTP_CACHED_REQUESTS

    # HTTP Responses
    rows = db_utils.query_db(
        db,
        """
         SELECT hp.*
         FROM http_responses as hp
         JOIN site_visits sv ON sv.visit_id = hp.visit_id and sv.browser_id = hp.browser_id
         WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # HACK: favicon caching is unpredictable, don't bother checking it
        if row["url"].split("?")[0].endswith("favicon.ico"):
            continue
        observed_records.add((
            row["url"].split("?")[0],
            # TODO: referrer isn't available yet in the
            # webext instrumentation | row['referrer'],
            row["is_cached"],
        ))
        assert row["request_id"] in request_id_to_url
        assert request_id_to_url[row["request_id"]] == row["url"]
    assert HTTP_CACHED_RESPONSES == observed_records

    # HTTP Redirects
    rows = db_utils.query_db(
        db,
        """
         SELECT hr.*
         FROM http_redirects as hr
         JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id
         WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # TODO: new_request_id isn't supported yet
        # src = request_id_to_url[row['old_request_id']].split('?')[0]
        # dst = request_id_to_url[row['new_request_id']].split('?')[0]
        src = row["old_request_url"].split("?")[0]
        dst = row["new_request_url"].split("?")[0]
        observed_records.add((src, dst))
    assert HTTP_CACHED_REDIRECTS == observed_records
Пример #15
0
                          timeout=5)
    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
        continue

    unsaved_jobs.append(job)
    retry_number = job_queue.get_retry_number(job)
    site_rank, site = job.decode("utf-8").split(",")
    if "://" not in site:
        site = "http://" + site
    manager.logger.info("Visiting %s..." % site)
    callback = get_job_completion_callback(manager.logger, unsaved_jobs_lock,
                                           job_queue, job)
    command_sequence = CommandSequence(
        site,
        blocking=True,
        reset=True,
        retry_number=retry_number,
        callback=callback,
        site_rank=int(site_rank),
    )
    command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
    manager.execute_command_sequence(command_sequence)
else:
    manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
    sentry_sdk.capture_message("Crawl worker finished")
Пример #16
0
# Commands time out by default after 60 seconds
with TaskManager(
        manager_params,
        browser_params,
        SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
        None,
) as manager:
    # Visits the sites
    for index, site in enumerate(sites):

        def callback(success: bool, val: str = site) -> None:
            print(
                f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
            )

        # Parallelize sites over all number of browsers set above.
        command_sequence = CommandSequence(
            site,
            site_rank=index,
            callback=callback,
        )

        # Start by visiting the page
        command_sequence.append_command(GetCommand(url=site, sleep=3),
                                        timeout=60)
        # Have a look at custom_command.py to see how to implement your own command
        command_sequence.append_command(LinkCountingCommand())

        # Run commands across all browsers (simple parallelization)
        manager.execute_command_sequence(command_sequence)
Пример #17
0
def crawl(sites, db_filename):
    '''
    sites ihe list of sites that we wish to crawl
    db_filename is the file name of the output database
    '''

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams
    NUM_BROWSERS = 12

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

        browser_params[i].bot_mitigation = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(
                Path("./datadir/{}.sqlite".format(db_filename))),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                reset=True,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)

            # Run commands across the three browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
Пример #18
0
manager_params.log_directory = "~/Desktop/"

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.get(sleep=3, timeout=60)

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()