示例#1
0
def test_seed_persistence(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    p = Path("profile.tar.gz")
    for browser_param in browser_params:
        browser_param.seed_tar = p
    manager, db = task_manager_creator(default_params)

    command_sequences = []
    for _ in range(2):
        cs = CommandSequence(url=BASE_TEST_URL)
        cs.get()
        cs.append_command(AssertConfigSetCommand("test_pref", True))
        command_sequences.append(cs)

    for cs in command_sequences:
        manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        db,
        "SELECT * FROM crawl_history;",
    )
    assert len(query_result) > 0
    for row in query_result:
        assert row[
            "command_status"] == "ok", f"Command {tuple(row)} was not ok"
示例#2
0
def test_dump_profile_command(default_params, task_manager_creator):
    """Test saving the browser profile using a command."""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    manager, _ = task_manager_creator((manager_params, browser_params[:1]))
    cs = CommandSequence(url=BASE_TEST_URL)
    cs.get()
    tar_path = manager_params.data_directory / "profile.tar.gz"
    cs.dump_profile(tar_path, True)
    manager.execute_command_sequence(cs)
    manager.close()
    assert tar_path.is_file()
示例#3
0
    def test_display_shutdown(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)
        port = manager.browsers[0].display_port

        sequence = CommandSequence(TEST_SITE)
        sequence.get()
        sequence.append_command(ExceptionCommand)
        manager.execute_command_sequence(sequence)
        manager.close()
        assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_display_shutdown(task_manager_creator, default_params):
    """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown"""
    manager_params, browser_params = default_params
    for browser_param in browser_params:
        browser_param.display_mode = "xvfb"
    TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
    manager, db = task_manager_creator((manager_params, browser_params))
    port = manager.browsers[0].display_port

    sequence = CommandSequence(TEST_SITE)
    sequence.get()
    sequence.append_command(ExceptionCommand())
    manager.execute_command_sequence(sequence)
    manager.close()
    assert not os.path.exists("/tmp/.X%s-lock" % port)
示例#5
0
    def test_local_callbacks(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)

        def callback(argument: List[int], success: bool):
            argument.extend([1, 2, 3])

        my_list = []
        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list)
        )
        sequence.get()

        manager.execute_command_sequence(sequence)
        manager.close()
        assert my_list == [1, 2, 3]
示例#6
0
def test_local_callbacks(default_params, task_manager_creator):
    """Test the storage controller as well as the entire callback machinery
    to see if all callbacks get correctly called"""
    manager, _ = task_manager_creator(default_params)
    TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"

    def callback(argument: List[int], success: bool) -> None:
        argument.extend([1, 2, 3])

    my_list: List[int] = []
    sequence = CommandSequence(
        TEST_SITE, blocking=True, callback=partial(callback, my_list)
    )
    sequence.get()

    manager.execute_command_sequence(sequence)
    manager.close()
    assert my_list == [1, 2, 3]
示例#7
0
    def test_s3_callbacks(self):
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager_params, browser_params = self.get_config()
        dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory)
        manager = task_manager.TaskManager(manager_params, browser_params)
        queue = Queue()

        def ensure_site_in_s3(success: bool):
            # Ensure http table is created
            queue.put(
                TEST_SITE in dataset.load_table("http_requests").top_level_url.unique()
            )

        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3
        )
        sequence.get()
        manager.execute_command_sequence(sequence)
        manager.close()

        assert queue.get()
示例#8
0
 def test_seed_persistance(self):
     manager_params, browser_params = self.get_test_config(num_browsers=1)
     browser_params[0].seed_tar = "."
     command_sequences = []
     for _ in range(2):
         cs = CommandSequence(url="https://example.com", reset=True)
         cs.get()
         cs.append_command(TestConfigSetCommand("test_pref", True))
         command_sequences.append(cs)
     manager = TaskManager(manager_params, browser_params)
     for cs in command_sequences:
         manager.execute_command_sequence(cs)
     manager.close()
     query_result = db_utils.query_db(
         manager_params.database_name,
         "SELECT * FROM crawl_history;",
     )
     assert len(query_result) > 0
     for row in query_result:
         assert row[
             "command_status"] == "ok", f"Command {tuple(row)} was not ok"
示例#9
0
    def test_seed_persistance(self):
        def test_config_is_set(*args, **kwargs):
            driver = kwargs["driver"]
            driver.get("about:config")
            result = driver.execute_script("""
                var prefs = Components
                            .classes["@mozilla.org/preferences-service;1"]
                            .getService(Components.interfaces.nsIPrefBranch);
                try {
                    return prefs.getBoolPref("test_pref")
                } catch (e) {
                    return false;
                }
            """)
            assert result

        manager_params, browser_params = self.get_test_config(num_browsers=1)
        browser_params[0]["seed_tar"] = "."
        command_sequences = []
        for _ in range(2):
            cs = CommandSequence(url="https://example.com", reset=True)
            cs.get()
            cs.run_custom_function(test_config_is_set)
            command_sequences.append(cs)
        manager = task_manager.TaskManager(manager_params, browser_params)
        for cs in command_sequences:
            manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params["db"],
            "SELECT * FROM crawl_history;",
        )
        assert len(query_result) > 0
        for row in query_result:
            assert row[
                "command_status"] == "ok", f"Command {tuple(row)} was not ok"
示例#10
0
def test_cache_hits_recorded(http_params, task_manager_creator):
    """Verify all http responses are recorded, including cached responses

    Note that we expect to see all of the same requests and responses
    during the second vist (even if cached) except for images. Cached
    images do not trigger Observer Notification events.
    See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073

    The test page includes an image which does several permanent redirects
    before returning a 404. We expect to see new requests and responses
    for this image when the page is reloaded. Additionally, the redirects
    should be cached.
    """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()
    # ensuring that we only spawn one browser
    manager_params.num_browsers = 1
    manager, db = task_manager_creator((manager_params, [browser_params[0]]))
    for i in range(2):
        cs = CommandSequence(test_url, site_rank=i)
        cs.get(sleep=5)
        manager.execute_command_sequence(cs)

    manager.close()

    request_id_to_url = dict()

    # HTTP Requests
    rows = db_utils.query_db(
        db,
        """
        SELECT hr.*
        FROM http_requests as hr
        JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id
        WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # HACK: favicon caching is unpredictable, don't bother checking it
        if row["url"].split("?")[0].endswith("favicon.ico"):
            continue
        observed_records.add((
            row["url"].split("?")[0],
            row["top_level_url"],
            row["triggering_origin"],
            row["loading_origin"],
            row["loading_href"],
            row["is_XHR"],
            row["is_third_party_channel"],
            row["is_third_party_to_top_window"],
            row["resource_type"],
        ))
        request_id_to_url[row["request_id"]] = row["url"]
    assert observed_records == HTTP_CACHED_REQUESTS

    # HTTP Responses
    rows = db_utils.query_db(
        db,
        """
         SELECT hp.*
         FROM http_responses as hp
         JOIN site_visits sv ON sv.visit_id = hp.visit_id and sv.browser_id = hp.browser_id
         WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # HACK: favicon caching is unpredictable, don't bother checking it
        if row["url"].split("?")[0].endswith("favicon.ico"):
            continue
        observed_records.add((
            row["url"].split("?")[0],
            # TODO: referrer isn't available yet in the
            # webext instrumentation | row['referrer'],
            row["is_cached"],
        ))
        assert row["request_id"] in request_id_to_url
        assert request_id_to_url[row["request_id"]] == row["url"]
    assert HTTP_CACHED_RESPONSES == observed_records

    # HTTP Redirects
    rows = db_utils.query_db(
        db,
        """
         SELECT hr.*
         FROM http_redirects as hr
         JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id
         WHERE sv.site_rank = 1""",
    )
    observed_records = set()
    for row in rows:
        # TODO: new_request_id isn't supported yet
        # src = request_id_to_url[row['old_request_id']].split('?')[0]
        # dst = request_id_to_url[row['new_request_id']].split('?')[0]
        src = row["old_request_url"].split("?")[0]
        dst = row["new_request_url"].split("?")[0]
        observed_records.add((src, dst))
    assert HTTP_CACHED_REDIRECTS == observed_records
示例#11
0
                          timeout=5)
    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
        continue

    unsaved_jobs.append(job)
    retry_number = job_queue.get_retry_number(job)
    site_rank, site = job.decode("utf-8").split(",")
    if "://" not in site:
        site = "http://" + site
    manager.logger.info("Visiting %s..." % site)
    callback = get_job_completion_callback(manager.logger, unsaved_jobs_lock,
                                           job_queue, job)
    command_sequence = CommandSequence(
        site,
        blocking=True,
        reset=True,
        retry_number=retry_number,
        callback=callback,
        site_rank=int(site_rank),
    )
    command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
    manager.execute_command_sequence(command_sequence)
else:
    manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
    sentry_sdk.capture_message("Crawl worker finished")
示例#12
0
manager_params.log_directory = "~/Desktop/"

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.get(sleep=3, timeout=60)

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()