def test_assertion_error_propagation( task_manager_creator, default_params, testing, expectation ): """Test that assertion errors bubble up through the TaskManager when running tests""" manager_params, browser_params = default_params manager_params.num_browsers = 1 manager_params.testing = testing manager, _ = task_manager_creator((manager_params, browser_params[:1])) cs = CommandSequence("http://example.com", blocking=True) cs.append_command(CrashingAssertionCommand()) with expectation: with manager: manager.execute_command_sequence(cs)
def test_local_callbacks(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) def callback(argument: List[int], success: bool): argument.extend([1, 2, 3]) my_list = [] sequence = CommandSequence( TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert my_list == [1, 2, 3]
def test_local_callbacks(default_params, task_manager_creator): """Test the storage controller as well as the entire callback machinery to see if all callbacks get correctly called""" manager, _ = task_manager_creator(default_params) TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" def callback(argument: List[int], success: bool) -> None: argument.extend([1, 2, 3]) my_list: List[int] = [] sequence = CommandSequence( TEST_SITE, blocking=True, callback=partial(callback, my_list) ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert my_list == [1, 2, 3]
def test_s3_callbacks(self): TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager_params, browser_params = self.get_config() dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) manager = task_manager.TaskManager(manager_params, browser_params) queue = Queue() def ensure_site_in_s3(success: bool): # Ensure http table is created queue.put( TEST_SITE in dataset.load_table("http_requests").top_level_url.unique() ) sequence = CommandSequence( TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3 ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert queue.get()
def test_seed_persistence(default_params, task_manager_creator): manager_params, browser_params = default_params p = Path("profile.tar.gz") for browser_param in browser_params: browser_param.seed_tar = p manager, db = task_manager_creator(default_params) command_sequences = [] for _ in range(2): cs = CommandSequence(url=BASE_TEST_URL) cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( db, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_profile_recovery(monkeypatch, default_params, task_manager_creator, testcase, stateful, seed_tar): """Test browser profile recovery in various scenarios.""" manager_params, browser_params = default_params manager_params.num_browsers = 1 browser_params[0].seed_tar = seed_tar manager, db = task_manager_creator((manager_params, browser_params[:1])) manager.get(BASE_TEST_URL, reset=not stateful) if testcase == "normal_operation": pass elif testcase == "on_crash": # Cause a selenium crash to force browser to restart manager.get("example.com", reset=not stateful) elif testcase == "on_crash_during_launch": # Cause a selenium crash to force browser to restart manager.get("example.com", reset=not stateful) # This will cause browser restarts to fail monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL") # Let the launch succeed after some failed launch attempts def undo_monkeypatch(): time.sleep(5) # This should be smaller than _SPAWN_TIMEOUT monkeypatch.undo() Thread(target=undo_monkeypatch).start() elif testcase == "on_timeout": # Set a very low timeout to cause a restart manager.get("about:config", reset=not stateful, timeout=0.1) cs = CommandSequence("about:config", reset=not stateful) expected_value = True if seed_tar else False cs.append_command(AssertConfigSetCommand("test_pref", expected_value)) tar_directory = manager_params.data_directory / "browser_profile" tar_path = tar_directory / "profile.tar.gz" cs.dump_profile(tar_path, True) manager.execute_command_sequence(cs) manager.close() # Check that a consistent profile is used for stateful crawls but # not for stateless crawls with tarfile.open(tar_path) as tar: tar.extractall(tar_directory) ff_db = tar_directory / "places.sqlite" rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") places = [url for (url, ) in rows] if stateful: assert BASE_TEST_URL in places else: assert BASE_TEST_URL not in places # Check if seed_tar was loaded on restart rows = db_utils.query_db( db, "SELECT command_status FROM crawl_history WHERE" " command='AssertConfigSetCommand'", ) assert rows[0][0] == "ok"
def test_dump_profile_command(default_params, task_manager_creator): """Test saving the browser profile using a command.""" manager_params, browser_params = default_params manager_params.num_browsers = 1 manager, _ = task_manager_creator((manager_params, browser_params[:1])) cs = CommandSequence(url=BASE_TEST_URL) cs.get() tar_path = manager_params.data_directory / "profile.tar.gz" cs.dump_profile(tar_path, True) manager.execute_command_sequence(cs) manager.close() assert tar_path.is_file()
def test_display_shutdown(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) port = manager.browsers[0].display_port sequence = CommandSequence(TEST_SITE) sequence.get() sequence.append_command(ExceptionCommand) manager.execute_command_sequence(sequence) manager.close() assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_display_shutdown(task_manager_creator, default_params): """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown""" manager_params, browser_params = default_params for browser_param in browser_params: browser_param.display_mode = "xvfb" TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager, db = task_manager_creator((manager_params, browser_params)) port = manager.browsers[0].display_port sequence = CommandSequence(TEST_SITE) sequence.get() sequence.append_command(ExceptionCommand()) manager.execute_command_sequence(sequence) manager.close() assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_seed_persistance(self): manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0].seed_tar = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.append_command(TestConfigSetCommand("test_pref", True)) command_sequences.append(cs) manager = TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_seed_persistance(self): def test_config_is_set(*args, **kwargs): driver = kwargs["driver"] driver.get("about:config") result = driver.execute_script(""" var prefs = Components .classes["@mozilla.org/preferences-service;1"] .getService(Components.interfaces.nsIPrefBranch); try { return prefs.getBoolPref("test_pref") } catch (e) { return false; } """) assert result manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0]["seed_tar"] = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.run_custom_function(test_config_is_set) command_sequences.append(cs) manager = task_manager.TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params["db"], "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
# memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=10), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()
def processSite(site): # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [site] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) return (None)
def test_cache_hits_recorded(http_params, task_manager_creator): """Verify all http responses are recorded, including cached responses Note that we expect to see all of the same requests and responses during the second vist (even if cached) except for images. Cached images do not trigger Observer Notification events. See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 The test page includes an image which does several permanent redirects before returning a 404. We expect to see new requests and responses for this image when the page is reloaded. Additionally, the redirects should be cached. """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = http_params() # ensuring that we only spawn one browser manager_params.num_browsers = 1 manager, db = task_manager_creator((manager_params, [browser_params[0]])) for i in range(2): cs = CommandSequence(test_url, site_rank=i) cs.get(sleep=5) manager.execute_command_sequence(cs) manager.close() request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db( db, """ SELECT hr.* FROM http_requests as hr JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row["url"].split("?")[0].endswith("favicon.ico"): continue observed_records.add(( row["url"].split("?")[0], row["top_level_url"], row["triggering_origin"], row["loading_origin"], row["loading_href"], row["is_XHR"], row["is_third_party_channel"], row["is_third_party_to_top_window"], row["resource_type"], )) request_id_to_url[row["request_id"]] = row["url"] assert observed_records == HTTP_CACHED_REQUESTS # HTTP Responses rows = db_utils.query_db( db, """ SELECT hp.* FROM http_responses as hp JOIN site_visits sv ON sv.visit_id = hp.visit_id and sv.browser_id = hp.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row["url"].split("?")[0].endswith("favicon.ico"): continue observed_records.add(( row["url"].split("?")[0], # TODO: referrer isn't available yet in the # webext instrumentation | row['referrer'], row["is_cached"], )) assert row["request_id"] in request_id_to_url assert request_id_to_url[row["request_id"]] == row["url"] assert HTTP_CACHED_RESPONSES == observed_records # HTTP Redirects rows = db_utils.query_db( db, """ SELECT hr.* FROM http_redirects as hr JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # TODO: new_request_id isn't supported yet # src = request_id_to_url[row['old_request_id']].split('?')[0] # dst = request_id_to_url[row['new_request_id']].split('?')[0] src = row["old_request_url"].split("?")[0] dst = row["new_request_url"].split("?")[0] observed_records.add((src, dst)) assert HTTP_CACHED_REDIRECTS == observed_records
timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) continue unsaved_jobs.append(job) retry_number = job_queue.get_retry_number(job) site_rank, site = job.decode("utf-8").split(",") if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) callback = get_job_completion_callback(manager.logger, unsaved_jobs_lock, job_queue, job) command_sequence = CommandSequence( site, blocking=True, reset=True, retry_number=retry_number, callback=callback, site_rank=int(site_rank), ) command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT) manager.execute_command_sequence(command_sequence) else: manager.logger.info("Job queue finished, exiting.") manager.close() if SENTRY_DSN: sentry_sdk.capture_message("Crawl worker finished")
# Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence)
def crawl(sites, db_filename): ''' sites ihe list of sites that we wish to crawl db_filename is the file name of the output database ''' # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 12 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True browser_params[i].bot_mitigation = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider( Path("./datadir/{}.sqlite".format(db_filename))), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, reset=True, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence)
manager_params.log_directory = "~/Desktop/" # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.get(sleep=3, timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()