def get_test_config( self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless" ): """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] manager_params.data_directory = data_dir manager_params.log_directory = data_dir manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode manager_params.database_name = join( manager_params.data_directory, manager_params.database_name ) return manager_params, browser_params
def default_params( tmp_path: Path, num_browsers: int = NUM_BROWSERS ) -> Tuple[ManagerParams, List[BrowserParams]]: """Just a simple wrapper around task_manager.load_default_params""" manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] manager_params.data_directory = tmp_path manager_params.log_path = tmp_path / "openwpm.log" for i in range(num_browsers): browser_params[i].display_mode = "headless" return manager_params, browser_params
for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = "~/Desktop/" manager_params.log_directory = "~/Desktop/" # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above.
def processSite(site): # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [site] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) return (None)
browser_params[i].cookie_instrument = COOKIE_INSTRUMENT browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT browser_params[i].js_instrument = JS_INSTRUMENT browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS if SAVE_CONTENT == "1": browser_params[i].save_content = True elif SAVE_CONTENT == "0": browser_params[i].save_content = False else: browser_params[i].save_content = SAVE_CONTENT if PREFS: browser_params[i].prefs = json.loads(PREFS) # Manager configuration manager_params.data_directory = Path("~/Desktop/") / CRAWL_DIRECTORY manager_params.log_path = Path("~/Desktop/") / CRAWL_DIRECTORY / "openwpm.log" structured = GcsStructuredProvider( project=GCP_PROJECT, bucket_name=GCS_BUCKET, base_path=CRAWL_DIRECTORY, token=AUTH_TOKEN, ) unstructured = GcsUnstructuredProvider( project=GCP_PROJECT, bucket_name=GCS_BUCKET, base_path=CRAWL_DIRECTORY + "/data", token=AUTH_TOKEN, ) # Instantiates the measurement platform
for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites
browser_params[i].cookie_instrument = COOKIE_INSTRUMENT browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT browser_params[i].js_instrument = JS_INSTRUMENT browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS if SAVE_CONTENT == "1": browser_params[i].save_content = True elif SAVE_CONTENT == "0": browser_params[i].save_content = False else: browser_params[i].save_content = SAVE_CONTENT if PREFS: browser_params[i].prefs = json.loads(PREFS) # Manager configuration manager_params.data_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY manager_params.log_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY manager_params.output_format = "s3" manager_params.s3_bucket = S3_BUCKET manager_params.s3_directory = CRAWL_DIRECTORY # Allow the use of localstack's mock s3 service S3_ENDPOINT = os.getenv("S3_ENDPOINT") if S3_ENDPOINT: boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT) manager_params.s3_bucket = local_s3_bucket(boto3.resource("s3"), name=S3_BUCKET) # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params, logger_kwargs=LOGGER_SETTINGS)
def crawl(sites, db_filename): ''' sites ihe list of sites that we wish to crawl db_filename is the file name of the output database ''' # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 12 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True browser_params[i].bot_mitigation = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider( Path("./datadir/{}.sqlite".format(db_filename))), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, reset=True, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence)