def __init__(self, manager_params, browser_params): # Constants self._SPAWN_TIMEOUT = 120 #seconds self._UNSUCCESSFUL_SPAWN_LIMIT = 1 # manager parameters self.current_profile_path = None self.db_socket_address = manager_params['aggregator_address'] self.logger_address = manager_params['logger_address'] self.crawl_id = browser_params['crawl_id'] self.curr_visit_id = None self.browser_params = browser_params self.manager_params = manager_params # Queues and process IDs for BrowserManager self.command_thread = None # thread to run commands issues from TaskManager self.command_queue = None # queue for passing command tuples to BrowserManager self.status_queue = None # queue for receiving command execution status from BrowserManager self.browser_pid = None # pid for browser instance controlled by BrowserManager self.display_pid = None # the pid of the display for the headless browser (if it exists) self.display_port = None # the port of the display for the headless browser (if it exists) self.is_fresh = True # boolean that says if the BrowserManager new (used to optimize restarts) self.restart_required = False # boolean indicating if the browser should be restarted self.current_timeout = None # timeout of the current command self.browser_settings = None # dict of additional browser profile settings (e.g. screen_res) self.browser_manager = None # process that controls browser self.logger = loggingclient( *self.logger_address) # connection to loggingserver
def __init__(self, manager_params, browser_params): # Constants self._SPAWN_TIMEOUT = 120 #seconds self._UNSUCCESSFUL_SPAWN_LIMIT = 4 # manager parameters self.current_profile_path = None self.db_socket_address = manager_params['aggregator_address'] self.logger_address = manager_params['logger_address'] self.crawl_id = browser_params['crawl_id'] self.curr_visit_id = None self.browser_params = browser_params self.manager_params = manager_params # Queues and process IDs for BrowserManager self.command_thread = None # thread to run commands issues from TaskManager self.command_queue = None # queue for passing command tuples to BrowserManager self.status_queue = None # queue for receiving command execution status from BrowserManager self.browser_pid = None # pid for browser instance controlled by BrowserManager self.display_pid = None # the pid of the display for the headless browser (if it exists) self.display_port = None # the port of the display for the headless browser (if it exists) self.is_fresh = True # boolean that says if the BrowserManager new (used to optimize restarts) self.restart_required = False # boolean indicating if the browser should be restarted self.current_timeout = None # timeout of the current command self.browser_settings = None # dict of additional browser profile settings (e.g. screen_res) self.browser_manager = None # process that controls browser self.logger = loggingclient(*self.logger_address) # connection to loggingserver
def __init__(self, status_queue, shutdown_queue, manager_params): self.status_queue = status_queue self.shutdown_queue = shutdown_queue self.logger = loggingclient(*manager_params['logger_address']) self._shutdown_flag = False self._last_update = time.time() # last status update time self.record_queue = None # Initialized on `startup`
def load_profile(browser_profile_folder, manager_params, browser_params, tar_location, load_flash=False): """ loads a zipped cookie-based profile stored in <tar_location> and unzips it to <browser_profile_folder>. This will load whatever profile is in the folder, either full_profile.tar.gz or profile.tar.gz """ try: # Connect to logger logger = loggingclient(*manager_params['logger_address']) # ensures that folder paths end with slashes if browser_profile_folder[-1] != '/': browser_profile_folder = browser_profile_folder + "/" if tar_location[-1] != '/': tar_location = tar_location + "/" if os.path.isfile(tar_location + 'profile.tar.gz'): tar_name = 'profile.tar.gz' else: tar_name = 'profile.tar' # Copy and untar the loaded profile logger.debug("BROWSER %i: Copying profile tar from %s to %s" % (browser_params['crawl_id'], tar_location + tar_name, browser_profile_folder)) shutil.copy(tar_location + tar_name, browser_profile_folder) if tar_name == 'profile.tar.gz': f = tarfile.open(browser_profile_folder + tar_name, 'r:gz', errorlevel=1) else: f = tarfile.open(browser_profile_folder + tar_name, 'r', errorlevel=1) f.extractall(browser_profile_folder) f.close() os.remove(browser_profile_folder + tar_name) logger.debug("BROWSER %i: Tarfile extracted" % browser_params['crawl_id']) # clear and load flash cookies if load_flash: load_flash_files(logger, browser_params, tar_location) # load the browser settings browser_settings = load_browser_settings(tar_location) except Exception as ex: logger.critical( "BROWSER %i: Error: %s while attempting to load profile" % (browser_params['crawl_id'], str(ex))) raise ProfileLoadError('Profile Load not successful') return browser_settings
def __init__(self, manager_params, browser_params): self.manager_params = manager_params self.browser_params = browser_params self.logger = loggingclient(*manager_params['logger_address']) self.listener_address = None self.listener_process = None self.status_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = 0.0
def screenshot_full_page(visit_id, crawl_id, driver, manager_params, suffix=''): logger = loggingclient(*manager_params['logger_address']) outdir = os.path.join(manager_params['screenshot_path'], 'parts') if not os.path.isdir(outdir): os.mkdir(outdir) if suffix != '': suffix = '-' + suffix urlhash = md5(driver.current_url.encode('utf-8')).hexdigest() outname = os.path.join( outdir, '%i-%s%s-part-%%i-%%i.png' % (visit_id, urlhash, suffix)) try: part = 0 max_height = execute_script_with_retry( driver, 'return document.body.scrollHeight;') inner_height = execute_script_with_retry(driver, 'return window.innerHeight;') curr_scrollY = execute_script_with_retry(driver, 'return window.scrollY;') prev_scrollY = -1 driver.save_screenshot(outname % (part, curr_scrollY)) while ((curr_scrollY + inner_height) < max_height and curr_scrollY != prev_scrollY): # Scroll down to bottom of previous viewport try: driver.execute_script('window.scrollBy(0, window.innerHeight)') except WebDriverException: logger.info("BROWSER %i: WebDriverException while scrolling, " "screenshot may be misaligned!" % crawl_id) pass # Update control variables part += 1 prev_scrollY = curr_scrollY curr_scrollY = execute_script_with_retry(driver, 'return window.scrollY;') # Save screenshot driver.save_screenshot(outname % (part, curr_scrollY)) except WebDriverException: excp = traceback.format_exception(*sys.exc_info()) logger.error( "BROWSER %i: Exception while taking full page screenshot \n %s" % (crawl_id, ''.join(excp))) return _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery): logger = loggingclient(*browser_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'], browser_params['logger_address'], browser_params['crawl_id']) browser_params['proxy'] = local_port # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']: logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket() extension_socket.connect('127.0.0.1',int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put('READY') browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes try: command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket) status_queue.put("OK") except Exception as e: logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e))) status_queue.put("FAILED") break
def browse_website(url, num_links, sleep, visit_id, webdriver, browser_params, manager_params, extension_socket): """Calls get_website before visiting <num_links> present on the page. Note: the site_url in the site_visits table for the links visited will be the site_url of the original page and NOT the url of the links visited. """ # First get the site get_website(url, sleep, visit_id, webdriver, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for _ in range(num_links): links = [ x for x in get_intra_links(webdriver, url) if is_displayed(x) is True ] if not links: break r = int(random.random() * len(links)) logger.info( "BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(max(1, sleep)) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() wait_until_loaded(webdriver, 300) except Exception: pass
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy( browser_params, manager_params, status_queue) browser_params['proxy'] = local_port status_queue.put(('STATUS', 'Proxy Ready', 'READY')) # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser( status_queue, browser_params, manager_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params[ 'extension_enabled']: logger.debug( "BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put(('STATUS', 'Browser Ready', 'READY')) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL', cPickle.dumps(err_info))) return except Exception as e: excp = traceback.format_exception(*sys.exc_info()) logger.info( "BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED', None)) return
def dump_profile(browser_profile_folder, manager_params, browser_params, tar_location, close_webdriver, webdriver=None, browser_settings=None, save_flash=False, compress=False): """ dumps a browser profile currently stored in <browser_profile_folder> to <tar_location> in which both folders are absolute paths. if <browser_settings> exists they are also saved <save_flash> specifies whether to dump flash files """ # Connect to logger logger = loggingclient(*manager_params['logger_address']) # ensures that folder paths end with slashes if browser_profile_folder[-1] != '/': browser_profile_folder = browser_profile_folder + "/" if tar_location[-1] != '/': tar_location = tar_location + "/" if not os.path.exists(tar_location): os.makedirs(tar_location) if compress: tar_name = 'profile.tar.gz' else: tar_name = 'profile.tar' # see if this file exists first # if it does, delete it before we try to save the current session if os.path.isfile(tar_location + tar_name): os.remove(tar_location + tar_name) # if this is a dump on close, close the webdriver and wait for checkpoint if close_webdriver: webdriver.close() sleep_until_sqlite_checkpoint(browser_profile_folder) # backup and tar profile if compress: tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1) else: tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1) logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (browser_params['crawl_id'], browser_profile_folder, tar_location + tar_name)) storage_vector_files = [ 'cookies.sqlite', # cookies 'cookies.sqlite-shm', 'cookies.sqlite-wal', 'places.sqlite', # history 'places.sqlite-shm', 'places.sqlite-wal', 'webappsstore.sqlite', # localStorage 'webappsstore.sqlite-shm', 'webappsstore.sqlite-wal', ] storage_vector_dirs = [ 'webapps', # related to localStorage? 'storage' # directory for IndexedDB ] for item in storage_vector_files: full_path = os.path.join(browser_profile_folder, item) if (not os.path.isfile(full_path) and full_path[-3:] != 'shm' and full_path[-3:] != 'wal'): logger.critical( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path)) elif (not os.path.isfile(full_path) and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal')): continue # These are just checkpoint files tar.add(full_path, arcname=item) for item in storage_vector_dirs: full_path = os.path.join(browser_profile_folder, item) if not os.path.isdir(full_path): logger.warning( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params['crawl_id'], full_path)) continue tar.add(full_path, arcname=item) tar.close() # save flash cookies if save_flash: save_flash_files(logger, browser_params, tar_location) # save the browser settings if browser_settings is not None: save_browser_settings(tar_location, browser_settings)
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params, manager_params, status_queue) browser_params['proxy'] = local_port status_queue.put(('STATUS','Proxy Ready','READY')) # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, manager_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension_enabled']: logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1',int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put(('STATUS','Browser Ready','READY')) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL',cPickle.dumps(err_info))) return except Exception as e: excp = traceback.format_exception(*sys.exc_info()) logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED',None)) return
def deploy_firefox(status_queue, browser_params, manager_params, crash_recovery): """ launches a firefox instance with parameters set by the input dictionary """ firefox_binary_path = get_firefox_binary_path() geckodriver_executable_path = get_geckodriver_exec_path() root_dir = os.path.dirname(__file__) # directory of this file logger = loggingclient(*manager_params['logger_address']) display_pid = None display_port = None fp = FirefoxProfile() browser_profile_path = fp.path + '/' status_queue.put(('STATUS', 'Profile Created', browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() profile_settings = None # Imported browser settings if browser_params['profile_tar'] and not crash_recovery: logger.debug( "BROWSER %i: Loading initial browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar'])) load_flash = browser_params['disable_flash'] is False profile_settings = load_profile(browser_profile_path, manager_params, browser_params, browser_params['profile_tar'], load_flash=load_flash) elif browser_params['profile_tar']: logger.debug( "BROWSER %i: Loading recovered browser profile from: %s" % (browser_params['crawl_id'], browser_params['profile_tar'])) profile_settings = load_profile(browser_profile_path, manager_params, browser_params, browser_params['profile_tar']) status_queue.put(('STATUS', 'Profile Tar', None)) if browser_params['random_attributes'] and profile_settings is None: logger.debug("BROWSER %i: Loading random attributes for browser" % browser_params['crawl_id']) profile_settings = dict() # choose a random screen-res from list resolutions = list() with open(os.path.join(root_dir, 'screen_resolutions.txt'), 'r') as f: for line in f: resolutions.append(tuple(line.strip().split(','))) profile_settings['screen_res'] = random.choice(resolutions) # set a random user agent from list ua_strings = list() with open(os.path.join(root_dir, 'user_agent_strings.txt'), 'r') as f: for line in f: ua_strings.append(line.strip()) profile_settings['ua_string'] = random.choice(ua_strings) # If profile settings still not set - set defaults if profile_settings is None: profile_settings = dict() profile_settings['screen_res'] = DEFAULT_SCREEN_RES profile_settings['ua_string'] = None if profile_settings['ua_string'] is not None: logger.debug( "BROWSER %i: Overriding user agent string to '%s'" % (browser_params['crawl_id'], profile_settings['ua_string'])) fo.set_preference("general.useragent.override", profile_settings['ua_string']) if browser_params['headless']: display = Display(visible=0, size=profile_settings['screen_res']) display.start() display_pid = display.pid display_port = display.cmd_param[-1][1:] status_queue.put(('STATUS', 'Display', (display_pid, display_port))) if browser_params['extension_enabled']: # Write config file extension_config = dict() extension_config.update(browser_params) extension_config['logger_address'] = manager_params['logger_address'] extension_config['aggregator_address'] = manager_params[ 'aggregator_address'] if 'ldb_address' in manager_params: extension_config['leveldb_address'] = manager_params['ldb_address'] else: extension_config['leveldb_address'] = None extension_config['testing'] = manager_params['testing'] ext_config_file = browser_profile_path + 'browser_params.json' with open(ext_config_file, 'w') as f: json.dump(extension_config, f) logger.debug("BROWSER %i: Saved extension config file to: %s" % (browser_params['crawl_id'], ext_config_file)) # TODO restore detailed logging # fo.set_preference("*****@*****.**", "all") # Disable flash if browser_params['disable_flash']: fo.set_preference('plugin.state.flash', 0) else: fo.set_preference('plugin.state.flash', 2) fo.set_preference('plugins.click_to_play', False) # Configure privacy settings configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path) # Set various prefs to improve speed and eliminate traffic to Mozilla configure_firefox.optimize_prefs(fo) # Intercept logging at the Selenium level and redirect it to the # main logger. This will also inform us where the real profile # directory is hiding. interceptor = FirefoxLogInterceptor(browser_params['crawl_id'], logger, browser_profile_path) interceptor.start() # Set custom prefs. These are set after all of the default prefs to allow # our defaults to be overwritten. for name, value in browser_params['prefs'].items(): logger.info("BROWSER %i: Setting custom preference: %s = %s" % (browser_params['crawl_id'], name, value)) fo.set_preference(name, value) # Launch the webdriver status_queue.put(('STATUS', 'Launch Attempted', None)) fb = FirefoxBinary(firefox_path=firefox_binary_path) driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=fb, executable_path=geckodriver_executable_path, firefox_options=fo, log_path=interceptor.fifo) # Add extension if browser_params['extension_enabled']: # Install extension ext_loc = os.path.join(root_dir, '../Extension/firefox/openwpm.xpi') ext_loc = os.path.normpath(ext_loc) driver.install_addon(ext_loc, temporary=True) logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" % browser_params['crawl_id']) # set window size driver.set_window_size(*profile_settings['screen_res']) # Get browser process pid if hasattr(driver, 'service') and hasattr(driver.service, 'process'): pid = driver.service.process.pid elif hasattr(driver, 'binary') and hasattr(driver.binary, 'process'): pid = driver.binary.process.pid else: raise RuntimeError("Unable to identify Firefox process ID.") status_queue.put( ('STATUS', 'Browser Launched', (int(pid), profile_settings))) return driver, driver.capabilities["moz:profile"], profile_settings
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the virtualdisplay (if necessary), webdriver, and browser driver, prof_folder, browser_settings = deploy_browser.deploy_browser( status_queue, browser_params, manager_params, crash_recovery) if prof_folder[-1] != '/': prof_folder += '/' # Read the extension port -- if extension is enabled # TODO: Initial communication from extension to TM should use sockets if (browser_params['browser'] == 'firefox' and browser_params['extension_enabled']): logger.debug("BROWSER %i: Looking for extension port information " "in %s" % (browser_params['crawl_id'], prof_folder)) elapsed = 0 port = None ep_filename = os.path.join(prof_folder, 'extension_port.txt') while elapsed < 5: try: with open(ep_filename, 'rt') as f: port = int(f.read().strip()) break except IOError as e: if e.errno != errno.ENOENT: raise time.sleep(0.1) elapsed += 0.1 if port is None: # try one last time, allowing all exceptions to propagate with open(ep_filename, 'rt') as f: port = int(f.read().strip()) logger.debug("BROWSER %i: Connecting to extension on port %i" % (browser_params['crawl_id'], port)) extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None logger.debug("BROWSER %i: BrowserManager ready." % browser_params['crawl_id']) # passes the profile folder, WebDriver pid and display pid back to the # TaskManager to signal a successful startup status_queue.put(('STATUS', 'Browser Ready', (prof_folder, 'READY'))) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form: # (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskManager to # kill and restart its worker processes command_executor.execute_command(command, driver, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL', pickle.dumps(err_info))) return except Exception: excp = traceback.format_exception(*sys.exc_info()) logger.info("BROWSER %i: Crash in driver, restarting browser manager " "\n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED', None)) return
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery): logger = loggingclient(*browser_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy( browser_params['aggregator_address'], browser_params['logger_address'], browser_params['crawl_id']) browser_params['proxy'] = local_port # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension'][ 'enabled']: logger.debug( "BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket() extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put('READY') browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes try: command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket) status_queue.put("OK") except Exception as e: logger.info( "BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e))) status_queue.put("FAILED") break