def __init__(self, db_path, browser_params, num_browsers, task_description=None): # sets up the information needed to write to the database self.desc = task_description self.db_path = db_path # sets up the crawl data database self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) # prepares browser settings self.num_browsers = num_browsers # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers> if type(browser_params) is not list: browser_params = [ copy.deepcopy(browser_params) for i in xrange(0, num_browsers) ] if len(browser_params) != num_browsers: raise Exception( "Number of browser parameter dictionaries is not the same as <num_browsers>" ) # sets up the DataAggregator + associated queues self.aggregator_status_queue = None # queue used for sending graceful KILL command to DataAggregator self.data_aggregator = self.launch_data_aggregator() self.aggregator_address = self.aggregator_status_queue.get( ) # socket location: (address, port) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1]) # update task table cur = self.db.cursor() cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc, )) self.db.commit() self.task_id = cur.lastrowid # sets up the BrowserManager(s) + associated queues self.browsers = self.initialize_browsers( browser_params) # List of the Browser(s) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1])
def dump_profile_cookies(start_time, visit_id, webdriver, browser_params, manager_params): """ Save changes to Firefox's cookies.sqlite to database We determine which cookies to save by the `start_time` timestamp. This timestamp should be taken prior to calling the `get` for which creates these changes. Note that the extension's cookieInstrument is preferred to this approach, as this is likely to miss changes still present in the sqlite `wal` files. This will likely be removed in a future version. """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills window to avoid stray requests sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: data = dict(row) data["crawl_id"] = browser_params['crawl_id'] data["visit_id"] = visit_id sock.send(("profile_cookies", data)) # Close connection to db sock.close()
def __init__(self, db_path, browser_params, num_browsers, log_file = '~/openwpm.log', process_watchdog=False, task_description=None): # Flow control self.closing = False self.failure_flag = False self.threadlock = threading.Lock() self.failurecount = 0 # sets up the information needed to write to the database self.desc = task_description self.db_path = db_path self.log_file = log_file self.process_watchdog = process_watchdog # sets up the crawl data database self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) # prepares browser settings self.num_browsers = num_browsers # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers> if type(browser_params) is not list: browser_params = [copy.deepcopy(browser_params) for i in xrange(0, num_browsers)] if len(browser_params) != num_browsers: raise Exception("Number of browser parameter dictionaries is not the same as <num_browsers>") # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() self.logger_address = self.logging_status_queue.get() # socket location: (address, port) self.logger = MPLogger.loggingclient(*self.logger_address) # sets up the DataAggregator + associated queues self.aggregator_status_queue = None # queue used for sending graceful KILL command to DataAggregator self.data_aggregator = self._launch_data_aggregator() self.aggregator_address = self.aggregator_status_queue.get() # socket location: (address, port) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1]) # update task table cur = self.db.cursor() cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,)) self.db.commit() self.task_id = cur.lastrowid # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) # List of the Browser(s) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start()
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery): logger = loggingclient(*browser_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'], browser_params['logger_address'], browser_params['crawl_id']) browser_params['proxy'] = local_port # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']: logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket() extension_socket.connect('127.0.0.1',int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put('READY') browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes try: command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket) status_queue.put("OK") except Exception as e: logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e))) status_queue.put("FAILED") break
def __init__(self, db_path, browser_params, num_browsers, task_description=None): # sets up the information needed to write to the database self.desc = task_description self.db_path = db_path # sets up the crawl data database self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) # prepares browser settings self.num_browsers = num_browsers # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers> if type(browser_params) is not list: browser_params = [copy.deepcopy(browser_params) for i in xrange(0, num_browsers)] if len(browser_params) != num_browsers: raise Exception("Number of browser parameter dictionaries is not the same as <num_browsers>") # sets up the DataAggregator + associated queues self.aggregator_status_queue = None # queue used for sending graceful KILL command to DataAggregator self.data_aggregator = self.launch_data_aggregator() self.aggregator_address = self.aggregator_status_queue.get() # socket location: (address, port) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1]) # update task table cur = self.db.cursor() cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,)) self.db.commit() self.task_id = cur.lastrowid # sets up the BrowserManager(s) + associated queues self.browsers = self.initialize_browsers(browser_params) # List of the Browser(s) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1])
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery): # sets up the proxy (for now, mitmproxy) if necessary proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'], browser_params['crawl_id']) browser_params['proxy'] = local_port # Gets the WebDriver, profile folder (i.e. where history/cookies are stored) and display pid (None if not headless) (driver, prof_folder, display_pid, browser_settings) = deploy_browser.deploy_browser(browser_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']: while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.01) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket() extension_socket.connect('127.0.0.1',int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put((prof_folder, int(driver.binary.process.pid), display_pid, browser_settings)) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() print "EXECUTING COMMAND: " + str(command) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes try: command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket) status_queue.put("OK") except Exception as ex: print "CRASH IN DRIVER ORACLE:" + str(ex) + " RESTARTING BROWSER MANAGER" status_queue.put("FAILED") break
def _launch_aggregators(self): """Launch the necessary data aggregators""" if self.manager_params["output_format"] == "local": self.data_aggregator = LocalAggregator.LocalAggregator( self.manager_params, self.browser_params) elif self.manager_params["output_format"] == "s3": self.data_aggregator = S3Aggregator.S3Aggregator( self.manager_params, self.browser_params) else: raise Exception("Unrecognized output format: %s" % self.manager_params["output_format"]) self.data_aggregator.launch() self.manager_params[ 'aggregator_address'] = self.data_aggregator.listener_address # open connection to aggregator for saving crawl details self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address'])
def collect_links(table_name, scheme, **kwargs): """ Collect links with `scheme` and save in table `table_name` """ driver = kwargs['driver'] manager_params = kwargs['manager_params'] link_urls = [ x for x in (element.get_attribute("href") for element in driver.find_elements_by_tag_name('a')) if x.startswith(scheme + '://') ] current_url = driver.current_url sock = clientsocket() sock.connect(*manager_params['aggregator_address']) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT);" % table_name) sock.send(("create_table", query)) for link in link_urls: query = (table_name, {"top_url": current_url, "link": link}) sock.send(query) sock.close()
def dump_flash_cookies(start_time, visit_id, webdriver, browser_params, manager_params): """ Save newly changed Flash LSOs to database We determine which LSOs to save by the `start_time` timestamp. This timestamp should be taken prior to calling the `get` for which creates these changes. """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills window to avoid stray requests sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Flash cookies flash_cookies = get_flash_cookies(start_time) for cookie in flash_cookies: data = cookie._asdict() data["crawl_id"] = browser_params["crawl_id"] data["visit_id"] = visit_id sock.send(("flash_cookies", data)) # Close connection to db sock.close()
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params, manager_params, status_queue) browser_params['proxy'] = local_port status_queue.put(('STATUS','Proxy Ready','READY')) # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, manager_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension_enabled']: logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1',int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put(('STATUS','Browser Ready','READY')) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL',cPickle.dumps(err_info))) return except Exception as e: excp = traceback.format_exception(*sys.exc_info()) logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED',None)) return
def __init__(self, manager_params, browser_params, process_watchdog=False, task_description=None): # Make paths absolute in manager_params manager_params['data_directory'] = os.path.expanduser(manager_params['data_directory']) manager_params['log_directory'] = os.path.expanduser(manager_params['log_directory']) manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name']) manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file']) self.manager_params = manager_params # Flow control self.closing = False self.failure_flag = False self.threadlock = threading.Lock() self.failurecount = 0 self.desc = task_description self.process_watchdog = process_watchdog # sets up the crawl data database db_path = manager_params['database_name'] if not os.path.exists(manager_params['data_directory']): os.mkdir(manager_params['data_directory']) self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']") # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient(*self.manager_params['logger_address']) # Mark if LDBAggregator is needed (if js is enabled on any browser) self.ldb_enabled = False for params in browser_params: if params['save_javascript']: self.ldb_enabled = True break # Initialize the data aggregators self._launch_aggregators() # open client socket self.sock = clientsocket() self.sock.connect(*self.manager_params['aggregator_address']) # update task table cur = self.db.cursor() cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,)) self.db.commit() self.task_id = cur.lastrowid # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) # List of the Browser(s) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start()
def __init__(self, manager_params, browser_params, process_watchdog=False): # Make paths absolute in manager_params for path in ['data_directory','log_directory']: if manager_params[path] is not None: manager_params[path] = os.path.expanduser(manager_params[path]) manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name']) manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file']) manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots') manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources') self.manager_params = manager_params # Create data directories if they do not exist if not os.path.exists(manager_params['screenshot_path']): os.makedirs(manager_params['screenshot_path']) if not os.path.exists(manager_params['source_dump_path']): os.makedirs(manager_params['source_dump_path']) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']") # Flow control self.closing = False self.failure_status = None self.threadlock = threading.Lock() self.failurecount = 0 if manager_params['failure_limit'] is not None: self.failure_limit = manager_params['failure_limit'] else: self.failure_limit = self.num_browsers * 2 + 10 self.process_watchdog = process_watchdog # sets up the crawl data database db_path = manager_params['database_name'] if not os.path.exists(manager_params['data_directory']): os.mkdir(manager_params['data_directory']) self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) self.db.commit() # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient(*self.manager_params['logger_address']) # Mark if LDBAggregator is needed (if js is enabled on any browser) self.ldb_enabled = False for params in browser_params: if params['save_javascript'] or params['save_javascript_proxy']: self.ldb_enabled = True break # Initialize the data aggregators self._launch_aggregators() # open client socket self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) self._save_configuration(browser_params) # read the last used site visit id cur = self.db.cursor() cur.execute("SELECT MAX(visit_id) from site_visits") last_visit_id = cur.fetchone()[0] if last_visit_id is None: last_visit_id = 0 self.next_visit_id = last_visit_id + 1 # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) # List of the Browser(s) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start()
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery): logger = loggingclient(*browser_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy( browser_params['aggregator_address'], browser_params['logger_address'], browser_params['crawl_id']) browser_params['proxy'] = local_port # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params['extension'][ 'enabled']: logger.debug( "BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket() extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put('READY') browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes try: command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket) status_queue.put("OK") except Exception as e: logger.info( "BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e))) status_queue.put("FAILED") break
def __init__(self, db_path, browser_params, num_browsers, log_file='~/openwpm.log', process_watchdog=False, task_description=None): # Flow control self.closing = False self.failure_flag = False self.threadlock = threading.Lock() self.failurecount = 0 # sets up the information needed to write to the database self.desc = task_description self.db_path = db_path self.log_file = log_file self.process_watchdog = process_watchdog # sets up the crawl data database self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) # prepares browser settings self.num_browsers = num_browsers # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers> if type(browser_params) is not list: browser_params = [ copy.deepcopy(browser_params) for i in xrange(0, num_browsers) ] if len(browser_params) != num_browsers: raise Exception( "Number of browser parameter dictionaries is not the same as <num_browsers>" ) # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() self.logger_address = self.logging_status_queue.get( ) # socket location: (address, port) self.logger = MPLogger.loggingclient(*self.logger_address) # sets up the DataAggregator + associated queues self.aggregator_status_queue = None # queue used for sending graceful KILL command to DataAggregator self.data_aggregator = self._launch_data_aggregator() self.aggregator_address = self.aggregator_status_queue.get( ) # socket location: (address, port) # open client socket self.sock = clientsocket() self.sock.connect(self.aggregator_address[0], self.aggregator_address[1]) # update task table cur = self.db.cursor() cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc, )) self.db.commit() self.task_id = cur.lastrowid # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers( browser_params) # List of the Browser(s) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start()
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the proxy proxy_site_queue = None # used to pass the current site down to the proxy if browser_params['proxy']: (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy( browser_params, manager_params, status_queue) browser_params['proxy'] = local_port status_queue.put(('STATUS', 'Proxy Ready', 'READY')) # Start the virtualdisplay (if necessary), webdriver, and browser (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser( status_queue, browser_params, manager_params, crash_recovery) # Read the extension port -- if extension is enabled # TODO: This needs to be cleaner if browser_params['browser'] == 'firefox' and browser_params[ 'extension_enabled']: logger.debug( "BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder)) while not os.path.isfile(prof_folder + 'extension_port.txt'): time.sleep(0.1) time.sleep(0.5) with open(prof_folder + 'extension_port.txt', 'r') as f: port = f.read().strip() extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None # passes the profile folder, WebDriver pid and display pid back to the TaskManager # now, the TaskManager knows that the browser is successfully set up status_queue.put(('STATUS', 'Browser Ready', 'READY')) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL', cPickle.dumps(err_info))) return except Exception as e: excp = traceback.format_exception(*sys.exc_info()) logger.info( "BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED', None)) return
def __init__(self, manager_params, browser_params, process_watchdog=False): # Make paths absolute in manager_params for path in ['data_directory', 'log_directory']: if manager_params[path] is not None: manager_params[path] = os.path.expanduser(manager_params[path]) manager_params['database_name'] = os.path.join( manager_params['data_directory'], manager_params['database_name']) manager_params['log_file'] = os.path.join( manager_params['log_directory'], manager_params['log_file']) manager_params['screenshot_path'] = os.path.join( manager_params['data_directory'], 'screenshots') manager_params['source_dump_path'] = os.path.join( manager_params['data_directory'], 'sources') self.manager_params = manager_params # Create data directories if they do not exist if not os.path.exists(manager_params['screenshot_path']): os.makedirs(manager_params['screenshot_path']) if not os.path.exists(manager_params['source_dump_path']): os.makedirs(manager_params['source_dump_path']) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception( "Number of <browser_params> dicts is not the same as manager_params['num_browsers']" ) # Flow control self.closing = False self.failure_status = None self.threadlock = threading.Lock() self.failurecount = 0 if manager_params['failure_limit'] is not None: self.failure_limit = manager_params['failure_limit'] else: self.failure_limit = self.num_browsers * 2 + 10 self.process_watchdog = process_watchdog # sets up the crawl data database db_path = manager_params['database_name'] if not os.path.exists(manager_params['data_directory']): os.mkdir(manager_params['data_directory']) self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) self.db.commit() # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient( *self.manager_params['logger_address']) # Mark if LDBAggregator is needed (if js is enabled on any browser) self.ldb_enabled = False for params in browser_params: if params['save_javascript'] or params['save_javascript_proxy']: self.ldb_enabled = True break # Initialize the data aggregators self._launch_aggregators() # open client socket self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) print 1 self._save_configuration(browser_params) print 2 # read the last used site visit id cur = self.db.cursor() cur.execute("SELECT MAX(visit_id) from site_visits") last_visit_id = cur.fetchone()[0] if last_visit_id is None: last_visit_id = 0 self.next_visit_id = last_visit_id + 1 print 3 # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers( browser_params) # List of the Browser(s) print 5 self._launch_browsers() print 4 # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start()
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery): """ The BrowserManager function runs in each new browser process. It is responsible for listening to command instructions from the Task Manager and passing them to the command module to execute and interface with Selenium. Command execution status is sent back to the TaskManager. """ try: logger = loggingclient(*manager_params['logger_address']) # Start the virtualdisplay (if necessary), webdriver, and browser driver, prof_folder, browser_settings = deploy_browser.deploy_browser( status_queue, browser_params, manager_params, crash_recovery) if prof_folder[-1] != '/': prof_folder += '/' # Read the extension port -- if extension is enabled # TODO: Initial communication from extension to TM should use sockets if (browser_params['browser'] == 'firefox' and browser_params['extension_enabled']): logger.debug("BROWSER %i: Looking for extension port information " "in %s" % (browser_params['crawl_id'], prof_folder)) elapsed = 0 port = None ep_filename = os.path.join(prof_folder, 'extension_port.txt') while elapsed < 5: try: with open(ep_filename, 'rt') as f: port = int(f.read().strip()) break except IOError as e: if e.errno != errno.ENOENT: raise time.sleep(0.1) elapsed += 0.1 if port is None: # try one last time, allowing all exceptions to propagate with open(ep_filename, 'rt') as f: port = int(f.read().strip()) logger.debug("BROWSER %i: Connecting to extension on port %i" % (browser_params['crawl_id'], port)) extension_socket = clientsocket(serialization='json') extension_socket.connect('127.0.0.1', int(port)) else: extension_socket = None logger.debug("BROWSER %i: BrowserManager ready." % browser_params['crawl_id']) # passes the profile folder, WebDriver pid and display pid back to the # TaskManager to signal a successful startup status_queue.put(('STATUS', 'Browser Ready', (prof_folder, 'READY'))) browser_params['profile_path'] = prof_folder # starts accepting arguments until told to die while True: # no command for now -> sleep to avoid pegging CPU on blocking get if command_queue.empty(): time.sleep(0.001) continue # reads in the command tuple of form: # (command, arg0, arg1, arg2, ..., argN) where N is variable command = command_queue.get() logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command))) # attempts to perform an action and return an OK signal # if command fails for whatever reason, tell the TaskManager to # kill and restart its worker processes command_executor.execute_command(command, driver, browser_settings, browser_params, manager_params, extension_socket) status_queue.put("OK") except (ProfileLoadError, BrowserConfigError, AssertionError) as e: logger.info("BROWSER %i: %s thrown, informing parent and raising" % (browser_params['crawl_id'], e.__class__.__name__)) err_info = sys.exc_info() status_queue.put(('CRITICAL', pickle.dumps(err_info))) return except Exception: excp = traceback.format_exception(*sys.exc_info()) logger.info("BROWSER %i: Crash in driver, restarting browser manager " "\n %s" % (browser_params['crawl_id'], ''.join(excp))) status_queue.put(('FAILED', None)) return