Пример #1
0
    def __init__(self, manager_params, browser_params):
        # Constants
        self._SPAWN_TIMEOUT = 120  #seconds
        self._UNSUCCESSFUL_SPAWN_LIMIT = 1

        # manager parameters
        self.current_profile_path = None
        self.db_socket_address = manager_params['aggregator_address']
        self.logger_address = manager_params['logger_address']
        self.crawl_id = browser_params['crawl_id']
        self.curr_visit_id = None
        self.browser_params = browser_params
        self.manager_params = manager_params

        # Queues and process IDs for BrowserManager
        self.command_thread = None  # thread to run commands issues from TaskManager
        self.command_queue = None  # queue for passing command tuples to BrowserManager
        self.status_queue = None  # queue for receiving command execution status from BrowserManager
        self.browser_pid = None  # pid for browser instance controlled by BrowserManager
        self.display_pid = None  # the pid of the display for the headless browser (if it exists)
        self.display_port = None  # the port of the display for the headless browser (if it exists)

        self.is_fresh = True  # boolean that says if the BrowserManager new (used to optimize restarts)
        self.restart_required = False  # boolean indicating if the browser should be restarted

        self.current_timeout = None  # timeout of the current command
        self.browser_settings = None  # dict of additional browser profile settings (e.g. screen_res)
        self.browser_manager = None  # process that controls browser
        self.logger = loggingclient(
            *self.logger_address)  # connection to loggingserver
Пример #2
0
    def __init__(self, manager_params, browser_params):
        # Constants
        self._SPAWN_TIMEOUT = 120 #seconds
        self._UNSUCCESSFUL_SPAWN_LIMIT = 4

        # manager parameters
        self.current_profile_path = None
        self.db_socket_address = manager_params['aggregator_address']
        self.logger_address = manager_params['logger_address']
        self.crawl_id = browser_params['crawl_id']
        self.curr_visit_id = None
        self.browser_params = browser_params
        self.manager_params = manager_params

        # Queues and process IDs for BrowserManager
        self.command_thread = None  # thread to run commands issues from TaskManager
        self.command_queue = None  # queue for passing command tuples to BrowserManager
        self.status_queue = None  # queue for receiving command execution status from BrowserManager
        self.browser_pid = None  # pid for browser instance controlled by BrowserManager
        self.display_pid = None  # the pid of the display for the headless browser (if it exists)
        self.display_port = None  # the port of the display for the headless browser (if it exists)

        self.is_fresh = True  # boolean that says if the BrowserManager new (used to optimize restarts)
        self.restart_required = False # boolean indicating if the browser should be restarted

        self.current_timeout = None # timeout of the current command
        self.browser_settings = None  # dict of additional browser profile settings (e.g. screen_res)
        self.browser_manager = None # process that controls browser
        self.logger = loggingclient(*self.logger_address) # connection to loggingserver
Пример #3
0
 def __init__(self, status_queue, shutdown_queue, manager_params):
     self.status_queue = status_queue
     self.shutdown_queue = shutdown_queue
     self.logger = loggingclient(*manager_params['logger_address'])
     self._shutdown_flag = False
     self._last_update = time.time()  # last status update time
     self.record_queue = None  # Initialized on `startup`
Пример #4
0
def load_profile(browser_profile_folder,
                 manager_params,
                 browser_params,
                 tar_location,
                 load_flash=False):
    """
    loads a zipped cookie-based profile stored in <tar_location> and
    unzips it to <browser_profile_folder>. This will load whatever profile
    is in the folder, either full_profile.tar.gz or profile.tar.gz
    """
    try:
        # Connect to logger
        logger = loggingclient(*manager_params['logger_address'])

        # ensures that folder paths end with slashes
        if browser_profile_folder[-1] != '/':
            browser_profile_folder = browser_profile_folder + "/"
        if tar_location[-1] != '/':
            tar_location = tar_location + "/"

        if os.path.isfile(tar_location + 'profile.tar.gz'):
            tar_name = 'profile.tar.gz'
        else:
            tar_name = 'profile.tar'

        # Copy and untar the loaded profile
        logger.debug("BROWSER %i: Copying profile tar from %s to %s" %
                     (browser_params['crawl_id'], tar_location + tar_name,
                      browser_profile_folder))
        shutil.copy(tar_location + tar_name, browser_profile_folder)

        if tar_name == 'profile.tar.gz':
            f = tarfile.open(browser_profile_folder + tar_name,
                             'r:gz',
                             errorlevel=1)
        else:
            f = tarfile.open(browser_profile_folder + tar_name,
                             'r',
                             errorlevel=1)
        f.extractall(browser_profile_folder)
        f.close()
        os.remove(browser_profile_folder + tar_name)
        logger.debug("BROWSER %i: Tarfile extracted" %
                     browser_params['crawl_id'])

        # clear and load flash cookies
        if load_flash:
            load_flash_files(logger, browser_params, tar_location)

        # load the browser settings
        browser_settings = load_browser_settings(tar_location)
    except Exception as ex:
        logger.critical(
            "BROWSER %i: Error: %s while attempting to load profile" %
            (browser_params['crawl_id'], str(ex)))
        raise ProfileLoadError('Profile Load not successful')

    return browser_settings
Пример #5
0
 def __init__(self, manager_params, browser_params):
     self.manager_params = manager_params
     self.browser_params = browser_params
     self.logger = loggingclient(*manager_params['logger_address'])
     self.listener_address = None
     self.listener_process = None
     self.status_queue = Queue()
     self.shutdown_queue = Queue()
     self._last_status = None
     self._last_status_received = 0.0
Пример #6
0
def screenshot_full_page(visit_id,
                         crawl_id,
                         driver,
                         manager_params,
                         suffix=''):
    logger = loggingclient(*manager_params['logger_address'])

    outdir = os.path.join(manager_params['screenshot_path'], 'parts')
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    if suffix != '':
        suffix = '-' + suffix
    urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
    outname = os.path.join(
        outdir, '%i-%s%s-part-%%i-%%i.png' % (visit_id, urlhash, suffix))

    try:
        part = 0
        max_height = execute_script_with_retry(
            driver, 'return document.body.scrollHeight;')
        inner_height = execute_script_with_retry(driver,
                                                 'return window.innerHeight;')
        curr_scrollY = execute_script_with_retry(driver,
                                                 'return window.scrollY;')
        prev_scrollY = -1
        driver.save_screenshot(outname % (part, curr_scrollY))
        while ((curr_scrollY + inner_height) < max_height
               and curr_scrollY != prev_scrollY):

            # Scroll down to bottom of previous viewport
            try:
                driver.execute_script('window.scrollBy(0, window.innerHeight)')
            except WebDriverException:
                logger.info("BROWSER %i: WebDriverException while scrolling, "
                            "screenshot may be misaligned!" % crawl_id)
                pass

            # Update control variables
            part += 1
            prev_scrollY = curr_scrollY
            curr_scrollY = execute_script_with_retry(driver,
                                                     'return window.scrollY;')

            # Save screenshot
            driver.save_screenshot(outname % (part, curr_scrollY))
    except WebDriverException:
        excp = traceback.format_exception(*sys.exc_info())
        logger.error(
            "BROWSER %i: Exception while taking full page screenshot \n %s" %
            (crawl_id, ''.join(excp)))
        return

    _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)
Пример #7
0
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery):
    logger = loggingclient(*browser_params['logger_address'])
    
    # Start the proxy
    proxy_site_queue = None  # used to pass the current site down to the proxy
    if browser_params['proxy']:
        (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'],
                                                                      browser_params['logger_address'],
                                                                      browser_params['crawl_id'])
        browser_params['proxy'] = local_port

    # Start the virtualdisplay (if necessary), webdriver, and browser
    (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery)

    # Read the extension port -- if extension is enabled
    # TODO: This needs to be cleaner
    if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']:
        logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder))
        while not os.path.isfile(prof_folder + 'extension_port.txt'):
            time.sleep(0.1)
        time.sleep(0.5)
        with open(prof_folder + 'extension_port.txt', 'r') as f:
            port = f.read().strip()
        extension_socket = clientsocket()
        extension_socket.connect('127.0.0.1',int(port))
    else:
        extension_socket = None

    # passes the profile folder, WebDriver pid and display pid back to the TaskManager
    # now, the TaskManager knows that the browser is successfully set up
    status_queue.put('READY')
    browser_params['profile_path'] = prof_folder

    # starts accepting arguments until told to die
    while True:
        # no command for now -> sleep to avoid pegging CPU on blocking get
        if command_queue.empty():
            time.sleep(0.001)
            continue

        # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
        command = command_queue.get()
        logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command)))

        # attempts to perform an action and return an OK signal
        # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
        try:
            command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket)
            status_queue.put("OK")
        except Exception as e:
            logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e)))
            status_queue.put("FAILED")
            break
Пример #8
0
def browse_website(url, num_links, sleep, visit_id, webdriver, browser_params,
                   manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, browser_params,
                extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for _ in range(num_links):
        links = [
            x for x in get_intra_links(webdriver, url)
            if is_displayed(x) is True
        ]
        if not links:
            break
        r = int(random.random() * len(links))
        logger.info(
            "BROWSER %i: visiting internal link %s" %
            (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
            wait_until_loaded(webdriver, 300)
        except Exception:
            pass
Пример #9
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params,
                   crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the proxy
        proxy_site_queue = None  # used to pass the current site down to the proxy
        if browser_params['proxy']:
            (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(
                browser_params, manager_params, status_queue)
            browser_params['proxy'] = local_port
        status_queue.put(('STATUS', 'Proxy Ready', 'READY'))

        # Start the virtualdisplay (if necessary), webdriver, and browser
        (driver, prof_folder,
         browser_settings) = deploy_browser.deploy_browser(
             status_queue, browser_params, manager_params, crash_recovery)

        # Read the extension port -- if extension is enabled
        # TODO: This needs to be cleaner
        if browser_params['browser'] == 'firefox' and browser_params[
                'extension_enabled']:
            logger.debug(
                "BROWSER %i: Looking for extension port information in %s" %
                (browser_params['crawl_id'], prof_folder))
            while not os.path.isfile(prof_folder + 'extension_port.txt'):
                time.sleep(0.1)
            time.sleep(0.5)
            with open(prof_folder + 'extension_port.txt', 'r') as f:
                port = f.read().strip()
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1', int(port))
        else:
            extension_socket = None

        # passes the profile folder, WebDriver pid and display pid back to the TaskManager
        # now, the TaskManager knows that the browser is successfully set up
        status_queue.put(('STATUS', 'Browser Ready', 'READY'))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                        (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
            command_executor.execute_command(command, driver, proxy_site_queue,
                                             browser_settings, browser_params,
                                             manager_params, extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                    (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL', cPickle.dumps(err_info)))
        return
    except Exception as e:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info(
            "BROWSER %i: Crash in driver, restarting browser manager \n %s" %
            (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED', None))
        return
Пример #10
0
def dump_profile(browser_profile_folder,
                 manager_params,
                 browser_params,
                 tar_location,
                 close_webdriver,
                 webdriver=None,
                 browser_settings=None,
                 save_flash=False,
                 compress=False):
    """
    dumps a browser profile currently stored in <browser_profile_folder> to
    <tar_location> in which both folders are absolute paths.
    if <browser_settings> exists they are also saved
    <save_flash> specifies whether to dump flash files
    """
    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # ensures that folder paths end with slashes
    if browser_profile_folder[-1] != '/':
        browser_profile_folder = browser_profile_folder + "/"
    if tar_location[-1] != '/':
        tar_location = tar_location + "/"

    if not os.path.exists(tar_location):
        os.makedirs(tar_location)

    if compress:
        tar_name = 'profile.tar.gz'
    else:
        tar_name = 'profile.tar'

    # see if this file exists first
    # if it does, delete it before we try to save the current session
    if os.path.isfile(tar_location + tar_name):
        os.remove(tar_location + tar_name)

    # if this is a dump on close, close the webdriver and wait for checkpoint
    if close_webdriver:
        webdriver.close()
        sleep_until_sqlite_checkpoint(browser_profile_folder)

    # backup and tar profile
    if compress:
        tar = tarfile.open(tar_location + tar_name, 'w:gz', errorlevel=1)
    else:
        tar = tarfile.open(tar_location + tar_name, 'w', errorlevel=1)
    logger.debug("BROWSER %i: Backing up full profile from %s to %s" %
                 (browser_params['crawl_id'], browser_profile_folder,
                  tar_location + tar_name))
    storage_vector_files = [
        'cookies.sqlite',  # cookies
        'cookies.sqlite-shm',
        'cookies.sqlite-wal',
        'places.sqlite',  # history
        'places.sqlite-shm',
        'places.sqlite-wal',
        'webappsstore.sqlite',  # localStorage
        'webappsstore.sqlite-shm',
        'webappsstore.sqlite-wal',
    ]
    storage_vector_dirs = [
        'webapps',  # related to localStorage?
        'storage'  # directory for IndexedDB
    ]
    for item in storage_vector_files:
        full_path = os.path.join(browser_profile_folder, item)
        if (not os.path.isfile(full_path) and full_path[-3:] != 'shm'
                and full_path[-3:] != 'wal'):
            logger.critical(
                "BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
                (browser_params['crawl_id'], full_path))
        elif (not os.path.isfile(full_path)
              and (full_path[-3:] == 'shm' or full_path[-3:] == 'wal')):
            continue  # These are just checkpoint files
        tar.add(full_path, arcname=item)
    for item in storage_vector_dirs:
        full_path = os.path.join(browser_profile_folder, item)
        if not os.path.isdir(full_path):
            logger.warning(
                "BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
                (browser_params['crawl_id'], full_path))
            continue
        tar.add(full_path, arcname=item)
    tar.close()

    # save flash cookies
    if save_flash:
        save_flash_files(logger, browser_params, tar_location)

    # save the browser settings
    if browser_settings is not None:
        save_browser_settings(tar_location, browser_settings)
Пример #11
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the proxy
        proxy_site_queue = None  # used to pass the current site down to the proxy
        if browser_params['proxy']:
            (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params,
                                                                          manager_params,
                                                                          status_queue)
            browser_params['proxy'] = local_port
        status_queue.put(('STATUS','Proxy Ready','READY'))

        # Start the virtualdisplay (if necessary), webdriver, and browser
        (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, manager_params, crash_recovery)

        # Read the extension port -- if extension is enabled
        # TODO: This needs to be cleaner
        if browser_params['browser'] == 'firefox' and browser_params['extension_enabled']:
            logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder))
            while not os.path.isfile(prof_folder + 'extension_port.txt'):
                time.sleep(0.1)
            time.sleep(0.5)
            with open(prof_folder + 'extension_port.txt', 'r') as f:
                port = f.read().strip()
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1',int(port))
        else:
            extension_socket = None

        # passes the profile folder, WebDriver pid and display pid back to the TaskManager
        # now, the TaskManager knows that the browser is successfully set up
        status_queue.put(('STATUS','Browser Ready','READY'))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
            command_executor.execute_command(command,
                                             driver,
                                             proxy_site_queue,
                                             browser_settings,
                                             browser_params,
                                             manager_params,
                                             extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL',cPickle.dumps(err_info)))
        return
    except Exception as e:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED',None))
        return
Пример #12
0
def deploy_firefox(status_queue, browser_params, manager_params,
                   crash_recovery):
    """
    launches a firefox instance with parameters set by the input dictionary
    """
    firefox_binary_path = get_firefox_binary_path()
    geckodriver_executable_path = get_geckodriver_exec_path()

    root_dir = os.path.dirname(__file__)  # directory of this file
    logger = loggingclient(*manager_params['logger_address'])

    display_pid = None
    display_port = None
    fp = FirefoxProfile()
    browser_profile_path = fp.path + '/'
    status_queue.put(('STATUS', 'Profile Created', browser_profile_path))

    # Use Options instead of FirefoxProfile to set preferences since the
    # Options method has no "frozen"/restricted options.
    # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
    fo = Options()

    profile_settings = None  # Imported browser settings
    if browser_params['profile_tar'] and not crash_recovery:
        logger.debug(
            "BROWSER %i: Loading initial browser profile from: %s" %
            (browser_params['crawl_id'], browser_params['profile_tar']))
        load_flash = browser_params['disable_flash'] is False
        profile_settings = load_profile(browser_profile_path,
                                        manager_params,
                                        browser_params,
                                        browser_params['profile_tar'],
                                        load_flash=load_flash)
    elif browser_params['profile_tar']:
        logger.debug(
            "BROWSER %i: Loading recovered browser profile from: %s" %
            (browser_params['crawl_id'], browser_params['profile_tar']))
        profile_settings = load_profile(browser_profile_path, manager_params,
                                        browser_params,
                                        browser_params['profile_tar'])
    status_queue.put(('STATUS', 'Profile Tar', None))

    if browser_params['random_attributes'] and profile_settings is None:
        logger.debug("BROWSER %i: Loading random attributes for browser" %
                     browser_params['crawl_id'])
        profile_settings = dict()

        # choose a random screen-res from list
        resolutions = list()
        with open(os.path.join(root_dir, 'screen_resolutions.txt'), 'r') as f:
            for line in f:
                resolutions.append(tuple(line.strip().split(',')))
        profile_settings['screen_res'] = random.choice(resolutions)

        # set a random user agent from list
        ua_strings = list()
        with open(os.path.join(root_dir, 'user_agent_strings.txt'), 'r') as f:
            for line in f:
                ua_strings.append(line.strip())
        profile_settings['ua_string'] = random.choice(ua_strings)

    # If profile settings still not set - set defaults
    if profile_settings is None:
        profile_settings = dict()
        profile_settings['screen_res'] = DEFAULT_SCREEN_RES
        profile_settings['ua_string'] = None

    if profile_settings['ua_string'] is not None:
        logger.debug(
            "BROWSER %i: Overriding user agent string to '%s'" %
            (browser_params['crawl_id'], profile_settings['ua_string']))
        fo.set_preference("general.useragent.override",
                          profile_settings['ua_string'])

    if browser_params['headless']:
        display = Display(visible=0, size=profile_settings['screen_res'])
        display.start()
        display_pid = display.pid
        display_port = display.cmd_param[-1][1:]
    status_queue.put(('STATUS', 'Display', (display_pid, display_port)))

    if browser_params['extension_enabled']:
        # Write config file
        extension_config = dict()
        extension_config.update(browser_params)
        extension_config['logger_address'] = manager_params['logger_address']
        extension_config['aggregator_address'] = manager_params[
            'aggregator_address']
        if 'ldb_address' in manager_params:
            extension_config['leveldb_address'] = manager_params['ldb_address']
        else:
            extension_config['leveldb_address'] = None
        extension_config['testing'] = manager_params['testing']
        ext_config_file = browser_profile_path + 'browser_params.json'
        with open(ext_config_file, 'w') as f:
            json.dump(extension_config, f)
        logger.debug("BROWSER %i: Saved extension config file to: %s" %
                     (browser_params['crawl_id'], ext_config_file))

        # TODO restore detailed logging
        # fo.set_preference("*****@*****.**", "all")

    # Disable flash
    if browser_params['disable_flash']:
        fo.set_preference('plugin.state.flash', 0)
    else:
        fo.set_preference('plugin.state.flash', 2)
        fo.set_preference('plugins.click_to_play', False)

    # Configure privacy settings
    configure_firefox.privacy(browser_params, fp, fo, root_dir,
                              browser_profile_path)

    # Set various prefs to improve speed and eliminate traffic to Mozilla
    configure_firefox.optimize_prefs(fo)

    # Intercept logging at the Selenium level and redirect it to the
    # main logger.  This will also inform us where the real profile
    # directory is hiding.
    interceptor = FirefoxLogInterceptor(browser_params['crawl_id'], logger,
                                        browser_profile_path)
    interceptor.start()

    # Set custom prefs. These are set after all of the default prefs to allow
    # our defaults to be overwritten.
    for name, value in browser_params['prefs'].items():
        logger.info("BROWSER %i: Setting custom preference: %s = %s" %
                    (browser_params['crawl_id'], name, value))
        fo.set_preference(name, value)

    # Launch the webdriver
    status_queue.put(('STATUS', 'Launch Attempted', None))
    fb = FirefoxBinary(firefox_path=firefox_binary_path)
    driver = webdriver.Firefox(firefox_profile=fp,
                               firefox_binary=fb,
                               executable_path=geckodriver_executable_path,
                               firefox_options=fo,
                               log_path=interceptor.fifo)

    # Add extension
    if browser_params['extension_enabled']:

        # Install extension
        ext_loc = os.path.join(root_dir, '../Extension/firefox/openwpm.xpi')
        ext_loc = os.path.normpath(ext_loc)
        driver.install_addon(ext_loc, temporary=True)
        logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" %
                     browser_params['crawl_id'])

    # set window size
    driver.set_window_size(*profile_settings['screen_res'])

    # Get browser process pid
    if hasattr(driver, 'service') and hasattr(driver.service, 'process'):
        pid = driver.service.process.pid
    elif hasattr(driver, 'binary') and hasattr(driver.binary, 'process'):
        pid = driver.binary.process.pid
    else:
        raise RuntimeError("Unable to identify Firefox process ID.")

    status_queue.put(
        ('STATUS', 'Browser Launched', (int(pid), profile_settings)))

    return driver, driver.capabilities["moz:profile"], profile_settings
Пример #13
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params,
                   crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the virtualdisplay (if necessary), webdriver, and browser
        driver, prof_folder, browser_settings = deploy_browser.deploy_browser(
            status_queue, browser_params, manager_params, crash_recovery)
        if prof_folder[-1] != '/':
            prof_folder += '/'

        # Read the extension port -- if extension is enabled
        # TODO: Initial communication from extension to TM should use sockets
        if (browser_params['browser'] == 'firefox'
                and browser_params['extension_enabled']):
            logger.debug("BROWSER %i: Looking for extension port information "
                         "in %s" % (browser_params['crawl_id'], prof_folder))
            elapsed = 0
            port = None
            ep_filename = os.path.join(prof_folder, 'extension_port.txt')
            while elapsed < 5:
                try:
                    with open(ep_filename, 'rt') as f:
                        port = int(f.read().strip())
                        break
                except IOError as e:
                    if e.errno != errno.ENOENT:
                        raise
                time.sleep(0.1)
                elapsed += 0.1
            if port is None:
                # try one last time, allowing all exceptions to propagate
                with open(ep_filename, 'rt') as f:
                    port = int(f.read().strip())

            logger.debug("BROWSER %i: Connecting to extension on port %i" %
                         (browser_params['crawl_id'], port))
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1', int(port))
        else:
            extension_socket = None

        logger.debug("BROWSER %i: BrowserManager ready." %
                     browser_params['crawl_id'])

        # passes the profile folder, WebDriver pid and display pid back to the
        # TaskManager to signal a successful startup
        status_queue.put(('STATUS', 'Browser Ready', (prof_folder, 'READY')))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form:
            # (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                        (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskManager to
            # kill and restart its worker processes
            command_executor.execute_command(command, driver, browser_settings,
                                             browser_params, manager_params,
                                             extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                    (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL', pickle.dumps(err_info)))
        return
    except Exception:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info("BROWSER %i: Crash in driver, restarting browser manager "
                    "\n %s" % (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED', None))
        return
Пример #14
0
def BrowserManager(command_queue, status_queue, browser_params,
                   crash_recovery):
    logger = loggingclient(*browser_params['logger_address'])

    # Start the proxy
    proxy_site_queue = None  # used to pass the current site down to the proxy
    if browser_params['proxy']:
        (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(
            browser_params['aggregator_address'],
            browser_params['logger_address'], browser_params['crawl_id'])
        browser_params['proxy'] = local_port

    # Start the virtualdisplay (if necessary), webdriver, and browser
    (driver, prof_folder,
     browser_settings) = deploy_browser.deploy_browser(status_queue,
                                                       browser_params,
                                                       crash_recovery)

    # Read the extension port -- if extension is enabled
    # TODO: This needs to be cleaner
    if browser_params['browser'] == 'firefox' and browser_params['extension'][
            'enabled']:
        logger.debug(
            "BROWSER %i: Looking for extension port information in %s" %
            (browser_params['crawl_id'], prof_folder))
        while not os.path.isfile(prof_folder + 'extension_port.txt'):
            time.sleep(0.1)
        time.sleep(0.5)
        with open(prof_folder + 'extension_port.txt', 'r') as f:
            port = f.read().strip()
        extension_socket = clientsocket()
        extension_socket.connect('127.0.0.1', int(port))
    else:
        extension_socket = None

    # passes the profile folder, WebDriver pid and display pid back to the TaskManager
    # now, the TaskManager knows that the browser is successfully set up
    status_queue.put('READY')
    browser_params['profile_path'] = prof_folder

    # starts accepting arguments until told to die
    while True:
        # no command for now -> sleep to avoid pegging CPU on blocking get
        if command_queue.empty():
            time.sleep(0.001)
            continue

        # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
        command = command_queue.get()
        logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                    (browser_params['crawl_id'], str(command)))

        # attempts to perform an action and return an OK signal
        # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
        try:
            command_executor.execute_command(command, driver, proxy_site_queue,
                                             browser_settings, browser_params,
                                             extension_socket)
            status_queue.put("OK")
        except Exception as e:
            logger.info(
                "BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s"
                % (browser_params['crawl_id'], str(type(e)), str(e)))
            status_queue.put("FAILED")
            break