def test_saving(self): manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) manager.get('http://example.com') manager.close() assert isfile( join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
def test_extension_gets_correct_visit_id(self): manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url_a = utilities.BASE_TEST_URL + '/simple_a.html' url_b = utilities.BASE_TEST_URL + '/simple_b.html' manager.get(url_a) manager.get(url_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.userAgent", )) simple_b_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.platform", )) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_get_site_visits_table_valid(self): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db(manager_params['db'], "SELECT site_url FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def test_save_screenshot_valid(self, tmpdir): """Check that 'save_screenshot' works""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=1) cs.save_screenshot('test') cs.screenshot_full_page('test_full') manager.execute_command_sequence(cs) manager.close() # Check that viewport image is not blank pattern = os.path.join(str(tmpdir), 'screenshots', '1-*-test.png') screenshot = glob.glob(pattern)[0] im = Image.open(screenshot) bands = im.split() is_blank = all(band.getextrema() == (255, 255) for band in bands) assert not is_blank # Check that full page screenshot is not blank pattern = os.path.join(str(tmpdir), 'screenshots', '1-*-test_full.png') screenshot = glob.glob(pattern)[0] im = Image.open(screenshot) bands = im.split() is_blank = all(band.getextrema() == (255, 255) for band in bands) assert not is_blank
def test_crash(self): manager_params, browser_params = self.get_config() manager_params['failure_limit'] = 0 manager = TaskManager.TaskManager(manager_params, browser_params) with pytest.raises(CommandExecutionError): manager.get('http://example.com') # So we have a profile manager.get('example.com') # Selenium requires scheme prefix manager.get('example.com') # Requires two commands to shut down
def visit(self, page_url, data_dir="", sleep_after=0): """Visit a test page with the given parameters.""" manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) if not page_url.startswith("http"): page_url = utilities.BASE_TEST_URL + page_url manager.get(url=page_url, sleep=sleep_after) manager.close() return manager_params['db']
def __init__(self): self.manager = None param_list = [] index = 0 for key in self.parameters: param_list.append(self.parameters[key]['combinations']) self.parameters[key]['index'] = index index += 1 combinations = [p for p in itertools.product(*param_list)] manager_params, browser_params = TaskManager.load_default_params( len(combinations)) self.setup(combinations, manager_params, browser_params)
def test_crash_profile(self): manager_params, browser_params = self.get_config() manager_params['failure_limit'] = 2 manager = TaskManager.TaskManager(manager_params, browser_params) try: manager.get('http://example.com') # So we have a profile manager.get('example.com') # Selenium requires scheme prefix manager.get('example.com') # Selenium requires scheme prefix manager.get('example.com') # Selenium requires scheme prefix manager.get('example.com') # Requires two commands to shut down except CommandExecutionError: pass assert isfile( join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
def setup(self, combinations, manager_params, browser_params): manager_params['data_directory'] = '~/Documents/openWpmDb/' manager_params['log_directory'] = '~/Desktop/' for i in range(len(combinations)): browser_params[i]['http_instrument'] = combinations[i][ self.parameters['http_instrument']['index']] browser_params[i]['js_instrument'] = combinations[i][ self.parameters['js_instrument']['index']] browser_params[i]['disable_flash'] = combinations[i][ self.parameters['disable_flash']['index']] browser_params[i]['tp_cookies'] = combinations[i][ self.parameters['tp_cookies']['index']] browser_params[i]['ghostery'] = combinations[i][ self.parameters['ghostery']['index']] browser_params[i]['https-everywhere'] = combinations[i][ self.parameters['https-everywhere']['index']] self.manager = TaskManager.TaskManager(manager_params, browser_params)
def test_flash_cookies(self): """ Check that some Flash LSOs are saved and are properly keyed in db.""" # Run the test crawl manager_params, browser_params = self.get_config() browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies and visit it twice lso_value_a = utilities.rand_str(8) expected_lso_content_a[5] = lso_value_a # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_a) test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_a) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) lso_value_b = utilities.rand_str(8) expected_lso_content_b[5] = lso_value_b # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_b) test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_b) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM flash_cookies", as_tuple=True) lso_count = len(qry_res) assert lso_count == 2 lso_content_a = list(qry_res[0][2:]) # Remove first two items lso_content_b = list(qry_res[1][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # rm LSO dirname lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # rm LSO dirname assert lso_content_a == expected_lso_content_a assert lso_content_b == expected_lso_content_b
def test_profile_cookies(self): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0][0] assert prof_cookie_count > 0
def test_get_http_tables_valid(self): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b]
def test_js_profile_cookies(self): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that the JS cookie we stored is recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM profile_cookies", as_tuple=True) assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected_js_cookie
def test_recursive_dump_page_source_valid(self, tmpdir): """Check that 'recursive_dump_page_source' works""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(NESTED_FRAMES_URL) cs.get(sleep=1) cs.recursive_dump_page_source() manager.execute_command_sequence(cs) manager.close() outfile = os.path.join(str(tmpdir), 'sources', '1-*.json.gz') src_file = glob.glob(outfile)[0] with gzip.GzipFile(src_file, 'rb') as f: visit_source = json.loads(f.read().decode('utf-8')) observed_parents = dict() def verify_frame(frame, parent_frames=[]): # Verify structure observed_parents[frame['doc_url']] = list(parent_frames) # copy # Verify source path = urlparse(frame['doc_url']).path expected_source = '' with open('.'+path, 'r') as f: expected_source = re.sub('\s', '', f.read().lower()) if expected_source.startswith('<!doctypehtml>'): expected_source = expected_source[14:] observed_source = re.sub('\s', '', frame['source'].lower()) if observed_source.startswith('<!doctypehtml>'): observed_source = observed_source[14:] assert observed_source == expected_source # Verify children parent_frames.append(frame['doc_url']) for key, child_frame in frame['iframes'].items(): verify_frame(child_frame, parent_frames) parent_frames.pop() verify_frame(visit_source) assert EXPECTED_PARENTS == observed_parents
def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ from src.open_wpm.automation import clientsocket def collect_links(table_name, scheme, **kwargs): """ Collect links with `scheme` and save in table `table_name` """ driver = kwargs['driver'] manager_params = kwargs['manager_params'] link_urls = [ x for x in (element.get_attribute("href") for element in driver.find_elements_by_tag_name('a')) if x.startswith(scheme + '://') ] current_url = driver.current_url sock = clientsocket() sock.connect(*manager_params['aggregator_address']) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT);" % table_name) sock.send((query, ())) for link in link_urls: query = ("INSERT INTO %s (top_url, link) " "VALUES (?, ?)" % table_name) sock.send((query, (current_url, link))) sock.close() manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.run_custom_function(collect_links, ('page_links', 'http')) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params['db'], "SELECT top_url, link FROM page_links;", as_tuple=True) assert PAGE_LINKS == set(query_result)
def test_dump_page_source_valid(self, tmpdir): """Check that 'dump_page_source' works and source is saved properly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=1) cs.dump_page_source(suffix='test') manager.execute_command_sequence(cs) manager.close() # Source filename is of the follow structure: # `sources/<visit_id>-<md5_of_url>(-suffix).html` # thus for this test we expect `sources/1-<md5_of_test_url>-test.html`. outfile = os.path.join(str(tmpdir), 'sources', '1-*-test.html') source_file = glob.glob(outfile)[0] with open(source_file, 'rb') as f: actual_source = f.read() with open('./test_pages/expected_source.html', 'rb') as f: expected_source = f.read() assert actual_source == expected_source
def test_profile_saved_when_launch_crashes(self): manager_params, browser_params = self.get_config() browser_params[0]['proxy'] = True browser_params[0]['save_javascript'] = True manager = TaskManager.TaskManager(manager_params, browser_params) manager.get('http://example.com') # Kill the LevelDBAggregator # This will cause the proxy launch to crash manager.ldb_status_queue.put("DIE") manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly manager.browsers[ 0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Have timeout occur quickly manager.get('example.com' ) # Cause a selenium crash to force browser to restart # The browser will fail to launch due to the proxy crashes try: manager.get('http://example.com') except CommandExecutionError: pass manager.close() assert isfile( join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
def test_browse_wrapper_http_table_valid(self): """Check that TaskManager.browse() wrapper works and populates http tables correctly. NOTE: Since the browse command is choosing links randomly, there is a (very small -- 2*0.5^20) chance this test will fail with valid code. """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS manager.browse(url_a, num_links=20, sleep=1) manager.browse(url_b, num_links=1, sleep=1) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits" ) # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has three links: # 1) An absolute link to simple_c.html # 2) A relative link to simple_d.html # 3) A javascript: link # 4) A link to www.google.com # 5) A link to example.com?localtest.me # We should see page visits for 1 and 2, but not 3-5. qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_d,)) assert qry_res[0][0] == visit_ids[url_a] # We expect 4 urls: a,c,d and a favicon request qry_res = db_utils.query_db( manager_params['db'], "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", (visit_ids[url_a],)) assert qry_res[0][0] == 4
def test_profile_error(self): manager_params, browser_params = self.get_config() browser_params[0]['profile_tar'] = '/tmp/NOTREAL' with pytest.raises(ProfileLoadError): TaskManager.TaskManager(manager_params, browser_params) # noqa
def test_browser_profile_coverage(self, tmpdir): """ Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains all visited sites (with a few exceptions). If it does not, it is likely the profile is lost at some point during the crawl """ # Run the test crawl data_dir = os.path.join(str(tmpdir), 'data_dir') manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz') manager.close() # Extract crawl profile with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0]['profile_archive_dir']) # Output databases ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 'places.sqlite') crawl_db = manager_params['db'] # Grab urls from crawl database rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for url, in rows: req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) hist_ps = set() # visited domains from CrawlHistory Table successes = dict() rows = db_utils.query_db( crawl_db, "SELECT arguments, bool_success " "FROM CrawlHistory WHERE command='GET'") for url, success in rows: ps = psl.get_public_suffix(urlparse(url).hostname) hist_ps.add(ps) successes[ps] = success # Grab urls from Firefox database profile_ps = set() # visited domains from firefox profile rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") for host, in rows: try: profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) except AttributeError: pass # We expect urls to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 (won't show in FF DB) missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) unexpected_missing_urls = set() for url in missing_urls: if successes[url] == 0 or successes[url] == -1: continue # Get the visit id for the url rows = db_utils.query_db( crawl_db, "SELECT visit_id FROM site_visits " "WHERE site_url = ?", ('http://' + url, )) visit_id = rows[0] rows = db_utils.query_db( crawl_db, "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", (visit_id, )) if rows[0] > 1: continue rows = db_utils.query_db( crawl_db, "SELECT response_status, location FROM " "http_responses WHERE visit_id = ?", (visit_id, )) response_status, location = rows[0] if response_status == 204: continue if location == 'http://': # site returned a blank redirect continue unexpected_missing_urls.add(url) assert len(unexpected_missing_urls) == 0