def test_cache_hits_recorded(self, tmpdir): """Verify all http responses are recorded, including cached responses Note that we expect to see all of the same requests and responses during the second vist (even if cached) except for images. Cached images do not trigger Observer Notification events. See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 """ test_url = utilities.BASE_TEST_URL + '/http_test_page.html' manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) manager.get(test_url, sleep=3) manager.get(test_url, sleep=3) manager.close() db = manager_params['db'] # HTTP Requests rows = utilities.query_db(db, ( "SELECT url, top_level_url, is_XHR, is_frame_load, is_full_page, " "is_third_party_channel, is_third_party_window, triggering_origin " "loading_origin, loading_href, content_policy_type " "FROM http_requests WHERE visit_id = 2")) observed_records = set() for row in rows: observed_records.add(row) assert expected.http_cached_requests == observed_records # HTTP Responses rows = utilities.query_db( db, ("SELECT url, referrer, is_cached FROM http_responses " "WHERE visit_id = 2")) observed_records = set() for row in rows: observed_records.add(row) assert expected.http_cached_responses == observed_records
def test_extension_gets_correct_visit_id(self, tmpdir): manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) url_a = utilities.BASE_TEST_URL + '/simple_a.html' url_b = utilities.BASE_TEST_URL + '/simple_b.html' manager.get(url_a) manager.get(url_b) manager.close(post_process=False) qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = utilities.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.userAgent",)) simple_b_visit_id = utilities.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.platform",)) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_flash_cookies(self, tmpdir): """ Check that some Flash LSOs are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies lso_value = utilities.rand_str(8) expected.lso_content[5] = lso_value # we'll expect this to be present expected.lso_content[0] = expected.lso_content[0].replace("REPLACEME", lso_value) qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value) test_url = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str start_time = time.time() manager.get(test_url, 120) time.sleep(5) manager.dump_flash_cookies(test_url, start_time) manager.close(post_process=False) # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM flash_cookies") lso_count = len(qry_res) assert lso_count == 1 lso_content = list(qry_res[0][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content[3] = lso_content[3].split("/", 1)[-1] # remove LSO dirname assert lso_content == expected.lso_content
def test_get_site_visits_table_valid(self, tmpdir): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db(manager_params['db'], "SELECT site_url FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def test_http_stacktrace_nonjs_loads(self, tmpdir): # stacktrace should be empty for requests NOT triggered by scripts test_url = utilities.BASE_TEST_URL + '/http_test_page.html' db = self.visit(test_url, str(tmpdir), sleep_after=3) rows = utilities.query_db( db, ("SELECT url, req_call_stack FROM http_requests")) for row in rows: _, stacktrace = row assert stacktrace == ""
def test_property_enumeration(self, tmpdir): test_url = utilities.BASE_TEST_URL + '/property_enumeration.html' db = self.visit(test_url, str(tmpdir)) rows = utilities.query_db(db, "SELECT script_url, symbol FROM javascript") observed_symbols = set() for script_url, symbol in rows: assert script_url == test_url observed_symbols.add(symbol) assert expected.properties == observed_symbols
def test_get_http_tables_valid(self, tmpdir): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b]
def test_get_http_tables_valid(self, tmpdir): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b]
def test_page_visit(self, tmpdir): test_url = utilities.BASE_TEST_URL + '/http_test_page.html' db = self.visit(test_url, str(tmpdir)) # HTTP Requests rows = utilities.query_db(db, ( "SELECT url, top_level_url, is_XHR, is_frame_load, is_full_page, " "is_third_party_channel, is_third_party_window, triggering_origin " "loading_origin, loading_href, content_policy_type FROM http_requests" )) observed_records = set() for row in rows: observed_records.add(row) assert expected.http_requests == observed_records # HTTP Responses rows = utilities.query_db( db, "SELECT url, referrer, location FROM http_responses") observed_records = set() for row in rows: observed_records.add(row) assert expected.http_responses == observed_records
def test_http_stacktrace(self, tmpdir): test_url = utilities.BASE_TEST_URL + '/http_stacktrace.html' db = self.visit(test_url, str(tmpdir), sleep_after=3) rows = utilities.query_db( db, ("SELECT url, req_call_stack FROM http_requests")) observed_records = set() for row in rows: url, stacktrace = row if (url.endswith("inject_pixel.js") or url.endswith("test_image.png") or url.endswith("Blank.gif")): observed_records.add(stacktrace) assert observed_records == expected.http_stacktraces
def test_flash_cookies(self, tmpdir): """ Check that some Flash LSOs are saved and are properly keyed in db.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies and visit it twice lso_value_a = utilities.rand_str(8) expected_lso_content_a[ 5] = lso_value_a # we'll expect this to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_a) test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_a) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) lso_value_b = utilities.rand_str(8) expected_lso_content_b[ 5] = lso_value_b # we'll expect this to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_b) test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_b) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM flash_cookies") lso_count = len(qry_res) assert lso_count == 2 lso_content_a = list(qry_res[0][2:]) # Remove first two items lso_content_b = list(qry_res[1][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # remove LSO dirname lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # remove LSO dirname assert lso_content_a == expected_lso_content_a assert lso_content_b == expected_lso_content_b
def test_js_profile_cookies(self, tmpdir): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" start_time = time.time() manager.get(url) time.sleep(5) manager.dump_profile_cookies(url, start_time) manager.close(post_process=False) # Check that the JS cookie we stored is recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM profile_cookies") assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected.js_cookie
def test_js_profile_cookies(self, tmpdir): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close(post_process=False) # Check that the JS cookie we stored is recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM profile_cookies") assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected_js_cookie
def test_profile_cookies(self, tmpdir): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0] assert prof_cookie_count > 0
def test_profile_cookies(self, tmpdir): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' start_time = time.time() manager.get(url) time.sleep(5) manager.dump_profile_cookies(url, start_time, timeout=90) manager.close(post_process=False) # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0] assert prof_cookie_count > 0
def test_js_profile_cookies(self, tmpdir): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that the JS cookie we stored is recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM profile_cookies") assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected_js_cookie
def test_profile_cookies(self, tmpdir): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close(post_process=False) # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0] assert prof_cookie_count > 0
def test_flash_cookies(self, tmpdir): """ Check that some Flash LSOs are saved and are properly keyed in db.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies and visit it twice lso_value_a = utilities.rand_str(8) expected_lso_content_a[5] = lso_value_a # we'll expect this to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_a) test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_a) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) lso_value_b = utilities.rand_str(8) expected_lso_content_b[5] = lso_value_b # we'll expect this to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_b) test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_b) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) manager.close(post_process=False) # Check that some flash cookies are recorded qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM flash_cookies") lso_count = len(qry_res) assert lso_count == 2 lso_content_a = list(qry_res[0][2:]) # Remove first two items lso_content_b = list(qry_res[1][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # remove LSO dirname lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # remove LSO dirname assert lso_content_a == expected_lso_content_a assert lso_content_b == expected_lso_content_b
def test_browse_http_table_valid(self, tmpdir): """Check that 'browse' works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.browse(num_links=1, sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.browse(num_links=1, sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has a link to simple_c.html. This request should # be keyed to the site visit for simple_a.html qry_res = utilities.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c,)) assert len(qry_res) == 1 assert qry_res[0][0] == visit_ids[url_a]
def test_browse_http_table_valid(self, tmpdir): """Check that 'browse' works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.browse(num_links=1, sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.browse(num_links=1, sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has a link to simple_c.html. This request should # be keyed to the site visit for simple_a.html qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c, )) assert len(qry_res) == 1 assert qry_res[0][0] == visit_ids[url_a]
def test_blocks_includes(self, tmpdir): data_dir = str(tmpdir) list_loc = os.path.join(data_dir, 'adblock_plus') manager_params, browser_params = self.get_config(data_dir) fetch_adblockplus_list(list_loc) browser_params[0]['adblock-plus_list_location'] = list_loc manager = TaskManager.TaskManager(manager_params, browser_params) manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html') manager.close(post_process=False) db = os.path.join(data_dir, manager_params['database_name']) rows = utilities.query_db(db, "SELECT url FROM http_requests") urls = set() for url, in rows: ps1 = psl.get_public_suffix(urlparse(url).hostname) # exclude requests to safebrowsing and tracking protection backends if ps1 not in ("mozilla.com", "mozilla.net"): urls.add(url) assert urls == expected.adblockplus
def test_blocks_includes(self, tmpdir): data_dir = str(tmpdir) list_loc = os.path.join(data_dir, 'adblock_plus') manager_params, browser_params = self.get_config(data_dir) fetch_adblockplus_list(list_loc) browser_params[0]['adblock-plus_list_location'] = list_loc manager = TaskManager.TaskManager(manager_params, browser_params) manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html') manager.close() db = os.path.join(data_dir, manager_params['database_name']) rows = utilities.query_db(db, "SELECT url FROM http_requests") urls = set() for url, in rows: ps1 = psl.get_public_suffix(urlparse(url).hostname) # exclude requests to safebrowsing and tracking protection backends if ps1 not in ("mozilla.com", "mozilla.net"): urls.add(url) assert urls == expected.adblockplus
def test_custom_function(self, tmpdir): """ Test `custom_function` with an inline function that collects links """ from ..automation.SocketInterface import clientsocket def collect_links(table_name, scheme, **kwargs): """ Collect links with matching `scheme` and save in table `table_name` """ driver = kwargs['driver'] manager_params = kwargs['manager_params'] link_elements = driver.find_elements_by_tag_name('a') link_urls = [ element.get_attribute("href") for element in link_elements ] link_urls = filter(lambda x: x.startswith(scheme + '://'), link_urls) current_url = driver.current_url sock = clientsocket() sock.connect(*manager_params['aggregator_address']) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT);" % table_name) sock.send((query, ())) for link in link_urls: query = ("INSERT INTO %s (top_url, link) " "VALUES (?, ?)" % table_name) sock.send((query, (current_url, link))) sock.close() manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.run_custom_function(collect_links, ('page_links', 'http')) manager.execute_command_sequence(cs) manager.close() query_result = utilities.query_db( manager_params['db'], "SELECT top_url, link FROM page_links;") assert expected.page_links == set(query_result)
def test_browser_profile_coverage(self, tmpdir): """ Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains all visited sites (with a few exceptions). If it does not, it is likely the profile is lost at some point during the crawl """ # Run the test crawl data_dir = os.path.join(str(tmpdir), 'data_dir') manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz') manager.close() # Extract crawl profile with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0]['profile_archive_dir']) # Output databases ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 'places.sqlite') crawl_db = manager_params['db'] # Grab urls from crawl database rows = utilities.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for url, in rows: req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) hist_ps = set() # visited domains from CrawlHistory Table successes = dict() rows = utilities.query_db(crawl_db, "SELECT arguments, bool_success " "FROM CrawlHistory WHERE command='GET'") for url, success in rows: ps = psl.get_public_suffix(urlparse(url).hostname) hist_ps.add(ps) successes[ps] = success # Grab urls from Firefox database profile_ps = set() # visited domains from firefox profile rows = utilities.query_db(ff_db, "SELECT url FROM moz_places") for host, in rows: try: profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) except AttributeError: pass # We expect urls to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 (won't show in FF DB) missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) unexpected_missing_urls = set() for url in missing_urls: if successes[url] == 0 or successes[url] == -1: continue # Get the visit id for the url rows = utilities.query_db(crawl_db, "SELECT visit_id FROM site_visits " "WHERE site_url = ?", ('http://' + url,)) visit_id = rows[0] rows = utilities.query_db(crawl_db, "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", (visit_id,)) if rows[0] > 1: continue rows = utilities.query_db(crawl_db, "SELECT response_status, location FROM " "http_responses WHERE visit_id = ?", (visit_id,)) response_status, location = rows[0] if response_status == 204: continue if location == 'http://': # site returned a blank redirect continue unexpected_missing_urls.add(url) assert len(unexpected_missing_urls) == 0
def test_browse_http_table_valid(self, tmpdir): """Check that 'browse' works and populates http tables correctly. NOTE: Since the browse command is choosing links randomly, there is a (very small -- 2*0.5^20) chance this test will fail with valid code. """ # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.browse(num_links=20, sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.browse(num_links=1, sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close(post_process=False) qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b, )) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has three links: # 1) An absolute link to simple_c.html # 2) A relative link to simple_d.html # 3) A javascript: link # 4) A link to www.google.com # 5) A link to example.com?localtest.me # We should see page visits for 1 and 2, but not 3-5. qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c, )) assert qry_res[0][0] == visit_ids[url_a] qry_res = utilities.query_db( manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_d, )) assert qry_res[0][0] == visit_ids[url_a] # We expect 4 urls: a,c,d and a favicon request qry_res = utilities.query_db( manager_params['db'], "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", (visit_ids[url_a], )) assert qry_res[0][0] == 4
def test_browser_profile_coverage(self, tmpdir): """ Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains all visited sites (with a few exceptions). If it does not, it is likely the profile is lost at some point during the crawl """ # Run the test crawl data_dir = os.path.join(str(tmpdir), 'data_dir') manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz') manager.close(post_process=False) # Extract crawl profile with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0]['profile_archive_dir']) # Output databases ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 'places.sqlite') crawl_db = manager_params['db'] # Grab urls from crawl database rows = utilities.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() for url, in rows: req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) hist_ps = set() successes = dict() rows = utilities.query_db(crawl_db, "SELECT arguments, bool_success " "FROM CrawlHistory WHERE command='GET'") for url, success in rows: ps = psl.get_public_suffix(urlparse(url).hostname) hist_ps.add(ps) successes[ps] = success # Grab urls from Firefox database profile_ps = set() rows = utilities.query_db(ff_db, "SELECT url FROM moz_places") for host, in rows: try: profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) except AttributeError: pass # We expect urls to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 (won't show in FF DB) missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) unexpected_missing_urls = set() for url in missing_urls: if successes[url] == 0 or successes[url] == -1: continue rows = utilities.query_db(crawl_db, "SELECT COUNT(*) FROM http_responses " "WHERE top_url = ?", ('http://' + url,)) if rows[0] > 1: continue rows = utilities.query_db(crawl_db, "SELECT response_status, location FROM " "http_responses WHERE top_url = ?", ('http://' + url,)) response_status, location = rows[0] if response_status == 204: continue if location == 'http://': continue unexpected_missing_urls.add(url) assert len(unexpected_missing_urls) == 0