def test_cache_hits_recorded(self, tmpdir):
        """Verify all http responses are recorded, including cached responses

        Note that we expect to see all of the same requests and responses
        during the second vist (even if cached) except for images. Cached
        images do not trigger Observer Notification events.
        See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073
        """
        test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get(test_url, sleep=3)
        manager.get(test_url, sleep=3)
        manager.close()
        db = manager_params['db']

        # HTTP Requests
        rows = utilities.query_db(db, (
            "SELECT url, top_level_url, is_XHR, is_frame_load, is_full_page, "
            "is_third_party_channel, is_third_party_window, triggering_origin "
            "loading_origin, loading_href, content_policy_type "
            "FROM http_requests WHERE visit_id = 2"))
        observed_records = set()
        for row in rows:
            observed_records.add(row)
        assert expected.http_cached_requests == observed_records

        # HTTP Responses
        rows = utilities.query_db(
            db, ("SELECT url, referrer, is_cached FROM http_responses "
                 "WHERE visit_id = 2"))
        observed_records = set()
        for row in rows:
            observed_records.add(row)
        assert expected.http_cached_responses == observed_records
예제 #2
0
    def test_extension_gets_correct_visit_id(self, tmpdir):
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + '/simple_a.html'
        url_b = utilities.BASE_TEST_URL + '/simple_b.html'

        manager.get(url_a)
        manager.get(url_b)
        manager.close(post_process=False)
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = utilities.query_db(
                                    manager_params['db'],
                                    "SELECT visit_id FROM javascript WHERE "
                                    "symbol=?", ("window.navigator.userAgent",))

        simple_b_visit_id = utilities.query_db(
                                    manager_params['db'],
                                    "SELECT visit_id FROM javascript WHERE "
                                    "symbol=?", ("window.navigator.platform",))

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
예제 #3
0
    def test_extension_gets_correct_visit_id(self, tmpdir):
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + '/simple_a.html'
        url_b = utilities.BASE_TEST_URL + '/simple_b.html'

        manager.get(url_a)
        manager.get(url_b)
        manager.close(post_process=False)
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = utilities.query_db(
                                    manager_params['db'],
                                    "SELECT visit_id FROM javascript WHERE "
                                    "symbol=?", ("window.navigator.userAgent",))

        simple_b_visit_id = utilities.query_db(
                                    manager_params['db'],
                                    "SELECT visit_id FROM javascript WHERE "
                                    "symbol=?", ("window.navigator.platform",))

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
예제 #4
0
    def test_flash_cookies(self, tmpdir):
        """ Check that some Flash LSOs are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies
        lso_value = utilities.rand_str(8)
        expected.lso_content[5] = lso_value  # we'll expect this to be present
        expected.lso_content[0] = expected.lso_content[0].replace("REPLACEME",
                                                                  lso_value)
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value)
        test_url = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        start_time = time.time()
        manager.get(test_url, 120)
        time.sleep(5)
        manager.dump_flash_cookies(test_url, start_time)
        manager.close(post_process=False)

        #  Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT * FROM flash_cookies")
        lso_count = len(qry_res)
        assert lso_count == 1
        lso_content = list(qry_res[0][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content[3] = lso_content[3].split("/", 1)[-1]  # remove LSO dirname
        assert lso_content == expected.lso_content
예제 #5
0
    def test_get_site_visits_table_valid(self, tmpdir):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
예제 #6
0
    def test_get_site_visits_table_valid(self, tmpdir):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
 def test_http_stacktrace_nonjs_loads(self, tmpdir):
     # stacktrace should be empty for requests NOT triggered by scripts
     test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
     db = self.visit(test_url, str(tmpdir), sleep_after=3)
     rows = utilities.query_db(
         db, ("SELECT url, req_call_stack FROM http_requests"))
     for row in rows:
         _, stacktrace = row
         assert stacktrace == ""
예제 #8
0
 def test_property_enumeration(self, tmpdir):
     test_url = utilities.BASE_TEST_URL + '/property_enumeration.html'
     db = self.visit(test_url, str(tmpdir))
     rows = utilities.query_db(db,
                               "SELECT script_url, symbol FROM javascript")
     observed_symbols = set()
     for script_url, symbol in rows:
         assert script_url == test_url
         observed_symbols.add(symbol)
     assert expected.properties == observed_symbols
예제 #9
0
    def test_get_http_tables_valid(self, tmpdir):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]
예제 #10
0
 def test_property_enumeration(self, tmpdir):
     test_url = utilities.BASE_TEST_URL + '/property_enumeration.html'
     db = self.visit(test_url, str(tmpdir))
     rows = utilities.query_db(db,
                               "SELECT script_url, symbol FROM javascript")
     observed_symbols = set()
     for script_url, symbol in rows:
         assert script_url == test_url
         observed_symbols.add(symbol)
     assert expected.properties == observed_symbols
예제 #11
0
    def test_get_http_tables_valid(self, tmpdir):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_requests"
                                     " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_requests"
                                     " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]
    def test_page_visit(self, tmpdir):
        test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
        db = self.visit(test_url, str(tmpdir))

        # HTTP Requests
        rows = utilities.query_db(db, (
            "SELECT url, top_level_url, is_XHR, is_frame_load, is_full_page, "
            "is_third_party_channel, is_third_party_window, triggering_origin "
            "loading_origin, loading_href, content_policy_type FROM http_requests"
        ))
        observed_records = set()
        for row in rows:
            observed_records.add(row)
        assert expected.http_requests == observed_records

        # HTTP Responses
        rows = utilities.query_db(
            db, "SELECT url, referrer, location FROM http_responses")
        observed_records = set()
        for row in rows:
            observed_records.add(row)
        assert expected.http_responses == observed_records
 def test_http_stacktrace(self, tmpdir):
     test_url = utilities.BASE_TEST_URL + '/http_stacktrace.html'
     db = self.visit(test_url, str(tmpdir), sleep_after=3)
     rows = utilities.query_db(
         db, ("SELECT url, req_call_stack FROM http_requests"))
     observed_records = set()
     for row in rows:
         url, stacktrace = row
         if (url.endswith("inject_pixel.js")
                 or url.endswith("test_image.png")
                 or url.endswith("Blank.gif")):
             observed_records.add(stacktrace)
     assert observed_records == expected.http_stacktraces
예제 #14
0
    def test_flash_cookies(self, tmpdir):
        """ Check that some Flash LSOs are saved and
        are properly keyed in db."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies and visit it twice
        lso_value_a = utilities.rand_str(8)
        expected_lso_content_a[
            5] = lso_value_a  # we'll expect this to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_a)
        test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_a)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        lso_value_b = utilities.rand_str(8)
        expected_lso_content_b[
            5] = lso_value_b  # we'll expect this to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_b)
        test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_b)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        manager.close()

        #  Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT * FROM flash_cookies")
        lso_count = len(qry_res)
        assert lso_count == 2
        lso_content_a = list(qry_res[0][2:])  # Remove first two items
        lso_content_b = list(qry_res[1][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content_a[3] = lso_content_a[3].split("/",
                                                  1)[-1]  # remove LSO dirname
        lso_content_b[3] = lso_content_b[3].split("/",
                                                  1)[-1]  # remove LSO dirname
        assert lso_content_a == expected_lso_content_a
        assert lso_content_b == expected_lso_content_b
예제 #15
0
 def test_js_profile_cookies(self, tmpdir):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config(str(tmpdir))
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     start_time = time.time()
     manager.get(url)
     time.sleep(5)
     manager.dump_profile_cookies(url, start_time)
     manager.close(post_process=False)
     # Check that the JS cookie we stored is recorded
     qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM profile_cookies")
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected.js_cookie
예제 #16
0
 def test_js_profile_cookies(self, tmpdir):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config(str(tmpdir))
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = CommandSequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     cs.dump_profile_cookies()
     manager.execute_command_sequence(cs)
     manager.close(post_process=False)
     # Check that the JS cookie we stored is recorded
     qry_res = utilities.query_db(manager_params['db'], "SELECT * FROM profile_cookies")
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected_js_cookie
예제 #17
0
    def test_profile_cookies(self, tmpdir):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        cs = CommandSequence.CommandSequence(url)
        cs.get(sleep=3, timeout=120)
        cs.dump_profile_cookies()
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0]
        assert prof_cookie_count > 0
예제 #18
0
    def test_profile_cookies(self, tmpdir):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        start_time = time.time()
        manager.get(url)
        time.sleep(5)
        manager.dump_profile_cookies(url, start_time, timeout=90)
        manager.close(post_process=False)

        # Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0]
        assert prof_cookie_count > 0
예제 #19
0
 def test_js_profile_cookies(self, tmpdir):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config(str(tmpdir))
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = CommandSequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     cs.dump_profile_cookies()
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = utilities.query_db(manager_params['db'],
                                  "SELECT * FROM profile_cookies")
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected_js_cookie
예제 #20
0
    def test_profile_cookies(self, tmpdir):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        cs = CommandSequence.CommandSequence(url)
        cs.get(sleep=3, timeout=120)
        cs.dump_profile_cookies()
        manager.execute_command_sequence(cs)
        manager.close(post_process=False)

        # Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0]
        assert prof_cookie_count > 0
예제 #21
0
    def test_flash_cookies(self, tmpdir):
        """ Check that some Flash LSOs are saved and
        are properly keyed in db."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies and visit it twice
        lso_value_a = utilities.rand_str(8)
        expected_lso_content_a[5] = lso_value_a  # we'll expect this to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_a)
        test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_a)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        lso_value_b = utilities.rand_str(8)
        expected_lso_content_b[5] = lso_value_b  # we'll expect this to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_b)
        test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_b)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        manager.close(post_process=False)

        #  Check that some flash cookies are recorded
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT * FROM flash_cookies")
        lso_count = len(qry_res)
        assert lso_count == 2
        lso_content_a = list(qry_res[0][2:])  # Remove first two items
        lso_content_b = list(qry_res[1][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content_a[3] = lso_content_a[3].split("/", 1)[-1]  # remove LSO dirname
        lso_content_b[3] = lso_content_b[3].split("/", 1)[-1]  # remove LSO dirname
        assert lso_content_a == expected_lso_content_a
        assert lso_content_b == expected_lso_content_b
예제 #22
0
    def test_browse_http_table_valid(self, tmpdir):
        """Check that 'browse' works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.browse(num_links=1, sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_requests"
                                     " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_requests"
                                     " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has a link to simple_c.html. This request should
        # be keyed to the site visit for simple_a.html
        qry_res = utilities.query_db(manager_params['db'],
                                     "SELECT visit_id FROM http_responses"
                                     " WHERE url = ?", (url_c,))
        assert len(qry_res) == 1
        assert qry_res[0][0] == visit_ids[url_a]
예제 #23
0
    def test_browse_http_table_valid(self, tmpdir):
        """Check that 'browse' works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.browse(num_links=1, sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has a link to simple_c.html. This request should
        # be keyed to the site visit for simple_a.html
        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_c, ))
        assert len(qry_res) == 1
        assert qry_res[0][0] == visit_ids[url_a]
예제 #24
0
    def test_blocks_includes(self, tmpdir):
        data_dir = str(tmpdir)
        list_loc = os.path.join(data_dir, 'adblock_plus')
        manager_params, browser_params = self.get_config(data_dir)
        fetch_adblockplus_list(list_loc)
        browser_params[0]['adblock-plus_list_location'] = list_loc
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html')
        manager.close(post_process=False)

        db = os.path.join(data_dir, manager_params['database_name'])
        rows = utilities.query_db(db, "SELECT url FROM http_requests")
        urls = set()
        for url, in rows:
            ps1 = psl.get_public_suffix(urlparse(url).hostname)
            # exclude requests to safebrowsing and tracking protection backends
            if ps1 not in ("mozilla.com", "mozilla.net"):
                urls.add(url)
        assert urls == expected.adblockplus
예제 #25
0
    def test_blocks_includes(self, tmpdir):
        data_dir = str(tmpdir)
        list_loc = os.path.join(data_dir, 'adblock_plus')
        manager_params, browser_params = self.get_config(data_dir)
        fetch_adblockplus_list(list_loc)
        browser_params[0]['adblock-plus_list_location'] = list_loc
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get(utilities.BASE_TEST_URL + '/abp/adblock_plus_test.html')
        manager.close()

        db = os.path.join(data_dir, manager_params['database_name'])
        rows = utilities.query_db(db, "SELECT url FROM http_requests")
        urls = set()
        for url, in rows:
            ps1 = psl.get_public_suffix(urlparse(url).hostname)
            # exclude requests to safebrowsing and tracking protection backends
            if ps1 not in ("mozilla.com", "mozilla.net"):
                urls.add(url)
        assert urls == expected.adblockplus
예제 #26
0
    def test_custom_function(self, tmpdir):
        """ Test `custom_function` with an inline function that collects links """

        from ..automation.SocketInterface import clientsocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with matching `scheme` and save in table `table_name` """
            driver = kwargs['driver']
            manager_params = kwargs['manager_params']
            link_elements = driver.find_elements_by_tag_name('a')
            link_urls = [
                element.get_attribute("href") for element in link_elements
            ]
            link_urls = filter(lambda x: x.startswith(scheme + '://'),
                               link_urls)
            current_url = driver.current_url

            sock = clientsocket()
            sock.connect(*manager_params['aggregator_address'])

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT);" % table_name)
            sock.send((query, ()))

            for link in link_urls:
                query = ("INSERT INTO %s (top_url, link) "
                         "VALUES (?, ?)" % table_name)
                sock.send((query, (current_url, link)))
            sock.close()

        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ('page_links', 'http'))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = utilities.query_db(
            manager_params['db'], "SELECT top_url, link FROM page_links;")
        assert expected.page_links == set(query_result)
예제 #27
0
    def test_browser_profile_coverage(self, tmpdir):
        """ Test the coverage of the browser's profile

        This verifies that Firefox's places.sqlite database contains
        all visited sites (with a few exceptions). If it does not,
        it is likely the profile is lost at some point during the crawl
        """
        # Run the test crawl
        data_dir = os.path.join(str(tmpdir), 'data_dir')
        manager_params, browser_params = self.get_config(data_dir)
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in TEST_SITES:
            manager.get(site)
        ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
                                 'profile.tar.gz')
        manager.close()

        # Extract crawl profile
        with tarfile.open(ff_db_tar) as tar:
            tar.extractall(browser_params[0]['profile_archive_dir'])

        # Output databases
        ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
                             'places.sqlite')
        crawl_db = manager_params['db']

        # Grab urls from crawl database
        rows = utilities.query_db(crawl_db, "SELECT url FROM http_requests")
        req_ps = set()  # visited domains from http_requests table
        for url, in rows:
            req_ps.add(psl.get_public_suffix(urlparse(url).hostname))

        hist_ps = set()  # visited domains from CrawlHistory Table
        successes = dict()
        rows = utilities.query_db(crawl_db, "SELECT arguments, bool_success "
                                  "FROM CrawlHistory WHERE command='GET'")
        for url, success in rows:
            ps = psl.get_public_suffix(urlparse(url).hostname)
            hist_ps.add(ps)
            successes[ps] = success

        # Grab urls from Firefox database
        profile_ps = set()  # visited domains from firefox profile
        rows = utilities.query_db(ff_db, "SELECT url FROM moz_places")
        for host, in rows:
            try:
                profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
            except AttributeError:
                pass

        # We expect urls to be in the Firefox profile if:
        # 1. We've made requests to it
        # 2. The url is a top_url we entered into the address bar
        # 3. The url successfully loaded (see: Issue #40)
        # 4. The site does not respond to the initial request with a 204 (won't show in FF DB)
        missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
        unexpected_missing_urls = set()
        for url in missing_urls:
            if successes[url] == 0 or successes[url] == -1:
                continue

            # Get the visit id for the url
            rows = utilities.query_db(crawl_db,
                                      "SELECT visit_id FROM site_visits "
                                      "WHERE site_url = ?",
                                      ('http://' + url,))
            visit_id = rows[0]

            rows = utilities.query_db(crawl_db,
                                      "SELECT COUNT(*) FROM http_responses "
                                      "WHERE visit_id = ?",
                                      (visit_id,))
            if rows[0] > 1:
                continue

            rows = utilities.query_db(crawl_db,
                                      "SELECT response_status, location FROM "
                                      "http_responses WHERE visit_id = ?",
                                      (visit_id,))
            response_status, location = rows[0]
            if response_status == 204:
                continue
            if location == 'http://':  # site returned a blank redirect
                continue
            unexpected_missing_urls.add(url)

        assert len(unexpected_missing_urls) == 0
예제 #28
0
    def test_browse_http_table_valid(self, tmpdir):
        """Check that 'browse' works and populates http tables correctly.

        NOTE: Since the browse command is choosing links randomly, there is a
              (very small -- 2*0.5^20) chance this test will fail with valid
              code.
        """
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.browse(num_links=20, sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close(post_process=False)

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_requests"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_a, ))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_b, ))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has three links:
        # 1) An absolute link to simple_c.html
        # 2) A relative link to simple_d.html
        # 3) A javascript: link
        # 4) A link to www.google.com
        # 5) A link to example.com?localtest.me
        # We should see page visits for 1 and 2, but not 3-5.
        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_c, ))
        assert qry_res[0][0] == visit_ids[url_a]
        qry_res = utilities.query_db(
            manager_params['db'], "SELECT visit_id FROM http_responses"
            " WHERE url = ?", (url_d, ))
        assert qry_res[0][0] == visit_ids[url_a]

        # We expect 4 urls: a,c,d and a favicon request
        qry_res = utilities.query_db(
            manager_params['db'],
            "SELECT COUNT(DISTINCT url) FROM http_responses"
            " WHERE visit_id = ?", (visit_ids[url_a], ))
        assert qry_res[0][0] == 4
예제 #29
0
    def test_browser_profile_coverage(self, tmpdir):
        """ Test the coverage of the browser's profile

        This verifies that Firefox's places.sqlite database contains
        all visited sites (with a few exceptions). If it does not,
        it is likely the profile is lost at some point during the crawl
        """
        # Run the test crawl
        data_dir = os.path.join(str(tmpdir), 'data_dir')
        manager_params, browser_params = self.get_config(data_dir)
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in TEST_SITES:
            manager.get(site)
        ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
                                 'profile.tar.gz')
        manager.close(post_process=False)

        # Extract crawl profile
        with tarfile.open(ff_db_tar) as tar:
            tar.extractall(browser_params[0]['profile_archive_dir'])

        # Output databases
        ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
                             'places.sqlite')
        crawl_db = manager_params['db']

        # Grab urls from crawl database
        rows = utilities.query_db(crawl_db, "SELECT url FROM http_requests")
        req_ps = set()
        for url, in rows:
            req_ps.add(psl.get_public_suffix(urlparse(url).hostname))

        hist_ps = set()
        successes = dict()
        rows = utilities.query_db(crawl_db, "SELECT arguments, bool_success "
                                  "FROM CrawlHistory WHERE command='GET'")
        for url, success in rows:
            ps = psl.get_public_suffix(urlparse(url).hostname)
            hist_ps.add(ps)
            successes[ps] = success

        # Grab urls from Firefox database
        profile_ps = set()
        rows = utilities.query_db(ff_db, "SELECT url FROM moz_places")
        for host, in rows:
            try:
                profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
            except AttributeError:
                pass

        # We expect urls to be in the Firefox profile if:
        # 1. We've made requests to it
        # 2. The url is a top_url we entered into the address bar
        # 3. The url successfully loaded (see: Issue #40)
        # 4. The site does not respond to the initial request with a 204 (won't show in FF DB)
        missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
        unexpected_missing_urls = set()
        for url in missing_urls:
            if successes[url] == 0 or successes[url] == -1:
                continue

            rows = utilities.query_db(crawl_db,
                                      "SELECT COUNT(*) FROM http_responses "
                                      "WHERE top_url = ?",
                                      ('http://' + url,))
            if rows[0] > 1:
                continue
            rows = utilities.query_db(crawl_db,
                                      "SELECT response_status, location FROM "
                                      "http_responses WHERE top_url = ?",
                                      ('http://' + url,))
            response_status, location = rows[0]
            if response_status == 204:
                continue
            if location == 'http://':
                continue
            unexpected_missing_urls.add(url)

        assert len(unexpected_missing_urls) == 0