示例#1
0
 def test_saving(self):
     manager_params, browser_params = self.get_config()
     manager = TaskManager.TaskManager(manager_params, browser_params)
     manager.get('http://example.com')
     manager.close()
     assert isfile(
         join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
示例#2
0
    def test_extension_gets_correct_visit_id(self):
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + '/simple_a.html'
        url_b = utilities.BASE_TEST_URL + '/simple_b.html'

        manager.get(url_a)
        manager.get(url_b)
        manager.close()
        qry_res = db_utils.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.userAgent", ))

        simple_b_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.platform", ))

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
示例#3
0
    def test_get_site_visits_table_valid(self):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
示例#4
0
    def test_save_screenshot_valid(self, tmpdir):
        """Check that 'save_screenshot' works"""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=1)
        cs.save_screenshot('test')
        cs.screenshot_full_page('test_full')
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that viewport image is not blank
        pattern = os.path.join(str(tmpdir), 'screenshots', '1-*-test.png')
        screenshot = glob.glob(pattern)[0]
        im = Image.open(screenshot)
        bands = im.split()
        is_blank = all(band.getextrema() == (255, 255) for band in bands)
        assert not is_blank

        # Check that full page screenshot is not blank
        pattern = os.path.join(str(tmpdir), 'screenshots', '1-*-test_full.png')
        screenshot = glob.glob(pattern)[0]
        im = Image.open(screenshot)
        bands = im.split()
        is_blank = all(band.getextrema() == (255, 255) for band in bands)
        assert not is_blank
示例#5
0
 def test_crash(self):
     manager_params, browser_params = self.get_config()
     manager_params['failure_limit'] = 0
     manager = TaskManager.TaskManager(manager_params, browser_params)
     with pytest.raises(CommandExecutionError):
         manager.get('http://example.com')  # So we have a profile
         manager.get('example.com')  # Selenium requires scheme prefix
         manager.get('example.com')  # Requires two commands to shut down
示例#6
0
 def visit(self, page_url, data_dir="", sleep_after=0):
     """Visit a test page with the given parameters."""
     manager_params, browser_params = self.get_config(data_dir)
     manager = TaskManager.TaskManager(manager_params, browser_params)
     if not page_url.startswith("http"):
         page_url = utilities.BASE_TEST_URL + page_url
     manager.get(url=page_url, sleep=sleep_after)
     manager.close()
     return manager_params['db']
示例#7
0
 def __init__(self):
     self.manager = None
     param_list = []
     index = 0
     for key in self.parameters:
         param_list.append(self.parameters[key]['combinations'])
         self.parameters[key]['index'] = index
         index += 1
     combinations = [p for p in itertools.product(*param_list)]
     manager_params, browser_params = TaskManager.load_default_params(
         len(combinations))
     self.setup(combinations, manager_params, browser_params)
示例#8
0
 def test_crash_profile(self):
     manager_params, browser_params = self.get_config()
     manager_params['failure_limit'] = 2
     manager = TaskManager.TaskManager(manager_params, browser_params)
     try:
         manager.get('http://example.com')  # So we have a profile
         manager.get('example.com')  # Selenium requires scheme prefix
         manager.get('example.com')  # Selenium requires scheme prefix
         manager.get('example.com')  # Selenium requires scheme prefix
         manager.get('example.com')  # Requires two commands to shut down
     except CommandExecutionError:
         pass
     assert isfile(
         join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
示例#9
0
 def setup(self, combinations, manager_params, browser_params):
     manager_params['data_directory'] = '~/Documents/openWpmDb/'
     manager_params['log_directory'] = '~/Desktop/'
     for i in range(len(combinations)):
         browser_params[i]['http_instrument'] = combinations[i][
             self.parameters['http_instrument']['index']]
         browser_params[i]['js_instrument'] = combinations[i][
             self.parameters['js_instrument']['index']]
         browser_params[i]['disable_flash'] = combinations[i][
             self.parameters['disable_flash']['index']]
         browser_params[i]['tp_cookies'] = combinations[i][
             self.parameters['tp_cookies']['index']]
         browser_params[i]['ghostery'] = combinations[i][
             self.parameters['ghostery']['index']]
         browser_params[i]['https-everywhere'] = combinations[i][
             self.parameters['https-everywhere']['index']]
     self.manager = TaskManager.TaskManager(manager_params, browser_params)
示例#10
0
    def test_flash_cookies(self):
        """ Check that some Flash LSOs are saved and
        are properly keyed in db."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies and visit it twice
        lso_value_a = utilities.rand_str(8)
        expected_lso_content_a[5] = lso_value_a  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_a)
        test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_a)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        lso_value_b = utilities.rand_str(8)
        expected_lso_content_b[5] = lso_value_b  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_b)
        test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_b)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        manager.close()

        #  Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT * FROM flash_cookies",
                                    as_tuple=True)
        lso_count = len(qry_res)
        assert lso_count == 2
        lso_content_a = list(qry_res[0][2:])  # Remove first two items
        lso_content_b = list(qry_res[1][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content_a[3] = lso_content_a[3].split("/", 1)[-1]  # rm LSO dirname
        lso_content_b[3] = lso_content_b[3].split("/", 1)[-1]  # rm LSO dirname
        assert lso_content_a == expected_lso_content_a
        assert lso_content_b == expected_lso_content_b
示例#11
0
    def test_profile_cookies(self):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        cs = CommandSequence.CommandSequence(url)
        cs.get(sleep=3, timeout=120)
        cs.dump_profile_cookies()
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0][0]
        assert prof_cookie_count > 0
示例#12
0
    def test_get_http_tables_valid(self):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]
示例#13
0
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = CommandSequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     cs.dump_profile_cookies()
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(manager_params['db'],
                                 "SELECT * FROM profile_cookies",
                                 as_tuple=True)
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected_js_cookie
示例#14
0
    def test_recursive_dump_page_source_valid(self, tmpdir):
        """Check that 'recursive_dump_page_source' works"""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(NESTED_FRAMES_URL)
        cs.get(sleep=1)
        cs.recursive_dump_page_source()
        manager.execute_command_sequence(cs)
        manager.close()

        outfile = os.path.join(str(tmpdir), 'sources', '1-*.json.gz')
        src_file = glob.glob(outfile)[0]
        with gzip.GzipFile(src_file, 'rb') as f:
            visit_source = json.loads(f.read().decode('utf-8'))

        observed_parents = dict()

        def verify_frame(frame, parent_frames=[]):
            # Verify structure
            observed_parents[frame['doc_url']] = list(parent_frames)  # copy

            # Verify source
            path = urlparse(frame['doc_url']).path
            expected_source = ''
            with open('.'+path, 'r') as f:
                expected_source = re.sub('\s', '', f.read().lower())
                if expected_source.startswith('<!doctypehtml>'):
                    expected_source = expected_source[14:]
            observed_source = re.sub('\s', '', frame['source'].lower())
            if observed_source.startswith('<!doctypehtml>'):
                observed_source = observed_source[14:]
            assert observed_source == expected_source

            # Verify children
            parent_frames.append(frame['doc_url'])
            for key, child_frame in frame['iframes'].items():
                verify_frame(child_frame, parent_frames)
            parent_frames.pop()

        verify_frame(visit_source)
        assert EXPECTED_PARENTS == observed_parents
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from src.open_wpm.automation import clientsocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs['driver']
            manager_params = kwargs['manager_params']
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name('a'))
                if x.startswith(scheme + '://')
            ]
            current_url = driver.current_url

            sock = clientsocket()
            sock.connect(*manager_params['aggregator_address'])

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT);" % table_name)
            sock.send((query, ()))

            for link in link_urls:
                query = ("INSERT INTO %s (top_url, link) "
                         "VALUES (?, ?)" % table_name)
                sock.send((query, (current_url, link)))
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ('page_links', 'http'))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params['db'],
            "SELECT top_url, link FROM page_links;",
            as_tuple=True)
        assert PAGE_LINKS == set(query_result)
示例#16
0
    def test_dump_page_source_valid(self, tmpdir):
        """Check that 'dump_page_source' works and source is saved properly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=1)
        cs.dump_page_source(suffix='test')
        manager.execute_command_sequence(cs)
        manager.close()

        # Source filename is of the follow structure:
        # `sources/<visit_id>-<md5_of_url>(-suffix).html`
        # thus for this test we expect `sources/1-<md5_of_test_url>-test.html`.
        outfile = os.path.join(str(tmpdir), 'sources', '1-*-test.html')
        source_file = glob.glob(outfile)[0]
        with open(source_file, 'rb') as f:
            actual_source = f.read()
        with open('./test_pages/expected_source.html', 'rb') as f:
            expected_source = f.read()

        assert actual_source == expected_source
示例#17
0
    def test_profile_saved_when_launch_crashes(self):
        manager_params, browser_params = self.get_config()
        browser_params[0]['proxy'] = True
        browser_params[0]['save_javascript'] = True
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get('http://example.com')

        # Kill the LevelDBAggregator
        # This will cause the proxy launch to crash
        manager.ldb_status_queue.put("DIE")
        manager.browsers[0]._SPAWN_TIMEOUT = 2  # Have timeout occur quickly
        manager.browsers[
            0]._UNSUCCESSFUL_SPAWN_LIMIT = 2  # Have timeout occur quickly
        manager.get('example.com'
                    )  # Cause a selenium crash to force browser to restart

        # The browser will fail to launch due to the proxy crashes
        try:
            manager.get('http://example.com')
        except CommandExecutionError:
            pass
        manager.close()
        assert isfile(
            join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz'))
示例#18
0
    def test_browse_wrapper_http_table_valid(self):
        """Check that TaskManager.browse() wrapper works and populates
        http tables correctly.

        NOTE: Since the browse command is choosing links randomly, there is a
              (very small -- 2*0.5^20) chance this test will fail with valid
              code.
        """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        manager.browse(url_a, num_links=20, sleep=1)
        manager.browse(url_b, num_links=1, sleep=1)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits"
        )

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has three links:
        # 1) An absolute link to simple_c.html
        # 2) A relative link to simple_d.html
        # 3) A javascript: link
        # 4) A link to www.google.com
        # 5) A link to example.com?localtest.me
        # We should see page visits for 1 and 2, but not 3-5.
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_c,))
        assert qry_res[0][0] == visit_ids[url_a]
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_d,))
        assert qry_res[0][0] == visit_ids[url_a]

        # We expect 4 urls: a,c,d and a favicon request
        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT COUNT(DISTINCT url) FROM http_responses"
            " WHERE visit_id = ?", (visit_ids[url_a],))
        assert qry_res[0][0] == 4
示例#19
0
 def test_profile_error(self):
     manager_params, browser_params = self.get_config()
     browser_params[0]['profile_tar'] = '/tmp/NOTREAL'
     with pytest.raises(ProfileLoadError):
         TaskManager.TaskManager(manager_params, browser_params)  # noqa
示例#20
0
    def test_browser_profile_coverage(self, tmpdir):
        """ Test the coverage of the browser's profile

        This verifies that Firefox's places.sqlite database contains
        all visited sites (with a few exceptions). If it does not,
        it is likely the profile is lost at some point during the crawl
        """
        # Run the test crawl
        data_dir = os.path.join(str(tmpdir), 'data_dir')
        manager_params, browser_params = self.get_config(data_dir)
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in TEST_SITES:
            manager.get(site)
        ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
                                 'profile.tar.gz')
        manager.close()

        # Extract crawl profile
        with tarfile.open(ff_db_tar) as tar:
            tar.extractall(browser_params[0]['profile_archive_dir'])

        # Output databases
        ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
                             'places.sqlite')
        crawl_db = manager_params['db']

        # Grab urls from crawl database
        rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
        req_ps = set()  # visited domains from http_requests table
        for url, in rows:
            req_ps.add(psl.get_public_suffix(urlparse(url).hostname))

        hist_ps = set()  # visited domains from CrawlHistory Table
        successes = dict()
        rows = db_utils.query_db(
            crawl_db, "SELECT arguments, bool_success "
            "FROM CrawlHistory WHERE command='GET'")
        for url, success in rows:
            ps = psl.get_public_suffix(urlparse(url).hostname)
            hist_ps.add(ps)
            successes[ps] = success

        # Grab urls from Firefox database
        profile_ps = set()  # visited domains from firefox profile
        rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
        for host, in rows:
            try:
                profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
            except AttributeError:
                pass

        # We expect urls to be in the Firefox profile if:
        # 1. We've made requests to it
        # 2. The url is a top_url we entered into the address bar
        # 3. The url successfully loaded (see: Issue #40)
        # 4. The site does not respond to the initial request with a 204 (won't show in FF DB)
        missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
        unexpected_missing_urls = set()
        for url in missing_urls:
            if successes[url] == 0 or successes[url] == -1:
                continue

            # Get the visit id for the url
            rows = db_utils.query_db(
                crawl_db, "SELECT visit_id FROM site_visits "
                "WHERE site_url = ?", ('http://' + url, ))
            visit_id = rows[0]

            rows = db_utils.query_db(
                crawl_db, "SELECT COUNT(*) FROM http_responses "
                "WHERE visit_id = ?", (visit_id, ))
            if rows[0] > 1:
                continue

            rows = db_utils.query_db(
                crawl_db, "SELECT response_status, location FROM "
                "http_responses WHERE visit_id = ?", (visit_id, ))
            response_status, location = rows[0]
            if response_status == 204:
                continue
            if location == 'http://':  # site returned a blank redirect
                continue
            unexpected_missing_urls.add(url)

        assert len(unexpected_missing_urls) == 0