def test_website_download(self): with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+Resources.path(__file__, "data/original_site-without_broken_links/issues_1.html", convert_to_url=True) navigator = HTMLMultipageNavigator(address, levels) crawler = SimpleDFSCrawler(navigator) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download-without_broken_links") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"]))
def test_simple_browsing(self): navigator = HTMLMultipageNavigator( "file:" + Resources.path(__file__, "../../test/data/original_site/issues_1.html", convert_to_url=True), LevelsCreator(None).create()) navigator.start_in_root() root_name = navigator.get_path()[0] children1 = navigator.get_children() self.assertEqual([ "2011-07-12", "2011-07-13", "2011-07-14", "2011-07-16", "2011-07-16-repetition_1", "2011-07-17" ], children1) navigator.move_to_child(children1[0]) self.assertEqual([root_name, "2011-07-12"], navigator.get_path()) children2 = navigator.get_children() self.assertEqual(["01", "02", "03", "04", "05", "06", "07", "08"], children2) navigator.move_to_child("05") self.assertEqual([root_name, "2011-07-12", "05"], navigator.get_path()) navigator.move_to_parent() self.assertEqual([root_name, "2011-07-12"], navigator.get_path()) navigator.move_to_parent() self.assertEqual([root_name], navigator.get_path())
def test_website_download(self): with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:" + Resources.path( __file__, "data/original_site-without_broken_links/issues_1.html", convert_to_url=True) navigator = HTMLMultipageNavigator(address, levels) crawler = SimpleDFSCrawler(navigator) crawler.run() expected_dir = Resources.path( __file__, "data/expected_download-without_broken_links") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"]))
def __check(self, dir_name, should_be_equal): path = Resources.path(__file__, os.path.join("data/dir_tree_comparer", dir_name)) ret = are_dir_trees_equal( os.path.join(path, "01"), os.path.join(path, "02"), ignore=[".gitignore"]) if should_be_equal: self.assertTrue(ret) else: self.assertFalse(ret)
def test_single_threaded_download_without_manager(self): # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) tree = TreeAccessor(_StandardNodeExtended()) navigator = HTMLMultipageNavigator(address, levels) navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree) crawler = CrawlerThread(navigator_wrapper, tree) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(tree.get_root()) self.__check_if_each_node_is_processed_once( tree.get_root(), {"/root/2011-07-16/06": 0})
def test_throttled_download(self): # Logger.start(logging_level=logging.DEBUG) address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) web_pages_no = 34 max_page_opens_per_second = 15 min_seconds_taken = float(web_pages_no)/max_page_opens_per_second for threads_no in [1, 3]: seconds_taken = self.__check_download( threads_no, address, max_page_opens_per_second) # print >>sys.stderr, "seconds_taken={}".format(seconds_taken) self.assertGreaterEqual(seconds_taken, min_seconds_taken)
def test_throttled_download(self): # Logger.start(logging_level=logging.DEBUG) address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) web_pages_no = 34 max_page_opens_per_second = 15 min_seconds_taken = float(web_pages_no) / max_page_opens_per_second for threads_no in [1, 3]: seconds_taken = self.__check_download(threads_no, address, max_page_opens_per_second) # print >>sys.stderr, "seconds_taken={}".format(seconds_taken) self.assertGreaterEqual(seconds_taken, min_seconds_taken)
def test_single_threaded_download_without_manager(self): # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) tree = TreeAccessor(_StandardNodeExtended()) navigator = HTMLMultipageNavigator(address, levels) navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree) crawler = CrawlerThread(navigator_wrapper, tree) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(tree.get_root()) self.__check_if_each_node_is_processed_once( tree.get_root(), {"/root/2011-07-16/06": 0})
def test_throttled_download_with_HTTP_server(self): # Logger.start(logging_level=logging.DEBUG) with DelayedHTTPFilesServer( Resources.path(__file__, "data/original_site"), 0) as server: (address, ip_number) = server.start() root_address = "http://{}:{}/issues_1.html".format( address, ip_number) web_pages_no = 34 max_page_opens_per_second = 15 min_seconds_taken = float(web_pages_no) / max_page_opens_per_second for threads_no in [1, 3]: seconds_taken = self.__check_download( threads_no, root_address, max_page_opens_per_second) # print >>sys.stderr, "seconds_taken={}".format(seconds_taken) self.assertGreaterEqual(seconds_taken, min_seconds_taken)
def test_throttled_download_with_HTTP_server(self): # Logger.start(logging_level=logging.DEBUG) with DelayedHTTPFilesServer( Resources.path(__file__, "data/original_site"), 0) as server: (address, ip_number) = server.start() root_address = "http://{}:{}/issues_1.html".format( address, ip_number) web_pages_no = 34 max_page_opens_per_second = 15 min_seconds_taken = float(web_pages_no)/max_page_opens_per_second for threads_no in [1, 3]: seconds_taken = self.__check_download( threads_no, root_address, max_page_opens_per_second) # print >>sys.stderr, "seconds_taken={}".format(seconds_taken) self.assertGreaterEqual(seconds_taken, min_seconds_taken)
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append( HTMLMultipageNavigator( address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start
def test_simple_browsing(self): navigator = HTMLMultipageNavigator("file:"+Resources.path(__file__, "../../test/data/original_site/issues_1.html", convert_to_url=True), LevelsCreator(None).create()) navigator.start_in_root() root_name = navigator.get_path()[0] children1 = navigator.get_children() self.assertEqual(["2011-07-12", "2011-07-13", "2011-07-14", "2011-07-16", "2011-07-16-repetition_1", "2011-07-17"], children1) navigator.move_to_child(children1[0]) self.assertEqual([root_name, "2011-07-12"], navigator.get_path()) children2 = navigator.get_children() self.assertEqual(["01", "02", "03", "04", "05", "06", "07", "08"], children2) navigator.move_to_child("05") self.assertEqual([root_name, "2011-07-12", "05"], navigator.get_path()) navigator.move_to_parent() self.assertEqual([root_name, "2011-07-12"], navigator.get_path()) navigator.move_to_parent() self.assertEqual([root_name], navigator.get_path())
def test_multithreaded_download_speedup_with_slow_HTTP_server(self): # Logger.start(logging_level=logging.DEBUG) with DelayedHTTPFilesServer( Resources.path(__file__, "data/original_site"), 0.1) as server: (address, ip_number) = server.start() root_address = "http://{}:{}/issues_1.html".format( address, ip_number) time_taken = [] threads_no_list = [1, 4] for threads_no in threads_no_list: run_time = self.__check_download(threads_no, root_address) time_taken.append(run_time) assert_str = "{} threads time taken: {}s while "\ "{} threads time taken: {}s".format( threads_no_list[0], time_taken[0], threads_no_list[1], time_taken[1]) min_speedup = 1 ## We're expecting at some speedup. The speedup ## is not fully deterministic and depends e.g. on processor load self.assert_(time_taken[0] > min_speedup*time_taken[1], assert_str)
def test_multithreaded_download_speedup_with_slow_HTTP_server(self): # Logger.start(logging_level=logging.DEBUG) with DelayedHTTPFilesServer( Resources.path(__file__, "data/original_site"), 0.1) as server: (address, ip_number) = server.start() root_address = "http://{}:{}/issues_1.html".format( address, ip_number) time_taken = [] threads_no_list = [1, 4] for threads_no in threads_no_list: run_time = self.__check_download(threads_no, root_address) time_taken.append(run_time) assert_str = "{} threads time taken: {}s while "\ "{} threads time taken: {}s".format( threads_no_list[0], time_taken[0], threads_no_list[1], time_taken[1]) min_speedup = 1 ## We're expecting at some speedup. The speedup ## is not fully deterministic and depends e.g. on processor load self.assert_(time_taken[0] > min_speedup * time_taken[1], assert_str)
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append(HTMLMultipageNavigator(address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start
def test_multithreaded_download(self): address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) for threads_no in [1, 2, 3, 4, 50]: self.__check_download(threads_no, address)