def __check(self, dir_name, should_be_equal): path = Resources.path(__file__, os.path.join("data/dir_tree_comparer", dir_name)) ret = are_dir_trees_equal( os.path.join(path, "01"), os.path.join(path, "02"), ignore=[".gitignore"]) if should_be_equal: self.assertTrue(ret) else: self.assertFalse(ret)
def test_website_download(self): with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+Resources.path(__file__, "data/original_site-without_broken_links/issues_1.html", convert_to_url=True) navigator = HTMLMultipageNavigator(address, levels) crawler = SimpleDFSCrawler(navigator) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download-without_broken_links") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"]))
def test_website_download(self): with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:" + Resources.path( __file__, "data/original_site-without_broken_links/issues_1.html", convert_to_url=True) navigator = HTMLMultipageNavigator(address, levels) crawler = SimpleDFSCrawler(navigator) crawler.run() expected_dir = Resources.path( __file__, "data/expected_download-without_broken_links") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"]))
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append( HTMLMultipageNavigator( address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start
def test_single_threaded_download_without_manager(self): # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) tree = TreeAccessor(_StandardNodeExtended()) navigator = HTMLMultipageNavigator(address, levels) navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree) crawler = CrawlerThread(navigator_wrapper, tree) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(tree.get_root()) self.__check_if_each_node_is_processed_once( tree.get_root(), {"/root/2011-07-16/06": 0})
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append(HTMLMultipageNavigator(address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start
def test_single_threaded_download_without_manager(self): # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: levels = LevelsCreator(temp_dir.get_path()).create() address = "file:"+\ Resources.path(__file__, "data/original_site/issues_1.html", convert_to_url=True) tree = TreeAccessor(_StandardNodeExtended()) navigator = HTMLMultipageNavigator(address, levels) navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree) crawler = CrawlerThread(navigator_wrapper, tree) crawler.run() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(tree.get_root()) self.__check_if_each_node_is_processed_once( tree.get_root(), {"/root/2011-07-16/06": 0})