def __check(self, dir_name, should_be_equal):
		path = Resources.path(__file__, 
			os.path.join("data/dir_tree_comparer", dir_name))
		ret = are_dir_trees_equal(
			os.path.join(path, "01"), 
			os.path.join(path, "02"), ignore=[".gitignore"])
		if should_be_equal:
			self.assertTrue(ret)
		else:
			self.assertFalse(ret)
	def test_website_download(self):
		with TempDir() as temp_dir:
			levels = LevelsCreator(temp_dir.get_path()).create()
			address = "file:"+Resources.path(__file__, 
				"data/original_site-without_broken_links/issues_1.html",
				convert_to_url=True)
			navigator = HTMLMultipageNavigator(address, levels)
			crawler = SimpleDFSCrawler(navigator)
			crawler.run()
			expected_dir = Resources.path(__file__, 
				"data/expected_download-without_broken_links")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
				ignore=[".gitignore"]))
示例#3
0
 def test_website_download(self):
     with TempDir() as temp_dir:
         levels = LevelsCreator(temp_dir.get_path()).create()
         address = "file:" + Resources.path(
             __file__,
             "data/original_site-without_broken_links/issues_1.html",
             convert_to_url=True)
         navigator = HTMLMultipageNavigator(address, levels)
         crawler = SimpleDFSCrawler(navigator)
         crawler.run()
         expected_dir = Resources.path(
             __file__, "data/expected_download-without_broken_links")
         actual_dir = temp_dir.get_path()
         self.assert_(
             are_dir_trees_equal(expected_dir,
                                 actual_dir,
                                 ignore=[".gitignore"]))
示例#4
0
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start
示例#5
0
	def test_single_threaded_download_without_manager(self):
#		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
#		try:
		with TempDir() as temp_dir:
			levels = LevelsCreator(temp_dir.get_path()).create()
			address = "file:"+\
				Resources.path(__file__, "data/original_site/issues_1.html",
							convert_to_url=True)
			tree = TreeAccessor(_StandardNodeExtended())
			navigator = HTMLMultipageNavigator(address, levels)
			navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree)
			crawler = CrawlerThread(navigator_wrapper, tree)
			crawler.run()
			expected_dir = Resources.path(__file__, "data/expected_download")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
					ignore=[".gitignore"]))
			self.__check_tree_final_state(tree.get_root())
			self.__check_if_each_node_is_processed_once(
				tree.get_root(), {"/root/2011-07-16/06": 0})
示例#6
0
	def __check_download(self,
			threads_no, address, max_page_opens_per_second=None):
		"""@return: run time in seconds"""
#		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
#		try:
		with TempDir() as temp_dir:
			token_filler = None
			browser_creator = None
			if max_page_opens_per_second is not None:
				token_bucket = None
				token_bucket = StandardTokenBucket(max_page_opens_per_second)
				token_filler = TokenBucketFiller(token_bucket, 1, 
					max_page_opens_per_second)
				token_filler.start()
				browser_creator = ThrottledWebBrowserCreator(
					MechanizeBrowserCreator(), token_bucket)
			else:
				browser_creator = MechanizeBrowserCreator()
			
			navigators = []
			for _ in xrange(threads_no):
				navigators.append(HTMLMultipageNavigator(address,
					LevelsCreator(temp_dir.get_path()).create(), 
					browser_creator))
			sentinel = _StandardNodeExtended()
			crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
			start = time.time()
			crawler.run()
			end = time.time()
			expected_dir = Resources.path(__file__, "data/expected_download")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
					ignore=[".gitignore"]))
			self.__check_tree_final_state(sentinel.get_child("root"))
			self.__check_if_each_node_is_processed_once(
				sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
			if max_page_opens_per_second is not None:
				token_filler.stop()
			return end - start
示例#7
0
 def test_single_threaded_download_without_manager(self):
     #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
     #		try:
     with TempDir() as temp_dir:
         levels = LevelsCreator(temp_dir.get_path()).create()
         address = "file:"+\
          Resources.path(__file__, "data/original_site/issues_1.html",
             convert_to_url=True)
         tree = TreeAccessor(_StandardNodeExtended())
         navigator = HTMLMultipageNavigator(address, levels)
         navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree)
         crawler = CrawlerThread(navigator_wrapper, tree)
         crawler.run()
         expected_dir = Resources.path(__file__, "data/expected_download")
         actual_dir = temp_dir.get_path()
         self.assert_(
             are_dir_trees_equal(expected_dir,
                                 actual_dir,
                                 ignore=[".gitignore"]))
         self.__check_tree_final_state(tree.get_root())
         self.__check_if_each_node_is_processed_once(
             tree.get_root(), {"/root/2011-07-16/06": 0})