Пример #1
0
 def __get_browser_creator_and_start_token_filler(self,
                                                  max_pages_per_second):
     self.__token_filler = None
     browser_creator = None
     if max_pages_per_second is not None:
         token_bucket = StandardTokenBucket(max_pages_per_second)
         browser_creator = ThrottledWebBrowserCreator(
             self._create_browser_creator(), token_bucket)
         self.__token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_pages_per_second)
         self.__token_filler.daemon = True
         self.__token_filler.start()
     else:
         browser_creator = self._create_browser_creator()
     return browser_creator
Пример #2
0
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start