class CmdLnNavigatorsCreator(AbstractCmdLnNavigatorsCreator): def __init__(self, levels_creator): """@type levels_creator: L{AbstractCmdLnLevelsCreator}""" self.__token_filler = None self.__levels_creator = levels_creator def fill_parser(self, parser): parser.add_argument("source_address", help="the address of the web site to crawl.") parser.add_argument("--max_pages_per_second", type=float, help="Maximal number of web pages downloads per second "\ "(a real number). By default no limit is imposed.") self.__levels_creator.fill_parser(parser) def create(self, args, navigators_count): browser_creator = self.__get_browser_creator_and_start_token_filler( args.max_pages_per_second) navigators = [] for _ in range(navigators_count): navigators.append( HTMLMultipageNavigator(args.source_address, self.__levels_creator.create(args), browser_creator)) return navigators def __get_browser_creator_and_start_token_filler(self, max_pages_per_second): self.__token_filler = None browser_creator = None if max_pages_per_second is not None: token_bucket = StandardTokenBucket(max_pages_per_second) browser_creator = ThrottledWebBrowserCreator( self._create_browser_creator(), token_bucket) self.__token_filler = TokenBucketFiller(token_bucket, 1, max_pages_per_second) self.__token_filler.daemon = True self.__token_filler.start() else: browser_creator = self._create_browser_creator() return browser_creator def _create_browser_creator(self): """ It is possible to override this function to use a different C{AbstractWebBrowserCreator}. @rtype: C{AbstractWebBrowserCreator} """ return MechanizeBrowserCreator() def on_exit(self): if self.__token_filler is not None: self.__token_filler.stop() self.__levels_creator.on_exit()
class CmdLnNavigatorsCreator(AbstractCmdLnNavigatorsCreator): def __init__(self, levels_creator): """@type levels_creator: L{AbstractCmdLnLevelsCreator}""" self.__token_filler = None self.__levels_creator = levels_creator def fill_parser(self, parser): parser.add_argument("source_address", help="the address of the web site to crawl.") parser.add_argument("--max_pages_per_second", type=float, help="Maximal number of web pages downloads per second "\ "(a real number). By default no limit is imposed.") self.__levels_creator.fill_parser(parser) def create(self, args, navigators_count): browser_creator = self.__get_browser_creator_and_start_token_filler( args.max_pages_per_second) navigators = [] for _ in range(navigators_count): navigators.append( HTMLMultipageNavigator(args.source_address, self.__levels_creator.create(args), browser_creator)) return navigators def __get_browser_creator_and_start_token_filler(self, max_pages_per_second): self.__token_filler = None browser_creator = None if max_pages_per_second is not None: token_bucket = StandardTokenBucket(max_pages_per_second) browser_creator = ThrottledWebBrowserCreator( self._create_browser_creator(), token_bucket) self.__token_filler = TokenBucketFiller( token_bucket, 1, max_pages_per_second) self.__token_filler.daemon = True self.__token_filler.start() else: browser_creator = self._create_browser_creator() return browser_creator def _create_browser_creator(self): """ It is possible to override this function to use a different C{AbstractWebBrowserCreator}. @rtype: C{AbstractWebBrowserCreator} """ return MechanizeBrowserCreator() def on_exit(self): if self.__token_filler is not None: self.__token_filler.stop() self.__levels_creator.on_exit()
def __get_browser_creator_and_start_token_filler(self, max_pages_per_second): self.__token_filler = None browser_creator = None if max_pages_per_second is not None: token_bucket = StandardTokenBucket(max_pages_per_second) browser_creator = ThrottledWebBrowserCreator( self._create_browser_creator(), token_bucket) self.__token_filler = TokenBucketFiller(token_bucket, 1, max_pages_per_second) self.__token_filler.daemon = True self.__token_filler.start() else: browser_creator = self._create_browser_creator() return browser_creator
def test_get(self): bucket = StandardTokenBucket(1000) filler = TokenBucketFiller(bucket, 2, 3) ths = [] threads_no = 2 for i in xrange(threads_no): ths.append(_Incrementor(bucket)) ths[i].start() filler.start() time.sleep(3) for i in xrange(threads_no): ths[i].order_stop() for i in xrange(threads_no): ths[i].join() filler.stop() results = [] sum = 0 for i in xrange(threads_no): results.append(ths[i].get_result()) sum += results[i] self.assertEqual(8, sum)
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append(HTMLMultipageNavigator(address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_(are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start
def __get_browser_creator_and_start_token_filler(self, max_pages_per_second): self.__token_filler = None browser_creator = None if max_pages_per_second is not None: token_bucket = StandardTokenBucket(max_pages_per_second) browser_creator = ThrottledWebBrowserCreator( self._create_browser_creator(), token_bucket) self.__token_filler = TokenBucketFiller( token_bucket, 1, max_pages_per_second) self.__token_filler.daemon = True self.__token_filler.start() else: browser_creator = self._create_browser_creator() return browser_creator
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append( HTMLMultipageNavigator( address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start