Пример #1
0
    def _create_browser_creator(self):
        """
		It is possible to override this function to use a different 
		C{AbstractWebBrowserCreator}.
		
		@rtype: C{AbstractWebBrowserCreator}
		"""
        return MechanizeBrowserCreator()
Пример #2
0
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start
Пример #3
0
    def __init__(self, address, levels, browser_creator=None):
        """
		@param browser_creator: a creator of browsers that will be used
			while crawling the web site. The default browser used here 
			is L{MechanizeBrowser}.
		@type browser_creator: L{AbstractWebBrowserCreator}
		@param levels: list of L{Level} objects. The first element is a level 
			corresponding to the root node, the last one corresponds to
			leafs level.
		@param address: URL address string
		"""
        self.__address = address
        self.__browser_creator = browser_creator
        if browser_creator is None:
            self.__browser_creator = MechanizeBrowserCreator()
        self.__br = None
        self.__levels = levels
        self.__path = None
        self.__children_history = None
        self.__current_children = None
        """
Пример #4
0
	def __init__(self, address, levels, browser_creator=None):
		"""
		@param browser_creator: a creator of browsers that will be used
			while crawling the web site. The default browser used here 
			is L{MechanizeBrowser}.
		@type browser_creator: L{AbstractWebBrowserCreator}
		@param levels: list of L{Level} objects. The first element is a level 
			corresponding to the root node, the last one corresponds to
			leafs level.
		@param address: URL address string
		"""
		self.__address = address
		self.__browser_creator = browser_creator
		if browser_creator is None:
			self.__browser_creator = MechanizeBrowserCreator()
		self.__br = None
		self.__levels = levels
		self.__path = None
		self.__children_history = None
		self.__current_children = None
		"""
Пример #5
0
class HTMLMultipageNavigator(AbstractTreeNavigator):
    """
	A web site tree navigator. 
	
	It is assumed that all web pages corresponding to the nodes of the tree on 
	the given level have the same basic	characteristics and are analyzed 
	in the same way, namely by the same object inheriting from  
	L{AbstractPageAnalyzer}. In particular, all of the leaf web pages are 
	placed on the same level of the tree. Some of the parts of the tree might
	be missing, which results in marking certain nodes of the tree as ERROR.
	"""

    __repetition_suffix_template = "-repetition_{}"
    __generate_new_name_max_repetitions = int(10e4)

    def __init__(self, address, levels, browser_creator=None):
        """
		@param browser_creator: a creator of browsers that will be used
			while crawling the web site. The default browser used here 
			is L{MechanizeBrowser}.
		@type browser_creator: L{AbstractWebBrowserCreator}
		@param levels: list of L{Level} objects. The first element is a level 
			corresponding to the root node, the last one corresponds to
			leafs level.
		@param address: URL address string
		"""
        self.__address = address
        self.__browser_creator = browser_creator
        if browser_creator is None:
            self.__browser_creator = MechanizeBrowserCreator()
        self.__br = None
        self.__levels = levels
        self.__path = None
        self.__children_history = None
        self.__current_children = None
        """
		Info about children on current level of tree structure.
		L{OrderedDictionary} with the key as child name and the value 
		as a link to child web page.
		"""

    def start_in_root(self):
        self.__br = self.__browser_creator.create()
        self.__br.open(self.__address)
        self.__path = [self.__levels[0].name]
        self.__children_history = _ChildrenHistory()
        self.__current_children = self.__get_current_children()

    def get_path(self):
        """
		@return: path to the tree node the navigator is currently in i.e.
			subsequent node names from the tree root to the current node
		"""
        return self.__path

    def get_children(self):
        return self.__current_children.keys()

    def __get_current_children(self):
        children = OrderedDict()
        page_index = 0
        child_links_retrieved_so_far = 0
        current_level = self.__levels[self.__get_current_level()]
        while True:
            page_links = \
             current_level.page_analyzer.get_links(
              self.__br.response(), child_links_retrieved_so_far)
            for (name, link) in page_links.children:
                if name in children:
                    new_name = \
                     self.__generate_new_name(name, children)
                    if new_name is None:
                        logging.error(
                            "Unable to generate a new name "
                            "for a repeating child name \"{}\" "
                            "(link=\"{}\") in node \"{}\": "
                            "all of the proposed new name variants are "
                            "already in use".format(name, link,
                                                    "/".join(self.__path)))
                        continue
                    children[new_name] = link
                else:
                    children[name] = link
            next_page_link = page_links.next_page_link
            if next_page_link is None:
                break
            self.__br.open(next_page_link)
            page_index += 1
            child_links_retrieved_so_far += len(page_links.children)
        if page_index > 0:
            self.__br.back(page_index)  ## Get back to the first page
        return children

    @staticmethod
    def __generate_new_name(original_name, children_dict):
        max_rep = HTMLMultipageNavigator.__generate_new_name_max_repetitions
        for i in xrange(max_rep):
            name = original_name + \
             HTMLMultipageNavigator.__repetition_suffix_template.format(i+1)
            if name not in children_dict:
                return name
        return None

    def __get_current_level(self):
        return len(self.get_path()) - 1

    def __is_on_leafs_level(self):
        return len(self.__path) == len(self.__levels)

    def move_to_child(self, child_name):
        assert not self.__is_on_leafs_level()
        try:
            self.__br.open(self.__current_children[child_name])
            self.__path.append(child_name)
            self.__children_history.push(self.__current_children)
            self.__current_children = self.__get_current_children()
        except Exception as ex:
            raise NavigationException(ex)

    def move_to_parent(self):
        assert self.__get_current_level() > 0
        try:
            self.__br.back()
            self.__path = self.__path[:-1]
            self.__current_children = self.__children_history.pop()
        except Exception as ex:
            raise NavigationException(ex)

    def process_node_and_check_if_is_leaf(self):
        try:
            response = self.__br.response()
            analyzer = self.__levels[self.__get_current_level()].page_analyzer
            analyzer.process(self.__path, response)
            if len(self.__current_children) > 0:
                return False
            return True
        except Exception as ex:
            raise NavigationException(ex)
Пример #6
0
class HTMLMultipageNavigator(AbstractTreeNavigator):
	"""
	A web site tree navigator. 
	
	It is assumed that all web pages corresponding to the nodes of the tree on 
	the given level have the same basic	characteristics and are analyzed 
	in the same way, namely by the same object inheriting from  
	L{AbstractPageAnalyzer}. In particular, all of the leaf web pages are 
	placed on the same level of the tree. Some of the parts of the tree might
	be missing, which results in marking certain nodes of the tree as ERROR.
	"""
	
	__repetition_suffix_template = "-repetition_{}"
	__generate_new_name_max_repetitions = int(10e4)

	def __init__(self, address, levels, browser_creator=None):
		"""
		@param browser_creator: a creator of browsers that will be used
			while crawling the web site. The default browser used here 
			is L{MechanizeBrowser}.
		@type browser_creator: L{AbstractWebBrowserCreator}
		@param levels: list of L{Level} objects. The first element is a level 
			corresponding to the root node, the last one corresponds to
			leafs level.
		@param address: URL address string
		"""
		self.__address = address
		self.__browser_creator = browser_creator
		if browser_creator is None:
			self.__browser_creator = MechanizeBrowserCreator()
		self.__br = None
		self.__levels = levels
		self.__path = None
		self.__children_history = None
		self.__current_children = None
		"""
		Info about children on current level of tree structure.
		L{OrderedDictionary} with the key as child name and the value 
		as a link to child web page.
		"""

	def start_in_root(self):
		self.__br = self.__browser_creator.create()
		self.__br.open(self.__address)
		self.__path = [self.__levels[0].name]
		self.__children_history = _ChildrenHistory()
		self.__current_children = self.__get_current_children()

	def get_path(self):
		"""
		@return: path to the tree node the navigator is currently in i.e.
			subsequent node names from the tree root to the current node
		"""
		return self.__path

	def get_children(self):
		return self.__current_children.keys()
	
	def __get_current_children(self):
		children = OrderedDict()
		page_index = 0
		child_links_retrieved_so_far = 0
		current_level = self.__levels[self.__get_current_level()]
		while True:
			page_links = \
				current_level.page_analyzer.get_links(
					self.__br.response(), child_links_retrieved_so_far)
			for (name, link) in page_links.children:
				if name in children:
					new_name = \
						self.__generate_new_name(name, children)
					if new_name is None:
						logging.error("Unable to generate a new name "
							"for a repeating child name \"{}\" "
							"(link=\"{}\") in node \"{}\": "
							"all of the proposed new name variants are "
							"already in use".format(
								name, link, "/".join(self.__path)))
						continue
					children[new_name] = link
				else:
					children[name] = link
			next_page_link = page_links.next_page_link
			if next_page_link is None:
				break
			self.__br.open(next_page_link)
			page_index += 1
			child_links_retrieved_so_far += len(page_links.children)
		if page_index > 0:
			self.__br.back(page_index) ## Get back to the first page
		return children
	
	@staticmethod
	def __generate_new_name(original_name, children_dict):
		max_rep = HTMLMultipageNavigator.__generate_new_name_max_repetitions
		for i in xrange(max_rep):
			name = original_name + \
				HTMLMultipageNavigator.__repetition_suffix_template.format(i+1)
			if name not in children_dict:
				return name
		return None
		
	def __get_current_level(self):
		return len(self.get_path())-1

	def __is_on_leafs_level(self):
		return len(self.__path) == len(self.__levels)

	def move_to_child(self, child_name):
		assert not self.__is_on_leafs_level()
		try:
			self.__br.open(self.__current_children[child_name])
			self.__path.append(child_name)
			self.__children_history.push(self.__current_children)
			self.__current_children = self.__get_current_children()
		except Exception as ex:
			raise NavigationException(ex)
		
	def move_to_parent(self):
		assert self.__get_current_level() > 0
		try:
			self.__br.back()
			self.__path = self.__path[:-1]
			self.__current_children = self.__children_history.pop()
		except Exception as ex:
			raise NavigationException(ex)

	def process_node_and_check_if_is_leaf(self):
		try:
			response = self.__br.response()
			analyzer = self.__levels[self.__get_current_level()].page_analyzer
			analyzer.process(self.__path, response)
			if len(self.__current_children) > 0:
				return False
			return True
		except Exception as ex:
			raise NavigationException(ex)