示例#1
0
 def __init__ (self, RESTART_LIMIT=100, VISIT_MEMORY=100):
     """
     RESTART_LIMIT: how many pages to visit before doing a random restart.
     VISIT_MEMORY: max pages to keep in a tabu list, i.e. a list of
         pages not to visit again. Prevents cycles.
     """
 
     self.current_url    = None
     self.current_html   = None
     self.pages_crawled  = 0
     
     self.next_urls  = None
     self.visited    = SetQueue(VISIT_MEMORY)
     
     self.RESTART_LIMIT = RESTART_LIMIT
示例#2
0
class Crawler (object):
    __metaclass__ = abc.ABCMeta

    def __init__ (self, RESTART_LIMIT=100, VISIT_MEMORY=100):
        """
        RESTART_LIMIT: how many pages to visit before doing a random restart.
        VISIT_MEMORY: max pages to keep in a tabu list, i.e. a list of
            pages not to visit again. Prevents cycles.
        """
    
        self.current_url    = None
        self.current_html   = None
        self.pages_crawled  = 0
        
        self.next_urls  = None
        self.visited    = SetQueue(VISIT_MEMORY)
        
        self.RESTART_LIMIT = RESTART_LIMIT
    
    def next_page (self):
        if self.current_url is not None and self.next_urls is None:
            self._fetch_next_urls_from_current_page()
    
        found_page = False
        
        while not found_page:
            if (self.current_url is None or
                    self._restart_required() or
                    self._no_new_pages_available_from_current()):
                new_page = self._random_restart()
            else:
                new_page = self.next_urls.pop()
            
            if new_page not in self.visited:
                found_page = True
                
                self.current_url = new_page
                self.current_html = None
                self.next_urls = None
                self.visited.add(new_page)
                   
        self.pages_crawled += 1
    
    @abc.abstractmethod
    def _fetch_next_urls_from_current_page (self):
        """Scrape current page for possible URLs to crawl next.
        
        Should form a list of possible URLs to be crawled next, using the
        HTML tree of the current page. Then assign self.next_urls to hold this
        list. Shuffle it as well, if randomness is desired.
        
        Returns:
            None
        """
        return
    
    @staticmethod
    def _fetch_page_content (f):
        """Decorator - ensures HTML tree retrieval before function body."""
        
        @wraps(f)
        def wrapped (inst, *args, **kwargs):
            if inst.current_html is None:
                page = req.get(inst.current_url)
                inst.current_html = html.fromstring(page.content)
                
            return f(inst, *args, **kwargs)
        
        return wrapped
    
    @abc.abstractmethod
    def _random_restart (self):
        """Return link to a 'random' page.
        
        Returns:
            Link to a new page to be crawled.
        """
        return
    
    @abc.abstractmethod
    def scrape_page (self):
        """Scrape data from the current page.
        Note: maybe shouldn't require this implementation?
        
        Returns:
            Data from the current page, in whatever format.
        """
        return
        
    def _restart_required (self):
        return self.pages_crawled % self.RESTART_LIMIT == 0
        
    def _no_new_pages_available_from_current (self):
        return self.next_urls == []