def __init__(self, sessions=None, settings=None): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True) self.settings = settings if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() pool_type = getattr(settings, 'POOL', 'greenlets') pool = Pools[pool_type] if self.max_concurrency_level is None: self.max_concurrency_level = pool['max_concurrency'] self.pool = pool['pool'](self.max_concurrency_level) self.request_manager = RequestManager(settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation) self._initialize_scrapers()
def __init__(self, sessions=None, debug=False): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = debug if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() self.pool = GreenPool() self.request_manager = RequestManager() self._initialize_scrapers()
def __init__(self, sessions=None, settings=None): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True) self.settings = settings if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() pool_type = getattr(settings, 'POOL', 'greenlets') pool = Pools[pool_type] if self.max_concurrency_level is None: self.max_concurrency_level = pool['max_concurrency'] self.pool = pool['pool'](self.max_concurrency_level) self.request_manager = RequestManager( settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation) self._initialize_scrapers()
class BaseCrawler(object): """ User's Crawlers must inherit from this class, may override some methods and define the start_urls list, the scrapers and the max crawling depth. """ __metaclass__ = CrawlerMeta start_urls = [] """ A list containing the start urls for the crawler """ allowed_urls = [] """ A list of urls allowed for crawl """ black_list = [] """ A list of blocked urls which never be crawled """ scrapers = [] """ A list of scrapers classes """ max_depth = -1 """ The maximun crawling recursive level """ max_concurrency_level = None """ The maximun coroutines concurrecy level """ headers = {} """ The default request headers """ requests_delay = config.REQUEST_DELAY """ The average delay time between requests """ requests_deviation = config.REQUEST_DEVIATION """ The requests deviation time """ extractor = None """ The extractor class. Default is XPathExtractor """ post_urls = [] """ The Post data for the urls. A List of tuples containing (url, data_dict) Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'}) """ login = None """ The login data. A tuple of (url, login_dict). Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'}) """ search_all_urls = True """ If user doesn't define the get_urls method in scrapers then the crawler will search for urls in the current page itself depending on the [search_all_urls] attribute. """ search_hidden_urls = False """ Search for hidden urls in the whole html """ def __init__(self, sessions=None, settings=None): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True) self.settings = settings if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() pool_type = getattr(settings, 'POOL', 'greenlets') pool = Pools[pool_type] if self.max_concurrency_level is None: self.max_concurrency_level = pool['max_concurrency'] self.pool = pool['pool'](self.max_concurrency_level) self.request_manager = RequestManager(settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation) self._initialize_scrapers() def _initialize_scrapers(self): """ Instanciates all the scraper classes """ self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers] def _make_request(self, url, data=None): """ Returns the response object from a request params: data: if this param is present it makes a POST. """ return self.request_manager.make_request(url, data, self.extractor) def _get_response(self, url, data=None): """ Returns the response data from a request params: data: if this param is present it makes a POST. """ for pattern, post_data in self.post_urls: if url_matcher(url, pattern): data = post_data return self._make_request(url, data) def request(self, url, data=None): return self._get_response(url, data=data) def _manage_scrapers(self, response): """ Checks if some scraper is suited for data extraction on the current url. If so, gets the extractor object and delegate the scraping task to the scraper Object """ scraped_urls = [] for scraper in self.scrapers: urls = scraper.try_scrape(response) if urls is not None: self._commit() scraped_urls.extend(urls) return scraped_urls def _commit(self): """ Makes a Commit in all sessions """ for session in self.sessions: session.commit() def _search_in_urls_list(self, urls_list, url, default=True): """ Searches an url in a list of urls """ if not urls_list: return default for pattern in urls_list: if url_matcher(url, pattern): return True return False def _validate_url(self, url): """ Validates if the url is in the crawler's [allowed_urls] list and not in [black_list]. """ return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False) def _fetch(self, url, depth_level=0): """ Recursive url fetching. Params: depth_level: The maximun recursion level url: The url to start crawling """ if not self._validate_url(url): return if self.debug: print "-" * 80 print "crawling -> %s" % url try: response = self._get_response(url) except Exception, ex: self.on_request_error(url, ex) return if self.debug: print "-" * 80 urls = self._manage_scrapers(response) if not urls: if self.search_all_urls: urls = self.get_urls(response) else: return for new_url in urls: if depth_level >= self.max_depth and self.max_depth != -1: return self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
class BaseCrawler(object): """ User's Crawlers must inherit from this class, may override some methods and define the start_urls list, the scrapers and the max crawling depth. """ __metaclass__ = CrawlerMeta start_urls = [] """ A list containing the start urls for the crawler """ allowed_urls = [] """ A list of urls allowed for crawl """ black_list = [] """ A list of blocked urls which never be crawled """ scrapers = [] """ A list of scrapers classes """ max_depth = -1 """ The maximun crawling recursive level """ max_concurrency_level = None """ The maximun coroutines concurrecy level """ headers = {} """ The default request headers """ requests_delay = config.REQUEST_DELAY """ The average delay time between requests """ requests_deviation = config.REQUEST_DEVIATION """ The requests deviation time """ extractor = None """ The extractor class. Default is XPathExtractor """ post_urls = [] """ The Post data for the urls. A List of tuples containing (url, data_dict) Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'}) """ login = None """ The login data. A tuple of (url, login_dict). Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'}) """ search_all_urls = True """ If user doesn't define the get_urls method in scrapers then the crawler will search for urls in the current page itself depending on the [search_all_urls] attribute. """ search_hidden_urls = False """ Search for hidden urls in the whole html """ def __init__(self, sessions=None, settings=None): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True) self.settings = settings if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() pool_type = getattr(settings, 'POOL', 'greenlets') pool = Pools[pool_type] if self.max_concurrency_level is None: self.max_concurrency_level = pool['max_concurrency'] self.pool = pool['pool'](self.max_concurrency_level) self.request_manager = RequestManager( settings=settings, headers=self.headers, delay=self.requests_delay, deviation=self.requests_deviation) self._initialize_scrapers() def _initialize_scrapers(self): """ Instanciates all the scraper classes """ self.scrapers = [ scraper_class(settings=self.settings) for scraper_class in self.scrapers ] def _make_request(self, url, data=None): """ Returns the response object from a request params: data: if this param is present it makes a POST. """ return self.request_manager.make_request(url, data, self.extractor) def _get_response(self, url, data=None): """ Returns the response data from a request params: data: if this param is present it makes a POST. """ for pattern, post_data in self.post_urls: if url_matcher(url, pattern): data = post_data return self._make_request(url, data) def request(self, url, data=None): return self._get_response(url, data=data) def _manage_scrapers(self, response): """ Checks if some scraper is suited for data extraction on the current url. If so, gets the extractor object and delegate the scraping task to the scraper Object """ scraped_urls = [] for scraper in self.scrapers: urls = scraper.try_scrape(response) if urls is not None: self._commit() scraped_urls.extend(urls) return scraped_urls def _commit(self): """ Makes a Commit in all sessions """ for session in self.sessions: session.commit() def _search_in_urls_list(self, urls_list, url, default=True): """ Searches an url in a list of urls """ if not urls_list: return default for pattern in urls_list: if url_matcher(url, pattern): return True return False def _validate_url(self, url): """ Validates if the url is in the crawler's [allowed_urls] list and not in [black_list]. """ return self._search_in_urls_list( self.allowed_urls, url) and not self._search_in_urls_list( self.black_list, url, default=False) def _fetch(self, url, depth_level=0): """ Recursive url fetching. Params: depth_level: The maximun recursion level url: The url to start crawling """ if not self._validate_url(url): return if self.debug: print "-" * 80 print "crawling -> %s" % url try: response = self._get_response(url) except Exception, ex: self.on_request_error(url, ex) return if self.debug: print "-" * 80 urls = self._manage_scrapers(response) if not urls: if self.search_all_urls: urls = self.get_urls(response) else: return for new_url in urls: if depth_level >= self.max_depth and self.max_depth != -1: return self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
class BaseCrawler(object): """ User's Crawlers must inherit from this class, may override some methods and define the start_urls list, the scrapers and the max crawling depth. """ __metaclass__ = CrawlerMeta start_urls = [] """ A list containing the start urls for the crawler """ allowed_urls = [] """ A list of urls allowed for crawl """ black_list = [] """ A list of blocked urls which never be crawled """ scrapers = [] """ A list of scrapers classes """ max_depth = -1 """ The maximun crawling recursive level """ extractor = None """ The extractor class. Default is XPathExtractor """ post_urls = [] """ The Post data for the urls. A List of tuples containing (url, data_dict) Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'}) """ login = None """ The login data. A tuple of (url, login_dict). Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'}) """ search_all_urls = True """ If user doesn't define the get_urls method in scrapers then the crawler will search for urls in the current page itself depending on the [search_all_urls] attribute. """ _url_regex = re_compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') def __init__(self, sessions=None, debug=False): """ Initializes the crawler params: sessions: Database or Documents persistant sessions debug: indicates if the crawler logs to stdout debug info """ if sessions is None: sessions = [] self.sessions = sessions self.debug = debug if self.extractor is None: self.extractor = XPathExtractor self.extractor = self.extractor() self.pool = GreenPool() self.request_manager = RequestManager() self._initialize_scrapers() def _initialize_scrapers(self): """ Instanciates all the scraper classes """ self.scrapers = [scraper_class(debug=self.debug) for scraper_class in self.scrapers] def _make_request(self, url, data=None): """ Returns the response object from a request params: data: if this param is present it makes a POST. """ return self.request_manager.make_request(url, data, self.extractor) def _get_response(self, url, data=None): """ Returns the response data from a request params: data: if this param is present it makes a POST. """ for pattern, post_data in self.post_urls: if url_matcher(url, pattern): data = post_data return self._make_request(url, data) def _manage_scrapers(self, response): """ Checks if some scraper is suited for data extraction on the current url. If so, gets the extractor object and delegate the scraping task to the scraper Object """ scraped_urls = [] for scraper in self.scrapers: urls = scraper.try_scrape(response) if urls is not None: self._commit() scraped_urls.extend(urls) return scraped_urls def _commit(self): """ Makes a Commit in all sessions """ for session in self.sessions: session.commit() def _search_in_urls_list(self, urls_list, url, default=True): """ Searches an url in a list of urls """ if not urls_list: return default for pattern in urls_list: if url_matcher(url, pattern): return True return False def _validate_url(self, url): """ Validates if the url is in the crawler's [allowed_urls] list and not in [black_list]. """ return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False) def _fetch(self, url, depth_level=0): """ Recursive url fetching. Params: depth_level: The maximun recursion level url: The url to start crawling """ if not self._validate_url(url): return if self.debug: print "-" * 80 print "crawling -> %s" % url try: response = self._get_response(url) except Exception, e: if self.debug: print "Request to %s returned error: %s" % (url, e) return if self.debug: print "-" * 80 urls = self._manage_scrapers(response) if not urls: if self.search_all_urls: urls = self.get_urls(response) else: return for new_url in urls: if depth_level >= self.max_depth and self.max_depth != -1: return self.pool.spawn_n(self._fetch, new_url, depth_level + 1)