def __init__(self, max_connections, input_is_plain): self.max_connections = max_connections self.input_is_plain = input_is_plain self.queue = Queue(1) self.closed = False self._handler_pool = GreenPool(self.max_connections) self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600) # Start IO worker and die if he does. self.io_worker = io.Worker(lambda: self.closed) t = spawn(self.io_worker.run_loop) t.link(reraise_errors, greenthread.getcurrent()) log.debug(u"Crawler started. Max connections: %d.", self.max_connections)
class Crawler(object): def __init__(self, max_connections, input_is_plain): self.max_connections = max_connections self.input_is_plain = input_is_plain self.queue = Queue(1) self.closed = False self._handler_pool = GreenPool(self.max_connections) self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600) # Start IO worker and die if he does. self.io_worker = io.Worker(lambda: self.closed) t = spawn(self.io_worker.run_loop) t.link(reraise_errors, greenthread.getcurrent()) log.debug(u"Crawler started. Max connections: %d.", self.max_connections) def crawl(self, forever=True): # TODO: do something special about signals? if forever: self.start_queue_updater() while not self.closed: # `get_nowait` will only work together with sleep(0) here # because we need greenlet switch to reraise exception from `do_process`. sleep() try: item = self.queue.get_nowait() except Empty: if not forever: self.graceful_stop() sleep(0.01) continue t = self._handler_pool.spawn(self.do_process, item) t.link(reraise_errors, greenthread.getcurrent()) def stop(self): self.closed = True def graceful_stop(self, timeout=None): """Stops crawler and waits for all already started crawling requests to finish. If `timeout` is supplied, it waits for at most `timeout` time to finish and returns True if allocated time was enough. Returns False if `timeout` was not enough. """ self.closed = True if timeout is not None: with eventlet.Timeout(timeout, False): if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() return True return False else: if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() def start_queue_updater(self): self._queue_updater_thread = spawn(self.queue_updater) self._queue_updater_thread.link(reraise_errors, greenthread.getcurrent()) def queue_updater(self): log.debug("Waiting for crawl jobs on stdin.") for line in sys.stdin: if self.closed: break line = line.strip() if self.input_is_plain: job = {"url": line} else: try: job = json.loads(line) except ValueError: log.error(u"Decoding input line: %s", line) continue # extend worker queue # 1. skip duplicate URLs for queue_item in self.queue.queue: if queue_item["url"] == job["url"]: # compare URLs break else: # 2. extend queue with new items # May block here, when queue is full. This is a feature. self.queue.put(job) # Stdin exhausted -> stop. while not self.queue.empty(): sleep(0.01) sleep(2) # FIXME: Crutch to prevent stopping too early. self.graceful_stop() def get_robots_checker(self, scheme, authority): """PoolMap func :: scheme, authority -> (agent, uri -> bool).""" robots_uri = "%s://%s/robots.txt" % (scheme, authority) fetch_result = self.io_worker.fetch(robots_uri) # Graceful stop thing. if fetch_result is None: return None if fetch_result["success"]: # TODO: set expiration time from headers # but this must be done after `self._robots_cache.put` or somehow else... if 200 <= fetch_result["status_code"] < 300: parser = robotparser.RobotFileParser() content_lines = fetch_result["content"].splitlines() try: parser.parse(content_lines) except KeyError: raise RobotsError(u"Known robotparser bug: KeyError at urllib.quote(path).") return parser.can_fetch # Authorization required and Forbidden are considered Disallow all. elif fetch_result["status_code"] in (401, 403): return lambda _agent, _uri: False # /robots.txt Not Found is considered Allow all. elif fetch_result["status_code"] == 404: return lambda _agent, _uri: True # FIXME: this is an optimistic rule and probably should be detailed with more specific checks elif fetch_result["status_code"] >= 400: return lambda _agent, _uri: True # What other cases left? 100 and redirects. Consider it Disallow all. else: return lambda _agent, _uri: False else: raise FetchError(u"/robots.txt fetch problem: %s" % (fetch_result["result"])) def ask_robots(self, uri, scheme, authority): key = scheme + ":" + authority with self._robots_cache.getc(key, scheme, authority) as checker: try: # Graceful stop thing. if checker is None: return None return checker(settings.identity["name"], uri) except Exception, e: log.exception(u"Get rid of this. ask_robots @ %s", uri) raise RobotsError(u"Error checking robots.txt permissions for URI '%s': %s" % (uri, unicode(e)))