class RobotCheck(Step): cache = LRUCache(1000) @classmethod def post_save_clear(cls, sender, document, **kwargs): logging.debug("Removing robots.txt cache information for: %s" % document.site) for scheme in ('http', 'https'): url = "%s://%s/robots.txt" % (scheme, document.site) if url in cls.cache: del cls.cache[url] def __init__(self, settings, **kwargs): """Initialzation""" self.settings = settings self.fetch = Fetch(settings) self.store = StoreResponse(settings) @gen.engine def process(self, task, callback=None, **kwargs): url = "%s://%s/robots.txt" % (task.url_scheme, task.url_host) if url in self.cache: matcher = self.cache[url] else: matcher = self.cache[url] = yield gen.Task(self.build_matcher, url) # TODO - Get Crawl Delay ``matcher.get_crawl_delay()`` if matcher.is_allowed_path(task.url_path): callback((Step.CONTINUE, task)) else: callback((Step.STOP, task)) @gen.engine def build_matcher(self, url, callback): task = Task(url) extra_rules = [] for rule in RobotRule.objects(site=task.url_host): extra_rules.append(('allow' if rule.flag else 'deny', rule.path, rule.order)) extra_rules = sorted(extra_rules, key=lambda x: x[2]) try: parser = RobotParser(useragent=self.settings.USER_AGENT, extra_rules=extra_rules) except Exception as e: logging.error("Exception building robot parser", e) raise e v, t = yield gen.Task(self.fetch.process, task) # Save the robots.txt yield gen.Task(self.store.process, task) if task.content: parser.parse(task.content) matcher = parser.matcher(self.settings.ROBOT_NAME) callback(matcher)
def __init__(self, redis): super(RedisQueue, self).__init__() self.redis = redis self._owned = {} self._seen_cache = LRUCache(10000) import socket, os self.guid = "%s:%s" % (socket.gethostname(), os.getpid()) self._known_buckets = set()
class DNSHandler(object): """Cache DNS Names - this is plugged into the Tornado Async Client""" def __init__(self): self.cache = LRUCache(1000) def get(self, host, default=None): """Mimic Dictionary get's but handling them through a cache""" addr = self.cache.get(host, None) if addr: return addr addrinfo = socket.getaddrinfo(host, 80, 0, 0, socket.SOL_TCP) af, socktype, proto, canonname, sockaddr = addrinfo[0] self.cache[host] = sockaddr[0] return sockaddr[0]
def __init__(self): self.cache = LRUCache(1000)
class Fetch(Step): default_delay = 10 cache = LRUCache(1000) def __init__(self, settings, user_settings=None, **kwargs): self.client = httpclient.AsyncHTTPClient() self.use_gzip = settings.USE_GZIP self.user_agent = settings.USER_AGENT self.ioloop = ioloop.IOLoop.instance() if user_settings: self.post_save(None, user_settings) def process(self, task, callback=None, **kwargs): task.request = httpclient.HTTPRequest(task.url, use_gzip=self.use_gzip, user_agent=self.user_agent) tnow = time.time() tv = self.cache.get(task.url_host, tnow) if tv > tnow: tnext = tv + self.delay self.cache[task.url_host] = tnext logging.debug("Fetching on timer in %.2f seconds" % (tnext - tnow)) self.ioloop.add_timeout(tnext, partial(self.fetch, task, callback)) else: logging.debug("Fetcher not busy %r" % (tv)) self.cache[task.url_host] = tnow + self.delay self.fetch(task, callback) @property def delay(self): return self.__class__.default_delay @gen.engine def fetch(self, task, callback): logging.debug("Starting fetch of url=%s" % (task.url)) task.response = yield gen.Task(self.client.fetch, task.request) if task.response.body: blen = len(task.response.body) else: blen = 0 try: raw_len = int(task.response.headers.get('content-length', blen)) except: raw_len = blen logging.debug("Fetched code=%d len_raw=%d len=%d url=%s" % (task.response.code, raw_len, blen, task.url)) PageStats.crawled(task.response.code, raw_len) if task.response.code == 200: task.content = task.content_from_response() elif task.response.code in (301, 302): logging.error("Unhandled Redirect code=%d url=%s" % (task.response.code, task.url)) else: task.content = None callback((Step.CONTINUE, task)) @classmethod def post_save(cls, sender, document, **kwargs): cls.default_delay = document.crawl_delay