def process(self, task, callback=None, **kwargs): task.request = httpclient.HTTPRequest(task.url, use_gzip=self.use_gzip, user_agent=self.user_agent) task.response = yield gen.Task(self.client.fetch, task.request) if task.response.body: blen = len(task.response.body) else: blen = 0 try: raw_len = int(task.response.headers.get('content-length', blen)) except: raw_len = blen logging.debug("Fetched code=%d len_raw=%d len=%d url=%s" % (task.response.code, raw_len, blen, task.url)) PageStats.crawled(task.response.code, raw_len) if task.response.code == 200: task.content = task.content_from_response() elif task.response.code in (301, 302): logging.error("Unhandled Redirect code=%d url=%s" % (task.response.code, task.url)) else: task.content = None callback((Step.CONTINUE, task))
def fetch(self, task, callback): logging.debug("Starting fetch of url=%s" % (task.url)) task.response = yield gen.Task(self.client.fetch, task.request) if task.response.body: blen = len(task.response.body) else: blen = 0 try: raw_len = int(task.response.headers.get('content-length', blen)) except: raw_len = blen logging.debug("Fetched code=%d len_raw=%d len=%d url=%s" % (task.response.code, raw_len, blen, task.url)) PageStats.crawled(task.response.code, raw_len) if task.response.code == 200: task.content = task.content_from_response() elif task.response.code in (301, 302): logging.error("Unhandled Redirect code=%d url=%s" % (task.response.code, task.url)) else: task.content = None callback((Step.CONTINUE, task))
def get(self): return self.finish(PageStats.stats())