def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres)
def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.fetch_started = False self.fetch_done = False
def __init__(self) -> None: thor.events.EventEmitter.__init__(self) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = True # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False
def __init__(self, iri, method="GET", req_hdrs=None, req_body=None, status_cb=None, body_procs=None, name=None): RedState.__init__(self, name) self.request = HttpRequest(self.notes, self.name) self.request.method = method self.request.set_iri(iri) self.request.headers = req_hdrs or [] self.request.payload = req_body self.response = HttpResponse(self.notes, self.name) self.response.is_head_response = (method == "HEAD") self.response.base_uri = self.request.uri self.response.set_decoded_procs(body_procs or []) self.exchange = None self.status_cb = status_cb self.done_cb = None self.outstanding_tasks = 0 self._st = [] # FIXME: this is temporary, for debugging thor
class RedFetcher(RedState): """ Fetches the given URI (with the provided method, headers and body) and calls: - status_cb as it progresses, and - every function in the body_procs list with each chunk of the body, and - done_cb when all tasks are done. If provided, type indicates the type of the request, and is used to help set notes and status_cb appropriately. The done() method is called when the response is done, NOT when all tasks are done. It can add tasks by calling add_task(). """ client = RedHttpClient() def __init__(self, iri, method="GET", req_hdrs=None, req_body=None, status_cb=None, body_procs=None, name=None): RedState.__init__(self, name) self.request = HttpRequest(self.notes, self.name) self.request.method = method self.request.set_iri(iri) self.request.headers = req_hdrs or [] self.request.payload = req_body self.response = HttpResponse(self.notes, self.name) self.response.is_head_response = (method == "HEAD") self.response.base_uri = self.request.uri self.response.set_decoded_procs(body_procs or []) self.exchange = None self.status_cb = status_cb self.done_cb = None self.outstanding_tasks = 0 self._st = [] # FIXME: this is temporary, for debugging thor def __getstate__(self): state = self.__dict__.copy() del state['exchange'] del state['status_cb'] del state['done_cb'] return state def add_task(self, task, *args): "Remeber that we've started a task." self.outstanding_tasks += 1 self._st.append('add_task(%s)' % str(task)) task(*args, done_cb=self.finish_task) def finish_task(self): "Note that we've finished a task, and see if we're done." self.outstanding_tasks -= 1 self._st.append('finish_task()') assert self.outstanding_tasks >= 0, self._st if self.outstanding_tasks == 0: if self.done_cb: self.done_cb() self.done_cb = None # clean up potentially cyclic references self.status_cb = None def done(self): "Callback for when the response is complete and analysed." raise NotImplementedError def preflight(self): """ Callback to check to see if we should bother running. Return True if so; False if not. """ return True def run(self, done_cb=None): """ Make an asynchronous HTTP request to uri, calling status_cb as it's updated and done_cb when it's done. Reason is used to explain what the request is in the status callback. """ self.outstanding_tasks += 1 self._st.append('run(%s)' % str(done_cb)) self.done_cb = done_cb if not self.preflight() or self.request.uri == None: # generally a good sign that we're not going much further. self.finish_task() return if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", u"RED/%s (http://redbot.org/)" % __version__)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.exchange.request_done([]) def _response_start(self, status, phrase, res_headers): "Process the response start-line and headers." self._st.append('_response_start(%s, %s)' % (status, phrase)) self.response.start_time = thor.time() self.response.version = self.exchange.res_version self.response.status_code = status.decode('iso-8859-1', 'replace') self.response.status_phrase = phrase.decode('iso-8859-1', 'replace') self.response.set_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk): "Process a chunk of the response body." self.response.feed_body(chunk) def _response_done(self, trailers): "Finish analysing the response, handling any parse errors." self._st.append('_response_done()') self.response.complete_time = thor.time() self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) if self.status_cb and self.name: self.status_cb("fetched %s (%s)" % ( self.request.uri, self.name )) self.done() self.finish_task() def _response_error(self, error): "Handle an error encountered while fetching the response." self._st.append('_response_error(%s)' % (str(error))) self.response.complete_time = thor.time() self.response.http_error = error if isinstance(error, httperr.BodyForbiddenError): self.add_note('header-none', rs.BODY_NOT_ALLOWED) # elif isinstance(error, httperr.ExtraDataErr): # res.payload_len += len(err.get('detail', '')) elif isinstance(error, httperr.ChunkError): err_msg = error.detail[:20] or "" self.add_note('header-transfer-encoding', rs.BAD_CHUNK, chunk_sample=err_msg.encode('string_escape') ) self.done() self.finish_task()
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' and 'debug' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() client.idle_timeout = 5 robot_emitter = thor.events.EventEmitter() def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state["exchange"] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if "response" not in kw: kw["response"] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request( self, iri: str, method: str = "GET", req_hdrs: StrHeaderListType = None, req_body: bytes = None, ) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = method == "HEAD" # type: ignore try: self.request.set_iri(iri) except httperr.UrlError as why: self.response.http_error = why self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if "user-agent" not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on("response_nonfinal", self._response_nonfinal) self.exchange.once("response_start", self._response_start) self.exchange.on("response_body", self._response_body) self.exchange.once("response_done", self._response_done) self.exchange.on("error", self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) self.emit("debug", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [ (k.encode("ascii", "replace"), v.encode("ascii", "replace")) for (k, v) in self.request.headers ] # FIXME: should complain self.exchange.request_start( self.request.method.encode("ascii"), self.request.uri.encode("ascii"), req_hdrs, ) self.request.start_time = thor.time() if not self.fetch_done: # the request could have immediately failed. if self.request.payload is not None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) if not self.fetch_done: # the request could have immediately failed. self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("debug", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit( "debug", "fetch error %s (%s) - %s" % (self.request.uri, self.check_name, error.desc), ) err_sample = error.detail[:40] or "" if isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note("body", BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note("body", EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note("header-transfer-encoding", BAD_CHUNK, chunk_sample=err_sample) elif isinstance(error, httperr.HeaderSpaceError): subject = "header-%s" % (error.detail.lower().strip()) self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.exchange = None self.emit("fetch_done")
def checkCaching(response: HttpResponse, request: HttpRequest = None) -> None: "Examine HTTP caching characteristics." # get header values lm_hdr = response.parsed_headers.get("last-modified", None) date_hdr = response.parsed_headers.get("date", None) expires_hdr = response.parsed_headers.get("expires", None) etag_hdr = response.parsed_headers.get("etag", None) age_hdr = response.parsed_headers.get("age", None) cc_set = response.parsed_headers.get("cache-control", []) cc_list = [k for (k, v) in cc_set] cc_dict = dict(cc_set) cc_keys = list(cc_dict.keys()) # Last-Modified if lm_hdr: serv_date = date_hdr or response.start_time if lm_hdr > serv_date: response.add_note("header-last-modified", LM_FUTURE) else: response.add_note( "header-last-modified", LM_PRESENT, last_modified_string=relative_time(lm_hdr, serv_date), ) # known Cache-Control directives that don't allow duplicates known_cc = [ "max-age", "no-store", "s-maxage", "public", "private", "pre-check", "post-check", "stale-while-revalidate", "stale-if-error", ] # check for mis-capitalised directives / # assure there aren't any dup directives with different values for cc in cc_keys: if cc.lower() in known_cc and cc != cc.lower(): response.add_note("header-cache-control", CC_MISCAP, cc_lower=cc.lower(), cc=cc) if cc in known_cc and cc_list.count(cc) > 1: response.add_note("header-cache-control", CC_DUP, cc=cc) # Who can store this? if request and request.method not in cacheable_methods: response.store_shared = response.store_private = False request.add_note("method", METHOD_UNCACHEABLE, method=request.method) return # bail; nothing else to see here if "no-store" in cc_keys: response.store_shared = response.store_private = False response.add_note("header-cache-control", NO_STORE) return # bail; nothing else to see here if "private" in cc_keys: response.store_shared = False response.store_private = True response.add_note("header-cache-control", PRIVATE_CC) elif (request and "authorization" in [k.lower() for k, v in request.headers] and "public" not in cc_keys): response.store_shared = False response.store_private = True response.add_note("header-cache-control", PRIVATE_AUTH) else: response.store_shared = response.store_private = True response.add_note("header-cache-control", STOREABLE) # no-cache? if "no-cache" in cc_keys: if lm_hdr is None and etag_hdr is None: response.add_note("header-cache-control", NO_CACHE_NO_VALIDATOR) else: response.add_note("header-cache-control", NO_CACHE) return # pre-check / post-check if "pre-check" in cc_keys or "post-check" in cc_keys: if "pre-check" not in cc_keys or "post-check" not in cc_keys: response.add_note("header-cache-control", CHECK_SINGLE) else: pre_check = post_check = None try: pre_check = int(cc_dict["pre-check"]) post_check = int(cc_dict["post-check"]) except ValueError: response.add_note("header-cache-control", CHECK_NOT_INTEGER) if pre_check is not None and post_check is not None: if pre_check == 0 and post_check == 0: response.add_note("header-cache-control", CHECK_ALL_ZERO) elif post_check > pre_check: response.add_note("header-cache-control", CHECK_POST_BIGGER) post_check = pre_check elif post_check == 0: response.add_note("header-cache-control", CHECK_POST_ZERO) else: response.add_note( "header-cache-control", CHECK_POST_PRE, pre_check=pre_check, post_check=post_check, ) # vary? vary = response.parsed_headers.get("vary", set()) if "*" in vary: response.add_note("header-vary", VARY_ASTERISK) return # bail; nothing else to see here if len(vary) > 3: response.add_note("header-vary", VARY_COMPLEX, vary_count=f_num(len(vary))) else: if "user-agent" in vary: response.add_note("header-vary", VARY_USER_AGENT) if "host" in vary: response.add_note("header-vary", VARY_HOST) # calculate age response.age = age_hdr or 0 age_str = relative_time(response.age, 0, 0) if date_hdr and date_hdr > 0: apparent_age = max(0, int(response.start_time - date_hdr)) else: apparent_age = 0 current_age = max(apparent_age, response.age) current_age_str = relative_time(current_age, 0, 0) if response.age >= 1: response.add_note("header-age header-date", CURRENT_AGE, age=age_str) # Check for clock skew and dateless origin server. if not date_hdr: response.add_note("", DATE_CLOCKLESS) if expires_hdr or lm_hdr: response.add_note("header-expires header-last-modified", DATE_CLOCKLESS_BAD_HDR) else: skew = date_hdr - response.start_time + (response.age) if response.age > max_clock_skew and (current_age - skew) < max_clock_skew: response.add_note("header-date header-age", AGE_PENALTY) elif abs(skew) > max_clock_skew: response.add_note( "header-date", DATE_INCORRECT, clock_skew_string=relative_time(skew, 0, 2), ) else: response.add_note("header-date", DATE_CORRECT) # calculate freshness freshness_lifetime = 0 has_explicit_freshness = False has_cc_freshness = False freshness_hdrs = ["header-date"] if "s-maxage" in cc_keys: freshness_lifetime = cc_dict["s-maxage"] freshness_hdrs.append("header-cache-control") has_explicit_freshness = True has_cc_freshness = True elif "max-age" in cc_keys: freshness_lifetime = cc_dict["max-age"] freshness_hdrs.append("header-cache-control") has_explicit_freshness = True has_cc_freshness = True elif "expires" in response.parsed_headers: # An invalid Expires header means it's automatically stale has_explicit_freshness = True freshness_hdrs.append("header-expires") freshness_lifetime = (expires_hdr or 0) - (date_hdr or int(response.start_time)) freshness_left = freshness_lifetime - current_age freshness_left_str = relative_time(abs(int(freshness_left)), 0, 0) freshness_lifetime_str = relative_time(int(freshness_lifetime), 0, 0) response.freshness_lifetime = freshness_lifetime fresh = freshness_left > 0 if has_explicit_freshness: if fresh: response.add_note( " ".join(freshness_hdrs), FRESHNESS_FRESH, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str, ) elif has_cc_freshness and response.age > freshness_lifetime: response.add_note( " ".join(freshness_hdrs), FRESHNESS_STALE_CACHE, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str, ) else: response.add_note( " ".join(freshness_hdrs), FRESHNESS_STALE_ALREADY, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str, ) # can heuristic freshness be used? elif response.status_code in heuristic_cacheable_status: response.add_note("header-last-modified", FRESHNESS_HEURISTIC) else: response.add_note("", FRESHNESS_NONE) # can stale responses be served? if "must-revalidate" in cc_keys: if fresh: response.add_note("header-cache-control", FRESH_MUST_REVALIDATE) elif has_explicit_freshness: response.add_note("header-cache-control", STALE_MUST_REVALIDATE) elif "proxy-revalidate" in cc_keys or "s-maxage" in cc_keys: if fresh: response.add_note("header-cache-control", FRESH_PROXY_REVALIDATE) elif has_explicit_freshness: response.add_note("header-cache-control", STALE_PROXY_REVALIDATE) else: if fresh: response.add_note("header-cache-control", FRESH_SERVABLE) elif has_explicit_freshness: response.add_note("header-cache-control", STALE_SERVABLE) # public? if "public" in cc_keys: # TODO: check for authentication in request response.add_note("header-cache-control", PUBLIC)
class RedFetcher(RedState): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and calls: - status_cb as it progresses, and - every function in the body_procs list with each chunk of the body, and - done_cb when all tasks are done. If provided, type indicates the type of the request, and is used to help set notes and status_cb appropriately. The done() method is called when the response is done, NOT when all tasks are done. It can add tasks by calling add_task(). """ client = RedHttpClient() robot_files = {} # cache of robots.txt robot_cache_dir = None robot_lookups = {} def __init__(self, iri, method="GET", req_hdrs=None, req_body=None, status_cb=None, body_procs=None, name=None): RedState.__init__(self, name) self.request = HttpRequest(self.notes, self.name) self.request.method = method self.request.set_iri(iri) self.request.headers = req_hdrs or [] self.request.payload = req_body self.response = HttpResponse(self.notes, self.name) self.response.is_head_response = (method == "HEAD") self.response.base_uri = self.request.uri self.response.set_decoded_procs(body_procs or []) self.exchange = None self.status_cb = status_cb self.done_cb = None # really should be "all tasks done" self.outstanding_tasks = 0 self.follow_robots_txt = True # Should we pay attention to robots file? self._st = [] # FIXME: this is temporary, for debugging thor def __getstate__(self): state = self.__dict__.copy() del state['exchange'] del state['status_cb'] del state['done_cb'] return state def add_task(self, task, *args): "Remeber that we've started a task." self.outstanding_tasks += 1 self._st.append('add_task(%s)' % str(task)) task(*args, done_cb=self.finish_task) def finish_task(self): "Note that we've finished a task, and see if we're done." self.outstanding_tasks -= 1 self._st.append('finish_task()') assert self.outstanding_tasks >= 0, self._st if self.outstanding_tasks == 0: if self.done_cb: self.done_cb() self.done_cb = None # clean up potentially cyclic references self.status_cb = None def done(self): "Callback for when the response is complete and analysed." raise NotImplementedError def preflight(self): """ Callback to check to see if we should bother running. Return True if so; False if not. """ return True def fetch_robots_txt(self, url, cb, network=True): """ Fetch the robots.txt URL and then feed the response to cb. If the status code is not 200, send a blank doc back. If network is False, we won't use the network, will return the result immediately if cached, and will assume it's OK if we don't have a cached file. """ origin = url_to_origin(self.request.uri) if origin == None: cb("") return "" origin_hash = hashlib.sha1(origin).hexdigest() if self.robot_files.has_key(origin): # FIXME: freshness lifetime cb(self.robot_files[origin]) return self.robot_files[origin] if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: cb(cached_robots_txt) return cached_robots_txt if not network: cb("") return "" if self.robot_lookups.has_key(origin): self.robot_lookups[origin].append(cb) else: self.robot_lookups[origin] = [cb] exchange = self.client.exchange() @thor.on(exchange) def response_start(status, phrase, headers): exchange.status = status exchange.res_body = "" @thor.on(exchange) def response_body(chunk): exchange.res_body += chunk @thor.on(exchange) def response_done(trailers): if not exchange.status.startswith("2"): robots_txt = "" else: robots_txt = exchange.res_body self.robot_files[origin] = robots_txt if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, 60*30) for _cb in self.robot_lookups[origin]: _cb(robots_txt) del self.robot_lookups[origin] p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start("GET", robots_url, [('User-Agent', UA_STRING)]) exchange.request_done([]) def run(self, done_cb=None): """ Make an asynchronous HTTP request to uri, calling status_cb as it's updated and done_cb when it's done. Reason is used to explain what the request is in the status callback. """ self.outstanding_tasks += 1 self._st.append('run(%s)' % str(done_cb)) self.done_cb = done_cb if not self.preflight() or self.request.uri == None: # generally a good sign that we're not going much further. self.finish_task() return if self.follow_robots_txt: self.fetch_robots_txt(self.request.uri, self.run_continue) else: self.run_continue("") def run_continue(self, robots_txt): """ Continue after getting the robots file. TODO: refactor callback style into events. """ if robots_txt == "": # empty or non-200 pass else: checker = RobotFileParser() checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines()) if not checker.can_fetch(UA_STRING, self.request.uri): self.response.http_error = RobotsTxtError() self.finish_task() return # TODO: show error? if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([]) def _response_start(self, status, phrase, res_headers): "Process the response start-line and headers." self._st.append('_response_start(%s, %s)' % (status, phrase)) self.response.start_time = thor.time() self.response.version = self.exchange.res_version self.response.status_code = status.decode('iso-8859-1', 'replace') self.response.status_phrase = phrase.decode('iso-8859-1', 'replace') self.response.set_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk): "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers): "Finish analysing the response, handling any parse errors." self._st.append('_response_done()') self.response.complete_time = thor.time() self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) if self.status_cb and self.name: self.status_cb("fetched %s (%s)" % ( self.request.uri, self.name )) self.done() self.finish_task() def _response_error(self, error): "Handle an error encountered while fetching the response." self._st.append('_response_error(%s)' % (str(error))) self.response.complete_time = thor.time() self.response.http_error = error if isinstance(error, httperr.BodyForbiddenError): self.add_note('header-none', rs.BODY_NOT_ALLOWED) # elif isinstance(error, httperr.ExtraDataErr): # res.payload_len += len(err.get('detail', '')) elif isinstance(error, httperr.ChunkError): err_msg = error.detail[:20] or "" self.add_note('header-transfer-encoding', rs.BAD_CHUNK, chunk_sample=err_msg.encode('string_escape') ) self.done() self.finish_task()
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() robot_fetcher = RobotFetcher() def __init__(self) -> None: thor.events.EventEmitter.__init__(self) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = True # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state['exchange'] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if 'response' not in kw: kw['response'] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request(self, iri: str, method: str="GET", req_hdrs: StrHeaderListType=None, req_body: bytes=None) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = (method == "HEAD") # type: ignore self.request.set_iri(iri) self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return if self.follow_robots_txt: self.robot_fetcher.once("robot-%s" % self.request.uri, self.run_continue) self.robot_fetcher.check_robots(self.request.uri) else: self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_nonfinal', self._response_nonfinal) self.exchange.once('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.once('response_done', self._response_done) self.exchange.on('error', self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [(k.encode('ascii'), v.encode('ascii')) for (k, v) in self.request.headers] self.exchange.request_start( self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("status", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit("status", "fetch error %s (%s) - %s" % ( self.request.uri, self.check_name, error.desc)) err_sample = error.detail[:40] or "" if error.client_recoverable: pass # we'll get to this later. elif isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note('body', EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.emit("fetch_done")
def checkCaching(response: HttpResponse, request: HttpRequest=None) -> None: "Examine HTTP caching characteristics." # get header values lm_hdr = response.parsed_headers.get('last-modified', None) date_hdr = response.parsed_headers.get('date', None) expires_hdr = response.parsed_headers.get('expires', None) etag_hdr = response.parsed_headers.get('etag', None) age_hdr = response.parsed_headers.get('age', None) cc_set = response.parsed_headers.get('cache-control', []) cc_list = [k for (k, v) in cc_set] cc_dict = dict(cc_set) cc_keys = list(cc_dict.keys()) # Last-Modified if lm_hdr: serv_date = date_hdr or response.start_time if lm_hdr > serv_date: response.add_note('header-last-modified', LM_FUTURE) else: response.add_note('header-last-modified', LM_PRESENT, last_modified_string=relative_time(lm_hdr, serv_date)) # known Cache-Control directives that don't allow duplicates known_cc = ["max-age", "no-store", "s-maxage", "public", "private", "pre-check", "post-check", "stale-while-revalidate", "stale-if-error"] # check for mis-capitalised directives / # assure there aren't any dup directives with different values for cc in cc_keys: if cc.lower() in known_cc and cc != cc.lower(): response.add_note('header-cache-control', CC_MISCAP, cc_lower=cc.lower(), cc=cc) if cc in known_cc and cc_list.count(cc) > 1: response.add_note('header-cache-control', CC_DUP, cc=cc) # Who can store this? if request and request.method not in cacheable_methods: response.store_shared = response.store_private = False request.add_note('method', METHOD_UNCACHEABLE, method=request.method) return # bail; nothing else to see here elif 'no-store' in cc_keys: response.store_shared = response.store_private = False response.add_note('header-cache-control', NO_STORE) return # bail; nothing else to see here elif 'private' in cc_keys: response.store_shared = False response.store_private = True response.add_note('header-cache-control', PRIVATE_CC) elif request and 'authorization' in [k.lower() for k, v in request.headers] \ and 'public' not in cc_keys: response.store_shared = False response.store_private = True response.add_note('header-cache-control', PRIVATE_AUTH) else: response.store_shared = response.store_private = True response.add_note('header-cache-control', STOREABLE) # no-cache? if 'no-cache' in cc_keys: if lm_hdr is None and etag_hdr is None: response.add_note('header-cache-control', NO_CACHE_NO_VALIDATOR) else: response.add_note('header-cache-control', NO_CACHE) return # pre-check / post-check if 'pre-check' in cc_keys or 'post-check' in cc_keys: if 'pre-check' not in cc_keys or 'post-check' not in cc_keys: response.add_note('header-cache-control', CHECK_SINGLE) else: pre_check = post_check = None try: pre_check = int(cc_dict['pre-check']) post_check = int(cc_dict['post-check']) except ValueError: response.add_note('header-cache-control', CHECK_NOT_INTEGER) if pre_check is not None and post_check is not None: if pre_check == 0 and post_check == 0: response.add_note('header-cache-control', CHECK_ALL_ZERO) elif post_check > pre_check: response.add_note('header-cache-control', CHECK_POST_BIGGER) post_check = pre_check elif post_check == 0: response.add_note('header-cache-control', CHECK_POST_ZERO) else: response.add_note('header-cache-control', CHECK_POST_PRE, pre_check=pre_check, post_check=post_check) # vary? vary = response.parsed_headers.get('vary', set()) if "*" in vary: response.add_note('header-vary', VARY_ASTERISK) return # bail; nothing else to see here elif len(vary) > 3: response.add_note('header-vary', VARY_COMPLEX, vary_count=f_num(len(vary))) else: if "user-agent" in vary: response.add_note('header-vary', VARY_USER_AGENT) if "host" in vary: response.add_note('header-vary', VARY_HOST) # TODO: enumerate the axes in a message # calculate age response.age = age_hdr or 0 age_str = relative_time(response.age, 0, 0) if date_hdr and date_hdr > 0: apparent_age = max(0, int(response.start_time - date_hdr)) else: apparent_age = 0 current_age = max(apparent_age, response.age) current_age_str = relative_time(current_age, 0, 0) if response.age >= 1: response.add_note('header-age header-date', CURRENT_AGE, age=age_str) # Check for clock skew and dateless origin server. if not date_hdr: response.add_note('', DATE_CLOCKLESS) if expires_hdr or lm_hdr: response.add_note('header-expires header-last-modified', DATE_CLOCKLESS_BAD_HDR) else: skew = date_hdr - response.start_time + (response.age) if response.age > max_clock_skew and (current_age - skew) < max_clock_skew: response.add_note('header-date header-age', AGE_PENALTY) elif abs(skew) > max_clock_skew: response.add_note('header-date', DATE_INCORRECT, clock_skew_string=relative_time(skew, 0, 2)) else: response.add_note('header-date', DATE_CORRECT) # calculate freshness freshness_lifetime = 0 has_explicit_freshness = False has_cc_freshness = False freshness_hdrs = ['header-date'] if 's-maxage' in cc_keys: freshness_lifetime = cc_dict['s-maxage'] freshness_hdrs.append('header-cache-control') has_explicit_freshness = True has_cc_freshness = True elif 'max-age' in cc_keys: freshness_lifetime = cc_dict['max-age'] freshness_hdrs.append('header-cache-control') has_explicit_freshness = True has_cc_freshness = True elif 'expires' in response.parsed_headers: # An invalid Expires header means it's automatically stale has_explicit_freshness = True freshness_hdrs.append('header-expires') freshness_lifetime = (expires_hdr or 0) - (date_hdr or response.start_time) freshness_left = freshness_lifetime - current_age freshness_left_str = relative_time(abs(int(freshness_left)), 0, 0) freshness_lifetime_str = relative_time(int(freshness_lifetime), 0, 0) response.freshness_lifetime = freshness_lifetime fresh = freshness_left > 0 if has_explicit_freshness: if fresh: response.add_note(" ".join(freshness_hdrs), FRESHNESS_FRESH, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str) # FIXME: response.age = None elif has_cc_freshness and response.age > freshness_lifetime: response.add_note(" ".join(freshness_hdrs), FRESHNESS_STALE_CACHE, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str) else: response.add_note(" ".join(freshness_hdrs), FRESHNESS_STALE_ALREADY, freshness_lifetime=freshness_lifetime_str, freshness_left=freshness_left_str, current_age=current_age_str) # can heuristic freshness be used? elif response.status_code in heuristic_cacheable_status: response.add_note('header-last-modified', FRESHNESS_HEURISTIC) else: response.add_note('', FRESHNESS_NONE) # can stale responses be served? if 'must-revalidate' in cc_keys: if fresh: response.add_note('header-cache-control', FRESH_MUST_REVALIDATE) elif has_explicit_freshness: response.add_note('header-cache-control', STALE_MUST_REVALIDATE) elif 'proxy-revalidate' in cc_keys or 's-maxage' in cc_keys: if fresh: response.add_note('header-cache-control', FRESH_PROXY_REVALIDATE) elif has_explicit_freshness: response.add_note('header-cache-control', STALE_PROXY_REVALIDATE) else: if fresh: response.add_note('header-cache-control', FRESH_SERVABLE) elif has_explicit_freshness: response.add_note('header-cache-control', STALE_SERVABLE) # public? if 'public' in cc_keys: # TODO: check for authentication in request response.add_note('header-cache-control', PUBLIC)