def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledDomainException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). The throttling routine will not be applied after the first successful request, to allow for redirects and other followup requests to succeed. To ensure proper throttling, a new object should be create for each top level request. Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by _ACCELERATED_DOMAIN_SPEEDUP_FACTOR. """ if self._use_throttling: domain = mediawords.util.url.get_url_distinctive_domain( request.url()) domain_timeout = self.domain_timeout if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS): domain_timeout = max( 1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR)) # this postgres function returns true if we are allowed to make the request and false otherwise. this # function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, domain_timeout)).flat()[0] log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock))) if not got_domain_lock: raise McThrottledDomainException("domain " + str(domain) + " is locked.") else: log.debug("domain lock obtained for %s: skipped" % str(request.url())) self._use_throttling = False return super().request(request)
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords'][ 'blacklist_url_pattern'] if blacklist_url_pattern is not None and len( blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE): request.set_url("http://blacklistedsite.localhost/%s" % url) return request
def __blacklist_request_if_needed(self, request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") blacklist_url_pattern = self._user_agent_config.blacklist_url_pattern() if blacklist_url_pattern: # MC_REWRITE_TO_PYTHON: a string might be coming from Perl if isinstance(blacklist_url_pattern, bytes): blacklist_url_pattern = decode_object_from_bytes_if_needed( blacklist_url_pattern) if isinstance(blacklist_url_pattern, str): blacklist_url_pattern = re.compile(blacklist_url_pattern, flags=re.IGNORECASE | re.UNICODE) if re.search(pattern=blacklist_url_pattern, string=url) is not None: request.set_url("http://0.0.0.1/%s" % url) return request
def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledDomainException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). The throttling routine will not be applied after the first successful request, to allow for redirects and other followup requests to succeed. To ensure proper throttling, a new object should be create for each top level request. Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by _ACCELERATED_DOMAIN_SPEEDUP_FACTOR. """ if self._use_throttling: domain = mediawords.util.url.get_url_distinctive_domain(request.url()) domain_timeout = self.domain_timeout if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS): domain_timeout = max(1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR)) # this postgres function returns true if we are allowed to make the request and false otherwise. this # function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, domain_timeout)).flat()[0] log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock))) if not got_domain_lock: raise McThrottledDomainException("domain " + str(domain) + " is locked.") else: log.debug("domain lock obtained for %s: skipped" % str(request.url())) self._use_throttling = False return super().request(request)
def __prepare_request(self, request: Request) -> requests.PreparedRequest: """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid.""" method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or (auth_username is not None and auth_password is None)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not None." ) auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or (len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not Empty." ) auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request( requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) return requests_prepared_request
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, 'a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % ( sql_now(), url, )) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % ( http_request_log_path, str(ex), )) pass
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
def __prepare_request(self, request: Request) -> requests.PreparedRequest: """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid.""" method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or ( auth_username is not None and auth_password is None)): raise McRequestException("Either both or none of HTTP authentication credentials must be not None.") auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or ( len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException("Either both or none of HTTP authentication credentials must be not Empty.") auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request(requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),)) return requests_prepared_request
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, encoding='utf-8', mode='a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % (sql_now(), url,)) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),)) pass
def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledUserAgentTimeoutException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). """ domain = mediawords.util.url.get_url_distinctive_domain(request.url()) # this postgres function returns true if we are allowed to make the request and false otherwise. # this function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, self.domain_timeout)).flat()[0] if not got_domain_lock: raise McThrottledUserAgentTimeoutException("domain " + str(domain) + " is locked.") return super(ThrottledUserAgent, self).request(request)
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords']['blacklist_url_pattern'] if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None: request.set_url("http://0.0.0.1/%s" % url) return request
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or (auth_username is not None and auth_password is None)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not None." ) auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or (len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not Empty." ) auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request( requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) error_is_client_side = False try: requests_response = self.__session.send( request=requests_prepared_request, timeout=self.timeout(), # To be able to enforce max_size stream=True, ) except requests.TooManyRedirects as ex: # On too many redirects, return the last fetched page (just like LWP::UserAgent does) log.warning("Exceeded max. redirects for URL %s" % request.url()) requests_response = ex.response response_data = str(ex) except requests.Timeout as ex: log.warning("Timeout for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) except Exception as ex: # Client-side error log.warning("Client-side error while processing request %s: %s" % ( str(request), str(ex), )) error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.BAD_REQUEST.value requests_response.reason = "Client-side error" requests_response.request = requests_prepared_request # Previous request / response chain is not built for client-side errored requests requests_response.history = [] requests_response.headers = { # LWP::UserAgent compatibility 'Client-Warning': 'Client-side error', } response_data = str(ex) else: try: max_size = self.max_size() response_data = "" read_response_data = True if max_size is not None: content_length = requests_response.headers.get( 'Content-Length', None) if content_length is not None: content_length = int(content_length) if content_length > max_size: log.warning( "Content-Length exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() read_response_data = False if read_response_data: if requests_response.encoding is None: if requests_response.apparent_encoding is None: # If encoding is not in HTTP headers nor can be determined from content itself, assume that # it's UTF-8 requests_response.encoding = 'UTF-8' else: # Test the encoding guesser's opinion, just like browsers do requests_response.encoding = requests_response.apparent_encoding else: # If "Content-Type" HTTP header contains a string "text" and doesn't have "charset" property, # "requests" falls back to setting the encoding to ISO-8859-1, which is probably not right # (encoding might have been defined in the HTML content itself via <meta> tag), so we use the # "apparent encoding" instead if requests_response.encoding.lower() == 'iso-8859-1': if requests_response.apparent_encoding is not None: requests_response.encoding = requests_response.apparent_encoding # Some pages report some funky encoding; in that case, fallback to UTF-8 try: codecs.lookup(requests_response.encoding) except LookupError: log.warning("Invalid encoding %s for URL %s" % (requests_response.encoding, requests_response.url)) requests_response.encoding = 'UTF-8' response_data_size = 0 for chunk in requests_response.iter_content( chunk_size=None, decode_unicode=True): response_data += chunk response_data_size += len(chunk) # Content-Length might be missing / lying, so we measure size while fetching the data too if max_size is not None: if response_data_size > max_size: log.warning("Data size exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() break except requests.RequestException as ex: log.warning("Error reading data for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) if requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=requests_response, data=response_data, ) if error_is_client_side: response.set_error_is_client_side( error_is_client_side=error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed(requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) previous_response = Response.from_requests_response( requests_response=previous_rq_response) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=requests_response.request) response.set_request(response_request) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) try: user_agent_response = self.__execute_request( requests_prepared_request) except Exception as ex: raise McRequestException("Unable to execute request %s: %s" % ( str(requests_prepared_request), str(ex), )) try: response_data = self.__read_response_data( user_agent_response.requests_response) except Exception as ex: log.warning("Error reading data for URL %s" % request.url()) user_agent_response.requests_response = requests.Response() user_agent_response.requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value user_agent_response.requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase user_agent_response.requests_response.request = requests_prepared_request user_agent_response.requests_response.history = [] # We treat timeouts as client-side errors too because we can retry on them user_agent_response.error_is_client_side = True response_data = str(ex) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=user_agent_response.requests_response, data=response_data, ) if user_agent_response.error_is_client_side is True: response.set_error_is_client_side( error_is_client_side=user_agent_response.error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed( user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response_data = previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response_data = '' previous_response = Response.from_requests_response( requests_response=previous_rq_response, data=previous_rq_response_data) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response. request) response.set_request(response_request) return response