def get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[str]: """Return content for this url and topic in topic_seed_urls. Arguments: db - db handle topic_fetch_url - topic_fetch_url dict from db Returns: dummy response object """ r = db.query( "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null", { 'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url'] }).flat() if len(r) == 0: return None response = Response(code=200, message='OK', headers={}, data=r[0]) response.set_request(Request('GET', topic_fetch_url['url'])) return response
def __get_follow_http_html_redirects_redirects_exhausted(self, response_: Response) -> Union[Response, None]: if response_ is None: raise McGetFollowHTTPHTMLRedirectsException("Response is None.") # If one of the URLs that we've been redirected to contains another encoded URL, assume # that we're hitting a paywall and the URLencoded URL is the right one urls_redirected_to = [] for x in range(self.max_redirect() + 1): previous = response_.previous() if previous is None: break url_redirected_to = previous.request().url() encoded_url_redirected_to = quote(url_redirected_to) for redir_url in urls_redirected_to: if re.search(pattern=re.escape(encoded_url_redirected_to), string=redir_url, flags=re.IGNORECASE | re.UNICODE): log.debug(""" Encoded URL %(encoded_url_redirected_to)s is a substring of another URL %(matched_url)s, so I'll assume that %(url_redirected_to)s is the correct one. """ % { 'encoded_url_redirected_to': encoded_url_redirected_to, 'matched_url': redir_url, 'url_redirected_to': url_redirected_to, }) return previous urls_redirected_to.append(url_redirected_to) # Return the original URL (unless we find a URL being a substring of another URL, see below) return None
def fetch_url( db: DatabaseHandler, url: str, network_down_host: str = DEFAULT_NETWORK_DOWN_HOST, network_down_port: str = DEFAULT_NETWORK_DOWN_PORT, network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT, domain_timeout: typing.Optional[int] = None ) -> typing.Optional[Request]: """Fetch a url and return the content. If fetching the url results in a 400 error, check whether the network_down_host is accessible. If so, return the errored response. Otherwise, wait network_down_timeout seconds and try again. This function catches McGetException and returns a dummy 400 Response object. Arguments: db - db handle url - url to fetch network_down_host - host to check if network is down on error network_down_port - port to check if network is down on error network_down_timeout - seconds to wait if the network is down domain_timeout - value to pass to ThrottledUserAgent() Returns: Response object """ while True: ua = ThrottledUserAgent(db, domain_timeout=domain_timeout) try: response = ua.get_follow_http_html_redirects(url) except mediawords.util.web.user_agent.McGetFollowHTTPHTMLRedirectsException: response = Response(400, 'bad url', {}, 'not a http url') if response.is_success(): return response if response.code() == 400 and _network_is_down(network_down_host, network_down_port): log.warning( "Response failed with %s and network is down. Waiting to retry ..." % (url, )) time.sleep(network_down_timeout) else: return response
def from_useragent_response(cls, url: str, response: Response): return cls( url=url, is_success=response.is_success(), code=response.code(), message=response.message(), content=response.decoded_content(), last_requested_url=response.request().url() if response.request() else None, )
def _make_dummy_bypassed_response(url: str) -> Response: """Given a url, make and return a response object with that url and empty content.""" response = Response(code=200, message='OK', headers={}, data='') response.set_request(Request('GET', url)) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),)) try: user_agent_response = self.__execute_request(requests_prepared_request) except Exception as ex: raise McRequestException("Unable to execute request %s: %s" % (str(requests_prepared_request), str(ex),)) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") response = Response( requests_response=user_agent_response.requests_response, max_size=self.max_size(), error_is_client_side=user_agent_response.error_is_client_side, ) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed(user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request ) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response.raw_data = io.StringIO('') previous_response = Response(requests_response=previous_rq_response, max_size=self.max_size()) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response.request ) response.set_request(response_request) return response
def __get_follow_http_html_redirects_follow_redirects(self, response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException("Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: log.warning( "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url())) if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request(request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url() ) log.debug("Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return self.__get_follow_http_html_redirects( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),)) # Return the original URL and give up return None
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or (auth_username is not None and auth_password is None)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not None." ) auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or (len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not Empty." ) auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request( requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) error_is_client_side = False try: requests_response = self.__session.send( request=requests_prepared_request, timeout=self.timeout(), # To be able to enforce max_size stream=True, ) except requests.TooManyRedirects as ex: # On too many redirects, return the last fetched page (just like LWP::UserAgent does) log.warning("Exceeded max. redirects for URL %s" % request.url()) requests_response = ex.response response_data = str(ex) except requests.Timeout as ex: log.warning("Timeout for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) except Exception as ex: # Client-side error log.warning("Client-side error while processing request %s: %s" % ( str(request), str(ex), )) error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.BAD_REQUEST.value requests_response.reason = "Client-side error" requests_response.request = requests_prepared_request # Previous request / response chain is not built for client-side errored requests requests_response.history = [] requests_response.headers = { # LWP::UserAgent compatibility 'Client-Warning': 'Client-side error', } response_data = str(ex) else: try: max_size = self.max_size() response_data = "" read_response_data = True if max_size is not None: content_length = requests_response.headers.get( 'Content-Length', None) if content_length is not None: content_length = int(content_length) if content_length > max_size: log.warning( "Content-Length exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() read_response_data = False if read_response_data: if requests_response.encoding is None: if requests_response.apparent_encoding is None: # If encoding is not in HTTP headers nor can be determined from content itself, assume that # it's UTF-8 requests_response.encoding = 'UTF-8' else: # Test the encoding guesser's opinion, just like browsers do requests_response.encoding = requests_response.apparent_encoding else: # If "Content-Type" HTTP header contains a string "text" and doesn't have "charset" property, # "requests" falls back to setting the encoding to ISO-8859-1, which is probably not right # (encoding might have been defined in the HTML content itself via <meta> tag), so we use the # "apparent encoding" instead if requests_response.encoding.lower() == 'iso-8859-1': if requests_response.apparent_encoding is not None: requests_response.encoding = requests_response.apparent_encoding # Some pages report some funky encoding; in that case, fallback to UTF-8 try: codecs.lookup(requests_response.encoding) except LookupError: log.warning("Invalid encoding %s for URL %s" % (requests_response.encoding, requests_response.url)) requests_response.encoding = 'UTF-8' response_data_size = 0 for chunk in requests_response.iter_content( chunk_size=None, decode_unicode=True): response_data += chunk response_data_size += len(chunk) # Content-Length might be missing / lying, so we measure size while fetching the data too if max_size is not None: if response_data_size > max_size: log.warning("Data size exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() break except requests.RequestException as ex: log.warning("Error reading data for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) if requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=requests_response, data=response_data, ) if error_is_client_side: response.set_error_is_client_side( error_is_client_side=error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed(requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) previous_response = Response.from_requests_response( requests_response=previous_rq_response) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=requests_response.request) response.set_request(response_request) return response
def __inner_follow_redirects( response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: if not urls_are_equal( url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request( request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url()) log.debug( "Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return __inner( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % ( response_.request().url(), response_.status_line(), )) # Return the original URL and give up return None
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) try: user_agent_response = self.__execute_request( requests_prepared_request) except Exception as ex: raise ex raise McRequestException("Unable to execute request %s: %s" % ( str(requests_prepared_request), str(ex), )) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") response = Response( requests_response=user_agent_response.requests_response, max_size=self.max_size(), error_is_client_side=user_agent_response.error_is_client_side, ) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed( user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response.raw_data = io.StringIO('') previous_response = Response( requests_response=previous_rq_response, max_size=self.max_size()) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response. request) response.set_request(response_request) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) try: user_agent_response = self.__execute_request( requests_prepared_request) except Exception as ex: raise McRequestException("Unable to execute request %s: %s" % ( str(requests_prepared_request), str(ex), )) try: response_data = self.__read_response_data( user_agent_response.requests_response) except Exception as ex: log.warning("Error reading data for URL %s" % request.url()) user_agent_response.requests_response = requests.Response() user_agent_response.requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value user_agent_response.requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase user_agent_response.requests_response.request = requests_prepared_request user_agent_response.requests_response.history = [] # We treat timeouts as client-side errors too because we can retry on them user_agent_response.error_is_client_side = True response_data = str(ex) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=user_agent_response.requests_response, data=response_data, ) if user_agent_response.error_is_client_side is True: response.set_error_is_client_side( error_is_client_side=user_agent_response.error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed( user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response_data = previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response_data = '' previous_response = Response.from_requests_response( requests_response=previous_rq_response, data=previous_rq_response_data) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response. request) response.set_request(response_request) return response