def from_useragent_response(cls, url: str, response: Response): return cls( url=url, is_success=response.is_success(), code=response.code(), message=response.message(), content=response.decoded_content(), last_requested_url=response.request().url() if response.request() else None, )
def fetch_url( db: DatabaseHandler, url: str, network_down_host: str = DEFAULT_NETWORK_DOWN_HOST, network_down_port: str = DEFAULT_NETWORK_DOWN_PORT, network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT, domain_timeout: typing.Optional[int] = None ) -> typing.Optional[Request]: """Fetch a url and return the content. If fetching the url results in a 400 error, check whether the network_down_host is accessible. If so, return the errored response. Otherwise, wait network_down_timeout seconds and try again. This function catches McGetException and returns a dummy 400 Response object. Arguments: db - db handle url - url to fetch network_down_host - host to check if network is down on error network_down_port - port to check if network is down on error network_down_timeout - seconds to wait if the network is down domain_timeout - value to pass to ThrottledUserAgent() Returns: Response object """ while True: ua = ThrottledUserAgent(db, domain_timeout=domain_timeout) try: response = ua.get_follow_http_html_redirects(url) except mediawords.util.web.user_agent.McGetFollowHTTPHTMLRedirectsException: response = Response(400, 'bad url', {}, 'not a http url') if response.is_success(): return response if response.code() == 400 and _network_is_down(network_down_host, network_down_port): log.warning( "Response failed with %s and network is down. Waiting to retry ..." % (url, )) time.sleep(network_down_timeout) else: return response
def __get_follow_http_html_redirects_follow_redirects(self, response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException("Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: log.warning( "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url())) if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request(request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url() ) log.debug("Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return self.__get_follow_http_html_redirects( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),)) # Return the original URL and give up return None
def __inner_follow_redirects( response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: if not urls_are_equal( url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request( request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url()) log.debug( "Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return __inner( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % ( response_.request().url(), response_.status_line(), )) # Return the original URL and give up return None