def from_useragent_response(cls, url: str, response: Response): return cls( url=url, is_success=response.is_success(), code=response.code(), message=response.message(), content=response.decoded_content(), last_requested_url=response.request().url() if response.request() else None, )
def __get_follow_http_html_redirects_follow_redirects(self, response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException("Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: log.warning( "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url())) if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request(request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url() ) log.debug("Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return self.__get_follow_http_html_redirects( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),)) # Return the original URL and give up return None
def __inner_follow_redirects( response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: if not urls_are_equal( url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request( request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url()) log.debug( "Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return __inner( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % ( response_.request().url(), response_.status_line(), )) # Return the original URL and give up return None