示例#1
0
 def from_useragent_response(cls, url: str, response: Response):
     return cls(
         url=url,
         is_success=response.is_success(),
         code=response.code(),
         message=response.message(),
         content=response.decoded_content(),
         last_requested_url=response.request().url() if response.request() else None,
     )
示例#2
0
 def from_useragent_response(cls, url: str, response: Response):
     return cls(
         url=url,
         is_success=response.is_success(),
         code=response.code(),
         message=response.message(),
         content=response.decoded_content(),
         last_requested_url=response.request().url() if response.request() else None,
     )
示例#3
0
    def __get_follow_http_html_redirects_follow_redirects(self,
                                                          response_: Response,
                                                          meta_redirects_left: int) -> Union[Response, None]:

        from mediawords.util.web.user_agent.html_redirects import (
            target_request_from_meta_refresh_url,
            target_request_from_archive_org_url,
            target_request_from_archive_is_url,
            target_request_from_linkis_com_url,
            target_request_from_alarabiya_url,
        )

        if response_ is None:
            raise McGetFollowHTTPHTMLRedirectsException("Response is None.")

        if response_.is_success():

            base_url = get_base_url(response_.request().url())

            html_redirect_functions = [
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            ]
            for html_redirect_function in html_redirect_functions:
                request_after_meta_redirect = html_redirect_function(
                    content=response_.decoded_content(),
                    archive_site_url=base_url,
                )
                if request_after_meta_redirect is not None:
                    log.warning(
                        "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url()))
                    if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()):

                        log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url())

                        orig_redirect_response = self.request(request=request_after_meta_redirect)
                        redirect_response = orig_redirect_response

                        # Response might have its previous() already set due to HTTP redirects,
                        # so we have to find the initial response first
                        previous = None
                        for x in range(self.max_redirect() + 1):
                            previous = redirect_response.previous()
                            if previous is None:
                                break
                            redirect_response = previous

                        if previous is not None:
                            raise McGetFollowHTTPHTMLRedirectsException(
                                "Can't find the initial redirected response; URL: %s" %
                                request_after_meta_redirect.url()
                            )

                        log.debug("Setting previous of URL %(url)s to %(previous_url)s" % {
                            'url': redirect_response.request().url(),
                            'previous_url': response_.request().url(),
                        })
                        redirect_response.set_previous(response_)

                        meta_redirects_left = meta_redirects_left - 1

                        return self.__get_follow_http_html_redirects(
                            response_=orig_redirect_response,
                            meta_redirects_left=meta_redirects_left,
                        )

            # No <meta /> refresh, the current URL is the final one
            return response_

        else:
            log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),))

            # Return the original URL and give up
            return None
示例#4
0
        def __inner_follow_redirects(
                response_: Response,
                meta_redirects_left: int) -> Union[Response, None]:

            from mediawords.util.web.user_agent.html_redirects import (
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            )

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            if response_.is_success():

                base_url = get_base_url(response_.request().url())

                html_redirect_functions = [
                    target_request_from_meta_refresh_url,
                    target_request_from_archive_org_url,
                    target_request_from_archive_is_url,
                    target_request_from_linkis_com_url,
                    target_request_from_alarabiya_url,
                ]
                for html_redirect_function in html_redirect_functions:
                    request_after_meta_redirect = html_redirect_function(
                        content=response_.decoded_content(),
                        archive_site_url=base_url,
                    )
                    if request_after_meta_redirect is not None:
                        if not urls_are_equal(
                                url1=response_.request().url(),
                                url2=request_after_meta_redirect.url()):

                            log.debug("URL after HTML redirects: %s" %
                                      request_after_meta_redirect.url())

                            orig_redirect_response = self.request(
                                request=request_after_meta_redirect)
                            redirect_response = orig_redirect_response

                            # Response might have its previous() already set due to HTTP redirects,
                            # so we have to find the initial response first
                            previous = None
                            for x in range(self.max_redirect() + 1):
                                previous = redirect_response.previous()
                                if previous is None:
                                    break
                                redirect_response = previous

                            if previous is not None:
                                raise McGetFollowHTTPHTMLRedirectsException(
                                    "Can't find the initial redirected response; URL: %s"
                                    % request_after_meta_redirect.url())

                            log.debug(
                                "Setting previous of URL %(url)s to %(previous_url)s"
                                % {
                                    'url': redirect_response.request().url(),
                                    'previous_url': response_.request().url(),
                                })
                            redirect_response.set_previous(response_)

                            meta_redirects_left = meta_redirects_left - 1

                            return __inner(
                                response_=orig_redirect_response,
                                meta_redirects_left=meta_redirects_left,
                            )

                # No <meta /> refresh, the current URL is the final one
                return response_

            else:
                log.debug("Request to %s was unsuccessful: %s" % (
                    response_.request().url(),
                    response_.status_line(),
                ))

                # Return the original URL and give up
                return None