예제 #1
0
 def from_useragent_response(cls, url: str, response: Response):
     return cls(
         url=url,
         is_success=response.is_success(),
         code=response.code(),
         message=response.message(),
         content=response.decoded_content(),
         last_requested_url=response.request().url() if response.request() else None,
     )
예제 #2
0
 def from_useragent_response(cls, url: str, response: Response):
     return cls(
         url=url,
         is_success=response.is_success(),
         code=response.code(),
         message=response.message(),
         content=response.decoded_content(),
         last_requested_url=response.request().url() if response.request() else None,
     )
예제 #3
0
def fetch_url(
        db: DatabaseHandler,
        url: str,
        network_down_host: str = DEFAULT_NETWORK_DOWN_HOST,
        network_down_port: str = DEFAULT_NETWORK_DOWN_PORT,
        network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT,
        domain_timeout: typing.Optional[int] = None
) -> typing.Optional[Request]:
    """Fetch a url and return the content.

    If fetching the url results in a 400 error, check whether the network_down_host is accessible.  If so,
    return the errored response.  Otherwise, wait network_down_timeout seconds and try again.

    This function catches McGetException and returns a dummy 400 Response object.

    Arguments:
    db - db handle
    url - url to fetch
    network_down_host - host to check if network is down on error
    network_down_port - port to check if network is down on error
    network_down_timeout - seconds to wait if the network is down
    domain_timeout - value to pass to ThrottledUserAgent()

    Returns:
    Response object
    """
    while True:
        ua = ThrottledUserAgent(db, domain_timeout=domain_timeout)

        try:
            response = ua.get_follow_http_html_redirects(url)
        except mediawords.util.web.user_agent.McGetFollowHTTPHTMLRedirectsException:
            response = Response(400, 'bad url', {}, 'not a http url')

        if response.is_success():
            return response

        if response.code() == 400 and _network_is_down(network_down_host,
                                                       network_down_port):
            log.warning(
                "Response failed with %s and network is down.  Waiting to retry ..."
                % (url, ))
            time.sleep(network_down_timeout)
        else:
            return response
예제 #4
0
    def __get_follow_http_html_redirects_follow_redirects(self,
                                                          response_: Response,
                                                          meta_redirects_left: int) -> Union[Response, None]:

        from mediawords.util.web.user_agent.html_redirects import (
            target_request_from_meta_refresh_url,
            target_request_from_archive_org_url,
            target_request_from_archive_is_url,
            target_request_from_linkis_com_url,
            target_request_from_alarabiya_url,
        )

        if response_ is None:
            raise McGetFollowHTTPHTMLRedirectsException("Response is None.")

        if response_.is_success():

            base_url = get_base_url(response_.request().url())

            html_redirect_functions = [
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            ]
            for html_redirect_function in html_redirect_functions:
                request_after_meta_redirect = html_redirect_function(
                    content=response_.decoded_content(),
                    archive_site_url=base_url,
                )
                if request_after_meta_redirect is not None:
                    log.warning(
                        "meta redirect from %s: %s" % (html_redirect_function, request_after_meta_redirect.url()))
                    if not urls_are_equal(url1=response_.request().url(), url2=request_after_meta_redirect.url()):

                        log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url())

                        orig_redirect_response = self.request(request=request_after_meta_redirect)
                        redirect_response = orig_redirect_response

                        # Response might have its previous() already set due to HTTP redirects,
                        # so we have to find the initial response first
                        previous = None
                        for x in range(self.max_redirect() + 1):
                            previous = redirect_response.previous()
                            if previous is None:
                                break
                            redirect_response = previous

                        if previous is not None:
                            raise McGetFollowHTTPHTMLRedirectsException(
                                "Can't find the initial redirected response; URL: %s" %
                                request_after_meta_redirect.url()
                            )

                        log.debug("Setting previous of URL %(url)s to %(previous_url)s" % {
                            'url': redirect_response.request().url(),
                            'previous_url': response_.request().url(),
                        })
                        redirect_response.set_previous(response_)

                        meta_redirects_left = meta_redirects_left - 1

                        return self.__get_follow_http_html_redirects(
                            response_=orig_redirect_response,
                            meta_redirects_left=meta_redirects_left,
                        )

            # No <meta /> refresh, the current URL is the final one
            return response_

        else:
            log.debug("Request to %s was unsuccessful: %s" % (response_.request().url(), response_.status_line(),))

            # Return the original URL and give up
            return None
예제 #5
0
        def __inner_follow_redirects(
                response_: Response,
                meta_redirects_left: int) -> Union[Response, None]:

            from mediawords.util.web.user_agent.html_redirects import (
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            )

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            if response_.is_success():

                base_url = get_base_url(response_.request().url())

                html_redirect_functions = [
                    target_request_from_meta_refresh_url,
                    target_request_from_archive_org_url,
                    target_request_from_archive_is_url,
                    target_request_from_linkis_com_url,
                    target_request_from_alarabiya_url,
                ]
                for html_redirect_function in html_redirect_functions:
                    request_after_meta_redirect = html_redirect_function(
                        content=response_.decoded_content(),
                        archive_site_url=base_url,
                    )
                    if request_after_meta_redirect is not None:
                        if not urls_are_equal(
                                url1=response_.request().url(),
                                url2=request_after_meta_redirect.url()):

                            log.debug("URL after HTML redirects: %s" %
                                      request_after_meta_redirect.url())

                            orig_redirect_response = self.request(
                                request=request_after_meta_redirect)
                            redirect_response = orig_redirect_response

                            # Response might have its previous() already set due to HTTP redirects,
                            # so we have to find the initial response first
                            previous = None
                            for x in range(self.max_redirect() + 1):
                                previous = redirect_response.previous()
                                if previous is None:
                                    break
                                redirect_response = previous

                            if previous is not None:
                                raise McGetFollowHTTPHTMLRedirectsException(
                                    "Can't find the initial redirected response; URL: %s"
                                    % request_after_meta_redirect.url())

                            log.debug(
                                "Setting previous of URL %(url)s to %(previous_url)s"
                                % {
                                    'url': redirect_response.request().url(),
                                    'previous_url': response_.request().url(),
                                })
                            redirect_response.set_previous(response_)

                            meta_redirects_left = meta_redirects_left - 1

                            return __inner(
                                response_=orig_redirect_response,
                                meta_redirects_left=meta_redirects_left,
                            )

                # No <meta /> refresh, the current URL is the final one
                return response_

            else:
                log.debug("Request to %s was unsuccessful: %s" % (
                    response_.request().url(),
                    response_.status_line(),
                ))

                # Return the original URL and give up
                return None