Exemplo n.º 1
0
    def __blacklist_request_if_needed(self, request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        blacklist_url_pattern = self._user_agent_config.blacklist_url_pattern()

        if blacklist_url_pattern:

            # MC_REWRITE_TO_PYTHON: a string might be coming from Perl
            if isinstance(blacklist_url_pattern, bytes):
                blacklist_url_pattern = decode_object_from_bytes_if_needed(
                    blacklist_url_pattern)
            if isinstance(blacklist_url_pattern, str):
                blacklist_url_pattern = re.compile(blacklist_url_pattern,
                                                   flags=re.IGNORECASE
                                                   | re.UNICODE)

            if re.search(pattern=blacklist_url_pattern,
                         string=url) is not None:
                request.set_url("http://0.0.0.1/%s" % url)

        return request
Exemplo n.º 2
0
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords'][
                'blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(
                blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern,
                         string=url,
                         flags=re.IGNORECASE | re.UNICODE):
                request.set_url("http://blacklistedsite.localhost/%s" % url)

        return request
Exemplo n.º 3
0
def target_request_from_alarabiya_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """alarabiya uses an interstitial that requires JavaScript. If the download URL matches alarabiya and returns the
    'requires JavaScript' page, manually parse out the necessary cookie and add it to the $ua so that the request will
    work."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if not is_http_url(archive_site_url):
        log.error("Archive site URL is not HTTP(s): %s" % archive_site_url)
        return None

    if content is None:
        return None

    if not re.search(
            pattern='alarabiya', string=archive_site_url, flags=re.IGNORECASE):
        return None

    if not re.search(
            pattern='This site requires JavaScript and Cookies to be enabled',
            string=content,
            flags=re.IGNORECASE):
        return None

    matches = re.search(
        pattern=
        r"setCookie\('(?P<cookie_name>[^']+)', '(?P<cookie_value>[^']+)'",
        string=content,
        flags=re.IGNORECASE)
    if matches:
        cookie_name = matches.group('cookie_name')
        cookie_value = matches.group('cookie_value')

        request = Request(method='GET', url=archive_site_url)
        request.set_header(name='Cookie',
                           value="%s=%s" % (
                               cookie_name,
                               cookie_value,
                           ))
        return request

    else:
        log.warning("Unable to parse cookie from alarabiya URL %s: %s" % (
            archive_site_url,
            content,
        ))

    return None
Exemplo n.º 4
0
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledDomainException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().

        The throttling routine will not be applied after the first successful request, to allow for redirects and
        other followup requests to succeed.  To ensure proper throttling, a new object should be create for each
        top level request.

        Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by
        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR.
        """
        if self._use_throttling:
            domain = mediawords.util.url.get_url_distinctive_domain(
                request.url())

            domain_timeout = self.domain_timeout
            if domain_timeout > 1 and (is_shortened_url(request.url())
                                       or domain in _ACCELERATED_DOMAINS):
                domain_timeout = max(
                    1,
                    int(self.domain_timeout /
                        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR))

            # this postgres function returns true if we are allowed to make the request and false otherwise. this
            # function does not use a table lock, so some extra requests might sneak through, but that's better than
            # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
            got_domain_lock = self.db.query(
                "select get_domain_web_requests_lock(%s, %s)",
                (domain, domain_timeout)).flat()[0]

            log.debug("domain lock obtained for %s: %s" %
                      (str(request.url()), str(got_domain_lock)))

            if not got_domain_lock:
                raise McThrottledDomainException("domain " + str(domain) +
                                                 " is locked.")
        else:
            log.debug("domain lock obtained for %s: skipped" %
                      str(request.url()))

        self._use_throttling = False

        return super().request(request)
Exemplo n.º 5
0
def get_seeded_content(db: DatabaseHandler,
                       topic_fetch_url: dict) -> typing.Optional[str]:
    """Return content for this url and topic in topic_seed_urls.

    Arguments:
    db - db handle
    topic_fetch_url - topic_fetch_url dict from db

    Returns:
    dummy response object

    """
    r = db.query(
        "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null",
        {
            'a': topic_fetch_url['topics_id'],
            'b': topic_fetch_url['url']
        }).flat()

    if len(r) == 0:
        return None

    response = Response(code=200, message='OK', headers={}, data=r[0])
    response.set_request(Request('GET', topic_fetch_url['url']))

    return response
Exemplo n.º 6
0
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledDomainException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().

        The throttling routine will not be applied after the first successful request, to allow for redirects and
        other followup requests to succeed.  To ensure proper throttling, a new object should be create for each
        top level request.

        Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by
        _ACCELERATED_DOMAIN_SPEEDUP_FACTOR.
        """
        if self._use_throttling:
            domain = mediawords.util.url.get_url_distinctive_domain(request.url())

            domain_timeout = self.domain_timeout
            if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS):
                domain_timeout = max(1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR))

            # this postgres function returns true if we are allowed to make the request and false otherwise. this
            # function does not use a table lock, so some extra requests might sneak through, but that's better than
            # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
            got_domain_lock = self.db.query(
                "select get_domain_web_requests_lock(%s, %s)",
                (domain, domain_timeout)).flat()[0]

            log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock)))

            if not got_domain_lock:
                raise McThrottledDomainException("domain " + str(domain) + " is locked.")
        else:
            log.debug("domain lock obtained for %s: skipped" % str(request.url()))

        self._use_throttling = False

        return super().request(request)
Exemplo n.º 7
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'],
                                             'logs', 'http_request.log')

        with open(http_request_log_path, 'a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (
                sql_now(),
                url,
            ))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (
                http_request_log_path,
                str(ex),
            ))
            pass
Exemplo n.º 8
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
Exemplo n.º 9
0
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords']['blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None:
                request.set_url("http://0.0.0.1/%s" % url)

        return request
Exemplo n.º 10
0
def target_request_from_meta_refresh_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from website with META refresh, return a request for the original URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    target_url = meta_refresh_url_from_html(html=content,
                                            base_url=archive_site_url)
    if target_url is None:
        return None

    return Request(method='GET', url=target_url)
Exemplo n.º 11
0
    def __prepare_request(self, request: Request) -> requests.PreparedRequest:
        """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid."""
        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None)
                or (auth_username is not None and auth_password is None)):
            raise McRequestException(
                "Either both or none of HTTP authentication credentials must be not None."
            )

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0)
                    or (len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException(
                    "Either both or none of HTTP authentication credentials must be not Empty."
                )

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(
                requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        return requests_prepared_request
Exemplo n.º 12
0
    def get(self, url: str) -> Response:
        """GET an URL."""
        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetException("URL is not HTTP(s): %s" % url)

        # Add HTTP authentication
        url = self.__url_with_http_auth(url=url)

        request = Request(method='GET', url=url)

        return self.request(request)
Exemplo n.º 13
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log')

        with open(http_request_log_path, encoding='utf-8', mode='a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (sql_now(), url,))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),))
            pass
Exemplo n.º 14
0
def target_request_from_archive_is_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from archive.is, return a request for the original URL."""
    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    if re.match(pattern=r'^https?://archive\.is/(.+?)$',
                string=archive_site_url,
                flags=re.IGNORECASE):
        canonical_link = link_canonical_url_from_html(html=content)
        if canonical_link is not None:
            matches = re.match(
                pattern=
                r'^https?://archive\.is/\d+?/(?P<target_url>https?://.+?)$',
                string=canonical_link,
                flags=re.IGNORECASE)
            if matches:
                target_url = matches.group('target_url')

                if is_http_url(target_url):
                    return Request(method='GET', url=target_url)
                else:
                    log.error("URL matched, but is not HTTP(s): %s" %
                              target_url)

            else:
                log.error(
                    "Unable to parse original URL from archive.is response '%s': %s"
                    % (
                        archive_site_url,
                        canonical_link,
                    ))
        else:
            log.error(
                "Unable to parse original URL from archive.is response '%s'" %
                archive_site_url)

    return None
Exemplo n.º 15
0
def target_request_from_archive_org_url(
        content: Union[str,
                       None], archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from archive.org, return a request for the original URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    matches = re.match(
        pattern=
        r'^https?://web\.archive\.org/web/(?P<date>\d+?/)?(?P<target_url>https?://.+?)$',
        string=archive_site_url,
        flags=re.IGNORECASE)
    if matches:
        target_url = matches.group('target_url')

        if is_http_url(target_url):
            return Request(method='GET', url=target_url)
        else:
            log.error("URL matched, but is not HTTP(s): %s" % target_url)

    return None
Exemplo n.º 16
0
    def request(self, request: Request) -> Response:
        """
        Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request.

        Before executing the request, the method will check whether a request has been made for this domain within the
        last self.domain_timeout seconds.  If so, the call will raise a McThrottledUserAgentTimeoutException.
        Otherwise, the method will mark the time for this domain request in a postgres table and then execute
        UserAgent.request().
        """
        domain = mediawords.util.url.get_url_distinctive_domain(request.url())

        # this postgres function returns true if we are allowed to make the request and false otherwise.
        # this function does not use a table lock, so some extra requests might sneak through, but that's better than
        # dealing with a lock.  we use a postgres function to make the the race condition as rare as possible.
        got_domain_lock = self.db.query(
            "select get_domain_web_requests_lock(%s, %s)",
            (domain, self.domain_timeout)).flat()[0]

        if not got_domain_lock:
            raise McThrottledUserAgentTimeoutException("domain " + str(domain) + " is locked.")

        return super(ThrottledUserAgent, self).request(request)
Exemplo n.º 17
0
    def __prepare_request(self, request: Request) -> requests.PreparedRequest:
        """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid."""
        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None) or (
                auth_username is not None and auth_password is None)):
            raise McRequestException("Either both or none of HTTP authentication credentials must be not None.")

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0) or (
                    len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException("Either both or none of HTTP authentication credentials must be not Empty.")

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),))

        return requests_prepared_request
Exemplo n.º 18
0
def _get_api_key() -> str:
    """Fetch the bw api key or use the cached one.

    To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for
    a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least
    once a year.
    """
    if hasattr(_get_api_key, "api_key"):
        return _get_api_key.api_key

    user = env_value('MC_BRANDWATCH_USER')
    password = env_value('MC_BRANDWATCH_PASSWORD')

    log.debug(f"user: {user}")
    log.debug(f"passwod: {password}")

    ua = _get_user_agent()

    url = (
        "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client"
        % (quote(user)))

    request = Request(method='POST', url=url)
    request.set_content_type(
        'application/x-www-form-urlencoded; charset=utf-8')
    request.set_content({'password': password})

    response = ua.request(request)

    if not response.is_success():
        raise McPostsBWTwitterDataException("error fetching posts: " +
                                            response.decoded_content())

    json = response.decoded_content()

    data = dict(decode_json(json))

    try:
        _get_api_key.api_key = data['access_token']
    except:
        raise McPostsBWTwitterDataException(
            "error parsing ouath response: '%s'" % json)

    return _get_api_key.api_key
Exemplo n.º 19
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        method = request.method()
        if method is None:
            raise McRequestException("Request's method is None.")

        url = request.url()
        if url is None:
            raise McRequestException("Request's URL is None.")

        headers = request.headers()
        if headers is None:
            raise McRequestException("Request's headers is None.")

        auth_username = request.auth_username()
        auth_password = request.auth_password()
        if ((auth_username is None and auth_password is not None)
                or (auth_username is not None and auth_password is None)):
            raise McRequestException(
                "Either both or none of HTTP authentication credentials must be not None."
            )

        auth = None
        if auth_username is not None and auth_password is not None:
            if ((len(auth_username) == 0 and len(auth_password) > 0)
                    or (len(auth_username) > 0 and len(auth_password) == 0)):
                raise McRequestException(
                    "Either both or none of HTTP authentication credentials must be not Empty."
                )

            auth = HTTPBasicAuth(auth_username, auth_password)

        data = request.content()

        try:
            requests_request = requests.Request(
                method=method,
                url=url,
                data=data,
                headers=headers,
                auth=auth,
            )
            requests_prepared_request = self.__session.prepare_request(
                requests_request)

        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        error_is_client_side = False

        try:
            requests_response = self.__session.send(
                request=requests_prepared_request,
                timeout=self.timeout(),

                # To be able to enforce max_size
                stream=True,
            )

        except requests.TooManyRedirects as ex:

            # On too many redirects, return the last fetched page (just like LWP::UserAgent does)
            log.warning("Exceeded max. redirects for URL %s" % request.url())
            requests_response = ex.response
            response_data = str(ex)

        except requests.Timeout as ex:

            log.warning("Timeout for URL %s" % request.url())

            # We treat timeouts as client-side errors too because we can retry on them
            error_is_client_side = True

            requests_response = requests.Response()
            requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
            requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
            requests_response.request = requests_prepared_request

            requests_response.history = []

            response_data = str(ex)

        except Exception as ex:

            # Client-side error
            log.warning("Client-side error while processing request %s: %s" % (
                str(request),
                str(ex),
            ))

            error_is_client_side = True

            requests_response = requests.Response()
            requests_response.status_code = HTTPStatus.BAD_REQUEST.value
            requests_response.reason = "Client-side error"
            requests_response.request = requests_prepared_request

            # Previous request / response chain is not built for client-side errored requests
            requests_response.history = []

            requests_response.headers = {
                # LWP::UserAgent compatibility
                'Client-Warning': 'Client-side error',
            }

            response_data = str(ex)

        else:

            try:

                max_size = self.max_size()

                response_data = ""
                read_response_data = True

                if max_size is not None:
                    content_length = requests_response.headers.get(
                        'Content-Length', None)

                    if content_length is not None:
                        content_length = int(content_length)
                        if content_length > max_size:
                            log.warning(
                                "Content-Length exceeds %d for URL %s" % (
                                    max_size,
                                    url,
                                ))

                            # Release the response to return connection back to the pool
                            # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow)
                            requests_response.close()

                            read_response_data = False

                if read_response_data:

                    if requests_response.encoding is None:

                        if requests_response.apparent_encoding is None:
                            # If encoding is not in HTTP headers nor can be determined from content itself, assume that
                            # it's UTF-8
                            requests_response.encoding = 'UTF-8'

                        else:
                            # Test the encoding guesser's opinion, just like browsers do
                            requests_response.encoding = requests_response.apparent_encoding

                    else:

                        # If "Content-Type" HTTP header contains a string "text" and doesn't have "charset" property,
                        # "requests" falls back to setting the encoding to ISO-8859-1, which is probably not right
                        # (encoding might have been defined in the HTML content itself via <meta> tag), so we use the
                        # "apparent encoding" instead
                        if requests_response.encoding.lower() == 'iso-8859-1':
                            if requests_response.apparent_encoding is not None:
                                requests_response.encoding = requests_response.apparent_encoding

                    # Some pages report some funky encoding; in that case, fallback to UTF-8
                    try:
                        codecs.lookup(requests_response.encoding)
                    except LookupError:
                        log.warning("Invalid encoding %s for URL %s" %
                                    (requests_response.encoding,
                                     requests_response.url))
                        requests_response.encoding = 'UTF-8'

                    response_data_size = 0
                    for chunk in requests_response.iter_content(
                            chunk_size=None, decode_unicode=True):
                        response_data += chunk
                        response_data_size += len(chunk)

                        # Content-Length might be missing / lying, so we measure size while fetching the data too
                        if max_size is not None:
                            if response_data_size > max_size:
                                log.warning("Data size exceeds %d for URL %s" %
                                            (
                                                max_size,
                                                url,
                                            ))

                                # Release the response to return connection back to the pool
                                # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow)
                                requests_response.close()

                                break

            except requests.RequestException as ex:

                log.warning("Error reading data for URL %s" % request.url())

                # We treat timeouts as client-side errors too because we can retry on them
                error_is_client_side = True

                requests_response = requests.Response()
                requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
                requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
                requests_response.request = requests_prepared_request

                requests_response.history = []

                response_data = str(ex)

        if requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        if response_data is None:
            # Probably a programming error
            raise McRequestException("Response data is None.")

        response = Response.from_requests_response(
            requests_response=requests_response,
            data=response_data,
        )

        if error_is_client_side:
            response.set_error_is_client_side(
                error_is_client_side=error_is_client_side)

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request)

            previous_response = Response.from_requests_response(
                requests_response=previous_rq_response)
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=requests_response.request)
        response.set_request(response_request)

        return response
Exemplo n.º 20
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        try:
            requests_prepared_request = self.__prepare_request(request)
        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        try:
            user_agent_response = self.__execute_request(
                requests_prepared_request)
        except Exception as ex:
            raise McRequestException("Unable to execute request %s: %s" % (
                str(requests_prepared_request),
                str(ex),
            ))

        try:
            response_data = self.__read_response_data(
                user_agent_response.requests_response)
        except Exception as ex:
            log.warning("Error reading data for URL %s" % request.url())

            user_agent_response.requests_response = requests.Response()
            user_agent_response.requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value
            user_agent_response.requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase
            user_agent_response.requests_response.request = requests_prepared_request

            user_agent_response.requests_response.history = []

            # We treat timeouts as client-side errors too because we can retry on them
            user_agent_response.error_is_client_side = True

            response_data = str(ex)

        if user_agent_response.requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        if response_data is None:
            # Probably a programming error
            raise McRequestException("Response data is None.")

        response = Response.from_requests_response(
            requests_response=user_agent_response.requests_response,
            data=response_data,
        )

        if user_agent_response.error_is_client_side is True:
            response.set_error_is_client_side(
                error_is_client_side=user_agent_response.error_is_client_side)

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(
                user_agent_response.requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request)

            # Sometimes reading the (chunked?) previous response's data fails with:
            #
            #      AttributeError: 'NoneType' object has no attribute 'readline'
            #
            # Previous response's data is not that important, so fail rather silently.
            try:
                previous_rq_response_data = previous_rq_response.text
            except Exception as ex:
                log.warning("Reading previous response's data failed: %s" %
                            str(ex))
                previous_rq_response_data = ''

            previous_response = Response.from_requests_response(
                requests_response=previous_rq_response,
                data=previous_rq_response_data)
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=user_agent_response.requests_response.
            request)
        response.set_request(response_request)

        return response
Exemplo n.º 21
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        try:
            requests_prepared_request = self.__prepare_request(request)
        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),))

        try:
            user_agent_response = self.__execute_request(requests_prepared_request)
        except Exception as ex:
            raise McRequestException("Unable to execute request %s: %s" % (str(requests_prepared_request), str(ex),))

        if user_agent_response.requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        response = Response(
            requests_response=user_agent_response.requests_response,
            max_size=self.max_size(),
            error_is_client_side=user_agent_response.error_is_client_side,
        )

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(user_agent_response.requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request
            )

            # Sometimes reading the (chunked?) previous response's data fails with:
            #
            #      AttributeError: 'NoneType' object has no attribute 'readline'
            #
            # Previous response's data is not that important, so fail rather silently.
            try:
                previous_rq_response.text
            except Exception as ex:
                log.warning("Reading previous response's data failed: %s" % str(ex))
                previous_rq_response.raw_data = io.StringIO('')

            previous_response = Response(requests_response=previous_rq_response, max_size=self.max_size())
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=user_agent_response.requests_response.request
        )
        response.set_request(response_request)

        return response
Exemplo n.º 22
0
def target_request_from_linkis_com_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given the content of a linkis.com web page, find the original URL in the content, which may be in one of sereral
    places in the DOM, and return a request for said URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    if not re.match(pattern='^https?://[^/]*linkis.com/',
                    string=archive_site_url,
                    flags=re.IGNORECASE):
        return None

    # list of dom search patterns to find nodes with a url and the
    # attributes to use from those nodes as the url.
    #
    # for instance the first item matches:
    #
    #     <meta property="og:url" content="http://foo.bar">
    #
    try:
        html_parser = etree.HTMLParser()
        html_tree = etree.parse(StringIO(content), html_parser)

        dom_maps = [
            ('//meta[@property="og:url"]', 'content'),
            ('//a[@class="js-youtube-ln-event"]', 'href'),
            ('//iframe[@id="source_site"]', 'src'),
        ]

        for xpath, url_attribute in dom_maps:
            nodes = html_tree.xpath(xpath)

            if len(nodes) > 0:
                first_node = nodes[0]
                matched_url = first_node.get(url_attribute)
                if matched_url is not None:
                    if not re.match(pattern='^https?://linkis.com',
                                    string=matched_url,
                                    flags=re.IGNORECASE):

                        if is_http_url(matched_url):
                            return Request(method='GET', url=matched_url)
                        else:
                            log.error("URL matched, but is not HTTP(s): %s" %
                                      matched_url)

    except Exception as ex:
        log.warning("Unable to parse HTML for URL %s: %s" % (
            archive_site_url,
            str(ex),
        ))

    # As a last resort, look for the longUrl key in a JavaScript array
    matches = re.search(pattern=r'"longUrl":\s*"(?P<target_url>[^"]+)"',
                        string=content,
                        flags=re.IGNORECASE)
    if matches:
        target_url = matches.group('target_url')

        # kludge to de-escape \'d characters in javascript -- 99% of urls
        # are captured by the dom stuff above, we shouldn't get to this
        # point often
        target_url = target_url.replace('\\', '')

        if not re.match(pattern='^https?://linkis.com',
                        string=target_url,
                        flags=re.IGNORECASE):
            if is_http_url(target_url):
                return Request(method='GET', url=target_url)
            else:
                log.error("URL matched, but is not HTTP(s): %s" % target_url)

    log.warning("No URL found for linkis URL: %s" % archive_site_url)

    return None
Exemplo n.º 23
0
def _make_dummy_bypassed_response(url: str) -> Response:
    """Given a url, make and return a response object with that url and empty content."""
    response = Response(code=200, message='OK', headers={}, data='')
    response.set_request(Request('GET', url))

    return response
Exemplo n.º 24
0
    def request(self, request: Request) -> Response:
        """Execute a request, return a response.

        All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted
        URLs etc."""

        if request is None:
            raise McRequestException("Request is None.")

        request = self.__blacklist_request_if_needed(request=request)

        self.__log_request(request=request)

        try:
            requests_prepared_request = self.__prepare_request(request)
        except Exception as ex:
            raise McRequestException("Unable to prepare request %s: %s" % (
                str(request),
                str(ex),
            ))

        try:
            user_agent_response = self.__execute_request(
                requests_prepared_request)
        except Exception as ex:
            raise ex
            raise McRequestException("Unable to execute request %s: %s" % (
                str(requests_prepared_request),
                str(ex),
            ))

        if user_agent_response.requests_response is None:
            raise McRequestException("Response from 'requests' is None.")

        response = Response(
            requests_response=user_agent_response.requests_response,
            max_size=self.max_size(),
            error_is_client_side=user_agent_response.error_is_client_side,
        )

        # Build the previous request / response chain from the redirects
        current_response = response
        for previous_rq_response in reversed(
                user_agent_response.requests_response.history):
            previous_rq_request = previous_rq_response.request
            previous_response_request = Request.from_requests_prepared_request(
                requests_prepared_request=previous_rq_request)

            # Sometimes reading the (chunked?) previous response's data fails with:
            #
            #      AttributeError: 'NoneType' object has no attribute 'readline'
            #
            # Previous response's data is not that important, so fail rather silently.
            try:
                previous_rq_response.text
            except Exception as ex:
                log.warning("Reading previous response's data failed: %s" %
                            str(ex))
                previous_rq_response.raw_data = io.StringIO('')

            previous_response = Response(
                requests_response=previous_rq_response,
                max_size=self.max_size())
            previous_response.set_request(request=previous_response_request)

            current_response.set_previous(previous=previous_response)
            current_response = previous_response

        # Redirects might have happened, so we have to recreate the request object from the latest page that was
        # redirected to
        response_request = Request.from_requests_prepared_request(
            requests_prepared_request=user_agent_response.requests_response.
            request)
        response.set_request(response_request)

        return response