def _get_content_from_api(self, query: str, start_date: datetime,
                              end_date: datetime) -> str:
        """Fetch the posts data from thw ch api and return the http response content."""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        return response.decoded_content()
Пример #2
0
class _SitemapWebClient(AbstractWebClient):
    # Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
    __HTTP_REQUEST_TIMEOUT = 60

    __slots__ = [
        '__ua',
    ]

    def __init__(self):
        self.__ua = UserAgent()
        self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT)

    def set_max_response_data_length(self,
                                     max_response_data_length: int) -> None:
        self.__ua.set_max_size(max_response_data_length)

    def get(self, url: str) -> AbstractWebClientResponse:
        ua_response = self.__ua.get(url)

        if ua_response.is_success():
            return _SitemapWebClientResponse(ua_response=ua_response)
        else:
            return WebClientErrorResponse(
                message=ua_response.status_line(),
                retryable=ua_response.code() in RETRYABLE_HTTP_STATUS_CODES,
            )
Пример #3
0
def _get_user_agent() -> UserAgent:
    """Get a properly configured user agent."""
    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    return ua
Пример #4
0
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
    """Wait for Solr to start and collections to become available, if needed."""

    # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
    sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"

    connected = False

    for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1):

        if retry > 0:
            log.debug(f"Retrying Solr connection ({retry})...")

        try:

            ua = UserAgent()
            ua.set_timeout(1)
            response = ua.get(sample_select_url)

            if not response.is_success():
                raise Exception(f"Unable to connect: {response.status_line()}")

            if not response.decoded_content():
                raise Exception("Response is empty.")

            try:
                result = response.decoded_json()
            except Exception as ex:
                raise Exception(f"Unable to decode response: {ex}")

            if not isinstance(result, dict):
                raise Exception(
                    f"Result is not a dictionary: {response.decoded_content()}"
                )

            if 'response' not in result:
                raise Exception(
                    f"Response doesn't have 'response' key: {response.decoded_content()}"
                )

        except Exception as ex:

            log.warning(f"Solr is down, will retry: {ex}")
            time.sleep(1)

        else:
            log.debug("Solr is up!")
            connected = True
            break

    if not connected:
        raise McSolrRequestDidNotStartInTimeException(
            f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up"
        )
    def fetch_posts(self, query: dict, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch tweets from archive.org that match the given query for the given day."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        enc_query = urlencode({
            'q': query,
            'date_from': start_arg,
            'date_to': end_arg
        })

        url = "https://searchtweets.archivelab.org/export?" + enc_query

        log.debug("archive.org url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsArchiveTwitterDataException(
                "error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        # sometimes we get null characters, which choke the csv module
        decoded_content = decoded_content.replace('\x00', '')

        meta_tweets = []
        lines = decoded_content.splitlines()[1:]
        for row in csv.reader(lines, delimiter="\t"):
            fields = 'user_name user_screen_name lang text timestamp_ms url'.split(
                ' ')
            meta_tweet = {}
            for i, field in enumerate(fields):
                meta_tweet[field] = row[i] if i < len(row) else ''

            if 'url' not in meta_tweet or meta_tweet['url'] == '':
                log.warning("meta_tweet '%s' does not have a url" % str(row))
                continue

            meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url'])

            meta_tweets.append(meta_tweet)

        add_tweets_to_meta_tweets(meta_tweets)

        return meta_tweets
Пример #6
0
def fetch_meta_tweets_from_ch(query: str, day: str) -> list:
    """Fetch day of tweets from crimson hexagon"""
    ch_monitor_id = int(query)

    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    config = mediawords.util.config.get_config()
    if 'crimson_hexagon' not in config or 'key' not in config[
            'crimson_hexagon']:
        raise McFetchTopicTweetsConfigException(
            "no key in mediawords.yml at //crimson_hexagon/key.")

    key = config['crimson_hexagon']['key']

    next_day = day + datetime.timedelta(days=1)

    day_arg = day.strftime('%Y-%m-%d')
    next_day_arg = next_day.strftime('%Y-%m-%d')

    url = (
        "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
        % (key, ch_monitor_id, day_arg, next_day_arg))

    log.debug("crimson hexagon url: " + url)

    response = ua.get(url)

    if not response.is_success():
        raise McFetchTopicTweetsDataException("error fetching posts: " +
                                              response.decoded_content())

    decoded_content = response.decoded_content()

    data = dict(mediawords.util.parse_json.decode_json(decoded_content))

    if 'status' not in data or not data['status'] == 'success':
        raise McFetchTopicTweetsDataException("Unknown response status: " +
                                              str(data))

    meta_tweets = data['posts']

    for mt in meta_tweets:
        mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

    return meta_tweets
Пример #7
0
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config[
                'crimson_hexagon']:
            raise McFetchTopicTweetsConfigException(
                "no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " +
                                                  response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " +
                                                  str(data))

        return data
Пример #8
0
    def test_api_request(self):
        """Make an API request, see if it succeeds."""

        credentials = self.univision_credentials()

        handler = DownloadFeedUnivisionHandler(crawler_config=self._mock_crawler_config())
        api_request_url = handler._api_request_url_with_signature_from_config(api_url=credentials.url)
        assert api_request_url, 'API request URL is not empty'

        ua = UserAgent()
        ua.set_timeout(30)

        response = ua.get(api_request_url)
        assert response.is_success(), 'API request was successful'

        json_string = response.decoded_content()
        assert json_string, 'JSON response is not empty'

        json = response.decoded_json()
        assert json.get('status', None) == 'success', "JSON response was successful"
        assert 'data' in json, 'JSON response has "data" key'
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']:
            raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" %
               (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " + str(data))

        return data
Пример #10
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
Пример #11
0
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]:
    """
    Make Facebook API request.

    Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if
    something went wrong.

    :param node: Facebook API node to call.
    :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if
                   multiple values with the same key have to be passed.
    :param config: Facebook configuration object.
    :return: API response.
    """
    node = decode_object_from_bytes_if_needed(node)
    params = decode_object_from_bytes_if_needed(params)

    if node is None:
        raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).")

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Params is not a dict.")

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    if not config.api_endpoint():
        raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.")

    api_uri = furl(config.api_endpoint())
    api_uri.path.segments.append(node)

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Parameters should be a dictionary.")

    for key, values in params.items():
        if key is None or values is None:
            raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.")

        if isinstance(values, str):
            # A single value
            api_uri = api_uri.add({key: values})

        elif isinstance(values, list):
            # Multiple values for the same key
            for value in values:
                api_uri = api_uri.add({key: value})

        else:
            raise McFacebookInvalidParametersException("Values is neither a string nor a list.")

    log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}")

    app_id = config.app_id()
    app_secret = config.app_secret()

    if not (app_id and app_secret):
        raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.")

    access_token = f"{app_id}|{app_secret}"
    api_uri = api_uri.add({'access_token': access_token})

    # Last API error to set as an exception message if we run out of retries
    last_api_error = None
    data = None

    for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1):

        if retry > 1:
            log.warning(f"Retrying #{retry}...")

        ua = UserAgent()
        ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT)

        try:
            response = ua.get(api_uri.url)
        except Exception as ex:
            # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up
            # something in the code or arguments
            raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}")

        decoded_content = response.decoded_content()

        if not decoded_content:
            # some stories consistenty return empty content, so just return a soft error and move on
            raise McFacebookSoftFailureException("Decoded content is empty.")

        try:
            data = decode_json(decoded_content)
        except Exception as ex:

            if 'something went wrong' in decoded_content:
                # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the
                # request
                last_api_error = f"API responded with 'Something went wrong', will retry"
                log.error(last_api_error)
                continue

            else:
                # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"Unable to decode JSON response: {ex}",
                )

        if response.is_success():
            # Response was successful and we managed to decode JSON -- break from the retry loop
            return data

        else:
            if 'error' not in data:
                # More likely than not it's our problem so consider it a hard failure
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"No 'error' key but HTTP status is not 2xx",
                )

            error = data['error']
            error_code = error.get('code', -1)
            error_message = error.get('message', 'unknown message')

            if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES:
                # Retryable error
                last_api_error = (
                    f"Retryable error {error_code}: {error_message}, "
                    f"will retry in {config.seconds_to_wait_between_retries()} seconds"
                )
                log.error(last_api_error)
                time.sleep(config.seconds_to_wait_between_retries())
                continue

            else:
                # Non-retryable error
                log.error(f"Non-retryable error {error_code}: {error_message}")
                return data

    # At this point, we've retried the request for some time but nothing worked
    log.error(f"Ran out of retries; last error: {last_api_error}")
    return data
Пример #12
0
def solr_request(path: str,
                 params: SolrParams = None,
                 content: Union[str, SolrParams] = None,
                 content_type: Optional[str] = None,
                 config: Optional[CommonConfig] = None) -> str:
    """
    Send a request to Solr.

    :param path: Solr path to call, e.g. 'select'.
    :param params: Query parameters to add to the path.
    :param content: String or dictionary content to send via POST request.
    :param content_type: Content-Type for the POST content.
    :param config: (testing) Configuration object
    :return: Raw response content on success, raise exception on error.
    """
    path = decode_object_from_bytes_if_needed(path)
    params = decode_object_from_bytes_if_needed(params)
    content = decode_object_from_bytes_if_needed(content)
    content_type = decode_object_from_bytes_if_needed(content_type)

    if not path:
        raise McSolrRequestInvalidParamsException("Path is unset.")

    if params:
        if not isinstance(params, dict):
            raise McSolrRequestInvalidParamsException(
                f"Params is not a dictionary: {params}")

    if content:
        if not (isinstance(content, str) or isinstance(content, dict)):
            raise McSolrRequestInvalidParamsException(
                f"Content is not a string not a dictionary: {content}")

    if not config:
        config = CommonConfig()

    solr_url = config.solr_url()

    if not params:
        params = {}

    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
    abs_uri = abs_uri.set(params)
    abs_url = str(abs_uri)

    ua = UserAgent()
    ua.set_timeout(__QUERY_HTTP_TIMEOUT)
    ua.set_max_size(None)

    # Remediate CVE-2017-12629
    q_param = str(params.get('q', ''))
    if 'xmlparser' in q_param.lower():
        raise McSolrRequestQueryErrorException(
            "XML queries are not supported.")

    # Solr might still be starting up so wait for it to expose the collections list
    __wait_for_solr_to_start(config=config)

    if content:

        if not content_type:
            fallback_content_type = 'text/plain; charset=utf-8'
            log.warning(
                f"Content-Type is not set; falling back to '{fallback_content_type}'"
            )
            content_type = fallback_content_type

        if isinstance(content, dict):
            content = urlencode(content, doseq=True)

        content_encoded = content.encode('utf-8', errors='replace')

        request = Request(method='POST', url=abs_url)
        request.set_header(name='Content-Type', value=content_type)
        request.set_header(name='Content-Length',
                           value=str(len(content_encoded)))
        request.set_content(content_encoded)

    else:

        request = Request(method='GET', url=abs_url)

    log.debug(f"Sending Solr request: {request}")

    response = ua.request(request)

    if not response.is_success():
        error_message = __solr_error_message_from_response(response=response)
        raise McSolrRequestQueryErrorException(
            f"Error fetching Solr response: {error_message}")

    return response.decoded_content()
Пример #13
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response
Пример #15
0
    def fetch_posts(self, query: str, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon"""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.warning("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                post = {
                    'post_id': mt['tweet_id'],
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': mt['tweet']['created_at'],
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts