def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        url = self.__tagger_config.annotator_url()
        if url is None:
            raise McTagsFromJSONAnnotationException("Unable to determine NYLabels annotator URL to use.")

        # Create JSON request
        log.debug("Converting text to JSON request...")
        try:
            text_json = encode_json({'text': text, 'models': [self._ENABLED_MODEL]})
        except Exception as ex:
            # Not critical, might happen to some stories, no need to shut down the annotator
            raise McTagsFromJSONAnnotationException(
                "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s" % {
                    'exception': str(ex),
                    'text': text,
                }
            )
        log.debug("Done converting text to JSON request.")

        request = Request(method='POST', url=url)
        request.set_content_type('application/json; charset=utf-8')
        request.set_content(text_json)

        return request
Пример #2
0
    def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        # CLIFF annotator URL
        config = py_get_config()
        url = config.get('nytlabels', {}).get('annotator_url', None)
        if url is None:
            raise McNYTLabelsAnnotatorException(
                "Unable to determine NYTLabels annotator URL to use.")

        # Create JSON request
        log.debug("Converting text to JSON request...")
        try:
            text_json = encode_json({'text': text})
        except Exception as ex:
            # Not critical, might happen to some stories, no need to shut down the annotator
            raise McNYTLabelsAnnotatorException(
                "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s"
                % {
                    'exception': str(ex),
                    'text': text,
                })
        log.debug("Done converting text to JSON request.")

        request = Request(method='POST', url=url)
        request.set_content_type('application/json; charset=utf-8')
        request.set_content(text_json)

        return request
Пример #3
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)

        ua = UserAgent()
        url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url'])
        request = Request(method='GET', url=url_with_credentials)
        response = ua.request(request)

        return response
    def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        # CLIFF annotator URL
        url = self.__tagger_config.annotator_url()
        if url is None:
            raise McTagsFromJSONAnnotationException("Unable to determine CLIFF annotator URL to use.")

        request = Request(method='POST', url=url)
        request.set_content_type('application/x-www-form-urlencoded; charset=utf-8')
        request.set_content({'q': text})

        return request
Пример #5
0
    def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        # CLIFF annotator URL
        config = py_get_config()
        url = config.get('cliff', {}).get('annotator_url', None)
        if url is None:
            raise McCLIFFAnnotatorException(
                "Unable to determine CLIFF annotator URL to use.")

        request = Request(method='POST', url=url)
        request.set_content_type(
            'application/x-www-form-urlencoded; charset=utf-8')
        request.set_content({'q': text})

        return request
Пример #6
0
def solr_request(path: str,
                 params: SolrParams = None,
                 content: Union[str, SolrParams] = None,
                 content_type: Optional[str] = None,
                 config: Optional[CommonConfig] = None) -> str:
    """
    Send a request to Solr.

    :param path: Solr path to call, e.g. 'select'.
    :param params: Query parameters to add to the path.
    :param content: String or dictionary content to send via POST request.
    :param content_type: Content-Type for the POST content.
    :param config: (testing) Configuration object
    :return: Raw response content on success, raise exception on error.
    """
    path = decode_object_from_bytes_if_needed(path)
    params = decode_object_from_bytes_if_needed(params)
    content = decode_object_from_bytes_if_needed(content)
    content_type = decode_object_from_bytes_if_needed(content_type)

    if not path:
        raise McSolrRequestInvalidParamsException("Path is unset.")

    if params:
        if not isinstance(params, dict):
            raise McSolrRequestInvalidParamsException(
                f"Params is not a dictionary: {params}")

    if content:
        if not (isinstance(content, str) or isinstance(content, dict)):
            raise McSolrRequestInvalidParamsException(
                f"Content is not a string not a dictionary: {content}")

    if not config:
        config = CommonConfig()

    solr_url = config.solr_url()

    if not params:
        params = {}

    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
    abs_uri = abs_uri.set(params)
    abs_url = str(abs_uri)

    ua = UserAgent()
    ua.set_timeout(__QUERY_HTTP_TIMEOUT)
    ua.set_max_size(None)

    # Remediate CVE-2017-12629
    q_param = str(params.get('q', ''))
    if 'xmlparser' in q_param.lower():
        raise McSolrRequestQueryErrorException(
            "XML queries are not supported.")

    # Solr might still be starting up so wait for it to expose the collections list
    __wait_for_solr_to_start(config=config)

    if content:

        if not content_type:
            fallback_content_type = 'text/plain; charset=utf-8'
            log.warning(
                f"Content-Type is not set; falling back to '{fallback_content_type}'"
            )
            content_type = fallback_content_type

        if isinstance(content, dict):
            content = urlencode(content, doseq=True)

        content_encoded = content.encode('utf-8', errors='replace')

        request = Request(method='POST', url=abs_url)
        request.set_header(name='Content-Type', value=content_type)
        request.set_header(name='Content-Length',
                           value=str(len(content_encoded)))
        request.set_content(content_encoded)

    else:

        request = Request(method='GET', url=abs_url)

    log.debug(f"Sending Solr request: {request}")

    response = ua.request(request)

    if not response.is_success():
        error_message = __solr_error_message_from_response(response=response)
        raise McSolrRequestQueryErrorException(
            f"Error fetching Solr response: {error_message}")

    return response.decoded_content()
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response