def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) url = self.__tagger_config.annotator_url() if url is None: raise McTagsFromJSONAnnotationException("Unable to determine NYLabels annotator URL to use.") # Create JSON request log.debug("Converting text to JSON request...") try: text_json = encode_json({'text': text, 'models': [self._ENABLED_MODEL]}) except Exception as ex: # Not critical, might happen to some stories, no need to shut down the annotator raise McTagsFromJSONAnnotationException( "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s" % { 'exception': str(ex), 'text': text, } ) log.debug("Done converting text to JSON request.") request = Request(method='POST', url=url) request.set_content_type('application/json; charset=utf-8') request.set_content(text_json) return request
def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) # CLIFF annotator URL config = py_get_config() url = config.get('nytlabels', {}).get('annotator_url', None) if url is None: raise McNYTLabelsAnnotatorException( "Unable to determine NYTLabels annotator URL to use.") # Create JSON request log.debug("Converting text to JSON request...") try: text_json = encode_json({'text': text}) except Exception as ex: # Not critical, might happen to some stories, no need to shut down the annotator raise McNYTLabelsAnnotatorException( "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s" % { 'exception': str(ex), 'text': text, }) log.debug("Done converting text to JSON request.") request = Request(method='POST', url=url) request.set_content_type('application/json; charset=utf-8') request.set_content(text_json) return request
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url']) request = Request(method='GET', url=url_with_credentials) response = ua.request(request) return response
def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) # CLIFF annotator URL url = self.__tagger_config.annotator_url() if url is None: raise McTagsFromJSONAnnotationException("Unable to determine CLIFF annotator URL to use.") request = Request(method='POST', url=url) request.set_content_type('application/x-www-form-urlencoded; charset=utf-8') request.set_content({'q': text}) return request
def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) # CLIFF annotator URL config = py_get_config() url = config.get('cliff', {}).get('annotator_url', None) if url is None: raise McCLIFFAnnotatorException( "Unable to determine CLIFF annotator URL to use.") request = Request(method='POST', url=url) request.set_content_type( 'application/x-www-form-urlencoded; charset=utf-8') request.set_content({'q': text}) return request
def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, content_type: Optional[str] = None, config: Optional[CommonConfig] = None) -> str: """ Send a request to Solr. :param path: Solr path to call, e.g. 'select'. :param params: Query parameters to add to the path. :param content: String or dictionary content to send via POST request. :param content_type: Content-Type for the POST content. :param config: (testing) Configuration object :return: Raw response content on success, raise exception on error. """ path = decode_object_from_bytes_if_needed(path) params = decode_object_from_bytes_if_needed(params) content = decode_object_from_bytes_if_needed(content) content_type = decode_object_from_bytes_if_needed(content_type) if not path: raise McSolrRequestInvalidParamsException("Path is unset.") if params: if not isinstance(params, dict): raise McSolrRequestInvalidParamsException( f"Params is not a dictionary: {params}") if content: if not (isinstance(content, str) or isinstance(content, dict)): raise McSolrRequestInvalidParamsException( f"Content is not a string not a dictionary: {content}") if not config: config = CommonConfig() solr_url = config.solr_url() if not params: params = {} abs_uri = furl(f"{solr_url}/mediacloud/{path}") abs_uri = abs_uri.set(params) abs_url = str(abs_uri) ua = UserAgent() ua.set_timeout(__QUERY_HTTP_TIMEOUT) ua.set_max_size(None) # Remediate CVE-2017-12629 q_param = str(params.get('q', '')) if 'xmlparser' in q_param.lower(): raise McSolrRequestQueryErrorException( "XML queries are not supported.") # Solr might still be starting up so wait for it to expose the collections list __wait_for_solr_to_start(config=config) if content: if not content_type: fallback_content_type = 'text/plain; charset=utf-8' log.warning( f"Content-Type is not set; falling back to '{fallback_content_type}'" ) content_type = fallback_content_type if isinstance(content, dict): content = urlencode(content, doseq=True) content_encoded = content.encode('utf-8', errors='replace') request = Request(method='POST', url=abs_url) request.set_header(name='Content-Type', value=content_type) request.set_header(name='Content-Length', value=str(len(content_encoded))) request.set_content(content_encoded) else: request = Request(method='GET', url=abs_url) log.debug(f"Sending Solr request: {request}") response = ua.request(request) if not response.is_success(): error_message = __solr_error_message_from_response(response=response) raise McSolrRequestQueryErrorException( f"Error fetching Solr response: {error_message}") return response.decoded_content()
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]: """ Using full page HTML as a parameter, extract part of HTML that contains the news article. :param content: Full page HTML. :param config: Optional CommonConfig object, useful for testing. :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version tag ("extractor_version" key). """ content = decode_object_from_bytes_if_needed(content) if not config: config = CommonConfig() ua = UserAgent() api_url = config.extractor_api_url() # Wait up to a minute for extraction to finish ua.set_timeout(EXTRACT_TIMEOUT) # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere api_uri = furl(api_url) api_url_hostname = str(api_uri.host) api_url_port = int(api_uri.port) assert api_url_hostname, f"API URL hostname is not set for URL {api_url}" assert api_url_port, f"API URL port is not set for URL {api_url}" if not wait_for_tcp_port_to_open( port=api_url_port, hostname=api_url_hostname, retries=EXTRACTOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever: # # 1) If the extractor service didn't come up in a given time, it won't # suddenly show up # 2) If it's a test that's doing the extraction, it can't do its job # and should fail one way or another; exit(1) is just one of the # ways how it can fail # 3) If it's some production code that needs something to get # extracted, and if we were to throw an exception instead of doing # exit(1), the caller might treat this exception as a failure to # extract this one specific input HTML file, and so it might # mis-extract a bunch of stories that way (making it hard for us to # spot the problem and time-consuming to fix it later (e.g. there # would be a need to manually re-extract a million of stories)) # # A better solution instead of exit(1) might be to throw different # kinds of exceptions and handle them appropriately in the caller, but # with the Perl-Python codebase that's a bit hard to do. fatal_error( "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format( url=api_url, timeout=EXTRACTOR_SERVICE_TIMEOUT, ) ) request_json = encode_json({'html': content}) http_request = Request(method='POST', url=api_url) http_request.set_content_type('application/json; charset=utf-8') http_request.set_content(request_json) # Try extracting multiple times # # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry # extracting the content a couple of times manually. http_response = None extraction_succeeded = False for retry in range(EXTRACT_RETRIES): if retry > 0: log.warning(f"Retrying #{retry + 1}...") http_response = ua.request(http_request) if http_response.is_success(): extraction_succeeded = True break else: log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}") if not extraction_succeeded: raise McExtractArticleFromPageException( f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}" ) response = http_response.decoded_json() assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key." assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key." return response