def _get_content_from_api(self, query: str, start_date: datetime, end_date: datetime) -> str: """Fetch the posts data from thw ch api and return the http response content.""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() start_arg = start_date.strftime('%Y-%m-%d') end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) return response.decoded_content()
class _SitemapWebClient(AbstractWebClient): # Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big. __HTTP_REQUEST_TIMEOUT = 60 __slots__ = [ '__ua', ] def __init__(self): self.__ua = UserAgent() self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT) def set_max_response_data_length(self, max_response_data_length: int) -> None: self.__ua.set_max_size(max_response_data_length) def get(self, url: str) -> AbstractWebClientResponse: ua_response = self.__ua.get(url) if ua_response.is_success(): return _SitemapWebClientResponse(ua_response=ua_response) else: return WebClientErrorResponse( message=ua_response.status_line(), retryable=ua_response.code() in RETRYABLE_HTTP_STATUS_CODES, )
def _get_user_agent() -> UserAgent: """Get a properly configured user agent.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) return ua
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1): if retry > 0: log.debug(f"Retrying Solr connection ({retry})...") try: ua = UserAgent() ua.set_timeout(1) response = ua.get(sample_select_url) if not response.is_success(): raise Exception(f"Unable to connect: {response.status_line()}") if not response.decoded_content(): raise Exception("Response is empty.") try: result = response.decoded_json() except Exception as ex: raise Exception(f"Unable to decode response: {ex}") if not isinstance(result, dict): raise Exception( f"Result is not a dictionary: {response.decoded_content()}" ) if 'response' not in result: raise Exception( f"Response doesn't have 'response' key: {response.decoded_content()}" ) except Exception as ex: log.warning(f"Solr is down, will retry: {ex}") time.sleep(1) else: log.debug("Solr is up!") connected = True break if not connected: raise McSolrRequestDidNotStartInTimeException( f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up" )
def fetch_posts(self, query: dict, start_date: datetime, end_date: datetime) -> list: """Fetch tweets from archive.org that match the given query for the given day.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') enc_query = urlencode({ 'q': query, 'date_from': start_arg, 'date_to': end_arg }) url = "https://searchtweets.archivelab.org/export?" + enc_query log.debug("archive.org url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsArchiveTwitterDataException( "error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() # sometimes we get null characters, which choke the csv module decoded_content = decoded_content.replace('\x00', '') meta_tweets = [] lines = decoded_content.splitlines()[1:] for row in csv.reader(lines, delimiter="\t"): fields = 'user_name user_screen_name lang text timestamp_ms url'.split( ' ') meta_tweet = {} for i, field in enumerate(fields): meta_tweet[field] = row[i] if i < len(row) else '' if 'url' not in meta_tweet or meta_tweet['url'] == '': log.warning("meta_tweet '%s' does not have a url" % str(row)) continue meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url']) meta_tweets.append(meta_tweet) add_tweets_to_meta_tweets(meta_tweets) return meta_tweets
def fetch_meta_tweets_from_ch(query: str, day: str) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) return meta_tweets
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def test_api_request(self): """Make an API request, see if it succeeds.""" credentials = self.univision_credentials() handler = DownloadFeedUnivisionHandler(crawler_config=self._mock_crawler_config()) api_request_url = handler._api_request_url_with_signature_from_config(api_url=credentials.url) assert api_request_url, 'API request URL is not empty' ua = UserAgent() ua.set_timeout(30) response = ua.get(api_request_url) assert response.is_success(), 'API request was successful' json_string = response.decoded_content() assert json_string, 'JSON response is not empty' json = response.decoded_json() assert json.get('status', None) == 'success', "JSON response was successful" assert 'data' in json, 'JSON response has "data" key'
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']: raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info("Annotating %d characters of text..." % len(text)) # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it." % ( text_length, self.__TEXT_LENGTH_LIMIT, )) text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( "Unable to create annotator request for text '%s': %s" % ( text, str(ex), )) # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {url}" assert port, f"API URL port is not set for URL {url}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( "Annotator service at {url} didn't come up in {timeout} seconds, exiting..." .format( url=url, timeout=self.__ANNOTATOR_SERVICE_TIMEOUT, )) log.debug("Sending request to %s..." % request.url()) response = ua.request(request) log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning("Request failed: %s" % response.decoded_content()) if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( "The request timed out, giving up; text length: %d; text: %s" % ( len(text), text, )) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error("User agent error: %s: %s" % ( response.status_line(), results_string, )) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error('%s: %s' % ( response.status_line(), results_string, )) elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( 'Annotator service was unable to process the download: %s' % results_string) else: # Shutdown the extractor on unconfigured responses fatal_error('Unknown HTTP response: %s: %s' % ( response.status_line(), results_string, )) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( "Annotator returned nothing for text: %s" % text) log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error("Unable to parse JSON response: %s\nJSON string: %s" % ( str(ex), results_string, )) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( "Unable to determine whether response is valid: %s\nJSON string: %s" % (str(ex), results_string)) if not response_is_valid: fatal_error("Annotator response is invalid for JSON string: %s" % results_string) log.info("Done annotating %d characters of text." % len(text)) return results
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]: """ Make Facebook API request. Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if something went wrong. :param node: Facebook API node to call. :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if multiple values with the same key have to be passed. :param config: Facebook configuration object. :return: API response. """ node = decode_object_from_bytes_if_needed(node) params = decode_object_from_bytes_if_needed(params) if node is None: raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).") if not isinstance(params, dict): raise McFacebookInvalidParametersException("Params is not a dict.") if not config.is_enabled(): raise McFacebookInvalidConfigurationException("Facebook API is not enabled.") if not config.api_endpoint(): raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.") api_uri = furl(config.api_endpoint()) api_uri.path.segments.append(node) if not isinstance(params, dict): raise McFacebookInvalidParametersException("Parameters should be a dictionary.") for key, values in params.items(): if key is None or values is None: raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.") if isinstance(values, str): # A single value api_uri = api_uri.add({key: values}) elif isinstance(values, list): # Multiple values for the same key for value in values: api_uri = api_uri.add({key: value}) else: raise McFacebookInvalidParametersException("Values is neither a string nor a list.") log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}") app_id = config.app_id() app_secret = config.app_secret() if not (app_id and app_secret): raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.") access_token = f"{app_id}|{app_secret}" api_uri = api_uri.add({'access_token': access_token}) # Last API error to set as an exception message if we run out of retries last_api_error = None data = None for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1): if retry > 1: log.warning(f"Retrying #{retry}...") ua = UserAgent() ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT) try: response = ua.get(api_uri.url) except Exception as ex: # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up # something in the code or arguments raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}") decoded_content = response.decoded_content() if not decoded_content: # some stories consistenty return empty content, so just return a soft error and move on raise McFacebookSoftFailureException("Decoded content is empty.") try: data = decode_json(decoded_content) except Exception as ex: if 'something went wrong' in decoded_content: # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the # request last_api_error = f"API responded with 'Something went wrong', will retry" log.error(last_api_error) continue else: # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"Unable to decode JSON response: {ex}", ) if response.is_success(): # Response was successful and we managed to decode JSON -- break from the retry loop return data else: if 'error' not in data: # More likely than not it's our problem so consider it a hard failure raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"No 'error' key but HTTP status is not 2xx", ) error = data['error'] error_code = error.get('code', -1) error_message = error.get('message', 'unknown message') if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES: # Retryable error last_api_error = ( f"Retryable error {error_code}: {error_message}, " f"will retry in {config.seconds_to_wait_between_retries()} seconds" ) log.error(last_api_error) time.sleep(config.seconds_to_wait_between_retries()) continue else: # Non-retryable error log.error(f"Non-retryable error {error_code}: {error_message}") return data # At this point, we've retried the request for some time but nothing worked log.error(f"Ran out of retries; last error: {last_api_error}") return data
def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, content_type: Optional[str] = None, config: Optional[CommonConfig] = None) -> str: """ Send a request to Solr. :param path: Solr path to call, e.g. 'select'. :param params: Query parameters to add to the path. :param content: String or dictionary content to send via POST request. :param content_type: Content-Type for the POST content. :param config: (testing) Configuration object :return: Raw response content on success, raise exception on error. """ path = decode_object_from_bytes_if_needed(path) params = decode_object_from_bytes_if_needed(params) content = decode_object_from_bytes_if_needed(content) content_type = decode_object_from_bytes_if_needed(content_type) if not path: raise McSolrRequestInvalidParamsException("Path is unset.") if params: if not isinstance(params, dict): raise McSolrRequestInvalidParamsException( f"Params is not a dictionary: {params}") if content: if not (isinstance(content, str) or isinstance(content, dict)): raise McSolrRequestInvalidParamsException( f"Content is not a string not a dictionary: {content}") if not config: config = CommonConfig() solr_url = config.solr_url() if not params: params = {} abs_uri = furl(f"{solr_url}/mediacloud/{path}") abs_uri = abs_uri.set(params) abs_url = str(abs_uri) ua = UserAgent() ua.set_timeout(__QUERY_HTTP_TIMEOUT) ua.set_max_size(None) # Remediate CVE-2017-12629 q_param = str(params.get('q', '')) if 'xmlparser' in q_param.lower(): raise McSolrRequestQueryErrorException( "XML queries are not supported.") # Solr might still be starting up so wait for it to expose the collections list __wait_for_solr_to_start(config=config) if content: if not content_type: fallback_content_type = 'text/plain; charset=utf-8' log.warning( f"Content-Type is not set; falling back to '{fallback_content_type}'" ) content_type = fallback_content_type if isinstance(content, dict): content = urlencode(content, doseq=True) content_encoded = content.encode('utf-8', errors='replace') request = Request(method='POST', url=abs_url) request.set_header(name='Content-Type', value=content_type) request.set_header(name='Content-Length', value=str(len(content_encoded))) request.set_content(content_encoded) else: request = Request(method='GET', url=abs_url) log.debug(f"Sending Solr request: {request}") response = ua.request(request) if not response.is_success(): error_message = __solr_error_message_from_response(response=response) raise McSolrRequestQueryErrorException( f"Error fetching Solr response: {error_message}") return response.decoded_content()
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info(f"Annotating {len(text)} characters of text...") # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( f"Text length ({text_length}) has exceeded the request text length limit" f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.") text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( f"Unable to create annotator request for text '{text}': {ex}") # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {request.url()}" assert port, f"API URL port is not set for URL {request.url()}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, " f"exiting...") log.debug(f"Sending request to {request.url()}...") # Try requesting a few times because sometimes it throws a connection error, e.g.: # # WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>: # ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')) # WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104, # 'Connection reset by peer')) # ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.', # ConnectionResetError(104, 'Connection reset by peer')) response = None retries = 60 sleep_between_retries = 1 for retry in range(1, retries + 1): if retry > 1: log.warning(f"Retrying ({retry} / {retries})...") response = ua.request(request) if response.is_success(): break else: if response.error_is_client_side(): log.error( f"Request failed on the client side: {response.decoded_content()}" ) time.sleep(sleep_between_retries) else: break log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning(f"Request failed: {response.decoded_content()}") if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( f"The request timed out, giving up; text length: {len(text)}; text: {text}" ) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error( f"User agent error: {response.status_line()}: {results_string}" ) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error(f'{response.status_line()}: {results_string}') elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( f'Annotator service was unable to process the download: {results_string}' ) else: # Shutdown the extractor on unconfigured responses fatal_error( f'Unknown HTTP response: {response.status_line()}: {results_string}' ) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( f"Annotator returned nothing for text: {text}") log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error( f"Unable to parse JSON response: {ex}\nJSON string: {results_string}" ) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}" ) if not response_is_valid: fatal_error( f"Annotator response is invalid for JSON string: {results_string}" ) log.info(f"Done annotating {len(text)} characters of text.") return results
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]: """ Using full page HTML as a parameter, extract part of HTML that contains the news article. :param content: Full page HTML. :param config: Optional CommonConfig object, useful for testing. :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version tag ("extractor_version" key). """ content = decode_object_from_bytes_if_needed(content) if not config: config = CommonConfig() ua = UserAgent() api_url = config.extractor_api_url() # Wait up to a minute for extraction to finish ua.set_timeout(EXTRACT_TIMEOUT) # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere api_uri = furl(api_url) api_url_hostname = str(api_uri.host) api_url_port = int(api_uri.port) assert api_url_hostname, f"API URL hostname is not set for URL {api_url}" assert api_url_port, f"API URL port is not set for URL {api_url}" if not wait_for_tcp_port_to_open( port=api_url_port, hostname=api_url_hostname, retries=EXTRACTOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever: # # 1) If the extractor service didn't come up in a given time, it won't # suddenly show up # 2) If it's a test that's doing the extraction, it can't do its job # and should fail one way or another; exit(1) is just one of the # ways how it can fail # 3) If it's some production code that needs something to get # extracted, and if we were to throw an exception instead of doing # exit(1), the caller might treat this exception as a failure to # extract this one specific input HTML file, and so it might # mis-extract a bunch of stories that way (making it hard for us to # spot the problem and time-consuming to fix it later (e.g. there # would be a need to manually re-extract a million of stories)) # # A better solution instead of exit(1) might be to throw different # kinds of exceptions and handle them appropriately in the caller, but # with the Perl-Python codebase that's a bit hard to do. fatal_error( "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format( url=api_url, timeout=EXTRACTOR_SERVICE_TIMEOUT, ) ) request_json = encode_json({'html': content}) http_request = Request(method='POST', url=api_url) http_request.set_content_type('application/json; charset=utf-8') http_request.set_content(request_json) # Try extracting multiple times # # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry # extracting the content a couple of times manually. http_response = None extraction_succeeded = False for retry in range(EXTRACT_RETRIES): if retry > 0: log.warning(f"Retrying #{retry + 1}...") http_response = ua.request(http_request) if http_response.is_success(): extraction_succeeded = True break else: log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}") if not extraction_succeeded: raise McExtractArticleFromPageException( f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}" ) response = http_response.decoded_json() assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key." assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key." return response
def fetch_posts(self, query: str, start_date: datetime, end_date: datetime) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.warning("mt: %d" % mt['tweet_id']) if 'tweet' in mt: post = { 'post_id': mt['tweet_id'], 'data': mt, 'content': mt['tweet']['text'], 'publish_date': mt['tweet']['created_at'], 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts