def _get_content_from_api(self, query: str, start_date: datetime, end_date: datetime) -> str: """Fetch the posts data from thw ch api and return the http response content.""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() start_arg = start_date.strftime('%Y-%m-%d') end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) return response.decoded_content()
def _get_user_agent() -> UserAgent: """Get a properly configured user agent.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) return ua
def fetch_posts(self, query: dict, start_date: datetime, end_date: datetime) -> list: """Fetch tweets from archive.org that match the given query for the given day.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') enc_query = urlencode({ 'q': query, 'date_from': start_arg, 'date_to': end_arg }) url = "https://searchtweets.archivelab.org/export?" + enc_query log.debug("archive.org url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsArchiveTwitterDataException( "error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() # sometimes we get null characters, which choke the csv module decoded_content = decoded_content.replace('\x00', '') meta_tweets = [] lines = decoded_content.splitlines()[1:] for row in csv.reader(lines, delimiter="\t"): fields = 'user_name user_screen_name lang text timestamp_ms url'.split( ' ') meta_tweet = {} for i, field in enumerate(fields): meta_tweet[field] = row[i] if i < len(row) else '' if 'url' not in meta_tweet or meta_tweet['url'] == '': log.warning("meta_tweet '%s' does not have a url" % str(row)) continue meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url']) meta_tweets.append(meta_tweet) add_tweets_to_meta_tweets(meta_tweets) return meta_tweets
def fetch_meta_tweets_from_ch(query: str, day: str) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) return meta_tweets
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']: raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info("Annotating %d characters of text..." % len(text)) # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it." % ( text_length, self.__TEXT_LENGTH_LIMIT, )) text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( "Unable to create annotator request for text '%s': %s" % ( text, str(ex), )) # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {url}" assert port, f"API URL port is not set for URL {url}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( "Annotator service at {url} didn't come up in {timeout} seconds, exiting..." .format( url=url, timeout=self.__ANNOTATOR_SERVICE_TIMEOUT, )) log.debug("Sending request to %s..." % request.url()) response = ua.request(request) log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning("Request failed: %s" % response.decoded_content()) if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( "The request timed out, giving up; text length: %d; text: %s" % ( len(text), text, )) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error("User agent error: %s: %s" % ( response.status_line(), results_string, )) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error('%s: %s' % ( response.status_line(), results_string, )) elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( 'Annotator service was unable to process the download: %s' % results_string) else: # Shutdown the extractor on unconfigured responses fatal_error('Unknown HTTP response: %s: %s' % ( response.status_line(), results_string, )) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( "Annotator returned nothing for text: %s" % text) log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error("Unable to parse JSON response: %s\nJSON string: %s" % ( str(ex), results_string, )) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( "Unable to determine whether response is valid: %s\nJSON string: %s" % (str(ex), results_string)) if not response_is_valid: fatal_error("Annotator response is invalid for JSON string: %s" % results_string) log.info("Done annotating %d characters of text." % len(text)) return results
class AssociatedPressAPI: """Object used to interface with the Associated Press API and to return data from various API endpoints. """ def __init__(self, ap_config: Optional[APCrawlerConfig] = None): self.api_key = None self.api_version = '1.1' self.retry_limit = 5 self.ratelimit_info = dict() self.ua = UserAgent() self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256]) if not ap_config: ap_config = APCrawlerConfig() self.api_key = ap_config.api_key() if not self.api_key: raise McAPMissingAPIKey( "API key configuration data missing for associated_press.") def feed(self, **kwargs) -> dict: """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#feed) METHOD: GET ENDPOINT PARAMETERS: q: Query Expression include, exclude: Parameters used to customize the fields returned in the response. text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in the response. For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and shotlists, the valid value is nitf (NITF). The value of all returns all available formats (this is the default). page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per page. versions: Specifies whether to return all available versions of the content item and all ANPA filings or only the latest (the same story in the ANPA format may be filed multiple times; for example, with a different category code). REQUEST HEADERS: Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip. """ url = 'https://api.ap.org/media/v/content/feed' api_method = 'feed' params = {'apikey': self.api_key} params.update(kwargs) self._check_ratelimit(api_method) feed_data = self._make_request(url, params) return json.loads(feed_data)['data'] def search(self, **kwargs) -> dict: """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#search) METHOD: GET ENDPOINT PARAMETERS: q: Query Expression include, exclude: Parameters used to customize the fields returned in the response. text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in the response. For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and shotlists, the valid value is nitf (NITF). The value of all returns all available formats (this is the default). sort: The sort order of the returned results. By default, the results are sorted by relevance (meta.score) - the most relevant items first, regardless of the time period. Valid options are: versioncreated: desc. The latest items first (reverse chronological order). versioncreated: asc. The oldest items first (chronological order). page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per page. page:. The requested page number within the set of search results. Page numbers start at 1. REQUEST HEADERS: Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip. """ url = 'https://api.ap.org/media/v/content/search' api_method = 'search' params = {'apikey': self.api_key} params.update(kwargs) self._check_ratelimit(api_method) search_data = self._make_request(url, params) return json.loads(search_data)['data'] def content(self, path, **kwargs) -> Optional[str]: """Content API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Content-Item/) Example: https://api.ap.org/media/v[{version}]/content/{item_id}?apikey={apikey}[{optional_parameters}] METHOD: GET ENDPOINT PARAMETERS: qt: Unknown. They are present in the feed response but don't appear to be in the documentation et: Unknown. Same as above. REQUEST HEADERS: Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip. """ url = 'https://api.ap.org/media/v/content/{}'.format(path) api_method = 'item' params = {'apikey': self.api_key} params.update(kwargs) self._check_ratelimit(api_method) content_data = self._make_request(url, params) return content_data def _make_request(self, url: str, params: dict = None) -> Optional[str]: """Internal method for making API requests""" retries = self.retry_limit # Begin making request and retry up to retry limit while retries: log.debug("Making request to {} with parameters {}".format( url, params)) try: response = requests.get(url, params=params, timeout=30) except Exception as e: log.warning( "Encountered an exception while making request to {}. Exception info: {}" .format(url, e)) else: if response.status_code == 200: log.debug("Successfully retrieved {}".format(url)) self._update_ratelimit_info(response.headers) return response.content elif response.status_code == 403: log.warning( "Received a 403 (forbidden) response for {} -- skipping." .format(url)) return None else: print(response.content) log.warning( "Received HTTP status code {} when fetching {}".format( response.status_code, url)) retries -= 1 if retries == 0: raise McAPFetchError( "Could not fetch {} after {} attempts. Giving up.".format( url, self.retry_limit)) wait_time = (self.retry_limit - retries)**2 log.info( "Exponentially backing off for {} seconds.".format(wait_time)) time.sleep(wait_time) return None def _check_ratelimit(self, api_method: str) -> None: """Check the endpoint rate limit before making an API call to that endpoint and to wait if necessary""" if api_method in self.ratelimit_info: current_window_remaining = float( self.ratelimit_info[api_method].get('current_window_remaining', None)) next_window = float(self.ratelimit_info[api_method].get( 'next_window', None)) if current_window_remaining < 1 and next_window > time.time(): wait_time = math.ceil(next_window - time.time()) if wait_time > 0: log.info( 'Rate limit for {}. Sleeping {} before next API call'. format(api_method, wait_time)) time.sleep(wait_time) def _update_ratelimit_info(self, headers): """Internal method to update rate limit information for an API endpoint""" api_method = headers['x-mediaapi-Q-name'] calls_used, window_limit = [ int(x) for x in headers['x-mediaapi-Q-used'].split('/') ] if api_method not in self.ratelimit_info: self.ratelimit_info[api_method] = dict() self.ratelimit_info[api_method]['next_window'] = math.ceil( int(headers['x-mediaapi-Q-secondsLeft']) + time.time()) self.ratelimit_info[api_method]['current_window_limit'] = window_limit self.ratelimit_info[api_method][ 'current_window_remaining'] = window_limit - calls_used
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info(f"Annotating {len(text)} characters of text...") # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( f"Text length ({text_length}) has exceeded the request text length limit" f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.") text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( f"Unable to create annotator request for text '{text}': {ex}") # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {request.url()}" assert port, f"API URL port is not set for URL {request.url()}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, " f"exiting...") log.debug(f"Sending request to {request.url()}...") # Try requesting a few times because sometimes it throws a connection error, e.g.: # # WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>: # ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')) # WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104, # 'Connection reset by peer')) # ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.', # ConnectionResetError(104, 'Connection reset by peer')) response = None retries = 60 sleep_between_retries = 1 for retry in range(1, retries + 1): if retry > 1: log.warning(f"Retrying ({retry} / {retries})...") response = ua.request(request) if response.is_success(): break else: if response.error_is_client_side(): log.error( f"Request failed on the client side: {response.decoded_content()}" ) time.sleep(sleep_between_retries) else: break log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning(f"Request failed: {response.decoded_content()}") if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( f"The request timed out, giving up; text length: {len(text)}; text: {text}" ) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error( f"User agent error: {response.status_line()}: {results_string}" ) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error(f'{response.status_line()}: {results_string}') elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( f'Annotator service was unable to process the download: {results_string}' ) else: # Shutdown the extractor on unconfigured responses fatal_error( f'Unknown HTTP response: {response.status_line()}: {results_string}' ) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( f"Annotator returned nothing for text: {text}") log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error( f"Unable to parse JSON response: {ex}\nJSON string: {results_string}" ) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}" ) if not response_is_valid: fatal_error( f"Annotator response is invalid for JSON string: {results_string}" ) log.info(f"Done annotating {len(text)} characters of text.") return results
def fetch_posts(self, query: str, start_date: datetime, end_date: datetime) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.warning("mt: %d" % mt['tweet_id']) if 'tweet' in mt: post = { 'post_id': mt['tweet_id'], 'data': mt, 'content': mt['tweet']['text'], 'publish_date': mt['tweet']['created_at'], 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts