def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None: """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets.""" topic_tweets = db.query( """ select * from topic_tweets tt join topic_tweet_days ttd using (topic_tweet_days_id) where ttd.topics_id = %(a)s """, {'a': topic['topics_id']}).hashes() expected_num_urls = 0 for topic_tweet in topic_tweets: data = dict(decode_json(topic_tweet['data'])) expected_num_urls += len(data['tweet']['entities']['urls']) # first sanity check to make sure we got some urls num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0] assert num_urls == expected_num_urls total_json_urls = 0 for topic_tweet in topic_tweets: ch_post = dict(decode_json(topic_tweet['data'])) expected_urls = [x['expanded_url'] for x in ch_post['tweet']['entities']['urls']] total_json_urls += len(expected_urls) for expected_url in expected_urls: got_url = db.query("select * from topic_tweet_urls where url = %(a)s", {'a': expected_url}).hash() assert got_url is not None assert total_json_urls == num_urls
def test_encode_decode_json(): test_obj = [ 'foo', { 'bar': 'baz' }, ['xyz', 'zyx'], 'moo', 'ąčęėįšųūž', 42, 3.14, True, False, None ] expected_json = '["foo",{"bar":"baz"},["xyz","zyx"],"moo","ąčęėįšųūž",42,3.14,true,false,null]' encoded_json = encode_json(json_obj=test_obj, pretty=False) assert encoded_json == expected_json decoded_json = decode_json(json_string=encoded_json) assert decoded_json == test_obj # Encoding errors with pytest.raises(McEncodeJSONException): # noinspection PyTypeChecker encode_json(None) with pytest.raises(McEncodeJSONException): # noinspection PyTypeChecker encode_json("strings can't be encoded") with pytest.raises(McDecodeJSONException): # noinspection PyTypeChecker decode_json(None) with pytest.raises(McDecodeJSONException): # noinspection PyTypeChecker decode_json('not JSON')
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ SELECT topic_posts.topic_posts_id FROM topic_posts INNER JOIN topic_post_days ON topic_posts.topics_id = topic_post_days.topics_id AND topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id INNER JOIN topic_seed_queries ON topic_post_days.topics_id = topic_seed_queries.topics_id AND topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], } ).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """ Return a mock ch response to the posts end point. Generate the mock response by sending back data from a consistent but semirandom selection of ch-posts-2016-01-0[12345].json. """ assert MOCK_TWEETS_PER_DAY <= MAX_MOCK_TWEETS_PER_DAY test_path = mediawords.util.paths.mc_root_path( ) + '/mediacloud/test-data/ch/' filename = test_path + "ch-posts-" + day.strftime('%Y-%m-%d') + '.json' with open(filename, 'r', encoding='utf-8') as fh: json = fh.read() data = dict(decode_json(json)) assert 'posts' in data assert len(data['posts']) >= MOCK_TWEETS_PER_DAY data['posts'] = data['posts'][0:MOCK_TWEETS_PER_DAY] # replace tweets with the epoch of the start date so that we can infer the date of each tweet in # tweet_urler_lookup below i = 0 for ch_post in data['posts']: ch_post['url'] = re.sub(r'status/(\d+)/', '/status/' + str(i), ch_post['url']) i += 1 return data
def fetch_100_users(screen_names: list) -> list: """Fetch data for up to 100 users.""" if len(screen_names) > 100: raise McFetchTweetsException('tried to fetch more than 100 users') # tweepy returns a 404 if none of the screen names exist, and that 404 is indistinguishable from a 404 # indicating that tweepy can't connect to the twitter api. in the latter case, we want to let tweepy use its # retry mechanism, but not the former. so we add a dummy account that we know exists to every request # to make sure any 404 we get back is an actual 404. dummy_account = 'cyberhalroberts' dummy_account_appended = False if 'cyberhalroberts' not in screen_names: screen_names.append('cyberhalroberts') dummy_account_appended = True users_json = _get_tweepy_api().lookup_users(screen_names=screen_names, include_entities=False) users = list(decode_json(users_json)) # if we added the dummy account, remove it from the results if dummy_account_appended: users = list(filter(lambda u: u['screen_name'] != dummy_account, users)) # return simple list so that this can be mocked. relies on RawParser() in _get_tweepy_api() return users
def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None: """ Update the state and message fields in the given table for the row whose '<table>_id' field matches that field in the job args. """ job_state = decode_object_from_bytes_if_needed(job_state) try: args = decode_json(job_state.get('args', '')) except Exception as ex: log.error(f"Unable to decode args from job state {job_state}: {ex}") return extra_table = self.__state_config.extra_table() if extra_table: id_field = extra_table.table_name() + '_id' id_value = args.get(id_field, None) if not id_value: # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance # SnapshotTopic needs to create the snapshot. log.warning(f"Unable to get ID value for field '{id_field}' from job state {job_state}") return None update = { extra_table.state_column(): job_state.get('state', None), extra_table.message_column(): job_state.get('message', None), } db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update) else: log.debug("Extra table for storing state is not configured.")
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: # job_states.args got changed from JSON to JSONB while sharding the # database, and there's no way to disable decoding JSONB (as # opposed to JSON) in psycopg2, so "args" might be a JSON string or # a pre-decoded dictionary maybe_json_db_args = job_state.get('args', '') if isinstance(maybe_json_db_args, dict): db_args = maybe_json_db_args else: db_args = decode_json(maybe_json_db_args) except Exception as ex: log.error( f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """ Return a mock ch response to the posts end point. Generate the mock response by sending back data from a consistent but semirandom selection of ch-posts-2016-01-0[12345].json. """ assert MOCK_TWEETS_PER_DAY <= MAX_MOCK_TWEETS_PER_DAY test_path = mediawords.util.paths.mc_root_path() + '/mediacloud/test-data/ch/' filename = test_path + "ch-posts-" + day.strftime('%Y-%m-%d') + '.json' with open(filename, 'r', encoding='utf-8') as fh: json = fh.read() data = dict(decode_json(json)) assert 'posts' in data assert len(data['posts']) >= MOCK_TWEETS_PER_DAY data['posts'] = data['posts'][0:MOCK_TWEETS_PER_DAY] # replace tweets with the epoch of the start date so that we can infer the date of each tweet in # tweet_urler_lookup below i = 0 for ch_post in data['posts']: ch_post['url'] = re.sub(r'status/(\d+)/', '/status/' + str(i), ch_post['url']) i += 1 return data
async def fetch_store_transcript(self, stories_id: int) -> None: log.info(f"Fetching and storing transcript for story {stories_id}...") with tempfile.TemporaryDirectory( prefix='fetch_store_transcript') as temp_dir: transcript_json_path = os.path.join(temp_dir, 'transcript.json') gcs = GCSStore(bucket_config=self.config.transcripts()) gcs.download_object(object_id=str(stories_id), local_file_path=transcript_json_path) with open(transcript_json_path, 'r') as f: transcript_json = f.read() transcript = Transcript.from_dict(decode_json(transcript_json)) db = connect_to_db_or_raise() story = db.find_by_id(table='stories', object_id=stories_id) feed = db.query( """ SELECT * FROM feeds WHERE feeds_id = ( SELECT feeds_id FROM feeds_stories_map WHERE stories_id = %(stories_id)s ) """, { 'stories_id': stories_id, }).hash() # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download # exists first download = db.find_or_create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'success', 'path': 'content:pending', 'priority': 1, 'extracted': 'f' }, ) text = transcript.download_text_from_transcript() # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later store_content(db=db, download=download, content=text) log.info( f"Done fetching and storing transcript for story {stories_id}")
def fetch_test_data(basename: str, subdirectory: str = '') -> dict: """Fetch the given data from disk.""" basename = decode_object_from_bytes_if_needed(basename) subdirectory = decode_object_from_bytes_if_needed(subdirectory) file_path = _get_data_file(basename=basename, subdirectory=subdirectory) with open(file_path, mode='r', encoding='utf-8') as f: return decode_json(f.read())
def fetch_annotation_for_story(self, db: DatabaseHandler, stories_id: int) -> Union[dict, list, None]: """Fetch the annotation from key-value store for the story, or None if story is not annotated.""" if not self.annotator_is_enabled(): fatal_error("Annotator is not enabled in the configuration.") # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) if not self.story_is_annotated(db=db, stories_id=stories_id): log.warning("Story %d is not annotated." % stories_id) return None json = self.__postgresql_store.fetch_content(db=db, object_id=stories_id) if json is None: raise McJSONAnnotatorException( "Fetched annotation is undefined or empty for story %d." % stories_id) json = json.decode('utf-8') try: annotation = decode_json(json) if annotation is None: raise McJSONAnnotatorException( "Annotation is None after decoding from JSON.") except Exception as ex: raise McJSONAnnotatorException( "Unable to parse annotation JSON for story %d: %s\nString JSON: %s" % ( stories_id, str(ex), json, )) try: annotation = self._preprocess_stored_annotation(annotation) if annotation is None: raise McJSONAnnotatorException( "Annotation is None after preprocessing.") except Exception as ex: fatal_error( "Unable to preprocess stored annotation for story %d: %s\nString JSON: %s" % ( stories_id, str(ex), json, )) return annotation
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1): if retry > 0: log.debug(f"Retrying Solr connection ({retry})...") try: ua = UserAgent() ua.set_timeout(1) response = ua.get(sample_select_url) if not response.is_success(): raise Exception(f"Unable to connect: {response.status_line()}") if not response.decoded_content(): raise Exception("Response is empty.") try: result = decode_json(response.decoded_content()) except Exception as ex: raise Exception(f"Unable to decode response: {ex}") if not isinstance(result, dict): raise Exception( f"Result is not a dictionary: {response.decoded_content()}" ) if 'response' not in result: raise Exception( f"Response doesn't have 'response' key: {response.decoded_content()}" ) except Exception as ex: log.warning(f"Solr is down, will retry: {ex}") time.sleep(1) else: log.debug("Solr is up!") connected = True break if not connected: raise McSolrRequestDidNotStartInTimeException( f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up" )
def fetch_posts_from_api(self, query: str, start_date: datetime, end_date: datetime) -> list: """Return posts from a csv that are within the given date range.""" if self.mock_enabled: googler_json = self._get_mock_json(start_date, end_date) else: # disabling this for now because googler seems not to return results any more log.warning('google support disabled') return [] global _last_google_request_epoch now = time.time() if now - _last_google_request_epoch < GOOGLE_REQUEST_DELAY: delay = GOOGLE_REQUEST_DELAY - (now - _last_google_request_epoch) log.info("waiting %d seconds to make google request..." % delay) time.sleep(delay) _last_google_request_epoch = time.time() start_query = "after:" + start_date.strftime("%Y-%m-%d") end_query = "before:" + ( end_date + datetime.timedelta(days=1)).strftime("%Y-%m-%d") full_query = "%s %s %s" % (query, start_query, end_query) googler_json = subprocess.check_output( ["googler", "--json", "-n 100", full_query]) links = decode_json(googler_json) posts = [] for link in links: publish_date = start_date.strftime('%Y-%m-%d') domain = mediawords.util.url.get_url_distinctive_domain( link['url']) posts.append({ 'post_id': link['url'], 'content': "%s %s %s" % (link['title'], link['abstract'], link['url']), 'author': domain, 'channel': domain, 'publish_date': publish_date, 'data': link }) return posts
def decoded_json(self) -> Union[dict, list]: """Return content as JSON object(s) assuming it's formatted appropriately""" if self.content_type() and 'json' not in self.content_type(): log.warning( F"Content-Type header ({self.content_type()}) is not json for {self.__requests_response.url}" ) try: return decode_json(self.decoded_content()) except Exception as ex: raise McUserAgentResponseException( F"Response from {self.__requests_response.url} not parseable to JSON: " F"{str(ex)}")
def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None: """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets.""" topic_tweets = db.query( """ select * from topic_tweets tt join topic_tweet_days ttd using (topic_tweet_days_id) where ttd.topics_id = %(a)s """, { 'a': topic['topics_id'] }).hashes() expected_num_urls = 0 for topic_tweet in topic_tweets: data = dict(decode_json(topic_tweet['data'])) expected_num_urls += len(data['tweet']['entities']['urls']) # first sanity check to make sure we got some urls num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0] assert num_urls == expected_num_urls total_json_urls = 0 for topic_tweet in topic_tweets: ch_post = dict(decode_json(topic_tweet['data'])) expected_urls = [ x['expanded_url'] for x in ch_post['tweet']['entities']['urls'] ] total_json_urls += len(expected_urls) for expected_url in expected_urls: got_url = db.query( "select * from topic_tweet_urls where url = %(a)s", { 'a': expected_url }).hash() assert got_url is not None assert total_json_urls == num_urls
def _authenticated_domains_from_json( value: Optional[str]) -> List[AuthenticatedDomain]: """Parse the string and return a list of authenticated domains.""" if value is None: return [] value = value.strip() if not value: return [] try: entries = decode_json(value) except McDecodeJSONException as ex: # Don't leak JSON errors to exception which might possibly end up in a public error message message = "Unable to decode authenticated domains." log.error(f"{message}: {ex}") raise McConfigAuthenticatedDomainsException(message) domains = [] if not isinstance(entries, collections.Iterable): message = "Invalid JSON configuration" log.error(f"{message}: root is not an iterable (a list)") raise McConfigAuthenticatedDomainsException(message) for entry in entries: if not callable(getattr(entry, "get", None)): message = "Invalid JSON configuration" log.error( f"{message}: one of the items does not have get() (is not a dictionary)" ) raise McConfigAuthenticatedDomainsException(message) domain = entry.get('domain', None) username = entry.get('username', None) password = entry.get('password', None) if not (domain and username and password): raise McConfigAuthenticatedDomainsException( "Incomplete authentication credentials.") domains.append( AuthenticatedDomain(domain=domain, username=username, password=password)) return domains
def test_encode_decode_json(): test_obj = [ 'foo', {'bar': 'baz'}, ['xyz', 'zyx'], 'moo', 'ąčęėįšųūž', 42, 3.14, True, False, None ] expected_json = '["foo",{"bar":"baz"},["xyz","zyx"],"moo","ąčęėįšųūž",42,3.14,true,false,null]' encoded_json = encode_json(json_obj=test_obj, pretty=False) assert encoded_json == expected_json decoded_json = decode_json(json_string=encoded_json) assert decoded_json == test_obj # Encoding errors with pytest.raises(McEncodeJSONException): # noinspection PyTypeChecker encode_json(None) with pytest.raises(McEncodeJSONException): # noinspection PyTypeChecker encode_json("strings can't be encoded") with pytest.raises(McDecodeJSONException): # noinspection PyTypeChecker decode_json(None) with pytest.raises(McDecodeJSONException): # noinspection PyTypeChecker decode_json('not JSON')
def _fetch_posts_from_api_single_page(self, query: str, start_date: datetime, end_date: datetime, next_cursor: str) -> str: """Fetch the posts data from thw ch api and return the http response content.""" try: (project_id, query_id) = query.split('-') project_id = int(project_id) query_id = int(query_id) except Exception: raise McPostsBWTwitterQueryException( f"Unable to parse query '{query}', should be in 123-456, where 123 is project id and 456 is query id." ) log.debug("brandwatch_twitter.fetch_posts") ua = _get_user_agent() api_key = _get_api_key() start_arg = start_date.strftime('%Y-%m-%d') end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d') cursor = next_cursor if next_cursor is not None else '' url = ( f"https://api.brandwatch.com/projects/{project_id}/data/mentions?" f"queryId={query_id}&startDate={start_arg}&endDate={end_arg}&" f"pageSize={PAGE_SIZE}&orderBy=date&orderDirection=asc&" f"access_token={api_key}&cursor={cursor}") log.debug("brandwatch url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsBWTwitterDataException( f"error fetching posts: {response.code()} {response.status_line()}" ) json = response.decoded_content() data = dict(decode_json(json)) if 'results' not in data: raise McPostsBWTwitterDataException("error parsing response: " + json) return data
def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None: """ Update the state and message fields in the given table for the row whose '<table>_id' field matches that field in the job args. """ job_state = decode_object_from_bytes_if_needed(job_state) try: # job_states.args got changed from JSON to JSONB while sharding the # database, and there's no way to disable decoding JSONB (as # opposed to JSON) in psycopg2, so "args" might be a JSON string or # a pre-decoded dictionary maybe_json_args = job_state.get('args', '') if isinstance(maybe_json_args, dict): args = maybe_json_args else: args = decode_json(maybe_json_args) except Exception as ex: log.error( f"Unable to decode args from job state {job_state}: {ex}") return extra_table = self.__state_config.extra_table() if extra_table: id_field = extra_table.table_name() + '_id' id_value = args.get(id_field, None) if not id_value: # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance # SnapshotTopic needs to create the snapshot. log.warning( f"Unable to get ID value for field '{id_field}' from job state {job_state}" ) return None update = { extra_table.state_column(): job_state.get('state', None), extra_table.message_column(): job_state.get('message', None), } db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update) else: log.debug("Extra table for storing state is not configured.")
def fetch_posts_from_api( self, query: str, start_date: datetime, end_date: datetime, sample: Optional[int] = None, page_size: Optional[int] = None, ) -> list: """Fetch day of tweets from crimson hexagon and twitter.""" decoded_content = self._get_content_from_api(query, start_date, end_date) assert sample is None, "Sampling is not implemented." assert page_size is None, "Page size limiting is not supported." data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.debug("mt: %d" % mt['tweet_id']) if 'tweet' in mt: publish_date = dateutil.parser.parse( mt['tweet']['created_at']).isoformat() post = { 'post_id': str(mt['tweet_id']), 'data': mt, 'content': mt['tweet']['text'], 'publish_date': publish_date, 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts
def __solr_error_message_from_response(response: Response) -> str: """Parse out Solr error message from response.""" if response.error_is_client_side(): # UserAgent error (UserAgent wasn't able to connect to the server or something like that) error_message = f'UserAgent error: {response.decoded_content()}' else: status_code_str = str(response.code()) if status_code_str.startswith('4'): # Client error - set default message error_message = f'Client error: {response.status_line()} {response.decoded_content()}' # Parse out Solr error message if there is one solr_response_maybe_json = response.decoded_content() if solr_response_maybe_json: solr_response_json = {} try: solr_response_json = decode_json(solr_response_maybe_json) except Exception as ex: log.debug( f"Unable to parse Solr error response: {ex}; raw response: {solr_response_maybe_json}" ) error_message = solr_response_json.get('error', {}).get('msg', {}) request_params = solr_response_json.get('responseHeader', {}).get('params', {}) if error_message and request_params: request_params_json = encode_json(request_params) # If we were able to decode Solr error message, overwrite the default error message with it error_message = f'Solr error: "{error_message}", params: {request_params_json}' elif status_code_str.startswith('5'): # Server error or some other error error_message = f'Server error: {response.status_line()} {response.decoded_content()}' else: # Some weird stuff error_message = f'Other error error: {response.status_line()} {response.decoded_content()}' return error_message
def fetch_100_tweets(tweet_ids: list) -> list: """Fetch data for up to 100 tweets.""" if len(tweet_ids) > 100: raise McFetchTweetsException('tried to fetch more than 100 tweets') if len(tweet_ids) == 0: return [] tweets = _get_tweepy_api().statuses_lookup(tweet_ids, include_entities=True, trim_user=False, tweet_mode='extended') # return simple list so that this can be mocked. relies on RawParser() in _get_tweepy_api() tweets = list(decode_json(tweets)) for tweet in tweets: if 'full_text' in tweet: tweet['text'] = tweet['full_text'] return tweets
def _get_api_key() -> str: """Fetch the bw api key or use the cached one. To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least once a year. """ if hasattr(_get_api_key, "api_key"): return _get_api_key.api_key user = env_value('MC_BRANDWATCH_USER') password = env_value('MC_BRANDWATCH_PASSWORD') log.debug(f"user: {user}") log.debug(f"passwod: {password}") ua = _get_user_agent() url = ( "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client" % (quote(user))) request = Request(method='POST', url=url) request.set_content_type( 'application/x-www-form-urlencoded; charset=utf-8') request.set_content({'password': password}) response = ua.request(request) if not response.is_success(): raise McPostsBWTwitterDataException("error fetching posts: " + response.decoded_content()) json = response.decoded_content() data = dict(decode_json(json)) try: _get_api_key.api_key = data['access_token'] except: raise McPostsBWTwitterDataException( "error parsing ouath response: '%s'" % json) return _get_api_key.api_key
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: db_args = decode_json(job_state.get('args', '{}')) except Exception as ex: log.error(f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None: """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data.""" topic_tweets = db.query( "select * from topic_tweets where topic_tweet_days_id = %(a)s", {'a': topic_tweet_day['topic_tweet_days_id']} ).hashes() # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets assert len(topic_tweets) > 0 assert len(topic_tweets) == topic_tweet_day['num_ch_tweets'] for topic_tweet in topic_tweets: tweet_data = dict(decode_json(topic_tweet['data'])) # random field that should be coming from twitter assert 'assignedCategoryId' in tweet_data expected_date = datetime.datetime.strptime(tweet_data['tweet']['created_at'], '%Y-%m-%d') got_date = datetime.datetime.strptime(topic_tweet['publish_date'], '%Y-%m-%d 00:00:00') assert got_date == expected_date assert topic_tweet['content'] == tweet_data['tweet']['text']
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ select tt.topic_posts_id from topic_posts tt join topic_post_days ttd using ( topic_post_days_id ) where topics_id = %(a)s """, { 'a': topic['topics_id'] }).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None: """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data.""" topic_tweets = db.query( "select * from topic_tweets where topic_tweet_days_id = %(a)s", { 'a': topic_tweet_day['topic_tweet_days_id'] }).hashes() # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets assert len(topic_tweets) > 0 assert len(topic_tweets) == topic_tweet_day['num_ch_tweets'] for topic_tweet in topic_tweets: tweet_data = dict(decode_json(topic_tweet['data'])) # random field that should be coming from twitter assert 'assignedCategoryId' in tweet_data expected_date = datetime.datetime.strptime( tweet_data['tweet']['created_at'], '%Y-%m-%d') got_date = datetime.datetime.strptime(topic_tweet['publish_date'], '%Y-%m-%d 00:00:00') assert got_date == expected_date assert topic_tweet['content'] == tweet_data['tweet']['text']
def test_api_request(self): """Make an API request, see if it succeeds.""" credentials = self.univision_credentials() handler = DownloadFeedUnivisionHandler( crawler_config=self._mock_crawler_config()) api_request_url = handler._api_request_url_with_signature_from_config( api_url=credentials.url) assert api_request_url, 'API request URL is not empty' ua = UserAgent() ua.set_timeout(30) response = ua.get(api_request_url) assert response.is_success(), 'API request was successful' json_string = response.decoded_content() assert json_string, 'JSON response is not empty' json = decode_json(json_string) assert json.get('status', None) == 'success', "JSON response was successful" assert 'data' in json, 'JSON response has "data" key'
def __post(self) -> bytes: uri = urlparse(self.path) if uri.path != self._API_ENDPOINT_PATH: return self.__error_response( status=HTTPStatus.NOT_FOUND.value, message=f"Only {self._API_ENDPOINT_PATH} is implemented.", ) content_length = int(self.headers.get('Content-Length', 0)) log.info(f"Received extraction request ({content_length} bytes)...") if not content_length: return self.__error_response( status=HTTPStatus.LENGTH_REQUIRED.value, message="Content-Length header is not set.", ) if content_length > _MAX_REQUEST_LENGTH: return self.__error_response( status=HTTPStatus.REQUEST_ENTITY_TOO_LARGE.value, message=f"Request is larger than {_MAX_REQUEST_LENGTH} bytes.") encoded_body = self.rfile.read(content_length) try: json_body = encoded_body.decode('utf-8', errors='replace') except Exception as ex: return self.__error_response( status=HTTPStatus.BAD_REQUEST.value, message=f"Unable to decode request body: {ex}", ) try: body = decode_json(json_body) except Exception as ex: return self.__error_response( status=HTTPStatus.BAD_REQUEST.value, message=f"Unable to decode request JSON: {ex}", ) if "html" not in body: return self.__error_response( status=HTTPStatus.BAD_REQUEST.value, message="Request JSON doesn't have 'html' key.", ) html = body["html"] try: extracted_html = extract_article_from_page(html) except Exception as ex: return self.__error_response( status=HTTPStatus.BAD_REQUEST.value, message=f"Unable to extract article HTML from page HTML: {ex}") response = { 'extracted_html': extracted_html, 'extractor_version': extractor_name(), } return self.__success_response( status=HTTPStatus.OK.value, response=response, )
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info(f"Annotating {len(text)} characters of text...") # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( f"Text length ({text_length}) has exceeded the request text length limit" f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.") text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( f"Unable to create annotator request for text '{text}': {ex}") # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {request.url()}" assert port, f"API URL port is not set for URL {request.url()}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, " f"exiting...") log.debug(f"Sending request to {request.url()}...") # Try requesting a few times because sometimes it throws a connection error, e.g.: # # WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>: # ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')) # WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104, # 'Connection reset by peer')) # ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.', # ConnectionResetError(104, 'Connection reset by peer')) response = None retries = 60 sleep_between_retries = 1 for retry in range(1, retries + 1): if retry > 1: log.warning(f"Retrying ({retry} / {retries})...") response = ua.request(request) if response.is_success(): break else: if response.error_is_client_side(): log.error( f"Request failed on the client side: {response.decoded_content()}" ) time.sleep(sleep_between_retries) else: break log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning(f"Request failed: {response.decoded_content()}") if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( f"The request timed out, giving up; text length: {len(text)}; text: {text}" ) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error( f"User agent error: {response.status_line()}: {results_string}" ) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error(f'{response.status_line()}: {results_string}') elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( f'Annotator service was unable to process the download: {results_string}' ) else: # Shutdown the extractor on unconfigured responses fatal_error( f'Unknown HTTP response: {response.status_line()}: {results_string}' ) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( f"Annotator returned nothing for text: {text}") log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error( f"Unable to parse JSON response: {ex}\nJSON string: {results_string}" ) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}" ) if not response_is_valid: fatal_error( f"Annotator response is invalid for JSON string: {results_string}" ) log.info(f"Done annotating {len(text)} characters of text.") return results
def __annotate_text(self, text: str) -> Union[dict, list]: """Fetch JSON annotation for text, decode it into dictionary / list.""" text = decode_object_from_bytes_if_needed(text) if text is None: fatal_error("Text is None.") if len(text) == 0: # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here raise McJSONAnnotationFetcherException("Text is empty.") log.info("Annotating %d characters of text..." % len(text)) # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early # without making a request to the annotator at all text = text.strip() if self.__TEXT_LENGTH_LIMIT > 0: text_length = len(text) if text_length > self.__TEXT_LENGTH_LIMIT: log.warning( "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it." % ( text_length, self.__TEXT_LENGTH_LIMIT, )) text = text[:self.__TEXT_LENGTH_LIMIT] # Make a request ua = UserAgent() ua.set_timing([1, 2, 4, 8]) ua.set_timeout(self.__HTTP_TIMEOUT) ua.set_max_size(None) request = None try: request = self._request_for_text(text=text) if request is None: raise McJSONAnnotationFetcherException( "Returned request is None.") except Exception as ex: # Assume that this is some sort of a programming error too fatal_error( "Unable to create annotator request for text '%s': %s" % ( text, str(ex), )) # Wait for the service's HTTP port to become open as the service might be # still starting up somewhere uri = furl(request.url()) hostname = str(uri.host) port = int(uri.port) assert hostname, f"URL hostname is not set for URL {url}" assert port, f"API URL port is not set for URL {url}" if not wait_for_tcp_port_to_open( port=port, hostname=hostname, retries=self.__ANNOTATOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever. fatal_error( "Annotator service at {url} didn't come up in {timeout} seconds, exiting..." .format( url=url, timeout=self.__ANNOTATOR_SERVICE_TIMEOUT, )) log.debug("Sending request to %s..." % request.url()) response = ua.request(request) log.debug("Response received.") # Force UTF-8 encoding on the response because the server might not always # return correct "Content-Type" results_string = response.decoded_utf8_content() if not response.is_success(): # Error; determine whether we should be blamed for making a malformed # request, or is it an extraction error log.warning("Request failed: %s" % response.decoded_content()) if response.code() == HTTPStatus.REQUEST_TIMEOUT.value: # Raise on request timeouts without retrying anything because those usually mean that we posted # something funky to the annotator service and it got stuck raise McJSONAnnotationFetcherException( "The request timed out, giving up; text length: %d; text: %s" % ( len(text), text, )) if response.error_is_client_side(): # Error was generated by the user agent client code; likely didn't reach server at all (timeout, # unresponsive host, etc.) fatal_error("User agent error: %s: %s" % ( response.status_line(), results_string, )) else: # Error was generated by server http_status_code = response.code() if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \ or http_status_code == HTTPStatus.BAD_REQUEST.value: # Not POST, empty POST fatal_error('%s: %s' % ( response.status_line(), results_string, )) elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value: # Processing error -- raise so that the error gets caught and logged into a database raise McJSONAnnotationFetcherException( 'Annotator service was unable to process the download: %s' % results_string) else: # Shutdown the extractor on unconfigured responses fatal_error('Unknown HTTP response: %s: %s' % ( response.status_line(), results_string, )) if results_string is None or len(results_string) == 0: raise McJSONAnnotationFetcherException( "Annotator returned nothing for text: %s" % text) log.debug("Parsing response's JSON...") results = None try: results = decode_json(results_string) if results is None: raise McJSONAnnotationFetcherException( "Returned JSON is None.") except Exception as ex: # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do # fatal_error() here fatal_error("Unable to parse JSON response: %s\nJSON string: %s" % ( str(ex), results_string, )) log.debug("Done parsing response's JSON.") response_is_valid = False try: response_is_valid = self._fetched_annotation_is_valid(results) except Exception as ex: fatal_error( "Unable to determine whether response is valid: %s\nJSON string: %s" % (str(ex), results_string)) if not response_is_valid: fatal_error("Annotator response is invalid for JSON string: %s" % results_string) log.info("Done annotating %d characters of text." % len(text)) return results
def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]: """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed.""" content = decode_object_from_bytes_if_needed(content) if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) if not content: raise McCrawlerFetcherSoftError("Feed content is empty or undefined.") try: feed_json = decode_json(content) except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}") try: # Intentionally raise exception on KeyError: if not feed_json['status'] == 'success': raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}") except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}") try: # Intentionally raise exception on KeyError: feed_items = feed_json.get('data', None).get('items', None) except Exception as ex: raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}") stories = [] for item in feed_items: url = item.get('url', None) if not url: # Some items in the feed don't have their URLs set log.warning(f"'url' for item is not set: {item}") continue # sic -- we take "uid" (without "g") and call it "guid" (with "g") guid = item.get('uid', None) if not guid: raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}") title = item.get('title', '(no title)') description = item.get('description', '') try: # Intentionally raise exception on KeyError: str_publish_date = item['publishDate'] publish_timestamp = str2time_21st_century(str_publish_date) publish_date = get_sql_date_from_epoch(publish_timestamp) except Exception as ex: # Die for good because Univision's dates should be pretty predictable raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}") log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'") stories.append({ 'url': url, 'guid': guid, 'media_id': media_id, 'publish_date': publish_date, 'title': title, 'description': description, }) return stories
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]: """ Make Facebook API request. Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if something went wrong. :param node: Facebook API node to call. :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if multiple values with the same key have to be passed. :param config: Facebook configuration object. :return: API response. """ node = decode_object_from_bytes_if_needed(node) params = decode_object_from_bytes_if_needed(params) if node is None: raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).") if not isinstance(params, dict): raise McFacebookInvalidParametersException("Params is not a dict.") if not config.is_enabled(): raise McFacebookInvalidConfigurationException("Facebook API is not enabled.") if not config.api_endpoint(): raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.") api_uri = furl(config.api_endpoint()) api_uri.path.segments.append(node) if not isinstance(params, dict): raise McFacebookInvalidParametersException("Parameters should be a dictionary.") for key, values in params.items(): if key is None or values is None: raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.") if isinstance(values, str): # A single value api_uri = api_uri.add({key: values}) elif isinstance(values, list): # Multiple values for the same key for value in values: api_uri = api_uri.add({key: value}) else: raise McFacebookInvalidParametersException("Values is neither a string nor a list.") log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}") app_id = config.app_id() app_secret = config.app_secret() if not (app_id and app_secret): raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.") access_token = f"{app_id}|{app_secret}" api_uri = api_uri.add({'access_token': access_token}) # Last API error to set as an exception message if we run out of retries last_api_error = None data = None for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1): if retry > 1: log.warning(f"Retrying #{retry}...") ua = UserAgent() ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT) try: response = ua.get(api_uri.url) except Exception as ex: # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up # something in the code or arguments raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}") decoded_content = response.decoded_content() if not decoded_content: # some stories consistenty return empty content, so just return a soft error and move on raise McFacebookSoftFailureException("Decoded content is empty.") try: data = decode_json(decoded_content) except Exception as ex: if 'something went wrong' in decoded_content: # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the # request last_api_error = f"API responded with 'Something went wrong', will retry" log.error(last_api_error) continue else: # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"Unable to decode JSON response: {ex}", ) if response.is_success(): # Response was successful and we managed to decode JSON -- break from the retry loop return data else: if 'error' not in data: # More likely than not it's our problem so consider it a hard failure raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"No 'error' key but HTTP status is not 2xx", ) error = data['error'] error_code = error.get('code', -1) error_message = error.get('message', 'unknown message') if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES: # Retryable error last_api_error = ( f"Retryable error {error_code}: {error_message}, " f"will retry in {config.seconds_to_wait_between_retries()} seconds" ) log.error(last_api_error) time.sleep(config.seconds_to_wait_between_retries()) continue else: # Non-retryable error log.error(f"Non-retryable error {error_code}: {error_message}") return data # At this point, we've retried the request for some time but nothing worked log.error(f"Ran out of retries; last error: {last_api_error}") return data
def query_solr(db: DatabaseHandler, params: SolrParams) -> Dict[str, Any]: """ Execute a query on the Solr server using the given parameters. Return a maximum of 1 million sentences. The "params" argument is a dictionary of query parameters to Solr, detailed here: https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html. The query ("params['q']") is transformed: lower case boolean operators are made uppercase to make Solr recognize them as boolean queries. Return decoded response in the format described here: https://lucene.apache.org/solr/guide/6_6/response-writers.html#ResponseWriters-JSONResponseWriter """ params = decode_object_from_bytes_if_needed(params) # Avoid editing the dictionary itself params = copy.deepcopy(params) if not params: raise McQuerySolrInternalErrorException('Parameters must be set.') if not isinstance(params, dict): raise McQuerySolrInternalErrorException( 'Parameters must be a dictionary.') params['wt'] = 'json' if 'rows' in params: params['rows'] = int(params['rows']) else: params['rows'] = 1000 if 'df' not in params: params['df'] = 'text' params['rows'] = min(params['rows'], 10_000_000) if 'q' not in params: params['q'] = '' # "fq" might be nonexistent or None if not params.get('fq', None): params['fq'] = [] if not isinstance(params['fq'], list): params['fq'] = [params['fq']] if ':[' in params['q']: raise McQuerySolrRangeQueryException( "Range queries are not allowed in the main query. Please use a filter query instead for range queries." ) # if params['q']: # params['q'] = f"{{!complexphrase inOrder=false}} {params['q']}" params['q'] = _uppercase_boolean_operators(params['q']) params['fq'] = _uppercase_boolean_operators(params['fq']) params['q'] = _replace_smart_quotes(params['q']) params['fq'] = _replace_smart_quotes(params['fq']) if params['q']: params['q'] = _insert_collection_media_ids(db=db, q=params['q']) if params['fq']: params['fq'] = [ _insert_collection_media_ids(db=db, q=_) for _ in params['fq'] ] response_json = solr_request( path='select', params={}, content=params, content_type='application/x-www-form-urlencoded; charset=utf-8', ) try: response = decode_json(response_json) except Exception as ex: raise McQuerySolrInternalErrorException( f"Error parsing Solr JSON: {ex}\nJSON: {response_json}") if 'error' in response: raise McQuerySolrInvalidQueryException( f"Error received from Solr: {response_json}") return response