def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fech_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): _add_to_topic_stories(db, story, topic) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'") log.debug("inserting into topic_posts ...") [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts] db.query( """ UPDATE topic_post_days SET posts_fetched = true, num_posts_stored = %(num_posts_stored)s, num_posts_fetched = %(num_posts_fetched)s WHERE topics_id = %(topics_id)s AND topic_post_days_id = %(topic_post_days_id)s """, { 'num_posts_stored': len(posts), 'num_posts_fetched': num_posts_fetched, 'topics_id': topic_post_day['topics_id'], 'topic_post_days_id': topic_post_day['topic_post_days_id'], } ) db.commit() log.debug("done inserting into topic_posts")
def get_default_size_attribute(db: DatabaseHandler, timespans_id: int) -> str: """Return size attribute based on whether the timespan belongs to a url sharing subtopic.""" timespan = db.require_by_id('timespans', timespans_id) if timespan['foci_id'] is None: return 'media_inlink_count' focus = db.require_by_id('foci', timespan['foci_id']) focal_set = db.require_by_id('focal_sets', focus['focal_sets_id']) if focal_set['focal_technique'] == 'URL Sharing': return 'author_count' else: return 'media_inlink_count'
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list( filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() log.debug("inserting into topic_posts ...") [ _store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts ] db.query( """ update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s where topic_post_days_id = %(c)s """, { 'a': len(posts), 'b': num_posts_fetched, 'c': topic_post_day['topic_post_days_id'] }) db.commit() log.debug("done inserting into topic_posts")
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: # job_states.args got changed from JSON to JSONB while sharding the # database, and there's no way to disable decoding JSONB (as # opposed to JSON) in psycopg2, so "args" might be a JSON string or # a pre-decoded dictionary maybe_json_db_args = job_state.get('args', '') if isinstance(maybe_json_db_args, dict): db_args = maybe_json_db_args else: db_args = decode_json(maybe_json_db_args) except Exception as ex: log.error( f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict: """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id.""" medium = db.require_by_id('media', media_id) if medium['dup_media_id'] is None: return medium else: return _get_deduped_medium(db, medium['dup_media_id'])
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ SELECT topic_posts.topic_posts_id FROM topic_posts INNER JOIN topic_post_days ON topic_posts.topics_id = topic_post_days.topics_id AND topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id INNER JOIN topic_seed_queries ON topic_post_days.topics_id = topic_seed_queries.topics_id AND topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], } ).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)
def get_story_date_tag(db: DatabaseHandler, story: dict) -> Optional[tuple]: """Return the tag tag_sets dict associated with the story guess method tag sets.""" tags = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) join stories_tags_map stm using ( tags_id ) where ts.name = any(%(a)s) and stm.stories_id = %(b)s """, { 'a': [GUESS_METHOD_TAG_SET, INVALID_TAG_SET], 'b': story['stories_id'] }).hashes() assert len(tags) < 2 if len(tags) == 1: tag = tags[0] else: return None, None tag_set = db.require_by_id('tag_sets', tag['tag_sets_id']) return tag, tag_set
def fetch_topic_posts(db: DatabaseHandler, topic_seed_query: dict) -> None: """For each day within the topic dates, fetch and store posts returned by the topic_seed_query. This is the core function that fetches and stores data for sharing topics. This function will break the date range for the topic into individual days and fetch posts matching the topic_seed_query for the for each day. This function will create a topic_post_day row for each day of posts fetched, a topic_post row for each post fetched, and a topic_post_url row for each url found in a post. Arguments: db - database handle topics_id - topic id Return: None """ topic = db.require_by_id('topics', topic_seed_query['topics_id']) date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d') end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d') while date <= end_date: log.debug("fetching posts for %s" % date) if not _topic_post_day_fetched(db, topic_seed_query, date): posts = fetch_posts(topic_seed_query, date) topic_post_day = _add_topic_post_single_day( db, topic_seed_query, len(posts), date) _store_posts_for_day(db, topic_post_day, posts) date = date + datetime.timedelta(days=1)
def update_job_state_message(self, db: DatabaseHandler, message: str) -> None: """ Update the message field for the current "job_states" row. This is a public method that is intended to be used by code run anywhere above the stack from run() to publish messages updating the progress of a long running job. """ message = decode_object_from_bytes_if_needed(message) # Verify that it exists I guess? db.require_by_id(table='job_states', object_id=self.__job_states_id) job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'message': message, 'last_updated': sql_now(), }) self.__update_table_state(db=db, job_state=job_state)
def _fetch_tweets_for_day(db: DatabaseHandler, topic_tweet_day: dict, meta_tweets: list, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets and then fetching each tweet from twitter. Arguments: db - db handle topic_tweet_day - topic_tweet_day dict meta_tweets - list of meta tweets found for day max_tweets - max tweets to fetch for a single day Return: None """ if (max_tweets is not None): meta_tweets = meta_tweets[0:max_tweets] topics_id = topic_tweet_day['topics_id'] log.info("adding %d tweets for topic %s, day %s" % (len(meta_tweets), topics_id, topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(meta_tweets), 100): _add_tweets_to_meta_tweets(meta_tweets[i:i + 100]) topic = db.require_by_id('topics', topic_tweet_day['topics_id']) meta_tweets = list( filter(lambda p: _tweet_matches_pattern(topic, p), meta_tweets)) log.info("%d tweets remaining after match" % (len(meta_tweets))) db.begin() log.debug("inserting into topic_tweets ...") [ _store_tweet_and_urls(db, topic_tweet_day, meta_tweet) for meta_tweet in meta_tweets ] topic_tweet_day['num_tweets'] = len(meta_tweets) db.query( "update topic_tweet_days set tweets_fetched = true, num_tweets = %(a)s where topic_tweet_days_id = %(b)s", { 'a': topic_tweet_day['num_tweets'], 'b': topic_tweet_day['topic_tweet_days_id'] }) db.commit() log.debug("done inserting into topic_tweets")
def _get_dup_story_groups(db: DatabaseHandler, topic: dict) -> list: """Return a list of duplicate story groups. Find all stories within a topic that have duplicate normalized titles with a given day and media_id. Return a list of story lists. Each story list is a list of stories that are duplicated os each other. """ story_pairs = db.query( """ SELECT a.stories_id AS stories_id_a, b.stories_id AS stories_id_b FROM snap.live_stories AS a, snap.live_stories AS b WHERE a.topics_id = %(topics_id)s AND a.topics_id = b.topics_id AND a.stories_id < b.stories_id AND a.media_id = b.media_id AND a.normalized_title_hash = b.normalized_title_hash AND date_trunc('day', a.publish_date) = date_trunc('day', b.publish_date) ORDER BY stories_id_a, stories_id_b """, { 'topics_id': topic['topics_id'], } ).hashes() story_groups = {} ignore_stories = {} for story_pair in story_pairs: if story_pair['stories_id_b'] in ignore_stories: continue story_a = db.require_by_id('stories', story_pair['stories_id_a']) story_b = db.require_by_id('stories', story_pair['stories_id_b']) story_groups.setdefault(story_a['stories_id'], [story_a]) story_groups[story_a['stories_id']].append(story_b) ignore_stories[story_b['stories_id']] = True return list(story_groups.values())
def fetch_topic_tweets( db: DatabaseHandler, topics_id: int, max_tweets_per_day: typing.Optional[int] = None) -> None: """For each day within the topic dates, fetch and store the tweets. This is the core function that fetches and stores data for twitter topics. This function will break the date range for the topic into individual days and fetch tweets matching thes twitter seed query for the topic for each day. This function will create a topic_tweet_day row for each day of tweets fetched, a topic_tweet row for each tweet fetched, and a topic_tweet_url row for each url found in a tweet. This function pulls metadata about the matching tweets from a search source (such as crimson hexagon or archive.org, as deteremined by the topic_seed_queries.source field) and then fetches the tweets returned by the search from the twitter api in batches of 100. Arguments: db - database handle topics_id - topic id max_tweets_per_day - max tweets to fetch each day Return: None """ topic = db.require_by_id('topics', topics_id) if topic['platform'] != 'twitter': raise ( McFetchTopicTweetsDataException("Topic platform is not 'twitter'")) date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d') end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d') while date <= end_date: try: log.info("fetching tweets for %s" % date) if not _topic_tweet_day_fetched(db, topic, date): meta_tweets = fetch_meta_tweets(db, topic, date) topic_tweet_day = _add_topic_tweet_single_day( db, topic, len(meta_tweets), date) _fetch_tweets_for_day(db, topic_tweet_day, meta_tweets, max_tweets_per_day) except McFetchTopicTweetDateFetchedException: pass date = date + datetime.timedelta(days=1)
def fetch_topic_tweets( db: DatabaseHandler, topics_id: int, twitter_class: typing.Type[AbstractTwitter] = Twitter, ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon ) -> None: """ Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic. Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted down to a single day. This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single topic for the whole date range combined. The call normalizes the number of tweets returned for each day so that each day has the same percentage of all tweets found on that day. So if there were 20,000 tweets found on the busiest day, each day will use at most 50% of the returned tweets for the day. One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those tweets from twitter (CH does not provide the tweet content, only the url). Each day's worth of tweets will be recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic, but each call will fetch any days newly included in the date range of the topic given a topic dates change. If there is no ch_monitor_id for the topic, do nothing. Arguments: db - db handle topics_id - topic id twitter_class - optional implementation of AbstractTwitter class; default to one that fetches data from twitter with config from mediawords.yml ch_class - optional implementation of AbstractCrimsonHexagon class; default to one that fetches data from twitter with config from mediawords.yml Return: None """ topic = db.require_by_id('topics', topics_id) ch_monitor_id = topic['ch_monitor_id'] if ch_monitor_id is None: log.debug( "returning after noop because topic topics_id has a null ch_monitor_id" ) return _add_topic_tweet_days(db, topic, twitter_class, ch_class)
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: db_args = decode_json(job_state.get('args', '{}')) except Exception as ex: log.error(f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def fetch_topic_posts(db: DatabaseHandler, topics_id: int) -> None: """For each day within the topic dates, fetch and store posts returned by the topic_seed_query. This is the core function that fetches and stores data for sharing topics. This function will break the date range for the topic into individual days and fetch posts matching the topic_seed_query for the for each day. This function will create a topic_post_day row for each day of posts fetched, a topic_post row for each post fetched, and a topic_post_url row for each url found in a post. Arguments: db - database handle topics_id - topic id Return: None """ topic = db.require_by_id('topics', topics_id) if topic['mode'] != 'url_sharing': raise McFetchTopicPostsDataException("Topic mode is not 'sharing'") topic_seed_queries = db.query( "select * from topic_seed_queries where topics_id = %(a)s", { 'a': topics_id }).hashes() if not len(topic_seed_queries) == 1: raise McFetchTopicPostsDataException( "Topic must have exactly one topic_seed_queries row") topic_seed_query = topic_seed_queries[0] date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d') end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d') log.warning("%s - %s" % (str(date), str(end_date))) while date <= end_date: log.debug("fetching posts for %s" % date) if not _topic_post_day_fetched(db, topic, date): posts = fetch_posts(topic_seed_query, date) topic_post_day = _add_topic_post_single_day( db, topic, len(posts), date) _store_posts_for_day(db, topic_post_day, posts) date = date + datetime.timedelta(days=1)
def fetch_topic_tweets( db: DatabaseHandler, topics_id: int, twitter_class: typing.Type[AbstractTwitter] = Twitter, ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon) -> None: """ Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic. Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted down to a single day. This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single topic for the whole date range combined. The call normalizes the number of tweets returned for each day so that each day has the same percentage of all tweets found on that day. So if there were 20,000 tweets found on the busiest day, each day will use at most 50% of the returned tweets for the day. One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those tweets from twitter (CH does not provide the tweet content, only the url). Each day's worth of tweets will be recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic, but each call will fetch any days newly included in the date range of the topic given a topic dates change. If there is no ch_monitor_id for the topic, do nothing. Arguments: db - db handle topics_id - topic id twitter_class - optional implementation of AbstractTwitter class; default to one that fetches data from twitter with config from mediawords.yml ch_class - optional implementation of AbstractCrimsonHexagon class; default to one that fetches data from twitter with config from mediawords.yml Return: None """ topic = db.require_by_id('topics', topics_id) ch_monitor_id = topic['ch_monitor_id'] if ch_monitor_id is None: log.debug("returning after noop because topic topics_id has a null ch_monitor_id") return _add_topic_tweet_days(db, topic, twitter_class, ch_class)
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ select tt.topic_posts_id from topic_posts tt join topic_post_days ttd using ( topic_post_days_id ) where topics_id = %(a)s """, { 'a': topic['topics_id'] }).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)
def _try_fetch_topic_url( db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: typing.Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url): topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls pending_state = _get_pending_state(topic_fetch_url) if pending_state: topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: if _ignore_link_pattern(response_url): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url story = mediawords.tm.stories.generate_story(db=db, content=content, url=url) topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except mediawords.tm.stories.McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException("Unable to find matching story after unique constraint error.") topic_fetch_url['stories_id'] = story_match['stories_id'] _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
def _try_fetch_topic_url( db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: typing.Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): topic_fetch_url['state'] = FETCH_STATE_IGNORE topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url): topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # get content from either the seed or by fetching it _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = get_seeded_content(db, topic_fetch_url) if response is None: _update_tfu_message(db, topic_fetch_url, "fetching content") response = fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code(), topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.decoded_content() fetched_url = topic_fetch_url['url'] response_url = response.request().url() if response.request() else None if fetched_url != response_url: if _ignore_link_pattern(response_url): topic_fetch_url['state'] = FETCH_STATE_IGNORE topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code() assume_match = topic_fetch_url['assume_match'] _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success(): topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message() elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not _content_matches_topic(content=content, topic=topic, assume_match=assume_match): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url story = mediawords.tm.stories.generate_story(db=db, content=content, url=url) topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except mediawords.tm.stories.McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException("Unable to find matching story after unique constraint error.") topic_fetch_url['stories_id'] = story_match['stories_id'] _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...") # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): log.info( f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching" ) return log.info("Checking ignore links...") _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): log.info("Link is to be ignored, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking failed URL...") _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: log.info("URL is failed, returning") topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return log.info("Checking self-linked domain...") _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): log.info("Link is self-linked domain, returning") topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return log.info(f"Fetching topic {topic_fetch_url['topics_id']}...") topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests log.info("Checking story match...") _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: log.info("URL is in pending state, getting story match...") story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: log.info(f"Matched story {story_match['stories_id']}, returning") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls log.info("Checking for pending state...") pending_state = _get_pending_state(topic_fetch_url) if pending_state: log.info("URL is in pending state, returning") topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it log.info("Checking seeded content...") _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: log.info("Seeded content found, fetching URL...") _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.info(f"{response.code} response returned") else: log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}") content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: log.info( f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern" ) if _ignore_link_pattern(response_url): log.info("Ignore link pattern matched, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking story match for redirect URL...") _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] log.info("Checking content match...") _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: log.info("Request failed") topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: log.info(f"Story {story_match['stories_id']} matched") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): log.info("Content matched") topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: log.info("Nothing matched, generating story...") try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url log.info("Creating story...") story = generate_story(db=db, content=content, url=url) log.info(f"Created story {story['stories_id']}") topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: log.info( "Duplicate story found, checking for story match on unique constraint error..." ) # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: message = "Unable to find matching story after unique constraint error." log.error(message) raise McTMFetchLinkException(message) log.info(f"Matched story {story_match['stories_id']}") topic_fetch_url['stories_id'] = story_match['stories_id'] log.info("Done generating story") _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done") log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
def extract_links_for_topic_story( db: DatabaseHandler, stories_id: int, topics_id: int, test_throw_exception: bool = False, ) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls _get_links_from_story() on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ story = db.require_by_id(table='stories', object_id=stories_id) topic = db.require_by_id(table='topics', object_id=topics_id) try: if test_throw_exception: raise McExtractLinksForTopicStoryTestException( "Testing whether errors get logged.") log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = _get_links_from_story(db, story) for link in links: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) link_mine_error = '' except Exception as ex: log.error(f"Link mining error: {ex}") link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, { 'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error })
def update(db: DatabaseHandler, media_id: int, client: SimilarWebClient): """Updates a media_id in the database, along with the summary table. Parameters ---------- db : DatabaseHandler Connection to the database media_id : int Media id to fetch audience data for client : SimilarWebClient client to use when querying SimilarWeb """ # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) try: media_data = db.require_by_id('media', media_id) except McRequireByIDException: raise ValueError('No media found with id {}'.format(media_id)) url = media_data['url'] similarweb_data = client.get(url) meta = similarweb_data['meta'] domain = meta['request']['domain'] is_domain_exact_match = check_if_is_domain_exact_match(url, domain) if 'visits' in similarweb_data: visits = [] for row in similarweb_data['visits']: visits.append(row['visits']) if visits[-1] is not None: month_visits = int(visits[-1]) else: month_visits = None db.query( """ INSERT INTO similarweb_metrics (domain, month, visits) VALUES (%(domain)s, %(month)s, %(visits)s) ON CONFLICT (domain, month) DO UPDATE SET domain = %(domain)s, month=%(month)s """, { 'domain': domain, 'month': row['date'], 'visits': month_visits, }) if len(visits) == 0: monthly_audience = 0 else: # careful of None values monthly_audience = int( sum(j if j else 0 for j in visits) / len(visits)) db.query( """ INSERT INTO similarweb_media_metrics (similarweb_domain, domain_exact_match, monthly_audience, media_id) VALUES (%(similarweb_domain)s, %(domain_exact_match)s, %(monthly_audience)s, %(media_id)s) ON CONFLICT (media_id) DO UPDATE SET similarweb_domain = %(similarweb_domain)s, domain_exact_match = %(domain_exact_match)s, monthly_audience = %(monthly_audience)s """, { 'similarweb_domain': domain, 'domain_exact_match': is_domain_exact_match, 'monthly_audience': monthly_audience, 'media_id': media_id, }) elif 'error_message' in meta: raise SimilarWebException(meta['error_message']) else: raise SimilarWebException( 'Was not able to fetch SimilarWeb data for {} for unknown reason'. format(url))
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fech_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): mediawords.tm.stories.add_to_topic_stories(db, story, topic) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)