def test_content_matches_topic(): """Test content_matches_topic().""" assert content_matches_topic('foo', {'topics_id': 1, 'pattern': 'foo'}) assert content_matches_topic('FOO', {'topics_id': 1, 'pattern': 'foo'}) assert content_matches_topic('FOO', {'topics_id': 1, 'pattern': ' foo '}) assert not content_matches_topic('foo', {'topics_id': 1, 'pattern': 'bar'}) assert content_matches_topic('foo', { 'topics_id': 1, 'pattern': 'bar' }, assume_match=True)
def _try_fetch_tweets_chunk(db: DatabaseHandler, topic: Dict[str, Any], topic_fetch_urls: List[Dict[str, Any]]) -> None: """Fetch up to URLS_CHUNK_SIZE topic_fetch_urls from twitter api as statuses and add them as topic stories. Throw any errors up the stack. """ status_lookup = {} for topic_fetch_url in topic_fetch_urls: status_id = parse_status_id_from_url(topic_fetch_url['url']) status_lookup.setdefault(status_id, []) status_lookup[status_id].append(topic_fetch_url) status_ids = list(status_lookup.keys()) log.info(f"fetching tweets for {len(status_ids)} status_ids ...") tweets = fetch_100_tweets(status_ids) for tweet in tweets: try: topic_fetch_urls = status_lookup[str(tweet['id'])] del (status_lookup[str(tweet['id'])]) except KeyError: raise KeyError(f"can't find tweet '{tweet['id']}' in ids: {status_ids}") if content_matches_topic(tweet['text'], topic): _add_tweet_story(db, topic, tweet, topic_fetch_urls) else: [_log_content_match_failed(db, u) for u in topic_fetch_urls] for status_id in status_lookup.keys(): topic_fetch_urls = status_lookup[status_id] [_log_tweet_missing(db, u) for u in topic_fetch_urls]
def _try_fetch_users_chunk(db: DatabaseHandler, topic: Dict[str, Any], topic_fetch_urls: List[Dict[str, Any]]) -> None: """Fetch up to URLS_CHUNK_SIZE topic_fetch_urls from twitter api as users and add them as topic stories. Throw any errors up the stack. """ url_lookup = {} for topic_fetch_url in topic_fetch_urls: screen_name = parse_screen_name_from_user_url(topic_fetch_url['url']).lower() url_lookup.setdefault(screen_name, []) url_lookup[screen_name].append(topic_fetch_url) screen_names = list(url_lookup.keys()) log.info(f"fetching users for {len(screen_names)} screen_names ...") users = fetch_100_users(screen_names) for user in users: try: screen_name = user['screen_name'].lower() topic_fetch_urls = url_lookup[screen_name] del (url_lookup[screen_name]) except KeyError: raise KeyError(f"can't find user '{user['screen_name']}' in urls: {screen_names}") content = f"{user['name']} {user['screen_name']} {user['description']}" if content_matches_topic(content, topic): _add_user_story(db, topic, user, topic_fetch_urls) else: [_log_content_match_failed(db, u) for u in topic_fetch_urls] for screen_name in url_lookup.keys(): topic_fetch_urls = url_lookup[screen_name] [_log_tweet_missing(db, u) for u in topic_fetch_urls]
def _story_matches_topic(db: DatabaseHandler, story: dict, topic: dict, assume_match: bool = False, redirect_url: str = None) -> bool: """Test whether the story sentences or metadata of the story match the topic['pattern'] regex. Arguments: db - database handle story - story to match against topic pattern topic - topic to match against redirect_url - alternate url for story Return: True if the story matches the topic pattern """ if assume_match: return True for field in ['title', 'description', 'url']: if content_matches_topic(story[field], topic): return True if redirect_url and content_matches_topic(redirect_url, topic): return True story = db.query( """ select string_agg(' ', sentence) as text from story_sentences ss join topics c on ( c.topics_id = %(a)s ) where ss.stories_id = %(b)s """, { 'a': topic['topics_id'], 'b': story['stories_id'] }).hash() if content_matches_topic(story['text'], topic): return True
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'") log.debug("inserting into topic_posts ...") [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts] db.query( """ UPDATE topic_post_days SET posts_fetched = true, num_posts_stored = %(num_posts_stored)s, num_posts_fetched = %(num_posts_fetched)s WHERE topics_id = %(topics_id)s AND topic_post_days_id = %(topic_post_days_id)s """, { 'num_posts_stored': len(posts), 'num_posts_fetched': num_posts_fetched, 'topics_id': topic_post_day['topics_id'], 'topic_post_days_id': topic_post_day['topic_post_days_id'], } ) db.commit() log.debug("done inserting into topic_posts")
def _story_matches_topic( db: DatabaseHandler, story: dict, topic: dict, assume_match: bool = False, redirect_url: str = None) -> bool: """Test whether the story sentences or metadata of the story match the topic['pattern'] regex. Arguments: db - database handle story - story to match against topic pattern topic - topic to match against redirect_url - alternate url for story Return: True if the story matches the topic pattern """ if assume_match: return True for field in ['title', 'description', 'url']: if content_matches_topic(story[field], topic): return True if redirect_url and content_matches_topic(redirect_url, topic): return True sentences = db.query( "select sentence from story_sentences where stories_id = %(a)s", {'a': story['stories_id']}).flat() text = ' '.join(sentences) if content_matches_topic(text, topic): return True
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list( filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() log.debug("inserting into topic_posts ...") [ _store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts ] db.query( """ update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s where topic_post_days_id = %(c)s """, { 'a': len(posts), 'b': num_posts_fetched, 'c': topic_post_day['topic_post_days_id'] }) db.commit() log.debug("done inserting into topic_posts")
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...") # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): log.info( f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching" ) return log.info("Checking ignore links...") _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): log.info("Link is to be ignored, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking failed URL...") _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: log.info("URL is failed, returning") topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return log.info("Checking self-linked domain...") _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): log.info("Link is self-linked domain, returning") topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return log.info(f"Fetching topic {topic_fetch_url['topics_id']}...") topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests log.info("Checking story match...") _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: log.info("URL is in pending state, getting story match...") story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: log.info(f"Matched story {story_match['stories_id']}, returning") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls log.info("Checking for pending state...") pending_state = _get_pending_state(topic_fetch_url) if pending_state: log.info("URL is in pending state, returning") topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it log.info("Checking seeded content...") _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: log.info("Seeded content found, fetching URL...") _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.info(f"{response.code} response returned") else: log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}") content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: log.info( f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern" ) if _ignore_link_pattern(response_url): log.info("Ignore link pattern matched, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking story match for redirect URL...") _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] log.info("Checking content match...") _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: log.info("Request failed") topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: log.info(f"Story {story_match['stories_id']} matched") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): log.info("Content matched") topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: log.info("Nothing matched, generating story...") try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url log.info("Creating story...") story = generate_story(db=db, content=content, url=url) log.info(f"Created story {story['stories_id']}") topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: log.info( "Duplicate story found, checking for story match on unique constraint error..." ) # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: message = "Unable to find matching story after unique constraint error." log.error(message) raise McTMFetchLinkException(message) log.info(f"Matched story {story_match['stories_id']}") topic_fetch_url['stories_id'] = story_match['stories_id'] log.info("Done generating story") _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done") log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls pending_state = _get_pending_state(topic_fetch_url) if pending_state: topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: if _ignore_link_pattern(response_url): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url story = generate_story(db=db, content=content, url=url) topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException( "Unable to find matching story after unique constraint error." ) topic_fetch_url['stories_id'] = story_match['stories_id'] _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")