def test_get_dup_story_groups(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) dup_story_groups = _get_dup_story_groups(db, topic) assert len(dup_story_groups) == 3 for dsg in dup_story_groups: for story in dsg: assert dsg[0]['title'].lower() == story['title'].lower()
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = f"{user['name']} ({user['screen_name']}): {user['description']}" title = f"{user['name']} ({user['screen_name']}) | Twitter" tweet_date = sql_now() url = f"https://twitter.com/{user['screen_name']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) stories_id = story['stories_id'] tags_id = undateable_tag['tags_id'] db.query(""" INSERT INTO public.stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) return story
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def _add_tweet_story(db: DatabaseHandler, topic: Dict[str, Any], tweet: dict, topic_fetch_urls: List[Dict[str, Any]]) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = f"{screen_name}: {content}" tweet_date = tweet['created_at'] url = f"https://twitter.com/{screen_name}/status/{tweet['id']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = get_tweet_urls(tweet) for url in urls: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url, } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) return story
def test_copy_story_to_new_medium_with_download_error(): """Test copy_story_to_new_medium with an associated download error.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']}) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None new_download = db.query( "select * from downloads where stories_id = %(a)s", {'a': new_story['stories_id']}).hash() assert new_download is not None assert new_download['state'] == 'error'
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) db.query(""" WITH all_story_ids AS ( SELECT stories_id FROM stories ) UPDATE stories SET normalized_title_hash = NULL WHERE stories_id IN ( SELECT stories_id FROM all_story_ids ) """) db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_copy_story_to_new_medium(): """Test copy_story_to_new_medium.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None for field in 'title url guid publish_date'.split(): assert old_story[field] == new_story[field] topic_story_exists = db.query(""" SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], }).hash() assert topic_story_exists is not None new_download = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hash() assert new_download is not None content = fetch_content(db, new_download) assert content is not None and len(content) > 0 story_sentences = db.query(""" SELECT * FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hashes() assert len(story_sentences) > 0
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date ) SELECT stories_id, %(sentence_number)s AS sentence_number, 'foo bar' AS sentence, media_id, publish_date FROM stories WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], 'sentence_number': j, }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( """ SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = ANY(%(stories_ids)s) """, { 'topics_id': topic['topics_id'], 'stories_ids': stories_ids, }).flat() assert merged_stories == [stories_ids[-1]]
def test_find_and_merge_dup_stories(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) find_and_merge_dup_stories(db, topic) num_topic_stories = db.query( """ SELECT COUNT(*) FROM topic_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_topic_stories == 3 num_distinct_titles = db.query( """ SELECT COUNT(DISTINCT normalized_title_hash) FROM snap.live_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_distinct_titles == 3
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( """ WITH found_topic_stories AS ( SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s ) SELECT * FROM stories WHERE stories_id IN ( SELECT stories_id FROM found_topic_stories ) """, { 'topics_id': topic['topics_id'] }).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) title = '%s (%s) | Twitter' % (user['name'], user['screen_name']) tweet_date = sql_now() url = 'https://twitter.com/%s' % user['screen_name'] story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': undateable_tag['tags_id']}) return story
def test_find_and_merge_dup_stories(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) find_and_merge_dup_stories(db, topic) num_topic_stories = db.query( "select count(*) from topic_stories where topics_id = %(a)s", { 'a': topic['topics_id'] }).flat()[0] assert num_topic_stories == 3 num_distinct_titles = db.query( "select count(distinct normalized_title_hash) from snap.live_stories where topics_id = %(a)s", { 'a': topic['topics_id'] }).flat()[0] assert num_distinct_titles == 3
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date) select stories_id, %(b)s, 'foo bar', media_id, publish_date from stories where stories_id = %(a)s """, { 'a': story['stories_id'], 'b': j }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)", { 'a': topic['topics_id'], 'b': stories_ids }).flat() assert merged_stories == [stories_ids[-1]]
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "alter table stories disable trigger stories_add_normalized_title") # noinspection SqlWithoutWhere db.query("update stories set normalized_title_hash = null") db.query("alter table stories enable trigger stories_add_normalized_title") assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_merge_dup_story(): """Test _merge_dup_story().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) old_story = create_test_story(db=db, label='merge old', feed=feed) new_story = create_test_story(db=db, label='merge new', feed=feed) linked_story = create_test_story(db=db, label='linked', feed=feed) linking_story = create_test_story(db=db, label='linking', feed=feed) for story in (old_story, new_story, linked_story, linking_story): add_to_topic_stories(db, story, topic) db.create( 'topic_links', { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'], 'url': old_story['url'], 'ref_stories_id': linked_story['stories_id'], }) db.create( 'topic_links', { 'topics_id': topic['topics_id'], 'stories_id': linking_story['stories_id'], 'url': old_story['url'], 'ref_stories_id': old_story['stories_id'], }) db.create('topic_seed_urls', { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'], }) _merge_dup_story(db, topic, old_story, new_story) old_topic_links = db.query( """ SELECT * FROM topic_links WHERE topics_id = %(topics_id)s AND %(stories_id)s IN (stories_id, ref_stories_id) """, { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'], }).hashes() assert len(old_topic_links) == 0 new_topic_links_linked = db.query( """ SELECT * FROM topic_links WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s AND ref_stories_id = %(ref_stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], 'ref_stories_id': linked_story['stories_id'], }).hashes() assert len(new_topic_links_linked) == 1 new_topic_links_linking = db.query( """ SELECT * FROM topic_links WHERE topics_id = %(topics_id)s AND ref_stories_id = %(ref_stories_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'ref_stories_id': new_story['stories_id'], 'stories_id': linking_story['stories_id'], }).hashes() assert len(new_topic_links_linking) == 1 old_topic_stories = db.query( """ SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'], }).hashes() assert len(old_topic_stories) == 0 topic_merged_stories_maps = db.query( """ SELECT * FROM topic_merged_stories_map WHERE target_stories_id = %(target_stories_id)s AND source_stories_id = %(source_stories_id)s """, { 'target_stories_id': new_story['stories_id'], 'source_stories_id': old_story['stories_id'], }).hashes() assert len(topic_merged_stories_maps) == 1
def test_merge_dup_story(): """Test _merge_dup_story().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) old_story = create_test_story(db=db, label='merge old', feed=feed) new_story = create_test_story(db=db, label='merge new', feed=feed) linked_story = create_test_story(db=db, label='linked', feed=feed) linking_story = create_test_story(db=db, label='linking', feed=feed) for story in (old_story, new_story, linked_story, linking_story): add_to_topic_stories(db, story, topic) db.create( 'topic_links', { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'], 'url': old_story['url'], 'ref_stories_id': linked_story['stories_id'] }) db.create( 'topic_links', { 'topics_id': topic['topics_id'], 'stories_id': linking_story['stories_id'], 'url': old_story['url'], 'ref_stories_id': old_story['stories_id'] }) db.create('topic_seed_urls', { 'topics_id': topic['topics_id'], 'stories_id': old_story['stories_id'] }) _merge_dup_story(db, topic, old_story, new_story) old_topic_links = db.query( "select * from topic_links where topics_id = %(a)s and %(b)s in ( stories_id, ref_stories_id )", { 'a': topic['topics_id'], 'b': old_story['stories_id'] }).hashes() assert len(old_topic_links) == 0 new_topic_links_linked = db.query( "select * from topic_links where topics_id = %(a)s and stories_id = %(b)s and ref_stories_id = %(c)s", { 'a': topic['topics_id'], 'b': new_story['stories_id'], 'c': linked_story['stories_id'] }).hashes() assert len(new_topic_links_linked) == 1 new_topic_links_linking = db.query( "select * from topic_links where topics_id = %(a)s and ref_stories_id = %(b)s and stories_id = %(c)s", { 'a': topic['topics_id'], 'b': new_story['stories_id'], 'c': linking_story['stories_id'] }).hashes() assert len(new_topic_links_linking) == 1 old_topic_stories = db.query( "select * from topic_stories where topics_id = %(a)s and stories_id = %(b)s", { 'a': topic['topics_id'], 'b': old_story['stories_id'] }).hashes() assert len(old_topic_stories) == 0 topic_merged_stories_maps = db.query( "select * from topic_merged_stories_map where target_stories_id = %(a)s and source_stories_id = %(b)s", { 'a': new_story['stories_id'], 'b': old_story['stories_id'] }).hashes() assert len(topic_merged_stories_maps) == 1
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fetch_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) if 'stories_id' in topic_fetch_url and topic_fetch_url[ 'stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): add_to_topic_stories(db, story, topic) # add redirect_url as a lookup url for the story, if it is different from the story url if not redirect_url == topic_fetch_url['url']: insert_story_urls(db, story, redirect_url) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format( topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)