def test_get_dup_story_groups(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) dup_story_groups = _get_dup_story_groups(db, topic) assert len(dup_story_groups) == 3 for dsg in dup_story_groups: for story in dsg: assert dsg[0]['title'].lower() == story['title'].lower()
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def validate_remote_integration(db: DatabaseHandler, source: str, query: str, day: str) -> None: """Run sanity test on remote APIs.""" topic = create_test_topic(db, "test_remote_integration") tsq = { 'topics_id': topic['topics_id'], 'platform': 'twitter', 'source': source, 'query': query } db.create('topic_seed_queries', tsq) topic['platform'] = 'twitter' topic['pattern'] = '.*' topic['start_date'] = day topic['end_date'] = day topic['mode'] = 'url_sharing' db.update_by_id('topics', topic['topics_id'], topic) fetch_topic_posts(db, topic['topics_id']) got_tts = db.query("select * from topic_posts").hashes() # for old ch monitors, lots of the posts may be deleted assert len(got_tts) > 20 assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
def test_get_failed_url(): db = connect_to_db() topic = create_test_topic(db, 'foo') topics_id = topic['topics_id'] tfus = [['http://story.added', FETCH_STATE_STORY_ADDED], ['http://story.matched', FETCH_STATE_STORY_MATCH], ['http://request.failed', FETCH_STATE_REQUEST_FAILED], ['http://content.match.failed', FETCH_STATE_CONTENT_MATCH_FAILED]] for tfu in tfus: db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': tfu[0], 'state': tfu[1] }) request_failed_tfu = _get_failed_url(db, topics_id, 'http://request.failed') assert request_failed_tfu is not None assert request_failed_tfu['url'] == 'http://request.failed' content_failed_tfu = _get_failed_url(db, topics_id, 'http://content.match.failed') assert content_failed_tfu is not None assert content_failed_tfu['url'] == 'http://content.match.failed' assert _get_failed_url(db, topics_id, 'http://story,added') is None assert _get_failed_url(db, topics_id, 'http://bogus.url') is None assert _get_failed_url(db, 0, 'http://request.failed') is None
def test_fetch_topic_posts() -> None: """Run fetch_topic_post tests.""" db = connect_to_db() topic = create_test_topic(db, 'test') topic['pattern'] = '.*' topic['platform'] = 'generic_post' topic['mode'] = 'url_sharing' topic['start_date'] = datetime.datetime.strptime(MOCK_START_DATE, '%Y-%m-%d') topic['end_date'] = topic['start_date'] + datetime.timedelta( days=MOCK_DAYS - 1) db.update_by_id('topics', topic['topics_id'], topic) mock_posts = _get_mock_posts() mock_posts_csv = CSVStaticPostFetcher()._get_csv_string_from_dicts( mock_posts) tsq = { 'topics_id': topic['topics_id'], 'platform': 'generic_post', 'source': 'csv', 'ignore_pattern': 'ignore', 'query': mock_posts_csv } tsq = db.create('topic_seed_queries', tsq) db.update_by_id('topics', topic['topics_id'], {'platform': 'generic_post'}) fetch_topic_posts(db, tsq) topic_post_days = db.query("SELECT * FROM topic_post_days").hashes() assert len(topic_post_days) == MOCK_DAYS start_date = topic['start_date'] test_days = [ start_date + datetime.timedelta(days=x) for x in range(0, MOCK_DAYS) ] for d in test_days: topic_post_day = db.query( """ SELECT * FROM topic_post_days WHERE topics_id = %(topics_id)s AND topic_seed_queries_id = %(topic_seed_queries_id)s AND day = %(day)s """, { 'topics_id': tsq['topics_id'], 'topic_seed_queries_id': tsq['topic_seed_queries_id'], 'day': d, }).hash() assert topic_post_day is not None _validate_topic_posts(db, topic, mock_posts) _validate_topic_post_urls(db, mock_posts)
def _add_timespans_to_stories(db: DatabaseHandler, stories: List[Dict[str, Any]]) -> None: """Add timespans to stories for solr indexing.""" stories = decode_object_from_bytes_if_needed(stories) topic = create_test_topic(db=db, label="solr dump test") snapshot = db.create(table='snapshots', insert_hash={ 'topics_id': topic['topics_id'], 'snapshot_date': '2018-01-01', 'start_date': '2018-01-01', 'end_date': '2018-01-01', }) timespans = [] for i in range(1, 5 + 1): timespan = db.create(table='timespans', insert_hash={ 'topics_id': topic['topics_id'], 'snapshots_id': snapshot['snapshots_id'], 'start_date': '2018-01-01', 'end_date': '2018-01-01', 'story_count': 1, 'story_link_count': 1, 'medium_count': 1, 'medium_link_count': 1, 'post_count': 1, 'period': 'overall', }) timespans.append(timespan) for story in stories: assert isinstance(story, dict) timespan = timespans.pop() timespans.insert(0, timespan) db.query( """ INSERT INTO snap.story_link_counts ( topics_id, timespans_id, stories_id, media_inlink_count, inlink_count, outlink_count ) VALUES ( %(topics_id)s, %(timespans_id)s, %(stories_id)s, 1, 1, 1 ) """, { 'topics_id': timespan['topics_id'], 'timespans_id': timespan['timespans_id'], 'stories_id': story['stories_id'], })
def test_copy_story_to_new_medium_with_download_error(): """Test copy_story_to_new_medium with an associated download error.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']}) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None new_download = db.query( "select * from downloads where stories_id = %(a)s", {'a': new_story['stories_id']}).hash() assert new_download is not None assert new_download['state'] == 'error'
def test_skip_self_links(self): """Test that self links are skipped within extract_links_for_topic_story""" story_domain = get_url_distinctive_domain(self.test_story['url']) topic = create_test_topic(self.db, 'links') self.db.create( 'topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'] }) num_links = MAX_SELF_LINKS * 2 content = '' for i in range(num_links): plain_text = "Sample sentence to make sure the links get extracted" * 10 url = "http://%s/%d" % (story_domain, i) paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text, url) content = content + paragraph store_content(self.db, self.test_download, content) extract_links_for_topic_story(db=self.db, stories_id=self.test_story['stories_id'], topics_id=topic['topics_id']) topic_links = self.db.query( "select * from topic_links where topics_id = %(a)s", { 'a': topic['topics_id'] }).hashes() assert (len(topic_links) == MAX_SELF_LINKS)
def test_merge_dup_media_story(): """Test merge_dup_media_story().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) old_story = create_test_story(db=db, label='merge old', feed=feed) new_medium = create_test_medium(db, 'merge new') db.update_by_id('media', medium['media_id'], {'dup_media_id': new_medium['media_id']}) cloned_story = merge_dup_media_story(db, topic, old_story) for field in 'url guid publish_date title'.split(): assert cloned_story[field] == old_story[field] topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': cloned_story['stories_id'], 'b': topic['topics_id'] }).hash() assert topic_story is not None merged_story = merge_dup_media_story(db, topic, old_story) assert merged_story['stories_id'] == cloned_story['stories_id']
def test_try_update_topic_link_ref_stories_id(): """Test try_update_topic_link_ref_stories_id().""" db = connect_to_db() medium = create_test_medium(db, 'foo') feed = create_test_feed(db, label='foo', medium=medium) source_story = create_test_story(db, label='source story', feed=feed) target_story = create_test_story(db, label='target story a', feed=feed) topic = create_test_topic(db, 'foo') db.create('topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id']}) # first update should work topic_link_a = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_a = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_a) topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id']) assert topic_link_a['ref_stories_id'] == target_story['stories_id'] # second one should silently fail topic_link_b = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_b = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_b) topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id']) assert topic_link_b['ref_stories_id'] is None # now generate an non-unique error and make sure we get an error bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'} with pytest.raises(McUpdateByIDException): try_update_topic_link_ref_stories_id(db, bogus_tfu)
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) db.query(""" WITH all_story_ids AS ( SELECT stories_id FROM stories ) UPDATE stories SET normalized_title_hash = NULL WHERE stories_id IN ( SELECT stories_id FROM all_story_ids ) """) db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_add_tweet_story(): db = connect_to_db() topic = create_test_topic(db, 'test') medium = create_test_medium(db, 'test') feed = create_test_feed(db, 'test', medium) source_story = create_test_story(db, 'source', feed) topics_id = topic['topics_id'] db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']}) topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']} topic_link = db.create('topic_links', topic_link) tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']} tfu = db.create('topic_fetch_urls', tfu) tweet = { 'id': 123, 'text': 'add tweet story tweet text', 'user': {'screen_name': 'tweet screen name'}, 'created_at': 'Mon Dec 13 23:21:48 +0000 2010', 'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]}, 'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}}, 'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}} } story = _add_tweet_story(db, topic, tweet, [tfu]) got_story = db.require_by_id('stories', story['stories_id']) assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text']) assert got_story['publish_date'][0:10] == '2010-12-13' assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id']) assert got_story['guid'] == story['url'] got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id']) assert got_topic_link['ref_stories_id'] == story['stories_id'] assert get_content_for_first_download(db, story) == tweet['text'] got_topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}).hash() assert got_topic_story is not None assert got_topic_story['link_mined'] # noinspection PyTypeChecker for url in [tweet['entities']['urls'][0]['expanded_url'], tweet['retweeted_status']['entities']['urls'][0]['expanded_url'], tweet['quoted_status']['entities']['urls'][0]['expanded_url']]: got_topic_link = db.query( "select * from topic_links where topics_id = %(a)s and url = %(b)s", {'a': topic['topics_id'], 'b': url}).hash() assert got_topic_link is not None
def test_copy_story_to_new_medium(): """Test copy_story_to_new_medium.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None for field in 'title url guid publish_date'.split(): assert old_story[field] == new_story[field] topic_story_exists = db.query(""" SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], }).hash() assert topic_story_exists is not None new_download = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hash() assert new_download is not None content = fetch_content(db, new_download) assert content is not None and len(content) > 0 story_sentences = db.query(""" SELECT * FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hashes() assert len(story_sentences) > 0
def setUp(self): super().setUp() self.db = connect_to_db() medium = create_test_medium(db=self.db, label='test') feed = create_test_feed(db=self.db, label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db, label='story-%d' % story_num, feed=feed) for sentence_number in range( 1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db.create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format( story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db, label='test') self.topics_id = topic['topics_id'] self.db.query( """ INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db.query( """ INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, { 'topics_id': self.topics_id }).flat()[0] self.db.query( """ INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date ) SELECT stories_id, %(sentence_number)s AS sentence_number, 'foo bar' AS sentence, media_id, publish_date FROM stories WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], 'sentence_number': j, }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( """ SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = ANY(%(stories_ids)s) """, { 'topics_id': topic['topics_id'], 'stories_ids': stories_ids, }).flat() assert merged_stories == [stories_ids[-1]]
def test_find_and_merge_dup_stories(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) find_and_merge_dup_stories(db, topic) num_topic_stories = db.query( """ SELECT COUNT(*) FROM topic_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_topic_stories == 3 num_distinct_titles = db.query( """ SELECT COUNT(DISTINCT normalized_title_hash) FROM snap.live_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_distinct_titles == 3
def test_try_fetch_tweets_chunk_multiple(): def _try_fetch_tweets_chunk_threaded(topic_: dict, tfus_: list) -> None: """Call ftu._try_fetch_tweets_chunk with a newly created db handle for thread safety.""" db_ = connect_to_db() with requests_mock.Mocker() as m: m.get("https://api.twitter.com/1.1/statuses/lookup.json", text=mock_statuses_lookup) _try_fetch_tweets_chunk(db_, topic_, tfus_) num_threads = 20 db = connect_to_db() topic = create_test_topic(db, 'test') topics_id = topic['topics_id'] num_urls_per_thread = 100 threads = [] for j in range(num_threads): tfus = [] for i in range(num_urls_per_thread): url = 'https://twitter.com/foo/status/%d' % i tfu = db.create('topic_fetch_urls', { 'topics_id': topics_id, 'url': url, 'state': 'pending' }) tfus.append(tfu) random.shuffle(tfus) t = threading.Thread(target=_try_fetch_tweets_chunk_threaded, args=(topic, tfus)) t.start() threads.append(t) [t.join() for t in threads] [num_topic_stories ] = db.query("select count(*) from topic_stories where topics_id = %(a)s", { 'a': topics_id }).flat() assert num_urls_per_thread == num_topic_stories
def test_try_fetch_users_chunk_multiple(): """Test fetch_100_users using mock. Run in parallel to test for race conditions.""" def _try_fetch_users_chunk_parallel(topic_: dict, tfus_: list) -> None: db_ = connect_to_db() with requests_mock.Mocker() as m: m.post("https://api.twitter.com/1.1/users/lookup.json", text=mock_users_lookup) _try_fetch_users_chunk(db_, topic_, tfus_) num_jobs = 20 db = connect_to_db() topic = create_test_topic(db, 'test') topics_id = topic['topics_id'] num_urls_per_job = 100 jobs = [] for j in range(num_jobs): tfus = [] for i in range(num_urls_per_job): url = f'https://twitter.com/test_user_{i}' tfu = db.create('topic_fetch_urls', { 'topics_id': topics_id, 'url': url, 'state': 'pending' }) tfus.append(tfu) random.shuffle(tfus) job = multiprocessing.Process(target=_try_fetch_users_chunk_parallel, args=(topic, tfus)) job.start() jobs.append(job) [job.join() for job in jobs] [num_topic_stories] = db.query( "SELECT COUNT(*) FROM topic_stories WHERE topics_id = %(topics_id)s", { 'topics_id': topics_id }).flat() assert num_urls_per_job == num_topic_stories
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( """ WITH found_topic_stories AS ( SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s ) SELECT * FROM stories WHERE stories_id IN ( SELECT stories_id FROM found_topic_stories ) """, { 'topics_id': topic['topics_id'] }).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def test_call_function_on_url_chunk(): """test _call_function_on_url_chunk.""" _chunk_collector = [] # noinspection PyUnusedLocal def _test_function(db_, topic_, urls_): _chunk_collector.append(urls_) # noinspection PyUnusedLocal def _error_function(db_, topic_, urls_): raise Exception('chunk exception') db = connect_to_db() topic = create_test_topic(db, 'test') urls = list(range(URLS_CHUNK_SIZE * 2)) _call_function_on_url_chunks(db, topic, urls, _test_function) assert _chunk_collector == [ urls[0:URLS_CHUNK_SIZE], urls[URLS_CHUNK_SIZE:] ] for i in range(URLS_CHUNK_SIZE * 2): db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'foo', 'state': 'pending' }) topic_fetch_urls = db.query("select * from topic_fetch_urls").hashes() _call_function_on_url_chunks(db, topic, topic_fetch_urls, _error_function) [error_count ] = db.query("select count(*) from topic_fetch_urls where state = %(a)s", { 'a': FETCH_STATE_PYTHON_ERROR }).flat() assert error_count == URLS_CHUNK_SIZE * 2
def test_find_and_merge_dup_stories(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) find_and_merge_dup_stories(db, topic) num_topic_stories = db.query( "select count(*) from topic_stories where topics_id = %(a)s", { 'a': topic['topics_id'] }).flat()[0] assert num_topic_stories == 3 num_distinct_titles = db.query( "select count(distinct normalized_title_hash) from snap.live_stories where topics_id = %(a)s", { 'a': topic['topics_id'] }).flat()[0] assert num_distinct_titles == 3
def test_get_seeded_content(): db = connect_to_db() topic = create_test_topic(db, 'foo') tfu = db.create( 'topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://0.0.0.1/foo', 'assume_match': True, 'state': FETCH_STATE_PENDING }) assert _get_seeded_content(db, tfu) is None tsu_content = '<title>seeded content</title>' db.create('topic_seed_urls', { 'topics_id': topic['topics_id'], 'url': tfu['url'], 'content': tsu_content }) response = _get_seeded_content(db, tfu) assert response.content == tsu_content assert response.code == 200 assert response.last_requested_url == tfu['url'] fetch_topic_url(db, tfu['topic_fetch_urls_id'], domain_timeout=0) tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id']) assert tfu['state'] == FETCH_STATE_STORY_ADDED assert tfu['code'] == 200 assert tfu['stories_id'] is not None story = db.require_by_id('stories', tfu['stories_id']) assert story['title'] == 'seeded content'
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date) select stories_id, %(b)s, 'foo bar', media_id, publish_date from stories where stories_id = %(a)s """, { 'a': story['stories_id'], 'b': j }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)", { 'a': topic['topics_id'], 'b': stories_ids }).flat() assert merged_stories == [stories_ids[-1]]
def setUp(self): super().setUp() medium = create_test_medium(db=self.db(), label='test') feed = create_test_feed(db=self.db(), label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed) for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db().create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db(), label='test') self.topics_id = topic['topics_id'] self.db().query(""" INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db().query(""" INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, {'topics_id': self.topics_id}).flat()[0] self.db().query(""" INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "alter table stories disable trigger stories_add_normalized_title") # noinspection SqlWithoutWhere db.query("update stories set normalized_title_hash = null") db.query("alter table stories enable trigger stories_add_normalized_title") assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_merge_foreign_rss_stories(): """Test merge_foreign_rss_stories().""" db = connect_to_db() topic = create_test_topic(db, 'foo') medium = create_test_medium(db, 'norss') feed = create_test_feed(db=db, label='norss', medium=medium) num_stories = 10 stories = [ create_test_story(db=db, label=str(i), feed=feed) for i in range(num_stories) ] rss_medium = create_test_medium(db, 'rss') rss_medium = db.query( """ UPDATE media SET foreign_rss_links = 't' WHERE media_id = %(media_id)s RETURNING * """, { 'media_id': rss_medium['media_id'], }).hash() rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium) num_rss_stories = 10 rss_stories = [] for i in range(num_rss_stories): story = create_test_story(db=db, label=str(i), feed=rss_feed) download = db.create( 'downloads', { 'stories_id': story['stories_id'], 'feeds_id': rss_feed['feeds_id'], 'url': story['url'], 'host': 'foo', 'type': 'content', 'state': 'success', 'priority': 0, 'sequence': 0, 'path': 'postgresql' }) store_content(db, download, story['title']) rss_stories.append(story) # noinspection SqlInsertValues db.query( """ INSERT INTO topic_stories ( stories_id, topics_id ) SELECT stories_id, %(topics_id)s AS topics_id FROM stories """, { 'topics_id': int(topic['topics_id']), }) assert db.query("SELECT COUNT(*) FROM topic_stories").flat( )[0] == num_stories + num_rss_stories merge_foreign_rss_stories(db, topic) assert db.query( "SELECT COUNT(*) FROM topic_stories").flat()[0] == num_stories assert db.query( "SELECT COUNT(*) FROM topic_seed_urls").flat()[0] == num_rss_stories got_topic_stories_ids = db.query( "SELECT stories_id FROM topic_stories").flat() expected_topic_stories_ids = [s['stories_id'] for s in stories] assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids) got_seed_urls = db.query( """ SELECT topics_id, url, content FROM topic_seed_urls WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).hashes() expected_seed_urls = \ [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories] assert sorted(got_seed_urls, key=itemgetter('url')) == sorted(expected_seed_urls, key=itemgetter('url'))
def test_extract_links_for_topic_story(self) -> None: """Test extract_links_for_topic_story().""" self.test_story['description'] = 'http://foo.com http://bar.com' self.db.update_by_id('stories', self.test_story['stories_id'], self.test_story) topic = create_test_topic(self.db, 'links') self.db.create( 'topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'] }) extract_links_for_topic_story( db=self.db, stories_id=self.test_story['stories_id'], topics_id=topic['topics_id'], ) got_topic_links = self.db.query( """ SELECT topics_id, stories_id, url FROM topic_links WHERE topics_id = %(topics_id)s ORDER BY url """, { 'topics_id': topic['topics_id'], }).hashes() expected_topic_links = [{ 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'], 'url': 'http://bar.com' }, { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'], 'url': 'http://foo.com' }] assert got_topic_links == expected_topic_links got_topic_story = self.db.query( """ SELECT topics_id, stories_id, link_mined FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'], }).hash() expected_topic_story = { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'], 'link_mined': True, } assert got_topic_story == expected_topic_story # generate an error and make sure that it gets saved to topic_stories del self.test_story['url'] extract_links_for_topic_story( db=self.db, stories_id=self.test_story['stories_id'], topics_id=topic['topics_id'], test_throw_exception=True, ) got_topic_story = self.db.query( """ SELECT topics_id, stories_id, link_mined, link_mine_error FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'], }).hash() assert "McExtractLinksForTopicStoryTestException" in got_topic_story[ 'link_mine_error'] assert got_topic_story['link_mined']
def test_get_topic_url_variants(self): media = create_test_story_stack(db=self.db(), data={ 'A': { 'B': [1, 2, 3], 'C': [4, 5, 6], }, 'D': { 'E': [7, 8, 9], } }) story_1 = media['A']['feeds']['B']['stories']['1'] story_2 = media['A']['feeds']['B']['stories']['2'] story_3 = media['A']['feeds']['B']['stories']['3'] story_4 = media['A']['feeds']['C']['stories']['4'] self.db().query( """ INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_2['stories_id'], 'target_stories_id': story_1['stories_id'], }) self.db().query( """ INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_3['stories_id'], 'target_stories_id': story_2['stories_id'], }) self.db().create( table='tag_sets', insert_hash={'name': 'foo'}, ) topic = create_test_topic(db=self.db(), label='foo') self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_1['stories_id'], }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_1['stories_id'], 'url': story_1['url'], 'redirect_url': story_1['url'] + "/redirect_url", }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_2['stories_id'], }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_2['stories_id'], 'url': story_2['url'], 'redirect_url': story_2['url'] + "/redirect_url", }) self.db().create(table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_3['stories_id'] }) self.db().create(table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_3['stories_id'], 'url': story_3['url'] + '/alternate', }) test_url = story_1['url'] + self.CRUFT expected_urls = { story_1['url'], story_1['url'] + self.CRUFT, story_2['url'], story_1['url'] + "/redirect_url", story_2['url'] + "/redirect_url", story_3['url'], story_3['url'] + "/alternate", } url_variants = all_url_variants(db=self.db(), url=test_url) assert len(expected_urls) == len(url_variants) sorted_expected_urls = sorted(expected_urls) sorted_url_variants = sorted(url_variants) for i in range(len(sorted_expected_urls)): assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
def setUp(self): self.db = connect_to_db() db = self.db self.connected_media = [] for i in range(self.__NUM_CONNECTED_MEDIA): self.connected_media.append( create_test_medium(db, 'connected %d' % i)) self.disconnected_media = [] for i in range(self.__NUM_DISCONNECTED_MEDIA): self.disconnected_media.append( create_test_medium(db, 'disconnected %d' % i)) self.all_media = self.connected_media + self.disconnected_media self.topic = create_test_topic(db, 'foo') self.timespan = create_test_timespan(db, self.topic) center_medium = self.connected_media[0] for medium in self.connected_media[1:]: db.query( """ INSERT INTO snap.medium_links ( topics_id, timespans_id, source_media_id, ref_media_id, link_count ) VALUES ( %(topics_id)s, %(timespans_id)s, %(source_media_id)s, %(ref_media_id)s, 1 ) """, { 'topics_id': self.topic['topics_id'], 'timespans_id': self.timespan['timespans_id'], 'source_media_id': medium['media_id'], 'ref_media_id': center_medium['media_id'], }) db.query(""" INSERT INTO snap.medium_link_counts ( topics_id, timespans_id, media_id, media_inlink_count, outlink_count, story_count, inlink_count, sum_media_inlink_count ) SELECT topics_id, timespans_id, media_id, media_id, 1, 1, 1, 1 FROM timespans AS t CROSS JOIN media AS m """) tag_set = db.find_or_create( 'tag_sets', {'name': 'retweet_partisanship_2016_count_10'}) tag = db.find_or_create('tags', { 'tag_sets_id': tag_set['tag_sets_id'], 'tag': 'right' }) db.find_or_create('color_sets', { 'color': 'bb0404', 'color_set': 'partisan_retweet', 'id': 'right' }) db.find_or_create('color_sets', { 'color': '', 'color_set': 'partisan_retweet', 'id': 'right' }) db.query( "INSERT INTO media_tags_map (media_id, tags_id) SELECT media_id, %(a)s FROM media", {'a': tag['tags_id']})
def test_get_topic_url_variants(self): media = create_test_story_stack( db=self.db(), data={ 'A': { 'B': [1, 2, 3], 'C': [4, 5, 6], }, 'D': { 'E': [7, 8, 9], } } ) story_1 = media['A']['feeds']['B']['stories']['1'] story_2 = media['A']['feeds']['B']['stories']['2'] story_3 = media['A']['feeds']['B']['stories']['3'] story_4 = media['A']['feeds']['C']['stories']['4'] self.db().query(""" INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_2['stories_id'], 'target_stories_id': story_1['stories_id'], }) self.db().query(""" INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id) VALUES (%(source_stories_id)s, %(target_stories_id)s) """, { 'source_stories_id': story_3['stories_id'], 'target_stories_id': story_2['stories_id'], }) self.db().create( table='tag_sets', insert_hash={'name': 'foo'}, ) topic = create_test_topic(db=self.db(), label='foo') self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_1['stories_id'], } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_1['stories_id'], 'url': story_1['url'], 'redirect_url': story_1['url'] + "/redirect_url", } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_2['stories_id'], } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_2['stories_id'], 'url': story_2['url'], 'redirect_url': story_2['url'] + "/redirect_url", } ) self.db().create( table='topic_stories', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_3['stories_id'] } ) self.db().create( table='topic_links', insert_hash={ 'topics_id': topic['topics_id'], 'stories_id': story_4['stories_id'], 'ref_stories_id': story_3['stories_id'], 'url': story_3['url'] + '/alternate', } ) test_url = story_1['url'] + self.CRUFT expected_urls = { story_1['url'], story_1['url'] + self.CRUFT, story_2['url'], story_1['url'] + "/redirect_url", story_2['url'] + "/redirect_url", story_3['url'], story_3['url'] + "/alternate", } url_variants = all_url_variants(db=self.db(), url=test_url) assert len(expected_urls) == len(url_variants) sorted_expected_urls = sorted(expected_urls) sorted_url_variants = sorted(url_variants) for i in range(len(sorted_expected_urls)): assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
def test_add_user_story(): """Test _add_user_story().""" db = connect_to_db() topic = create_test_topic(db, 'test') medium = create_test_medium(db, 'test') feed = create_test_feed(db, 'test', medium) source_story = create_test_story(db, 'source', feed) topics_id = topic['topics_id'] db.create('topic_stories', { 'topics_id': topics_id, 'stories_id': source_story['stories_id'] }) topic_link = db.create( 'topic_links', { 'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id'], }) tfu = db.create( 'topic_fetch_urls', { 'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id'], }) user = { 'id': 123, 'screen_name': 'test_screen_name', 'name': 'test screen name', 'description': 'test user description' } story = _add_user_story(db, topic, user, [tfu]) got_story = db.require_by_id('stories', story['stories_id']) assert got_story[ 'title'] == f"{user['name']} ({user['screen_name']}) | Twitter" assert got_story['url'] == f"https://twitter.com/{user['screen_name']}" got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id']) assert got_topic_link['ref_stories_id'] == story['stories_id'] content = f"{user['name']} ({user['screen_name']}): {user['description']}" assert get_content_for_first_download(db, story) == content got_topic_story = db.query( """ SELECT * FROM topic_stories WHERE stories_id = %(stories_id)s AND topics_id = %(topics_id)s """, { 'stories_id': story['stories_id'], 'topics_id': topic['topics_id'], }).hash() assert got_topic_story is not None assert got_topic_story['link_mined'] got_undateable_tag = db.query( """ SELECT * FROM stories_tags_map AS stm INNER JOIN tags AS t USING (tags_id) INNER JOIN tag_sets USING (tag_sets_id) WHERE stories_id = %(stories_id)s AND tag = 'undateable' AND name = 'date_invalid' """, { 'stories_id': got_story['stories_id'] }).hash() assert got_undateable_tag
def test_add_user_story(): """Test _add_user_story().""" db = connect_to_db() topic = create_test_topic(db, 'test') medium = create_test_medium(db, 'test') feed = create_test_feed(db, 'test', medium) source_story = create_test_story(db, 'source', feed) topics_id = topic['topics_id'] db.create('topic_stories', { 'topics_id': topics_id, 'stories_id': source_story['stories_id'] }) topic_link = { 'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id'] } topic_link = db.create('topic_links', topic_link) tfu = { 'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id'] } tfu = db.create('topic_fetch_urls', tfu) user = { 'id': 123, 'screen_name': 'test_screen_name', 'name': 'test screen name', 'description': 'test user description' } story = _add_user_story(db, topic, user, [tfu]) got_story = db.require_by_id('stories', story['stories_id']) assert got_story['title'] == "%s (%s) | Twitter" % (user['name'], user['screen_name']) assert got_story['url'] == 'https://twitter.com/%s' % (user['screen_name']) got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id']) assert got_topic_link['ref_stories_id'] == story['stories_id'] content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) assert get_content_for_first_download(db, story) == content got_topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }).hash() assert got_topic_story is not None assert got_topic_story['link_mined'] got_undateable_tag = db.query( """ select * from stories_tags_map stm join tags t using (tags_id) join tag_sets using(tag_sets_id) where stories_id = %(a)s and tag = 'undateable' and name = 'date_invalid' """, { 'a': got_story['stories_id'] }).hash() assert got_undateable_tag