def test_get_story_with_most_sentences(self) -> None: """Test _get_story_with_most_senences().""" db = self.db() medium = mediawords.test.db.create_test_medium(db, "foo") feed = mediawords.test.db.create_test_feed(db=db, label="foo", medium=medium) num_filled_stories = 5 stories = [] for i in range(num_filled_stories): story = mediawords.test.db.create_test_story(db=db, label="foo" + str(i), feed=feed) stories.append(story) for n in range(1, i + 1): db.create('story_sentences', { 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'sentence': 'foo', 'sentence_number': n, 'publish_date': story['publish_date']}) empty_stories = [] for i in range(2): story = mediawords.test.db.create_test_story(db=db, label="foo empty" + str(i), feed=feed) empty_stories.append(story) stories.append(story) assert mediawords.tm.stories._get_story_with_most_sentences(db, stories) == stories[num_filled_stories - 1] assert mediawords.tm.stories._get_story_with_most_sentences(db, [empty_stories[0]]) == empty_stories[0] assert mediawords.tm.stories._get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
def test_find_and_merge_dup_stories(self) -> None: """Test find_and_merge_dup_stories().""" db = self.db() topic = mediawords.test.db.create.create_test_topic(db, 'merge') medium = mediawords.test.db.create.create_test_medium(db, 'merge') feed = mediawords.test.db.create.create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = mediawords.test.db.create.create_test_story(db, "merge " + str(i), feed=feed) db.update_by_id('stories', story['stories_id'], {'title': "long dup title foo bar baz"}) mediawords.tm.stories.add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date) select stories_id, %(b)s, 'foo bar', media_id, publish_date from stories where stories_id = %(a)s """, {'a': story['stories_id'], 'b': j}) mediawords.tm.stories.find_and_merge_dup_stories(db, topic) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)", {'a': topic['topics_id'], 'b': stories_ids}).flat() assert merged_stories == [stories_ids[-1]]
def test_get_story_with_most_sentences(self) -> None: """Test _get_story_with_most_senences().""" db = self.db() medium = mediawords.test.db.create.create_test_medium(db, "foo") feed = mediawords.test.db.create.create_test_feed(db=db, label="foo", medium=medium) num_filled_stories = 5 stories = [] for i in range(num_filled_stories): story = mediawords.test.db.create.create_test_story(db=db, label="foo" + str(i), feed=feed) stories.append(story) for n in range(1, i + 1): db.create('story_sentences', { 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'sentence': 'foo', 'sentence_number': n, 'publish_date': story['publish_date']}) empty_stories = [] for i in range(2): story = mediawords.test.db.create.create_test_story(db=db, label="foo empty" + str(i), feed=feed) empty_stories.append(story) stories.append(story) assert mediawords.tm.stories._get_story_with_most_sentences(db, stories) == stories[num_filled_stories - 1] assert mediawords.tm.stories._get_story_with_most_sentences(db, [empty_stories[0]]) == empty_stories[0] assert mediawords.tm.stories._get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
def test_get_story_match(self) -> None: """Test get_story_match().""" db = self.db() medium = mediawords.test.db.create.create_test_medium(db, 'foo') num_stories = 10 stories = [] for i in range(num_stories): story = db.create( 'stories', { 'media_id': medium['media_id'], 'url': ('http://stories-%d.com/foo/bar' % i), 'guid': ('http://stories-%d.com/foo/bar/guid' % i), 'title': ('story %d' % i), 'publish_date': '2017-01-01' }) stories.append(story) # None assert mediawords.tm.stories.get_story_match(db, 'http://foo.com') is None # straight and normalized versions of url and redirect_url assert mediawords.tm.stories.get_story_match( db, stories[0]['url']) == stories[0] assert mediawords.tm.stories.get_story_match( db, 'http://foo.com', stories[1]['url']) == stories[1] assert mediawords.tm.stories.get_story_match( db, stories[2]['url'] + '#foo') == stories[2] assert mediawords.tm.stories.get_story_match( db, 'http://foo.com', stories[3]['url'] + '#foo') == stories[3] # get_preferred_story - return only story with sentences db.query( """ insert into story_sentences ( stories_id, media_id, publish_date, sentence, sentence_number ) select stories_id, media_id, publish_date, 'foo', 1 from stories where stories_id = %(a)s """, {'a': stories[4]['stories_id']}) stories = db.query( "update stories set url = 'http://stories.com/' returning *" ).hashes() assert mediawords.tm.stories.get_story_match( db, 'http://stories.com/') == stories[4]
def test_find_and_merge_dup_stories(self) -> None: """Test find_and_merge_dup_stories().""" db = self.db() topic = mediawords.test.db.create.create_test_topic(db, 'merge') medium = mediawords.test.db.create.create_test_medium(db, 'merge') feed = mediawords.test.db.create.create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = mediawords.test.db.create.create_test_story(db, "merge " + str(i), feed=feed) db.update_by_id('stories', story['stories_id'], {'title': "long dup title foo bar baz"}) mediawords.tm.stories.add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date) select stories_id, %(b)s, 'foo bar', media_id, publish_date from stories where stories_id = %(a)s """, { 'a': story['stories_id'], 'b': j }) mediawords.tm.stories.find_and_merge_dup_stories(db, topic) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)", { 'a': topic['topics_id'], 'b': stories_ids }).flat() assert merged_stories == [stories_ids[-1]]
def test_get_story_match(self) -> None: """Test get_story_match().""" db = self.db() medium = mediawords.test.db.create.create_test_medium(db, 'foo') num_stories = 10 stories = [] for i in range(num_stories): story = db.create('stories', { 'media_id': medium['media_id'], 'url': ('http://stories-%d.com/foo/bar' % i), 'guid': ('http://stories-%d.com/foo/bar/guid' % i), 'title': ('story %d' % i), 'publish_date': '2017-01-01' }) stories.append(story) # None assert mediawords.tm.stories.get_story_match(db, 'http://foo.com') is None # straight and normalized versions of url and redirect_url assert mediawords.tm.stories.get_story_match(db, stories[0]['url']) == stories[0] assert mediawords.tm.stories.get_story_match(db, 'http://foo.com', stories[1]['url']) == stories[1] assert mediawords.tm.stories.get_story_match(db, stories[2]['url'] + '#foo') == stories[2] assert mediawords.tm.stories.get_story_match(db, 'http://foo.com', stories[3]['url'] + '#foo') == stories[3] # get_preferred_story - return only story with sentences db.query( """ insert into story_sentences ( stories_id, media_id, publish_date, sentence, sentence_number ) select stories_id, media_id, publish_date, 'foo', 1 from stories where stories_id = %(a)s """, {'a': stories[4]['stories_id']}) stories = db.query("update stories set url = 'http://stories.com/' returning *").hashes() assert mediawords.tm.stories.get_story_match(db, 'http://stories.com/') == stories[4]