def test_get_story_with_most_sentences(): """Test _get_story_with_most_sentences().""" db = connect_to_db() medium = create_test_medium(db, "foo") feed = create_test_feed(db=db, label="foo", medium=medium) num_filled_stories = 5 stories = [] for i in range(num_filled_stories): story = create_test_story(db=db, label="foo" + str(i), feed=feed) stories.append(story) for n in range(1, i + 1): db.create( 'story_sentences', { 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'sentence': 'foo', 'sentence_number': n, 'publish_date': story['publish_date'] }) empty_stories = [] for i in range(2): story = create_test_story(db=db, label="foo empty" + str(i), feed=feed) empty_stories.append(story) stories.append(story) assert _get_story_with_most_sentences( db, stories) == stories[num_filled_stories - 1] assert _get_story_with_most_sentences( db, [empty_stories[0]]) == empty_stories[0] assert _get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
def test_try_update_topic_link_ref_stories_id(): """Test try_update_topic_link_ref_stories_id().""" db = connect_to_db() medium = create_test_medium(db, 'foo') feed = create_test_feed(db, label='foo', medium=medium) source_story = create_test_story(db, label='source story', feed=feed) target_story = create_test_story(db, label='target story a', feed=feed) topic = create_test_topic(db, 'foo') db.create('topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id']}) # first update should work topic_link_a = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_a = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_a) topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id']) assert topic_link_a['ref_stories_id'] == target_story['stories_id'] # second one should silently fail topic_link_b = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_b = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_b) topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id']) assert topic_link_b['ref_stories_id'] is None # now generate an non-unique error and make sure we get an error bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'} with pytest.raises(McUpdateByIDException): try_update_topic_link_ref_stories_id(db, bogus_tfu)
def test_multiple_audio_enclosures(self): multiple_audio_enclosures = TestStoryAndEnclosure( story=create_test_story( db=self._DB, label='multiple audio enclosures', feed=self._TEST_FEED, )) multiple_audio_enclosures.enclosures.extend([ self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': multiple_audio_enclosures.stories_id, 'url': 'http://www.example.com/test.aac', 'mime_type': 'audio/aac', 'length': 100000, }), self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': multiple_audio_enclosures.stories_id, 'url': 'http://www.example.com/test.mp3', 'mime_type': 'audio/mpeg', 'length': 100000, }), ]) assert podcast_viable_enclosure_for_story( db=self._DB, stories_id=multiple_audio_enclosures.stories_id, ) == StoryEnclosure.from_db_row( multiple_audio_enclosures.enclosures[1] ), ("Story with multiple audio enclosures should return a supported audio enclosure." )
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def test_get_preferred_story(): """Test get_preferred_story().""" db = connect_to_db() num_media = 5 media = [] for i in range(num_media): medium = create_test_medium(db, "foo " + str(i)) feed = create_test_feed(db=db, label="foo", medium=medium) story = create_test_story(db=db, label="foo", feed=feed) medium['story'] = story media.append(medium) # first prefer medium pointed to by dup_media_id of another story preferred_medium = media[1] db.query("update media set dup_media_id = %(a)s where media_id = %(b)s", { 'a': preferred_medium['media_id'], 'b': media[0]['media_id'] }) stories = [m['story'] for m in media] assert get_preferred_story(db, stories) == preferred_medium['story'] # next prefer any medium without a dup_media_id preferred_medium = media[num_media - 1] # noinspection SqlWithoutWhere db.query("update media set dup_media_id = null") db.query("update media set dup_media_id = %(a)s where media_id != %(a)s", {'a': media[0]['media_id']}) db.query("update media set dup_media_id = null where media_id = %(a)s", {'a': preferred_medium['media_id']}) stories = [m['story'] for m in media[1:]] assert get_preferred_story(db, stories) == preferred_medium['story'] # next prefer the medium whose story url matches the medium domain # noinspection SqlWithoutWhere db.query("update media set dup_media_id = null") # noinspection SqlWithoutWhere db.query("update media set url='http://media-'||media_id||'.com'") # noinspection SqlWithoutWhere db.query("update stories set url='http://stories-'||stories_id||'.com'") preferred_medium = media[2] db.query( "update stories set url = 'http://media-'||media_id||'.com' where media_id = %(a)s", {'a': preferred_medium['media_id']}) stories = db.query("select * from stories").hashes() preferred_story = db.query("select * from stories where media_id = %(a)s", { 'a': preferred_medium['media_id'] }).hash() assert get_preferred_story(db, stories) == preferred_story # next prefer lowest media_id # noinspection SqlWithoutWhere db.query("update stories set url='http://stories-'||stories_id||'.com'") stories = db.query("select * from stories").hashes() assert get_preferred_story( db, stories)['stories_id'] == media[0]['story']['stories_id']
def setUp(self) -> None: super().setUp() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)
def test_merge_dup_media_story(): """Test merge_dup_media_story().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) old_story = create_test_story(db=db, label='merge old', feed=feed) new_medium = create_test_medium(db, 'merge new') db.update_by_id('media', medium['media_id'], {'dup_media_id': new_medium['media_id']}) cloned_story = merge_dup_media_story(db, topic, old_story) for field in 'url guid publish_date title'.split(): assert cloned_story[field] == old_story[field] topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': cloned_story['stories_id'], 'b': topic['topics_id'] }).hash() assert topic_story is not None merged_story = merge_dup_media_story(db, topic, old_story) assert merged_story['stories_id'] == cloned_story['stories_id']
def test_only_video_enclosures(self): only_video_enclosures = TestStoryAndEnclosure(story=create_test_story( db=self._DB, label='only video enclosures', feed=self._TEST_FEED, )) only_video_enclosures.enclosures.extend([ self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': only_video_enclosures.stories_id, 'url': 'http://www.example.com/test.mkv', 'mime_type': 'video/x-matroska', 'length': 100000, }), self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': only_video_enclosures.stories_id, 'url': 'http://www.example.com/test.mp4', 'mime_type': 'video/mp4', 'length': 100000, }), ]) assert podcast_viable_enclosure_for_story( db=self._DB, stories_id=only_video_enclosures.stories_id, ) == StoryEnclosure.from_db_row(only_video_enclosures.enclosures[0]), ( "Story with only video enclosures should return the first video enclosure." )
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME) self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium) self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)
def test_audio_and_video_enclosures(self): audio_and_video_enclosures = TestStoryAndEnclosure( story=create_test_story( db=self._DB, label='audio and video enclosures', feed=self._TEST_FEED, )) audio_and_video_enclosures.enclosures.extend([ self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': audio_and_video_enclosures.stories_id, 'url': 'http://www.example.com/test.mkv', 'mime_type': 'video/x-matroska', 'length': 100000, }), self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': audio_and_video_enclosures.stories_id, 'url': 'http://www.example.com/test.aac', 'mime_type': 'audio/aac', 'length': 100000, }), ]) assert podcast_viable_enclosure_for_story( db=self._DB, stories_id=audio_and_video_enclosures.stories_id, ) == StoryEnclosure.from_db_row( audio_and_video_enclosures.enclosures[1] ), ("Story with audio and video enclosures should return an audio enclosure." )
def test_copy_story_to_new_medium_with_download_error(): """Test copy_story_to_new_medium with an associated download error.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']}) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None new_download = db.query( "select * from downloads where stories_id = %(a)s", {'a': new_story['stories_id']}).hash() assert new_download is not None assert new_download['state'] == 'error'
def test_single_mp3_without_mime_enclosure(self): single_mp3_without_mime_enclosure = TestStoryAndEnclosure( story=create_test_story( db=self._DB, label='single MP3 enclosure without MIME type set', feed=self._TEST_FEED, )) single_mp3_without_mime_enclosure.enclosures.append( self._DB.insert(table='story_enclosures', insert_hash={ 'stories_id': single_mp3_without_mime_enclosure.stories_id, 'url': 'http://www.example.com/test.mp3', 'mime_type': '', 'length': 100000, })) assert podcast_viable_enclosure_for_story( db=self._DB, stories_id=single_mp3_without_mime_enclosure.stories_id, ) == StoryEnclosure.from_db_row( single_mp3_without_mime_enclosure.enclosures[0] ), ("Story with a single MP3 enclosure without MIME type set should return that enclosure." )
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.config = mediawords.util.config.get_config() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download) mediawords.dbi.downloads.store_content(self.db(), self.test_download, self.__TEST_CONTENT) self.save_config = copy.deepcopy(self.config)
def test_get_dup_story_groups(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) dup_story_groups = _get_dup_story_groups(db, topic) assert len(dup_story_groups) == 3 for dsg in dup_story_groups: for story in dsg: assert dsg[0]['title'].lower() == story['title'].lower()
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) db.query(""" WITH all_story_ids AS ( SELECT stories_id FROM stories ) UPDATE stories SET normalized_title_hash = NULL WHERE stories_id IN ( SELECT stories_id FROM all_story_ids ) """) db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_add_tweet_story(): db = connect_to_db() topic = create_test_topic(db, 'test') medium = create_test_medium(db, 'test') feed = create_test_feed(db, 'test', medium) source_story = create_test_story(db, 'source', feed) topics_id = topic['topics_id'] db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']}) topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']} topic_link = db.create('topic_links', topic_link) tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']} tfu = db.create('topic_fetch_urls', tfu) tweet = { 'id': 123, 'text': 'add tweet story tweet text', 'user': {'screen_name': 'tweet screen name'}, 'created_at': 'Mon Dec 13 23:21:48 +0000 2010', 'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]}, 'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}}, 'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}} } story = _add_tweet_story(db, topic, tweet, [tfu]) got_story = db.require_by_id('stories', story['stories_id']) assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text']) assert got_story['publish_date'][0:10] == '2010-12-13' assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id']) assert got_story['guid'] == story['url'] got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id']) assert got_topic_link['ref_stories_id'] == story['stories_id'] assert get_content_for_first_download(db, story) == tweet['text'] got_topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}).hash() assert got_topic_story is not None assert got_topic_story['link_mined'] # noinspection PyTypeChecker for url in [tweet['entities']['urls'][0]['expanded_url'], tweet['retweeted_status']['entities']['urls'][0]['expanded_url'], tweet['quoted_status']['entities']['urls'][0]['expanded_url']]: got_topic_link = db.query( "select * from topic_links where topics_id = %(a)s and url = %(b)s", {'a': topic['topics_id'], 'b': url}).hash() assert got_topic_link is not None
def setUp(self): self.db = connect_to_db() self.test_medium = create_test_medium(db=self.db, label='test') self.test_feed = create_test_feed(db=self.db, label='test', medium=self.test_medium) self.story = create_test_story(db=self.db, label='test', feed=self.test_feed) stories_id = self.story['stories_id'] enclosure = self.db.insert( table='story_enclosures', insert_hash={ 'stories_id': stories_id, # URL doesn't really matter as we won't be fetching it 'url': 'http://example.com/', 'mime_type': 'audio/mpeg', 'length': 100000, }) episode = self.db.insert(table='podcast_episodes', insert_hash={ 'stories_id': stories_id, 'story_enclosures_id': enclosure['story_enclosures_id'], 'gcs_uri': 'gs://whatever', 'duration': 1, 'codec': 'MP3', 'sample_rate': 44100, 'bcp47_language_code': 'en-US', 'speech_operation_id': 'foo', }) self.db.query( """ INSERT INTO podcast_episode_transcript_fetches ( podcast_episodes_id, add_to_queue_at ) VALUES ( %(podcast_episodes_id)s, NOW() ) """, { 'podcast_episodes_id': episode['podcast_episodes_id'], })
def test_copy_story_to_new_medium(): """Test copy_story_to_new_medium.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None for field in 'title url guid publish_date'.split(): assert old_story[field] == new_story[field] topic_story_exists = db.query(""" SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], }).hash() assert topic_story_exists is not None new_download = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hash() assert new_download is not None content = fetch_content(db, new_download) assert content is not None and len(content) > 0 story_sentences = db.query(""" SELECT * FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hashes() assert len(story_sentences) > 0
def setUp(self): super().setUp() self.db = connect_to_db() medium = create_test_medium(db=self.db, label='test') feed = create_test_feed(db=self.db, label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db, label='story-%d' % story_num, feed=feed) for sentence_number in range( 1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db.create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format( story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db, label='test') self.topics_id = topic['topics_id'] self.db.query( """ INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db.query( """ INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, { 'topics_id': self.topics_id }).flat()[0] self.db.query( """ INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})
def setUp(self): """Add AP medium and some content so that we can find dup sentences.""" super().setUp() ap_medium = create_test_medium(db=self.db(), label=get_ap_medium_name()) feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium) story = create_test_story(db=self.db(), label='story', feed=feed) story['content'] = "\n".join(self.__get_ap_sentences()) add_content_to_test_story(db=self.db(), story=story, feed=feed)
def test_no_enclosures(self): no_enclosures = TestStoryAndEnclosure(story=create_test_story( db=self._DB, label='no enclosures', feed=self._TEST_FEED, )) assert podcast_viable_enclosure_for_story( db=self._DB, stories_id=no_enclosures.stories_id, ) is None, "Story with no enclosures."
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date ) SELECT stories_id, %(sentence_number)s AS sentence_number, 'foo bar' AS sentence, media_id, publish_date FROM stories WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], 'sentence_number': j, }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( """ SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = ANY(%(stories_ids)s) """, { 'topics_id': topic['topics_id'], 'stories_ids': stories_ids, }).flat() assert merged_stories == [stories_ids[-1]]
def setUp(self) -> None: super().setUp() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_download = create_download_for_feed(self.db(), self.test_feed) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
def __is_syndicated(db: DatabaseHandler, content: str) -> bool: label = content[:64] medium = create_test_medium(db=db, label=label) feed = create_test_feed(db=db, label=label, medium=medium) story = create_test_story(db=db, label=label, feed=feed) story['content'] = content story = add_content_to_test_story(db=db, story=story, feed=feed) return is_syndicated(db=db, story_title=story['title'], story_text=content)
def test_update_extractor_version_tag(self): test_medium = create_test_medium(db=self.db(), label='test medium') test_feed = create_test_feed(db=self.db(), label='test feed', medium=test_medium) test_story = create_test_story(db=self.db(), label='test story', feed=test_feed) story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id']) assert len(story_extractor_tags) == 0 update_extractor_version_tag(db=self.db(), story=test_story) story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id']) assert len(story_extractor_tags) == 1
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME) self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium) self.test_download = create_download_for_feed(self.db(), self.test_feed) self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
def __is_syndicated(self, content: str) -> bool: label = content[:64] medium = create_test_medium(db=self.db(), label=label) feed = create_test_feed(db=self.db(), label=label, medium=medium) story = create_test_story(db=self.db(), label=label, feed=feed) story['content'] = content story = add_content_to_test_story(db=self.db(), story=story, feed=feed) return is_syndicated(db=self.db(), story_title=story['title'], story_text=content)
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.db = connect_to_db() self.test_medium = create_test_medium(self.db, 'downloads test') self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium) self.test_download_feed = create_download_for_feed(self.db, self.test_feed) self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story) store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
def test_mark_as_processed(): db = connect_to_db() test_medium = create_test_medium(db=db, label=TEST_MEDIUM_NAME) test_feed = create_test_feed(db=db, label=TEST_FEED_NAME, medium=test_medium) test_story = create_test_story(db=db, label=TEST_STORY_NAME, feed=test_feed) processed_stories = db.query("SELECT * FROM processed_stories").hashes() assert len(processed_stories) == 0 mark_as_processed(db=db, stories_id=test_story['stories_id']) processed_stories = db.query("SELECT * FROM processed_stories").hashes() assert len(processed_stories) == 1 assert processed_stories[0]['stories_id'] == test_story['stories_id']
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.db = connect_to_db() self.test_medium = create_test_medium(self.db, self.TEST_MEDIUM_NAME) self.test_feed = create_test_feed(self.db, self.TEST_FEED_NAME, self.test_medium) self.test_story = create_test_story(self.db, label=self.TEST_STORY_NAME, feed=self.test_feed) self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story)
def setUp(self) -> None: super().setUp() self.db = connect_to_db() test_medium = create_test_medium(db=self.db, label='test') test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) test_story = create_test_story(db=self.db, feed=test_feed, label='test') self.enclosure = self.db.insert(table='story_enclosures', insert_hash={ 'stories_id': test_story['stories_id'], 'url': 'foo', 'mime_type': 'foo', 'length': 3, }) self.episode = self.db.insert( table='podcast_episodes', insert_hash={ 'stories_id': test_story['stories_id'], 'story_enclosures_id': self.enclosure['story_enclosures_id'], 'gcs_uri': 'gs://test', 'duration': 3, 'codec': 'FLAC', 'sample_rate': 44100, 'bcp47_language_code': 'en-US', 'speech_operation_id': self.MOCK_SPEECH_OPERATION_ID, }) self.transcript_fetch = self.db.query( """ INSERT INTO podcast_episode_transcript_fetches (podcast_episodes_id, add_to_queue_at) VALUES (%(podcast_episodes_id)s, NOW()) RETURNING * """, { 'podcast_episodes_id': self.episode['podcast_episodes_id'], }).hash() self.podcast_episode_transcript_fetches_id = self.transcript_fetch[ 'podcast_episode_transcript_fetches_id']
def test_find_and_merge_dup_stories(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) find_and_merge_dup_stories(db, topic) num_topic_stories = db.query( """ SELECT COUNT(*) FROM topic_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_topic_stories == 3 num_distinct_titles = db.query( """ SELECT COUNT(DISTINCT normalized_title_hash) FROM snap.live_stories WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).flat()[0] assert num_distinct_titles == 3
def setUp(self): super().setUp() medium = create_test_medium(db=self.db(), label='test') feed = create_test_feed(db=self.db(), label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed) for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db().create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db(), label='test') self.topics_id = topic['topics_id'] self.db().query(""" INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db().query(""" INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, {'topics_id': self.topics_id}).flat()[0] self.db().query(""" INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})
def test_insert_story_sentences(self): sentences = [ # Single quotes "It's toasted!", # Duplicate sentence within story "It's toasted!", # Non-English language 'Įlinkdama fechtuotojo špaga sublykčiojusi pragręžė apvalų arbūzą.', ] inserted_sentences = _insert_story_sentences( db=self.db(), story=self.test_story, sentences=sentences, ) assert len(inserted_sentences) == 2 # Minus the duplicate sentence assert inserted_sentences[0] == sentences[0] assert inserted_sentences[1] == sentences[2] db_sentences = self.db().query(""" SELECT * FROM story_sentences ORDER BY sentence_number """).hashes() assert len(db_sentences) == 2 assert db_sentences[0]['media_id'] == self.test_medium['media_id'] assert db_sentences[0]['stories_id'] == self.test_story['stories_id'] assert db_sentences[0]['sentence_number'] == 0 assert db_sentences[0]['sentence'] == sentences[0] assert db_sentences[0]['publish_date'] == self.test_story['publish_date'] assert db_sentences[0]['language'] == 'en' assert db_sentences[0]['is_dup'] is None assert db_sentences[1]['media_id'] == self.test_medium['media_id'] assert db_sentences[1]['stories_id'] == self.test_story['stories_id'] assert db_sentences[1]['sentence_number'] == 1 assert db_sentences[1]['sentence'] == sentences[2] assert db_sentences[1]['publish_date'] == self.test_story['publish_date'] assert db_sentences[1]['language'] == 'lt' assert db_sentences[1]['is_dup'] is None test_story_2 = create_test_story(self.db(), label='test story 1', feed=self.test_feed) # Try inserting same sentences again, see if is_dup gets set inserted_sentences = _insert_story_sentences( db=self.db(), story=test_story_2, sentences=sentences, ) assert len(inserted_sentences) == 0 db_sentences = self.db().query(""" SELECT * FROM story_sentences ORDER BY sentence_number """).hashes() assert len(db_sentences) == 2 assert db_sentences[0]['is_dup'] is True assert db_sentences[1]['is_dup'] is True # Make sure no_dedup_sentences works inserted_sentences = _insert_story_sentences( db=self.db(), story=test_story_2, sentences=sentences, no_dedup_sentences=True, ) assert len(inserted_sentences) == len(sentences) db_sentences = self.db().query(""" SELECT * FROM story_sentences ORDER BY stories_id, sentence_number """).hashes() # Two sentences with no_dedup_sentences=False, plus three sentences with no_dedup_sentences=True assert len(db_sentences) == 5