def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def create_test_topic_stories( db: DatabaseHandler, topic: dict, num_media: int=10, num_stories_per_medium: int=10) -> None: """Fill topic with test stories in num_media with one cross media link per medium.""" for mi in range(num_media): medium = create_test_medium(db, f'medium {mi}') feed = create_test_feed(db, f'feed {mi}', medium) for si in range(num_stories_per_medium): story = create_test_story(db, f'story {mi} {si}', feed) db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']}) db.create('topic_stories', {'topics_id': topic['topics_id'], 'stories_id': story['stories_id']}) db.query( """ insert into topic_links ( topics_id, stories_id, url, ref_stories_id ) select %(a)s, a.stories_id, b.url, b.stories_id from stories a join stories b on ( a.media_id <> b.media_id ) limit %(b)s """, {'a': topic['topics_id'], 'b': num_media * num_stories_per_medium})
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def create_test_topic_stories(db: DatabaseHandler, topic: dict, num_media: int = 10, num_stories_per_medium: int = 10) -> None: """Fill topic with test stories in num_media with one cross media link per medium.""" for mi in range(num_media): medium = create_test_medium(db, f'medium {mi}') feed = create_test_feed(db, f'feed {mi}', medium) for si in range(num_stories_per_medium): story = create_test_story(db, f'story {mi} {si}', feed) db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']}) db.create('topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'] }) test_topic_links = db.query( """ WITH src_stories AS ( SELECT * FROM stories ), test_topic_links AS ( SELECT src_stories.stories_id, ref_stories.url, ref_stories.stories_id AS ref_stories_id FROM src_stories INNER JOIN stories AS ref_stories ON src_stories.media_id != ref_stories.media_id LIMIT 50 ) SELECT stories_id, url, ref_stories_id FROM test_topic_links """, { 'topic_link_count': num_media * num_stories_per_medium, }).hashes() # Inserting individually because otherwise it complains with: # ERROR: cannot handle complex subqueries when the router executor is disabled for test_topic_link in test_topic_links: db.query( """ INSERT INTO topic_links ( topics_id, stories_id, url, ref_stories_id ) VALUES ( %(topics_id)s, %(stories_id)s, %(url)s, %(ref_stories_id)s ) """, { 'topics_id': topic['topics_id'], 'stories_id': test_topic_link['stories_id'], 'url': test_topic_link['url'], 'ref_stories_id': test_topic_link['ref_stories_id'], })