def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: story = db.find_by_id(table='stories', object_id=transcript.stories_id) feed = db.query( """ SELECT * FROM feeds WHERE feeds_id = ( SELECT feeds_id FROM feeds_stories_map WHERE stories_id = %(stories_id)s ) """, { 'stories_id': transcript.stories_id, }).hash() download = create_download_for_new_story(db=db, story=story, feed=feed) text = cls._download_text_from_transcript(transcript=transcript) # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later store_content(db=db, download=download, content=text) return download['downloads_id']
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None: """Given a ap story return by get_new_stories(), add it to the database.""" ap_medium = db.query( """ SELECT * FROM media WHERE name = %(medium_name)s """, { 'medium_name': AP_MEDIUM_NAME, }).hash() ap_feed = { 'media_id': ap_medium['media_id'], 'name': 'API Feed', 'active': False, 'type': 'syndicated', 'url': 'http://ap.com' } ap_feed = db.find_or_create('feeds', ap_feed) story = { 'guid': ap_story['guid'], 'url': ap_story['url'], 'publish_date': ap_story['publish_date'], 'title': ap_story['title'], 'description': ap_story['description'], 'media_id': ap_medium['media_id'] } story = add_story(db, story, ap_feed['feeds_id']) if not story: return story_download = create_download_for_new_story(db, story, ap_feed) download_text = { 'downloads_id': story_download['downloads_id'], 'download_text': ap_story['text'], 'download_text_length': len(ap_story['text']) } db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s) """, download_text) # Send to the extractor for it to do vectorization, language detection, etc. JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=story['stories_id'], use_existing=True, )
def test_create_download_for_new_story(): """Test create_download_for_new_story().""" db = connect_to_db() medium = create_test_medium(db, 'foo') feed = create_test_feed(db=db, label='foo', medium=medium) story = create_test_story(db=db, label='foo', feed=feed) returned_download = create_download_for_new_story(db, story, feed) assert returned_download is not None got_download = db.query("select * from downloads where stories_id = %(a)s", {'a': story['stories_id']}).hash() assert got_download is not None assert got_download['downloads_id'] == returned_download['downloads_id'] assert got_download['feeds_id'] == feed['feeds_id'] assert got_download['url'] == story['url'] assert got_download['state'] == 'success' assert got_download['type'] == 'content' assert not got_download['extracted']
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:MAX_URL_LENGTH] medium = guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = get_spidered_tag(db) if title is None: title = html_title(content, url, MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date story = add_story(db, story, feed['feeds_id']) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): download = create_download_for_new_story(db, story, feed) store_content(db, download, content) _extract_story(story) return story
def test_create(): db = connect_to_db() test_medium = create_test_medium(db, 'downloads test') test_feed = create_test_feed(db, 'downloads test', test_medium) test_story = create_test_story(db=db, feed=test_feed, label='test_story') test_download = create_download_for_new_story(db=db, story=test_story, feed=test_feed) test_download['path'] = 'postgresql:foo' test_download['state'] = 'success' db.update_by_id('downloads', test_download['downloads_id'], test_download) assert len( db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': test_download['downloads_id'] }).hashes()) == 0 assert len( db.query( """ SELECT * FROM downloads WHERE downloads_id = %(downloads_id)s AND extracted = 't' """, { 'downloads_id': test_download['downloads_id'] }).hashes()) == 0 extract = { 'extracted_text': 'Hello!', } created_download_text = create(db=db, download=test_download, extract=extract) assert created_download_text assert created_download_text['downloads_id'] == test_download[ 'downloads_id'] found_download_texts = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': test_download['downloads_id'] }).hashes() assert len(found_download_texts) == 1 download_text = found_download_texts[0] assert download_text assert download_text['downloads_id'] == test_download['downloads_id'] assert download_text['download_text'] == extract['extracted_text'] assert download_text['download_text_length'] == len( extract['extracted_text'])
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") log.debug(f"Generating story from URL {url}...") url = url[0:MAX_URL_LENGTH] log.debug(f"Guessing medium for URL {url}...") medium = guess_medium(db, url) log.debug(f"Done guessing medium for URL {url}: {medium}") log.debug(f"Getting spider feed for medium {medium}...") feed = get_spider_feed(db, medium) log.debug(f"Done getting spider feed for medium {medium}: {feed}") log.debug(f"Getting spidered tag...") spidered_tag = get_spidered_tag(db) log.debug(f"Done getting spidered tag: {spidered_tag}") if title is None: log.debug(f"Parsing HTML title...") title = html_title(content, url, MAX_TITLE_LENGTH) log.debug(f"Done parsing HTML title: {title}") story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: log.debug(f"Guessing date for URL {url}...") date_guess = guess_date(url, content) log.debug(f"Done guessing date for URL {url}: {date_guess}") story['publish_date'] = date_guess.date if date_guess.found else None else: story['publish_date'] = publish_date log.debug(f"Adding story {story}...") story = add_story(db, story, feed['feeds_id']) log.debug(f"Done adding story {story}") db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: log.debug(f"Assigning date guess tag...") assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): log.debug("Story is new, creating download...") download = create_download_for_new_story(db, story, feed) log.debug("Storing story content...") store_and_verify_content(db, download, content) log.debug("Extracting story...") _extract_story(db, story) log.debug("Done extracting story") else: log.debug("Story is not new, skipping download storage and extraction") log.debug(f"Done generating story from URL {url}") return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = db.create( 'stories', { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] }, ) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) for old_story_tag in db.query( """ SELECT tags_id FROM stories_tags_map WHERE stories_id = %(stories_id)s ORDER BY tags_id """, {'stories_id': old_story['stories_id']}, ).hashes(): stories_id = story['stories_id'] tags_id = old_story_tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': old_story['stories_id'], } ).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ INSERT INTO download_texts ( downloads_id, download_text, download_text_length ) SELECT %(downloads_id)s, dt.download_text, dt.download_text_length FROM download_texts AS dt WHERE dt.downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'], }, ) # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date, language ) SELECT %(new_stories_id)s, sentence_number, sentence, media_id, publish_date, language FROM story_sentences WHERE stories_id = %(old_stories_id)s """, { 'old_stories_id': old_story['stories_id'], 'new_stories_id': int(story['stories_id']), }, ) return story