def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = "%s: %s" % (screen_name, content) tweet_date = tweet['created_at'] url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id']) story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = mediawords.util.twitter.get_tweet_urls(tweet) for url in urls: if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) return story
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create(table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, }) db.create(table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), }) return story
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create( table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, } ) db.create( table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), } ) return story
def _add_tweet_story(db: DatabaseHandler, topic: Dict[str, Any], tweet: dict, topic_fetch_urls: List[Dict[str, Any]]) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = f"{screen_name}: {content}" tweet_date = tweet['created_at'] url = f"https://twitter.com/{screen_name}/status/{tweet['id']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = get_tweet_urls(tweet) for url in urls: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url, } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) return story
def create_test_timespan(db: DatabaseHandler, topic: dict = None, snapshot: dict = None) -> dict: """Create simple timespans for testing. Mast pass either topic or snapshot or both. If a snapshot is not passed, create one. """ assert topic is not None or snapshot is not None if not snapshot: snapshot = create_test_snapshot(db, topic) return db.create(table='timespans', insert_hash={ 'topics_id': snapshot['topics_id'], 'snapshots_id': snapshot['snapshots_id'], 'start_date': snapshot['start_date'], 'end_date': snapshot['end_date'], 'period': 'overall', 'story_count': 0, 'story_link_count': 0, 'medium_count': 0, 'medium_link_count': 0, 'post_count': 0 })
def create_test_snapshot(db: DatabaseHandler, topic: dict) -> dict: """Create simple snapshot for testing.""" return db.create(table='snapshots', insert_hash={ 'topics_id': topic['topics_id'], 'snapshot_date': topic['end_date'], 'start_date': topic['start_date'], 'end_date': topic['end_date'] })
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = "%s: %s" % (screen_name, content) tweet_date = tweet['created_at'] url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id']) story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id( db, topic_fetch_url) urls = mediawords.util.twitter.get_tweet_urls(tweet) for url in urls: if mediawords.tm.domains.skip_self_linked_domain_url( db, topic['topics_id'], story['url'], url): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) return story
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict: """Create test feed with a simple label belonging to medium.""" label = decode_object_from_bytes_if_needed(label) medium = decode_object_from_bytes_if_needed(medium) return db.create(table='feeds', insert_hash={ 'name': label, 'url': "http://feed.test/%s" % label, 'media_id': int(medium['media_id']), })
def create_test_medium(db: DatabaseHandler, label: str) -> dict: """Create test medium with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create(table='media', insert_hash={ 'name': label, 'url': "http://media.test/%s" % (label, ), 'is_monitored': True, 'public_notes': "%s public notes" % (label, ), 'editor_notes': "%s editor notes" % (label, ), })
def create_test_medium(db: DatabaseHandler, label: str) -> dict: """Create test medium with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create( table='media', insert_hash={ 'name': label, 'url': "http://media.test/%s" % (label,), 'is_monitored': True, 'public_notes': "%s public notes" % (label,), 'editor_notes': "%s editor notes" % (label,), })
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict: """Create test feed with a simple label belonging to medium.""" label = decode_object_from_bytes_if_needed(label) medium = decode_object_from_bytes_if_needed(medium) return db.create( table='feeds', insert_hash={ 'name': label, 'url': "http://feed.test/%s" % label, 'media_id': int(medium['media_id']), } )
def create_test_topic_stories( db: DatabaseHandler, topic: dict, num_media: int=10, num_stories_per_medium: int=10) -> None: """Fill topic with test stories in num_media with one cross media link per medium.""" for mi in range(num_media): medium = create_test_medium(db, f'medium {mi}') feed = create_test_feed(db, f'feed {mi}', medium) for si in range(num_stories_per_medium): story = create_test_story(db, f'story {mi} {si}', feed) db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']}) db.create('topic_stories', {'topics_id': topic['topics_id'], 'stories_id': story['stories_id']}) db.query( """ insert into topic_links ( topics_id, stories_id, url, ref_stories_id ) select %(a)s, a.stories_id, b.url, b.stories_id from stories a join stories b on ( a.media_id <> b.media_id ) limit %(b)s """, {'a': topic['topics_id'], 'b': num_media * num_stories_per_medium})
def create_test_topic(db: DatabaseHandler, label: str) -> dict: """Create test topic with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create(table='topics', insert_hash={ 'name': label, 'description': label, 'pattern': label, 'solr_seed_query': label, 'solr_seed_query_run': True, 'start_date': '2016-01-01', 'end_date': '2016-03-01', 'job_queue': 'mc', 'max_stories': 100000, })
def create_test_topic(db: DatabaseHandler, label: str) -> dict: """Create test topic with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create( table='topics', insert_hash={ 'name': label, 'description': label, 'pattern': label, 'solr_seed_query': label, 'solr_seed_query_run': True, 'start_date': '2016-01-01', 'end_date': '2016-03-01', 'job_queue': 'mc', 'max_stories': 100000, } )
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) story = decode_object_from_bytes_if_needed(story) host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'success', 'priority': 1, 'extracted': False, 'path': 'postgresql:foo', 'stories_id': story['stories_id'], })
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) story = decode_object_from_bytes_if_needed(story) host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'success', 'priority': 1, 'extracted': False, 'path': 'postgresql:foo', 'stories_id': story['stories_id'], } )
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def create_test_topic_posts( db: DatabaseHandler, topic: dict, num_posts_per_day: int=10) -> int: """Fill topic with topic_posts. Return the number of posts created.""" date = dateutil.parser.parse(topic['start_date']) end_date = dateutil.parser.parse(topic['end_date']) tsq = { 'topics_id': topic['topics_id'], 'source': 'csv', 'platform': 'generic_post', 'query': 'foo' } tsq = db.create('topic_seed_queries', tsq) stories = db.query( "select * from snap.live_stories where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() num_posts = 0 while date < end_date: tpd = { 'topic_seed_queries_id': tsq['topic_seed_queries_id'], 'day': date.strftime('%Y-%m-%d'), 'num_posts_stored': num_posts_per_day, 'num_posts_fetched': num_posts_per_day, } tpd = db.create('topic_post_days', tpd) for i in range(num_posts_per_day): author_num = i channel_num = i % num_posts_per_day topic_post = { 'topic_post_days_id': tpd['topic_post_days_id'], 'post_id': i, 'content': f'content {i}', 'author': f'author {author_num}', 'channel': f'channel {channel_num}', 'publish_date': date.strftime('%Y-%m-%d'), 'data': '{}' } topic_post = db.create('topic_posts', topic_post) post_story = stories[num_posts % len(stories)] tpu = { 'topic_posts_id': topic_post['topic_posts_id'], 'url': post_story['url'] } tpu = db.create('topic_post_urls', tpu) tsu = { 'topics_id': topic['topics_id'], 'url': post_story['url'], 'stories_id': post_story['stories_id'], 'topic_seed_queries_id': tsq['topic_seed_queries_id'], 'topic_post_urls_id': tpu['topic_post_urls_id'] } tsu = db.create('topic_seed_urls', tsu) num_posts += 1 date = date + datetime.timedelta(days=1) return num_posts
def create_test_topic_stories(db: DatabaseHandler, topic: dict, num_media: int = 10, num_stories_per_medium: int = 10) -> None: """Fill topic with test stories in num_media with one cross media link per medium.""" for mi in range(num_media): medium = create_test_medium(db, f'medium {mi}') feed = create_test_feed(db, f'feed {mi}', medium) for si in range(num_stories_per_medium): story = create_test_story(db, f'story {mi} {si}', feed) db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']}) db.create('topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'] }) test_topic_links = db.query( """ WITH src_stories AS ( SELECT * FROM stories ), test_topic_links AS ( SELECT src_stories.stories_id, ref_stories.url, ref_stories.stories_id AS ref_stories_id FROM src_stories INNER JOIN stories AS ref_stories ON src_stories.media_id != ref_stories.media_id LIMIT 50 ) SELECT stories_id, url, ref_stories_id FROM test_topic_links """, { 'topic_link_count': num_media * num_stories_per_medium, }).hashes() # Inserting individually because otherwise it complains with: # ERROR: cannot handle complex subqueries when the router executor is disabled for test_topic_link in test_topic_links: db.query( """ INSERT INTO topic_links ( topics_id, stories_id, url, ref_stories_id ) VALUES ( %(topics_id)s, %(stories_id)s, %(url)s, %(ref_stories_id)s ) """, { 'topics_id': topic['topics_id'], 'stories_id': test_topic_link['stories_id'], 'url': test_topic_link['url'], 'ref_stories_id': test_topic_link['ref_stories_id'], })