def _add_missing_normalized_title_hashes(db: DatabaseHandler, topic: dict) -> None: """Add a normalized_title_hash field for every stories row that is missing it for the given topic.""" db.begin() db.query( """ DECLARE c CURSOR FOR SELECT stories_id FROM snap.live_stories WHERE topics_id = %(topics_id)s AND normalized_title_hash IS NULL """, { 'topics_id': topic['topics_id'] } ) log.info('adding normalized story titles ...') # break this up into chunks instead of doing all topic stories at once via a simple sql query because we don't # want to do a single giant transaction with millions of stories while True: stories_ids = db.query("fetch 100 from c").flat() if len(stories_ids) < 1: break db.query(""" UPDATE stories SET normalized_title_hash = md5(get_normalized_title(title, media_id))::UUID WHERE stories_id = ANY(%(story_ids)s) """, { 'story_ids': stories_ids, }) db.commit()
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException("add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid']) ) return None else: raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story))) db.find_or_create( table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, } ) db.commit() return story
def _add_missing_normalized_title_hashes(db: DatabaseHandler, topic: dict) -> None: """Add a normalized_title_hash field for every stories row that is missing it for the given topic.""" db.begin() db.query( """ declare c cursor for select stories_id from snap.live_stories where topics_id = %(a)s and normalized_title_hash is null """, {'a': topic['topics_id']}) log.info('adding normalized story titles ...') # break this up into chunks instead of doing all topic stories at once via a simple sql query because we don't # want to do a single giant transaction with millions of stories while True: stories_ids = db.query("fetch 100 from c").flat() if len(stories_ids) < 1: break db.query( """ update stories set normalized_title_hash = md5(get_normalized_title(title, media_id))::uuid where stories_id = any( %(a)s ) """, {'a': stories_ids}) db.commit()
def _fetch_tweets_for_day(db: DatabaseHandler, twitter_class: typing.Type[AbstractTwitter], topic: dict, topic_tweet_day: dict, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets from CH and then fetching each tweet from twitter. Arguments: db - db handle twitter_class - AbstractTwitter class topic - topic dict topic_tweet_day - topic_tweet_day dict max_tweets - max tweets to fetch for a single day Return: None """ if topic_tweet_day['tweets_fetched']: return ch_posts_data = topic_tweet_day['ch_posts'] ch_posts = ch_posts_data['posts'] if (max_tweets is not None): ch_posts = ch_posts[0:max_tweets] log.debug("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(ch_posts), 100): _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100]) db.begin() log.debug("inserting into topic_tweets ...") for ch_post in ch_posts: if 'tweet' in ch_post: _store_tweet_and_urls(db, topic_tweet_day, ch_post) num_deleted_tweets = len(list(filter(lambda x: 'tweet' not in x, ch_posts))) topic_tweet_day['num_ch_tweets'] -= num_deleted_tweets db.query( "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s", { 'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id'] }) db.commit() log.debug("done inserting into topic_tweets")
def extract_and_process_story(db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None: """Extract all of the downloads for the given story and then call process_extracted_story().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] use_transaction = not db.in_transaction() if use_transaction: db.begin() log.debug("Fetching downloads for story {}...".format(stories_id)) downloads = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s AND type = 'content' ORDER BY downloads_id ASC """, {'stories_id': stories_id}).hashes() # MC_REWRITE_TO_PYTHON: Perlism if downloads is None: downloads = [] for download in downloads: log.debug("Extracting download {} for story {}...".format(download['downloads_id'], stories_id)) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) log.debug("Processing extracted story {}...".format(stories_id)) process_extracted_story(db=db, story=story, extractor_args=extractor_args) if use_transaction: db.commit()
def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None: log.info(f"Importing downloads from {csv_file}...") db.begin() with open(csv_file, mode='r', encoding='utf-8') as f: # Guess dialect sample = f.read(1024) sniffer = csv.Sniffer() dialect = sniffer.sniff(sample) f.seek(0) input_csv = csv.DictReader(f, dialect=dialect) n = 1 for download in input_csv: log.info(f"Importing download {n}...") n += 1 raw_download_content = download.get('_raw_download_content', None) if raw_download_content: del raw_download_content['_raw_download_content'] # Cast some columns download['feeds_id'] = int( download.get['feeds_id'] ) if 'feeds_id' in download else None # NULL download['stories_id'] = int( download.get['stories_id'] ) if 'stories_id' in download else None # NULL download['parent'] = int( download.get['parent'] ) if 'parent' in download else None # NULL download['priority'] = int( download.get['priority'] ) if 'priority' in download else 0 # NOT NULL download['sequence'] = int( download.get['sequence'] ) if 'sequence' in download else 0 # NOT NULL download['sequence'] = 't' if download.get('extracted', False) else 'f' # Will be rewritten by handle_download() download['path'] = '' download = db.create(table='downloads', insert_hash=download) # Create mock response to import it response = FakeResponse(content=raw_download_content) handler = handler_for_download(db=db, download=download) handler.store_response(db=db, download=download, response=response) log.info("Committing...") db.commit() log.info(f"Done importing downloads from {csv_file}")
def _fetch_tweets_for_day( db: DatabaseHandler, twitter_class: typing.Type[AbstractTwitter], topic: dict, topic_tweet_day: dict, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets from CH and then fetching each tweet from twitter. Arguments: db - db handle twitter_class - AbstractTwitter class topic - topic dict topic_tweet_day - topic_tweet_day dict max_tweets - max tweets to fetch for a single day Return: None """ if topic_tweet_day['tweets_fetched']: return ch_posts_data = topic_tweet_day['ch_posts'] ch_posts = ch_posts_data['posts'] if (max_tweets is not None): ch_posts = ch_posts[0:max_tweets] log.info("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(ch_posts), 100): _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100]) ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p), ch_posts)) log.info("%d tweets remaining after match" % (len(ch_posts))) db.begin() log.debug("inserting into topic_tweets ...") [_store_tweet_and_urls(db, topic_tweet_day, ch_post) for ch_post in ch_posts] topic_tweet_day['num_ch_tweets'] = len(ch_posts) db.query( "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s", {'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id']}) db.commit() log.debug("done inserting into topic_tweets")
def _fetch_tweets_for_day(db: DatabaseHandler, topic_tweet_day: dict, meta_tweets: list, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets and then fetching each tweet from twitter. Arguments: db - db handle topic_tweet_day - topic_tweet_day dict meta_tweets - list of meta tweets found for day max_tweets - max tweets to fetch for a single day Return: None """ if (max_tweets is not None): meta_tweets = meta_tweets[0:max_tweets] topics_id = topic_tweet_day['topics_id'] log.info("adding %d tweets for topic %s, day %s" % (len(meta_tweets), topics_id, topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(meta_tweets), 100): _add_tweets_to_meta_tweets(meta_tweets[i:i + 100]) topic = db.require_by_id('topics', topic_tweet_day['topics_id']) meta_tweets = list( filter(lambda p: _tweet_matches_pattern(topic, p), meta_tweets)) log.info("%d tweets remaining after match" % (len(meta_tweets))) db.begin() log.debug("inserting into topic_tweets ...") [ _store_tweet_and_urls(db, topic_tweet_day, meta_tweet) for meta_tweet in meta_tweets ] topic_tweet_day['num_tweets'] = len(meta_tweets) db.query( "update topic_tweet_days set tweets_fetched = true, num_tweets = %(a)s where topic_tweet_days_id = %(b)s", { 'a': topic_tweet_day['num_tweets'], 'b': topic_tweet_day['topic_tweet_days_id'] }) db.commit() log.debug("done inserting into topic_tweets")
def print_long_running_job_states(db: DatabaseHandler, limit: int): media = db.query(""" select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id is null order by m.media_id asc limit %(a)s """, {'a': limit}).hashes() media_groups = {} num_media = len(media) for i, medium in enumerate(media): domain = get_url_distinctive_domain(medium['url']) log.warning("%s [%d/%d]" % (domain, i, num_media)) if domain not in media_groups: media_groups[domain] = [] media_groups[domain].append(medium) medium['medium_domain'] = domain medium['dup_domain_matches'] = True dup_media = db.query( "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s", {'a': medium['media_id']} ).hashes() media_groups[domain].extend(dup_media) for dup_medium in dup_media: dup_domain = get_url_distinctive_domain(dup_medium['url']) medium['medium_domain'] = dup_domain medium['dup_domain_matches'] = domain == dup_domain db.query("DROP TABLE IF EXISTS media_dups") db.query( """ CREATE TABLE media_dups ( domain TEXT, media_id BIGINT ) """) db.begin() for i, domain in enumerate(media_groups.keys()): log.warning("domain %s [%d/%d]" % (domain, i, len(media_groups.keys()))) media = media_groups[domain] if len(media) > 1: for m in media: db.query(""" insert into media_dups (domain, media_id) values (%(a)s, %(b)s) """, {'a': domain, 'b': m['media_id']}) db.commit()
def regenerate_api_key(db: DatabaseHandler, email: str) -> None: """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user = user_info(db=db, email=email) except Exception as _: raise McAuthProfileException( "User with email address '%s' does not exist." % email) db.begin() # Purge all IP-limited API keys db.query( """ DELETE FROM auth_user_api_keys WHERE ip_address IS NOT NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) # Regenerate non-IP limited API key db.query( """ UPDATE auth_user_api_keys -- DEFAULT points to a generation function SET api_key = DEFAULT WHERE ip_address IS NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthProfileException( "Unable to send email about reset API key.") db.commit()
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'") log.debug("inserting into topic_posts ...") [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts] db.query( """ UPDATE topic_post_days SET posts_fetched = true, num_posts_stored = %(num_posts_stored)s, num_posts_fetched = %(num_posts_fetched)s WHERE topics_id = %(topics_id)s AND topic_post_days_id = %(topic_post_days_id)s """, { 'num_posts_stored': len(posts), 'num_posts_fetched': num_posts_fetched, 'topics_id': topic_post_day['topics_id'], 'topic_post_days_id': topic_post_day['topic_post_days_id'], } ) db.commit() log.debug("done inserting into topic_posts")
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, { 'a': topic['topics_id'] }).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() content = '' try: content = fetch_content(db, download) except Exception as ex: log.warning( f"Unable to fetch content for download {download['downloads_id']}: {ex}" ) # postgres will complain if the content has a null in it content = content.replace('\x00', '') db.begin() db.create( 'topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.commit()
def regenerate_api_key(db: DatabaseHandler, email: str) -> None: """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthProfileException( "User with email address '%s' does not exist." % email) db.begin() # Purge all API keys db.query( """ DELETE FROM auth_user_api_keys WHERE auth_users_id = %(auth_users_id)s """, {'auth_users_id': user.user_id()}) # Regenerate non-IP limited API key db.query( """ INSERT INTO auth_user_api_keys ( auth_users_id, api_key, ip_address ) VALUES ( %(auth_users_id)s, -- DEFAULT points to a generation function DEFAULT, NULL ) """, {'auth_users_id': user.user_id()}) message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthProfileException( "Unable to send email about reset API key.") db.commit()
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None: """ Store posts for a single day. Arguments: db - db handle topic_post_day - topic_post_day dict posts - list of posts found for day Return: None """ log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day'])) tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id']) topic = db.require_by_id('topics', tsq['topics_id']) posts = list( filter(lambda p: content_matches_topic(p['content'], topic), posts)) num_posts_fetched = len(posts) log.info(f"{num_posts_fetched} posts remaining after match") db.begin() log.debug("inserting into topic_posts ...") [ _store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts ] db.query( """ update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s where topic_post_days_id = %(c)s """, { 'a': len(posts), 'b': num_posts_fetched, 'c': topic_post_day['topic_post_days_id'] }) db.commit() log.debug("done inserting into topic_posts")
def regenerate_api_key(db: DatabaseHandler, email: str) -> None: """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthProfileException("User with email address '%s' does not exist." % email) db.begin() # Purge all IP-limited API keys db.query(""" DELETE FROM auth_user_api_keys WHERE ip_address IS NOT NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) # Regenerate non-IP limited API key db.query(""" UPDATE auth_user_api_keys -- DEFAULT points to a generation function SET api_key = DEFAULT WHERE ip_address IS NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthProfileException("Unable to send email about reset API key.") db.commit()
def queue_all_stories(db: DatabaseHandler) -> None: db.begin() db.query("TRUNCATE TABLE solr_import_stories") # "SELECT FROM processed_stories" because only processed stories should get imported. "ORDER BY" so that the # import is more efficient when pulling blocks of stories out. db.query(""" INSERT INTO solr_import_stories (stories_id) SELECT stories_id FROM processed_stories GROUP BY stories_id ORDER BY stories_id """) db.commit()
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, { 'a': topic['topics_id'] }).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() content = '' try: content = mediawords.dbi.downloads.fetch_content(db, download) except Exception: pass db.begin() db.create( 'topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.commit()
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep normalized_url field in media table up to date. Set the normalized_url field of any row in media for which it is null. Take care to lock the process so that only one process is doing this work at a time. """ # put a lock on this because the process of generating all media urls will take a couple hours, and we don't # want all workers to do the work locked = False while not locked: if not _normalized_urls_out_of_date(db): return db.begin() # poll instead of block so that we can releae the transaction and see whether someone else has already # updated all of the media locked = get_session_lock( db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False) if not locked: db.commit() log.info("sleeping for media_normalized_urls lock...") time.sleep(1) log.warning("updating media_normalized_urls ...") media = db.query( "select * from media where normalized_url is null").hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url}) db.commit()
def queue_all_stories(db: DatabaseHandler, stories_queue_table: str = 'solr_import_stories') -> None: stories_queue_table = decode_object_from_bytes_if_needed(stories_queue_table) db.begin() db.query(f"TRUNCATE TABLE {stories_queue_table}") # "SELECT FROM processed_stories" because only processed stories should get imported. "ORDER BY" so that the # import is more efficient when pulling blocks of stories out. db.query(f""" INSERT INTO {stories_queue_table} SELECT stories_id FROM processed_stories GROUP BY stories_id ORDER BY stories_id """) db.commit()
def extract_and_process_story( db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Extract all of the downloads for the given story and then call process_extracted_story().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] use_transaction = not db.in_transaction() if use_transaction: db.begin() log.debug("Fetching downloads for story {}...".format(stories_id)) downloads = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s AND type = 'content' AND state = 'success' ORDER BY downloads_id ASC """, { 'stories_id': stories_id }).hashes() # MC_REWRITE_TO_PYTHON: Perlism if downloads is None: downloads = [] for download in downloads: log.debug("Extracting download {} for story {}...".format( download['downloads_id'], stories_id)) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) log.debug("Processing extracted story {}...".format(stories_id)) process_extracted_story(db=db, story=story, extractor_args=extractor_args) if use_transaction: db.commit()
def activate_user_via_token(db: DatabaseHandler, email: str, activation_token: str) -> None: """Change password with a password token sent by email.""" email = decode_object_from_bytes_if_needed(email) activation_token = decode_object_from_bytes_if_needed(activation_token) if not email: raise McAuthRegisterException("Email is empty.") if not activation_token: raise McAuthRegisterException('Password reset token is empty.') # Validate the token once more (was pre-validated in controller) if not password_reset_token_is_valid( db=db, email=email, password_reset_token=activation_token): raise McAuthRegisterException('Activation token is invalid.') db.begin() # Set the password hash db.query( """ UPDATE auth_users SET active = TRUE WHERE email = %(email)s """, {'email': email}) # Unset the password reset token db.query( """ UPDATE auth_users SET password_reset_token_hash = NULL WHERE email = %(email)s """, {'email': email}) user = user_info(db=db, email=email) message = AuthActivatedMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthRegisterException( "Unable to send email about an activated user.") db.commit()
def _store_map(db: DatabaseHandler, topics_id: int, timespans_id: int, content: bytes, graph_format: str, color_by: str) -> None: """Create a timespans_map row.""" db.begin() options = {'color_by': color_by} options_json = encode_json(options) db.query( """ DELETE FROM timespan_maps WHERE timespans_id = %(a)s AND format = %(b)s AND options = %(c)s """, {'a': timespans_id, 'b': graph_format, 'c': options_json} ) timespan_map = { 'topics_id': topics_id, 'timespans_id': timespans_id, 'options': options_json, 'format': graph_format } timespan_map = db.create('timespan_maps', timespan_map) db.commit() content_types = { 'svg': 'image/svg+xml', 'gexf': 'xml/gexf' } content_type = content_types[graph_format] store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type) url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id']) db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
def all_users(db: DatabaseHandler) -> List[CurrentUser]: """Fetch and return a list of users and their roles.""" # Start a transaction so that the list of users doesn't change while we run separate queries with user_info() db.begin() user_emails = db.query(""" SELECT email FROM auth_users ORDER BY auth_users_id """).flat() users = [] for email in user_emails: users.append(user_info(db=db, email=email)) db.commit() return users
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep normalized_url field in media table up to date. Set the normalized_url field of any row in media for which it is null. Take care to lock the process so that only one process is doing this work at a time. """ # put a lock on this because the process of generating all media urls will take a couple hours, and we don't # want all workers to do the work locked = False while not locked: if not _normalized_urls_out_of_date(db): return db.begin() # poll instead of block so that we can releae the transaction and see whether someone else has already # updated all of the media locked = get_session_lock(db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False) if not locked: db.commit() log.info("sleeping for media_normalized_urls lock...") time.sleep(1) log.warning("updating media_normalized_urls ...") media = db.query("select * from media where normalized_url is null").hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url}) db.commit()
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, {'a': topic['topics_id']}).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", {'a': story['stories_id']}).hash() content = '' try: content = mediawords.dbi.downloads.fetch_content(db, download) except Exception: pass db.begin() db.create('topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}) db.commit()
def activate_user_via_token(db: DatabaseHandler, email: str, activation_token: str) -> None: """Change password with a password token sent by email.""" email = decode_object_from_bytes_if_needed(email) activation_token = decode_object_from_bytes_if_needed(activation_token) if not email: raise McAuthRegisterException("Email is empty.") if not activation_token: raise McAuthRegisterException('Password reset token is empty.') # Validate the token once more (was pre-validated in controller) if not password_reset_token_is_valid(db=db, email=email, password_reset_token=activation_token): raise McAuthRegisterException('Activation token is invalid.') db.begin() # Set the password hash db.query(""" UPDATE auth_users SET active = TRUE WHERE email = %(email)s """, {'email': email}) # Unset the password reset token db.query(""" UPDATE auth_users SET password_reset_token_hash = NULL WHERE email = %(email)s """, {'email': email}) user = user_info(db=db, email=email) message = AuthActivatedMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthRegisterException("Unable to send email about an activated user.") db.commit()
def run_provider(db: DatabaseHandler, daemon: bool = True) -> None: """Run the provider daemon to periodically add crawler_fetcher jobs by querying for pending downloads. Poll forever as a daemon. Every QUEUE_INTERVAL seconds, check whether queued_downloads has less than MAX_QUEUE_SIZE jobs. If it does, call provide_download_ids and queue a fetcher job for each provided download_id. When run as a daemon, this function effectively throttles each host to no more than one download every QUEUE_INTERVAL seconds because provide_download_ids only provides one downloads_id for each host. """ while True: queue_size = db.query( "select count(*) from ( select 1 from queued_downloads limit %(a)s ) q", { 'a': MAX_QUEUE_SIZE * 10 }).flat()[0] log.warning("queue_size: %d" % queue_size) if queue_size < MAX_QUEUE_SIZE: downloads_ids = provide_download_ids(db) log.warning("adding to downloads to queue: %d" % len(downloads_ids)) db.begin() for i in downloads_ids: db.query( "insert into queued_downloads(downloads_id) values(%(a)s) on conflict (downloads_id) do nothing", {'a': i}) db.commit() if daemon: time.sleep(QUEUE_INTERVAL) elif daemon: time.sleep(QUEUE_INTERVAL * 10) if not daemon: break
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None: """Update an existing user.""" if not user_updates: raise McAuthProfileException("Existing user is undefined.") # Check if user exists try: user = user_info(db=db, email=user_updates.email()) except Exception as _: raise McAuthProfileException( 'User with email address "%s" does not exist.' % user_updates.email()) db.begin() if user_updates.full_name() is not None: db.query( """ UPDATE auth_users SET full_name = %(full_name)s WHERE email = %(email)s """, { 'full_name': user_updates.full_name(), 'email': user_updates.email(), }) if user_updates.notes() is not None: db.query( """ UPDATE auth_users SET notes = %(notes)s WHERE email = %(email)s """, { 'notes': user_updates.notes(), 'email': user_updates.email(), }) if user_updates.active() is not None: db.query( """ UPDATE auth_users SET active = %(active)s WHERE email = %(email)s """, { 'active': bool(int(user_updates.active())), 'email': user_updates.email(), }) if user_updates.password() is not None: try: change_password( db=db, email=user_updates.email(), new_password=user_updates.password(), new_password_repeat=user_updates.password_repeat(), do_not_inform_via_email=True, ) except Exception as ex: db.rollback() raise McAuthProfileException("Unable to change password: %s" % str(ex)) if user_updates.weekly_requests_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requests_limit': user_updates.weekly_requests_limit(), 'auth_users_id': user.user_id(), }) if user_updates.weekly_requested_items_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requested_items_limit': user_updates.weekly_requested_items_limit(), 'auth_users_id': user.user_id(), }) if user_updates.role_ids() is not None: db.query( """ DELETE FROM auth_users_roles_map WHERE auth_users_id = %(auth_users_id)s """, {'auth_users_id': user.user_id()}) for auth_roles_id in user_updates.role_ids(): db.insert(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) db.commit()
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query(""" WITH topic_stories_from_topic AS ( SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND (NOT valid_foreign_rss_story) ) SELECT stories.* FROM stories INNER JOIN media ON stories.media_id = media.media_id AND media.foreign_rss_links WHERE stories.stories_id IN ( SELECT stories_id FROM topic_stories_from_topic ) """, { 'topics_id': topic['topics_id'], }).hashes() for story in stories: download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': story['stories_id'], } ).hash() content = '' try: content = fetch_content(db, download) except Exception as ex: log.warning(f"Unable to fetch content for download {download['downloads_id']}: {ex}") # postgres will complain if the content has a null in it content = content.replace('\x00', '') db.begin() db.create( 'topic_seed_urls', { 'topics_id': topic['topics_id'], 'url': story['url'], 'source': 'merge_foreign_rss_stories', 'content': content, }, ) db.query( """ UPDATE topic_links SET ref_stories_id = NULL, link_spidered = 'f' WHERE topics_id = %(topics_id)s AND ref_stories_id = %(ref_stories_id)s """, { 'ref_stories_id': story['stories_id'], 'topics_id': topic['topics_id'], }, ) db.query( """ DELETE FROM topic_stories WHERE stories_id = %(stories_id)s AND topics_id = %(topics_id)s """, { 'stories_id': story['stories_id'], 'topics_id': topic['topics_id'], }, ) db.commit()
def update_tags_for_story(self, db: DatabaseHandler, stories_id: int) -> None: """Add version, country and story tags for story.""" # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) annotation = self.__annotation_store.fetch_annotation_for_story( db=db, stories_id=stories_id) if annotation is None: raise McJSONAnnotationTaggerException( "Unable to fetch annotation for story %d" % stories_id) tags = None try: tags = self._tags_for_annotation(annotation) except Exception as ex: # Programming error (should at least return an empty list) fatal_error("Unable to fetch tags for story %d: %s" % ( stories_id, str(ex), )) if tags is None: raise McJSONAnnotationTaggerException( "Returned tags is None for story %d." % stories_id) log.debug("Tags for story %d: %s" % ( stories_id, str(tags), )) db.begin() unique_tag_sets_names = set() for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace( tag.tag_sets_name) unique_tag_sets_names.add(tag_sets_name) # Delete old tags the story might have under a given tag set db.query( """ DELETE FROM stories_tags_map WHERE stories_id = %(stories_id)s AND tags_id IN ( SELECT tags_id FROM tags WHERE tag_sets_id IN ( SELECT tag_sets_id FROM tag_sets WHERE name = ANY(%(tag_sets_names)s) ) ) """, { 'stories_id': stories_id, 'tag_sets_names': list(unique_tag_sets_names) }) for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace( tag.tag_sets_name) tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name) # Not using find_or_create() because tag set / tag might already exist # with slightly different label / description # Find or create a tag set db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={ 'name': tag_sets_name }).hash() if db_tag_set is None: db.query( """ INSERT INTO tag_sets (name, label, description) VALUES (%(name)s, %(label)s, %(description)s) ON CONFLICT (name) DO NOTHING """, { 'name': tag_sets_name, 'label': tag.tag_sets_label, 'description': tag.tag_sets_description }) db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={ 'name': tag_sets_name }).hash() tag_sets_id = int(db_tag_set['tag_sets_id']) # Find or create tag db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() if db_tag is None: db.query( """ INSERT INTO tags (tag_sets_id, tag, label, description) VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s) ON CONFLICT (tag, tag_sets_id) DO NOTHING """, { 'tag_sets_id': tag_sets_id, 'tag': tags_name, 'label': tag.tags_label, 'description': tag.tags_description, }) db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() tags_id = int(db_tag['tags_id']) # Assign story to tag (if no such mapping exists yet) # # (partitioned table's INSERT trigger will take care of conflicts) # # Not using db.create() because it tests last_inserted_id, and on duplicates there would be no such # "last_inserted_id" set. db.query( """ INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, { 'stories_id': stories_id, 'tags_id': tags_id, }) db.commit()
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed( skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) db.find_or_create(table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, }) db.commit() return story
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None: """Update an existing user.""" if not user_updates: raise McAuthProfileException("Existing user is undefined.") # Check if user exists try: user = user_info(db=db, email=user_updates.email()) except Exception: raise McAuthProfileException('User with email address "%s" does not exist.' % user_updates.email()) db.begin() if user_updates.full_name() is not None: db.query(""" UPDATE auth_users SET full_name = %(full_name)s WHERE email = %(email)s """, { 'full_name': user_updates.full_name(), 'email': user_updates.email(), }) if user_updates.notes() is not None: db.query(""" UPDATE auth_users SET notes = %(notes)s WHERE email = %(email)s """, { 'notes': user_updates.notes(), 'email': user_updates.email(), }) if user_updates.active() is not None: db.query(""" UPDATE auth_users SET active = %(active)s WHERE email = %(email)s """, { 'active': bool(int(user_updates.active())), 'email': user_updates.email(), }) if user_updates.password() is not None: try: change_password( db=db, email=user_updates.email(), new_password=user_updates.password(), new_password_repeat=user_updates.password_repeat(), do_not_inform_via_email=True, ) except Exception as ex: db.rollback() raise McAuthProfileException("Unable to change password: %s" % str(ex)) if user_updates.weekly_requests_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requests_limit': user_updates.weekly_requests_limit(), 'auth_users_id': user.user_id(), }) if user_updates.weekly_requested_items_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requested_items_limit': user_updates.weekly_requested_items_limit(), 'auth_users_id': user.user_id(), }) if user_updates.role_ids() is not None: db.query(""" DELETE FROM auth_users_roles_map WHERE auth_users_id = %(auth_users_id)s """, {'auth_users_id': user.user_id()}) for auth_roles_id in user_updates.role_ids(): db.insert(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) db.commit()
def add_user(db: DatabaseHandler, new_user: NewUser) -> None: """Add new user.""" if not new_user: raise McAuthRegisterException("New user is undefined.") # Check if user already exists user_exists = db.query( """ SELECT auth_users_id FROM auth_users WHERE email = %(email)s LIMIT 1 """, { 'email': new_user.email() }).hash() if user_exists is not None and 'auth_users_id' in user_exists: raise McAuthRegisterException("User with email '%s' already exists." % new_user.email()) # Hash + validate the password try: password_hash = generate_secure_hash(password=new_user.password()) if not password_hash: raise McAuthRegisterException("Password hash is empty.") except Exception as _: raise McAuthRegisterException('Unable to hash a new password.') db.begin() # Create the user db.create(table='auth_users', insert_hash={ 'email': new_user.email(), 'password_hash': password_hash, 'full_name': new_user.full_name(), 'notes': new_user.notes(), 'active': bool(int(new_user.active())), }) # Fetch the user's ID try: user = user_info(db=db, email=new_user.email()) except Exception as ex: db.rollback() raise McAuthRegisterException( "I've attempted to create the user but it doesn't exist: %s" % str(ex)) # Create roles try: for auth_roles_id in new_user.role_ids(): db.create(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) except Exception as ex: raise McAuthRegisterException("Unable to create roles: %s" % str(ex)) # Update limits (if they're defined) if new_user.weekly_requests_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requests_limit': new_user.weekly_requests_limit(), }) if new_user.weekly_requested_items_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requested_items_limit': new_user.weekly_requested_items_limit(), }) # Subscribe to newsletter if new_user.subscribe_to_newsletter(): db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()}) if not new_user.active(): send_user_activation_token( db=db, email=new_user.email(), activation_link=new_user.activation_url(), subscribe_to_newsletter=new_user.subscribe_to_newsletter(), ) db.commit()
def update_tags_for_story(self, db: DatabaseHandler, stories_id: int) -> None: """Add version, country and story tags for story.""" if not self.annotator_is_enabled(): fatal_error("Annotator is not enabled in the configuration.") # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) annotation = self.fetch_annotation_for_story(db=db, stories_id=stories_id) if annotation is None: raise McJSONAnnotatorException( "Unable to fetch annotation for story %d" % stories_id) tags = None try: tags = self._tags_for_annotation(annotation) except Exception as ex: # Programming error (should at least return an empty list) fatal_error("Unable to fetch tags for story %d: %s" % ( stories_id, str(ex), )) if tags is None: raise McJSONAnnotatorException( "Returned tags is None for story %d." % stories_id) log.debug("Tags for story %d: %s" % ( stories_id, str(tags), )) db.begin() # Delete old tags the story might have under a given tag set for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace( tag.tag_sets_name) db.query( """ DELETE FROM stories_tags_map USING tags, tag_sets WHERE stories_tags_map.tags_id = tags.tags_id AND tags.tag_sets_id = tag_sets.tag_sets_id AND stories_tags_map.stories_id = %(stories_id)s AND tag_sets.name = %(tag_sets_name)s """, { 'stories_id': stories_id, 'tag_sets_name': tag_sets_name }) for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace( tag.tag_sets_name) tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name) # Not using find_or_create() because tag set / tag might already exist # with slightly different label / description # Create tag set db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={ 'name': tag_sets_name }).hash() if db_tag_set is None: db.query( """ INSERT INTO tag_sets (name, label, description) VALUES (%(name)s, %(label)s, %(description)s) ON CONFLICT (name) DO NOTHING """, { 'name': tag_sets_name, 'label': tag.tag_sets_label, 'description': tag.tag_sets_description }) db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={ 'name': tag_sets_name }).hash() tag_sets_id = int(db_tag_set['tag_sets_id']) # Create tag db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() if db_tag is None: db.query( """ INSERT INTO tags (tag_sets_id, tag, label, description) VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s) ON CONFLICT (tag, tag_sets_id) DO NOTHING """, { 'tag_sets_id': tag_sets_id, 'tag': tags_name, 'label': tag.tags_label, 'description': tag.tags_description, }) db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() tags_id = int(db_tag['tags_id']) # Assign story to tag (if no such mapping exists yet) db.query( """ INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) db.commit()
def add_user(db: DatabaseHandler, new_user: NewUser) -> None: """Add new user.""" if not new_user: raise McAuthRegisterException("New user is undefined.") # Check if user already exists user_exists = db.query(""" SELECT auth_users_id FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': new_user.email()}).hash() if user_exists is not None and 'auth_users_id' in user_exists: raise McAuthRegisterException("User with email '%s' already exists." % new_user.email()) # Hash + validate the password try: password_hash = generate_secure_hash(password=new_user.password()) if not password_hash: raise McAuthRegisterException("Password hash is empty.") except Exception as ex: log.error("Unable to hash a new password: {}".format(ex)) raise McAuthRegisterException('Unable to hash a new password.') db.begin() # Create the user db.create( table='auth_users', insert_hash={ 'email': new_user.email(), 'password_hash': password_hash, 'full_name': new_user.full_name(), 'notes': new_user.notes(), 'active': bool(int(new_user.active())), } ) # Fetch the user's ID try: user = user_info(db=db, email=new_user.email()) except Exception as ex: db.rollback() raise McAuthRegisterException("I've attempted to create the user but it doesn't exist: %s" % str(ex)) # Create roles try: for auth_roles_id in new_user.role_ids(): db.create(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) except Exception as ex: raise McAuthRegisterException("Unable to create roles: %s" % str(ex)) # Update limits (if they're defined) if new_user.weekly_requests_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requests_limit': new_user.weekly_requests_limit(), }) if new_user.weekly_requested_items_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requested_items_limit': new_user.weekly_requested_items_limit(), }) # Subscribe to newsletter if new_user.subscribe_to_newsletter(): db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()}) if not new_user.active(): send_user_activation_token( db=db, email=new_user.email(), activation_link=new_user.activation_url(), subscribe_to_newsletter=new_user.subscribe_to_newsletter(), ) db.commit()
def add_story(db: DatabaseHandler, story: dict, feeds_id: int) -> Optional[dict]: """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it. Returns found or created story. Adds an is_new = True story if the story was created by the call. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") db_story = find_dup_story(db, story) if db_story: log.debug("found existing dup story: %s [%s]" % (story['title'], story['url'])) db.commit() return db_story medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) story['is_new'] = True [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])] # on conflict does not work with partitioned feeds_stories_map db.query( """ insert into feeds_stories_map_p ( feeds_id, stories_id ) select %(a)s, %(b)s where not exists ( select 1 from feeds_stories_map where feeds_id = %(a)s and stories_id = %(b)s ) """, { 'a': feeds_id, 'b': story['stories_id'] }) db.commit() log.debug("added story: %s" % story['url']) return story
def update_tags_for_story(self, db: DatabaseHandler, stories_id: int) -> None: """Add version, country and story tags for story.""" if not self.annotator_is_enabled(): fatal_error("Annotator is not enabled in the configuration.") # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) annotation = self.fetch_annotation_for_story(db=db, stories_id=stories_id) if annotation is None: raise McJSONAnnotatorException("Unable to fetch annotation for story %d" % stories_id) tags = None try: tags = self._tags_for_annotation(annotation) except Exception as ex: # Programming error (should at least return an empty list) fatal_error("Unable to fetch tags for story %d: %s" % (stories_id, str(ex),)) if tags is None: raise McJSONAnnotatorException("Returned tags is None for story %d." % stories_id) log.debug("Tags for story %d: %s" % (stories_id, str(tags),)) db.begin() unique_tag_sets_names = set() for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name) unique_tag_sets_names.add(tag_sets_name) # Delete old tags the story might have under a given tag set db.query(""" DELETE FROM stories_tags_map WHERE stories_id = %(stories_id)s AND tags_id IN ( SELECT tags_id FROM tags WHERE tag_sets_id IN ( SELECT tag_sets_id FROM tag_sets WHERE name = ANY(%(tag_sets_names)s) ) ) """, {'stories_id': stories_id, 'tag_sets_names': list(unique_tag_sets_names)}) for tag in tags: tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name) tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name) # Not using find_or_create() because tag set / tag might already exist # with slightly different label / description # Find or create a tag set db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={'name': tag_sets_name}).hash() if db_tag_set is None: db.query(""" INSERT INTO tag_sets (name, label, description) VALUES (%(name)s, %(label)s, %(description)s) ON CONFLICT (name) DO NOTHING """, { 'name': tag_sets_name, 'label': tag.tag_sets_label, 'description': tag.tag_sets_description }) db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={'name': tag_sets_name}).hash() tag_sets_id = int(db_tag_set['tag_sets_id']) # Find or create tag db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() if db_tag is None: db.query(""" INSERT INTO tags (tag_sets_id, tag, label, description) VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s) ON CONFLICT (tag, tag_sets_id) DO NOTHING """, { 'tag_sets_id': tag_sets_id, 'tag': tags_name, 'label': tag.tags_label, 'description': tag.tags_description, }) db_tag = db.select(table='tags', what_to_select='*', condition_hash={ 'tag_sets_id': tag_sets_id, 'tag': tags_name, }).hash() tags_id = int(db_tag['tags_id']) # Assign story to tag (if no such mapping exists yet) # (partitioned table's INSERT trigger will take care of conflicts) # # db.create() can't be used here because: # # 1) Master table for partitioned table might not have a primary key itself, only the partitions do -- # FIXME maybe master tables should have primary keys? Or let's wait for when we move to PostgreSQL 10+. # # 2) Partitioned table's INSERT trigger doesn't return last_inserted_id which db.create() requires # FIXME there might be a way for it to return the inserted row # db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, { 'stories_id': stories_id, 'tags_id': tags_id, }) db.commit()
def update_story_sentences_and_language( db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Update story vectors for the given story, updating "story_sentences". If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new ones (useful for optimization if you are very sure no story vectors exist for this story). If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are reprocessing a small set of stories). """ story = decode_object_from_bytes_if_needed(story) use_transaction = not db.in_transaction() if use_transaction: db.begin() stories_id = story['stories_id'] if not extractor_args.no_delete(): _delete_story_sentences(db=db, story=story) story_text = story.get('story_text', None) if not story_text: story_text = get_text_for_word_counts(db=db, story=story) if not story_text: story_text = '' story_lang = language_code_for_text(text=story_text) sentences = _get_sentences_from_story_text(story_text=story_text, story_lang=story_lang) if (not story.get('language', None)) or story.get('language', None) != story_lang: db.query( """ UPDATE stories SET language = %(story_lang)s WHERE stories_id = %(stories_id)s """, { 'stories_id': stories_id, 'story_lang': story_lang }) story['language'] = story_lang if sentences is None: raise McUpdateStorySentencesAndLanguageException( "Sentences for story {} are undefined.".format(stories_id)) if len(sentences) == 0: log.debug("Story {} doesn't have any sentences.".format(stories_id)) return sentences = _clean_sentences(sentences) _insert_story_sentences( db=db, story=story, sentences=sentences, no_dedup_sentences=extractor_args.no_dedup_sentences(), ) story['ap_syndicated'] = _update_ap_syndicated( db=db, stories_id=stories_id, story_title=story['title'], story_text=story_text, story_language=story_lang, ) if use_transaction: db.commit()
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep media_normalized_urls table up to date. This function compares the media and versions in media_normalized_urls against the version returned by mediawords.util.url.normalize_url_lossy_version() and updates or inserts rows for any media that do not have up to date versions. """ if not _normalized_urls_out_of_date(db): return # put a lock on this because the process of generating all media urls will take around 30 seconds, and we don't # want all workers to do the work db.begin() db.query("lock media_normalized_urls in access exclusive mode") if not _normalized_urls_out_of_date(db): db.commit() return log.warning("updating media_normalized_urls ...") version = mediawords.util.url.normalize_url_lossy_version() media = db.query( """ select m.* from media m left join media_normalized_urls u on ( m.media_id = u.media_id and u.normalize_url_lossy_version = %(a)s) where u.normalized_url is null or u.db_row_last_updated < m.db_row_last_updated """, { 'a': version }).hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.query( "delete from media_normalized_urls where media_id = %(a)s and normalize_url_lossy_version = %(b)s", { 'a': medium['media_id'], 'b': version }) db.create( 'media_normalized_urls', { 'media_id': medium['media_id'], 'normalized_url': normalized_url, 'normalize_url_lossy_version': version }) db.commit()