def process_download_for_extractor(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None: """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be extracted for the story, call process_extracted_story() on the parent story.""" download = decode_object_from_bytes_if_needed(download) stories_id = download['stories_id'] log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url'])) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) has_remaining_download = db.query(""" SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s AND extracted = 'f' AND type = 'content' """, {'stories_id': stories_id}).hash() # MC_REWRITE_TO_PYTHON: Perlism if has_remaining_download is None: has_remaining_download = {} if len(has_remaining_download) > 0: log.info("Pending more downloads...") else: story = db.find_by_id(table='stories', object_id=stories_id) process_extracted_story(db=db, story=story, extractor_args=extractor_args)
def get_links_from_story_text(db: DatabaseHandler, story: dict) -> typing.List[str]: """Get all urls that appear in the text or description of the story using a simple regex.""" download_ids = db.query(""" SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s """, {'stories_id': story['stories_id']} ).flat() download_texts = db.query(""" SELECT * FROM download_texts WHERE downloads_id = ANY(%(download_ids)s) ORDER BY download_texts_id """, {'download_ids': download_ids} ).hashes() story_text = ' '.join([dt['download_text'] for dt in download_texts]) story_text = story_text + ' ' + str(story['title']) if story['title'] is not None else story_text story_text = story_text + ' ' + str(story['description']) if story['description'] is not None else story_text links = [] for url in re.findall(r'https?://[^\s\")]+', story_text): url = re.sub(r'\W+$', '', url) links.append(url) return links
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict: """Store the content for the download.""" # feed_error state indicates that the download was successful but that there was a problem # parsing the feed afterward. so we want to keep the feed_error state even if we redownload # the content download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) new_state = 'success' if download['state'] != 'feed_error' else 'feed_error' try: path = _get_store_for_writing().store_content(db, download['downloads_id'], content) except Exception as ex: raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex)) if new_state == 'success': download['error_message'] = '' db.update_by_id( table='downloads', object_id=download['downloads_id'], update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']}, ) download = db.find_by_id('downloads', download['downloads_id']) return download
def create(db: DatabaseHandler, download: dict, extract: dict) -> dict: """Create a download_text hash and insert it into the database. Delete any existing download_text row for the download.""" # FIXME don't pass freeform "extract" dict, we need just the "extracted_text" download = decode_object_from_bytes_if_needed(download) extract = decode_object_from_bytes_if_needed(extract) db.query(""" DELETE FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) download_text = db.query(""" INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) RETURNING * """, { 'downloads_id': download['downloads_id'], 'download_text': extract['extracted_text'], }).hash() db.query(""" UPDATE downloads SET extracted = 't' WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) return download_text
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]: """Lookup a media source by normalized url and then name. Uses mediawords.util.url.normalize_url_lossy to normalize urls. Returns the parent media for duplicate media sources and returns no media that are marked foreign_rss_links. This function queries the media.normalized_url field to find the matching urls. Because the normalization function is in python, we have to keep that denormalized_url field current from within python. This function is responsible for keeping the table up to date by filling the field for any media for which it is null. Arguments: db - db handle url - url to lookup name - name to lookup Returns: a media source dict or None """ _update_media_normalized_urls(db) nu = _normalize_url(url) lookup_query = \ """ select m.* from media m where m.normalized_url = %(a)s and foreign_rss_links = 'f' order by dup_media_id asc nulls last, media_id asc """ medium = db.query(lookup_query, {'a': nu}).hash() if medium is None: medium = db.query( "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false", {'a': name}).hash() if medium is None: return None if medium['dup_media_id'] is not None: media_cycle_lookup = dict() # type: dict while medium['dup_media_id'] is not None: if medium['media_id'] in media_cycle_lookup: raise McTopicMediaException('Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys())) media_cycle_lookup[medium['media_id']] = True medium = db.query("select * from media where media_id = %(a)s", {'a': medium['dup_media_id']}).hash() if medium['foreign_rss_links']: raise McTopicMediaException('Parent duplicate media source %d has foreign_rss_links' % medium['media_id']) return medium
def _insert_tweet_urls(db: DatabaseHandler, topic_tweet: dict, urls: typing.List) -> typing.List: """Insert list of urls into topic_tweet_urls.""" for url in urls: db.query( """ insert into topic_tweet_urls( topic_tweets_id, url ) values( %(a)s, %(b)s ) on conflict do nothing """, {'a': topic_tweet['topic_tweets_id'], 'b': url})
def change_password(db: DatabaseHandler, email: str, new_password: str, new_password_repeat: str, do_not_inform_via_email: bool = False) -> None: """Change user's password.""" email = decode_object_from_bytes_if_needed(email) new_password = decode_object_from_bytes_if_needed(new_password) new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat) if isinstance(do_not_inform_via_email, bytes): do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email) do_not_inform_via_email = bool(int(do_not_inform_via_email)) # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email) password_validation_message = validate_new_password(email=email, password=new_password, password_repeat=new_password_repeat) if password_validation_message: raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message) # Hash + validate the password try: password_new_hash = generate_secure_hash(password=new_password) except Exception as ex: raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex)) if not password_new_hash: raise McAuthChangePasswordException("Generated password hash is empty.") # Set the password hash db.query(""" UPDATE auth_users SET password_hash = %(password_hash)s, active = TRUE WHERE email = %(email)s """, { 'email': email, 'password_hash': password_new_hash, }) if not do_not_inform_via_email: message = AuthPasswordChangedMessage(to=email, full_name=user.full_name()) if not send_email(message): raise McAuthChangePasswordException( 'The password has been changed, but I was unable to send an email notifying you about the change.' )
def __remove_object_from_cache(self, db: DatabaseHandler, object_id: int) -> None: """Attempt to remove object from cache. Raise if removal fails because after removal we'd expect the object to be gone for good.""" object_id = self._prepare_object_id(object_id) sql = "DELETE FROM %s " % self.__cache_table # interpolated by Python sql += "WHERE object_id = %(object_id)s" # interpolated by psycopg2 db.query(sql, {'object_id': object_id})
def __get_topic_url_variants(db: DatabaseHandler, urls: List[str]) -> List[str]: """Get any alternative urls for the given url from topic_merged_stories or topic_links.""" urls = decode_object_from_bytes_if_needed(urls) # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples stories_ids_sql = "SELECT stories_id " stories_ids_sql += "FROM stories " stories_ids_sql += "WHERE url = ANY(?)" stories_ids = db.query(stories_ids_sql, urls).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if stories_ids is None: stories_ids = [] elif isinstance(stories_ids, int): stories_ids = [stories_ids] stories_ids = [int(x) for x in stories_ids] all_stories_ids = __get_merged_stories_ids(db=db, stories_ids=stories_ids) if len(all_stories_ids) == 0: return urls all_urls = db.query(""" SELECT DISTINCT url FROM ( SELECT redirect_url AS url FROM topic_links WHERE ref_stories_id = ANY(?) UNION SELECT url FROM topic_links WHERE ref_stories_id = ANY(?) UNION SELECT url FROM stories WHERE stories_id = ANY(?) ) AS q WHERE q IS NOT NULL """, all_stories_ids, all_stories_ids, all_stories_ids).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if all_urls is None: all_urls = [] elif isinstance(all_urls, str): all_urls = [all_urls] return all_urls
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]: """Generate password reset token used for both activating newly registered users and resetting passwords. Returns non-hashed password reset token or None if user was not found. """ email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a # "success" message. That way the adversary would not be able to find out which email addresses are active users. # # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing # attacks) user_exists = db.query(""" SELECT auth_users_id, email FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if user_exists is None or len(user_exists) == 0: # User was not found, so set the email address to an empty string, but don't return just now and continue with a # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks) email = '' # Generate the activation token password_reset_token = random_string(length=64) if len(password_reset_token) == 0: raise McAuthProfileException('Unable to generate an activation token.') # Hash + validate the activation token password_reset_token_hash = generate_secure_hash(password=password_reset_token) if not password_reset_token_hash: raise McAuthProfileException("Unable to hash an activation token.") # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing) db.query(""" UPDATE auth_users SET password_reset_token_hash = %(password_reset_token_hash)s WHERE email = %(email)s AND email != '' """, { 'email': email, 'password_reset_token_hash': password_reset_token_hash, }) return password_reset_token
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str: """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called.""" item_set = decode_object_from_bytes_if_needed(item_set) item_id = decode_object_from_bytes_if_needed(item_id) # Always return grey for null or not typed values if item_id.lower() in {'null', 'not typed'}: return '999999' color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", { 'item_set': item_set, 'item_id': item_id, }).flat() if color is not None and len(color): if isinstance(color, list): color = color[0] return color set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", { 'item_set': item_set, }).flat() if set_colors is not None: if not isinstance(set_colors, list): set_colors = [set_colors] existing_colors = set() if set_colors is not None: for color in set_colors: existing_colors.add(color) # Use the hard coded palette of 25 colors if possible new_color = None for color in __MC_COLORS: if color not in existing_colors: new_color = color break # Otherwise, just generate a random color if new_color is None: colors = analogous_color(color='0000ff', return_slices=256, split_slices=255) new_color = random.choice(colors) db.create(table='color_sets', insert_hash={ 'color_set': item_set, 'id': item_id, 'color': new_color, }) return new_color
def _add_topic_tweet_single_day( db: DatabaseHandler, topic: dict, day: datetime.datetime, ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict: """ Add a row to topic_tweet_day if it does not already exist. fetch data for new row from CH. Arguments: db - database handle topic - topic dict day - date to fetch eg '2017-12-30' ch_class - AbstractCrimsonHexagon class Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_tweet_day = db.query( "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s", {'a': topic['topics_id'], 'b': str(day)}).hash() if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']: raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_tweet_day is not None: db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id']) ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day) tweet_count = ch_posts['totalPostsAvailable'] num_ch_tweets = len(ch_posts['posts']) topic_tweet_day = db.create( 'topic_tweet_days', { 'topics_id': topic['topics_id'], 'day': day, 'tweet_count': tweet_count, 'num_ch_tweets': num_ch_tweets, 'tweets_fetched': False }) topic_tweet_day['ch_posts'] = ch_posts return topic_tweet_day
def regenerate_api_key(db: DatabaseHandler, email: str) -> None: """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthProfileException("User with email address '%s' does not exist." % email) db.begin() # Purge all IP-limited API keys db.query(""" DELETE FROM auth_user_api_keys WHERE ip_address IS NOT NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) # Regenerate non-IP limited API key db.query(""" UPDATE auth_user_api_keys -- DEFAULT points to a generation function SET api_key = DEFAULT WHERE ip_address IS NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthProfileException("Unable to send email about reset API key.") db.commit()
def _get_ap_dup_sentence_lengths(db: DatabaseHandler, story_text: str) -> List[int]: story_text = decode_object_from_bytes_if_needed(story_text) ap_media_id = _get_ap_media_id(db=db) if ap_media_id is None: return [] sentences = _get_sentences_from_content(story_text=story_text) md5s = [] for sentence in sentences: md5_hash = hashlib.md5(sentence.encode('utf-8')).hexdigest() md5s.append(md5_hash) sentence_lengths = db.query(""" SELECT length(sentence) AS len FROM story_sentences WHERE media_id = %(ap_media_id)s -- FIXME this probably never worked because the index is half_md5(), not md5() AND md5(sentence) = ANY(%(md5s)s) """, { 'ap_media_id': ap_media_id, 'md5s': md5s, }).flat() # MC_REWRITE_TO_PYTHON: Perlism if sentence_lengths is None: sentence_lengths = [] return sentence_lengths
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict: """Find or create the 'Spider Feed' feed for the media source.""" feed = db.query( "select * from feeds where media_id = %(a)s and name = %(b)s", {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash() if feed is not None: return feed return db.find_or_create('feeds', { 'media_id': medium['media_id'], 'url': medium['url'] + '#spiderfeed', 'name': SPIDER_FEED_NAME, 'active': False, })
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict: """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id.""" medium = db.require_by_id('media', media_id) if medium['dup_media_id'] is None: return medium else: return _get_deduped_medium(db, medium['dup_media_id'])
def _get_story_with_most_sentences(db: DatabaseHandler, stories: list) -> dict: """Given a list of stories, return the story with the most sentences.""" assert len(stories) > 0 if len(stories) == 1: return stories[0] story = db.query( """ select s.* from stories s where stories_id in ( select stories_id from story_sentences where stories_id = any (%(a)s) group by stories_id order by count(*) desc limit 1 ) """, {'a': [s['stories_id'] for s in stories]}).hash() if story is not None: return story else: return stories[0]
def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool: """Validate password reset token (used for both user activation and password reset).""" email = decode_object_from_bytes_if_needed(email) password_reset_token = decode_object_from_bytes_if_needed(password_reset_token) if not (email and password_reset_token): log.error("Email and / or password reset token is empty.") return False # Fetch readonly information about the user password_reset_token_hash = db.query(""" SELECT auth_users_id, email, password_reset_token_hash FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash: log.error("Unable to find user %s in the database." % email) return False password_reset_token_hash = password_reset_token_hash['password_reset_token_hash'] if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token): return True else: return False
def _get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[FetchLinkResponse]: """Return content for this url and topic in topic_seed_urls. Arguments: db - db handle topic_fetch_url - topic_fetch_url dict from db Returns: dummy response object """ r = db.query( "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null", {'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url']}).flat() if len(r) == 0: return None return FetchLinkResponse( url=topic_fetch_url['url'], is_success=True, code=HTTPStatus.OK.value, message=HTTPStatus.OK.phrase, content=r[0], last_requested_url=topic_fetch_url['url'], )
def lookup_tag(db: DatabaseHandler, tag_name: str) -> Optional[Dict[str, Any]]: """Lookup the tag given the tag_set:tag format.""" tag_name = decode_object_from_bytes_if_needed(tag_name) if not tag_name: log.warning("Tag name is empty.") return None if not re.match(pattern='^([^:]*):([^:]*)$', string=tag_name): log.warning("Unable to parse tag name '{}'.".format(tag_name)) return None tag_set_name, tag = tag_name.split(':') found_tag = db.query(""" SELECT t.* FROM tags AS t, tag_sets AS ts WHERE t.tag_sets_id = ts.tag_sets_id AND t.tag = %(tag)s AND ts.name = %(tag_set_name)s """, {'tag': tag, 'tag_set_name': tag_set_name}).hash() # MC_REWRITE_TO_PYTHON: Perlism if found_tag is None: found_tag = {} return found_tag
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls get_links_from_story on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ try: log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = get_links_from_story(db, story) for link in links: if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) link_mine_error = '' except Exception: link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
def guess_medium(db: DatabaseHandler, story_url: str) -> dict: """Guess the media source for a story with the given url. The guess is based on a normalized version of the host part of the url. The guess takes into account the duplicate media relationships included in the postgres database through the media.dup_media_id fields. If no appropriate media source exists, this function will create a new one and return it. """ (medium_url, medium_name) = generate_medium_url_and_name_from_url(story_url) medium = lookup_medium(db, medium_url, medium_name) if medium is not None: return medium normalized_medium_url = _normalize_url(medium_url) normalized_story_url = _normalize_url(story_url) all_urls = [normalized_medium_url, medium_url, normalized_story_url, story_url] # avoid conflicts with existing media names and urls that are missed # by the above query b/c of dups feeds or foreign_rss_links medium_name = get_unique_medium_name(db, [medium_name] + all_urls) medium_url = get_unique_medium_url(db, all_urls) # a race condition with another thread can cause this to fail sometimes, but after the medium in the # other process has been created, all should be fine for i in range(_GUESS_MEDIUM_RETRIES): medium_data = {'name': medium_name, 'url': medium_url, 'normalized_url': normalized_medium_url} medium = db.find_or_create('media', medium_data) if medium is not None: break else: time.sleep(1) if medium is None: raise McTopicMediaUniqueException( "Unable to find or create medium for %s / %s" % (medium_name, medium_url)) log.info("add medium: %s / %s / %d" % (medium_name, medium_url, medium['media_id'])) spidered_tag = get_spidered_tag(db) db.find_or_create('media_tags_map', {'media_id': medium['media_id'], 'tags_id': spidered_tag['tags_id']}) return medium
def get_unique_medium_name(db: DatabaseHandler, names: list) -> str: """Return the first name in the names list that does not yet exist for a media source, or None.""" for name in names: name = name[0:MAX_NAME_LENGTH] name_exists = db.query("select 1 from media where lower(name) = lower(%(a)s)", {'a': name}).hash() if name_exists is None: return name raise McTopicMediaUniqueException("Unable to find unique name among names: " + str(names))
def get_spidered_tag(db: DatabaseHandler) -> dict: """Return the spidered:spidered tag dict.""" spidered_tag = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) where t.tag = %(a)s and ts.name = %(b)s """, {'a': SPIDERED_TAG_TAG, 'b': SPIDERED_TAG_SET}).hash() if spidered_tag is None: tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET}) spidered_tag = db.find_or_create('tags', {'tag': SPIDERED_TAG_TAG, 'tag_sets_id': tag_set['tag_sets_id']}) return spidered_tag
def get_media_id(db: DatabaseHandler, download: dict) -> int: """Convenience method to get the media_id for the download.""" download = decode_object_from_bytes_if_needed(download) return db.query(""" SELECT media_id FROM feeds WHERE feeds_id = %(feeds_id)s """, {'feeds_id': download['feeds_id']}).hash()['media_id']
def mark_as_processed(db: DatabaseHandler, stories_id: int) -> bool: """Mark the story as processed by inserting an entry into 'processed_stories'. Return True on success.""" # FIXME upsert instead of inserting a potential duplicate if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) log.debug("Marking story ID %d as processed..." % stories_id) try: db.insert(table='processed_stories', insert_hash={'stories_id': stories_id}) except Exception as ex: log.warning("Unable to insert story ID %d into 'processed_stories': %s" % (stories_id, str(ex),)) return False else: return True
def is_new(db: DatabaseHandler, story: dict) -> bool: """Return true if this story should be considered new for the given media source. This is used to determine whether to add a new story for a feed item URL. A story is new if no story with the same URL or GUID exists in the same media source and if no story exists with the same title in the same media source in the same calendar day. """ story = decode_object_from_bytes_if_needed(story) if story['title'] == '(no title)': return False db_story = db.query(""" SELECT * FROM stories WHERE guid = %(guid)s AND media_id = %(media_id)s """, { 'guid': story['guid'], 'media_id': story['media_id'], }).hash() if db_story: return False db_story = db.query(""" SELECT 1 FROM stories WHERE md5(title) = md5(%(title)s) AND media_id = %(media_id)s -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index AND date_trunc('day', publish_date) + interval '1 second' = date_trunc('day', %(publish_date)s::date) + interval '1 second' """, { 'title': story['title'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }).hash() if db_story: return False return True
def __get_merged_stories_ids(db: DatabaseHandler, stories_ids: List[int], n: int = 0) -> List[int]: """Gor a given set of stories, get all the stories that are source or target merged stories in topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found.""" stories_ids = decode_object_from_bytes_if_needed(stories_ids) # "The crazy load was from a query to our topic_merged_stories_ids to get # url variants. It looks like we have some case of many, many merged story # pairs that are causing that query to make postgres sit on a cpu for a # super long time. There's no good reason to query for ridiculous numbers # of merged stories, so I just arbitrarily capped the number of merged story # pairs to 20 to prevent this query from running away in the future." max_stories = 20 # MC_REWRITE_TO_PYTHON: cast strings to ints # noinspection PyTypeChecker stories_ids = [int(x) for x in stories_ids] if len(stories_ids) == 0: return [] if len(stories_ids) >= max_stories: return stories_ids[0:max_stories - 1] # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples stories_ids_list = ', '.join(str(x) for x in stories_ids) merged_stories_ids = db.query(""" SELECT DISTINCT target_stories_id, source_stories_id FROM topic_merged_stories_map WHERE target_stories_id IN (%(stories_ids_list)s) OR source_stories_id IN (%(stories_ids_list)s) LIMIT %(max_stories)s """ % { 'stories_ids_list': stories_ids_list, 'max_stories': int(max_stories), }).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if merged_stories_ids is None: merged_stories_ids = [] elif isinstance(merged_stories_ids, int): merged_stories_ids = [merged_stories_ids] merged_stories_ids = [int(x) for x in merged_stories_ids] all_stories_ids = list(set(stories_ids + merged_stories_ids)) if n > 10 or len(stories_ids) == len(all_stories_ids) or len(stories_ids) >= max_stories: return all_stories_ids else: return __get_merged_stories_ids(db=db, stories_ids=all_stories_ids, n=n + 1)
def delete_user(db: DatabaseHandler, email: str) -> None: """Delete user.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user_info(db=db, email=email) except Exception: raise McAuthProfileException("User with email address '%s' does not exist." % email) # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map') db.query(""" DELETE FROM auth_users WHERE email = %(email)s """, {'email': email})
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def __get_merged_stories_ids(db: DatabaseHandler, stories_ids: List[int], n: int = 0) -> List[int]: """Gor a given set of stories, get all the stories that are source or target merged stories in topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found.""" stories_ids = decode_object_from_bytes_if_needed(stories_ids) # "The crazy load was from a query to our topic_merged_stories_ids to get # url variants. It looks like we have some case of many, many merged story # pairs that are causing that query to make postgres sit on a cpu for a # super long time. There's no good reason to query for ridiculous numbers # of merged stories, so I just arbitrarily capped the number of merged story # pairs to 20 to prevent this query from running away in the future." max_stories = 20 # MC_REWRITE_TO_PYTHON: cast strings to ints # noinspection PyTypeChecker stories_ids = [int(x) for x in stories_ids] if len(stories_ids) == 0: return [] if len(stories_ids) >= max_stories: return stories_ids[0:max_stories - 1] # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples stories_ids_list = ', '.join(str(x) for x in stories_ids) merged_stories_ids = db.query(""" SELECT DISTINCT target_stories_id, source_stories_id FROM topic_merged_stories_map WHERE target_stories_id IN (%(stories_ids_list)s) OR source_stories_id IN (%(stories_ids_list)s) LIMIT %(max_stories)s """ % { 'stories_ids_list': stories_ids_list, 'max_stories': int(max_stories), }).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if merged_stories_ids is None: merged_stories_ids = [] elif isinstance(merged_stories_ids, int): merged_stories_ids = [merged_stories_ids] merged_stories_ids = [int(x) for x in merged_stories_ids] all_stories_ids = list(set(stories_ids + merged_stories_ids)) if n > 10 or len(stories_ids) == len( all_stories_ids) or len(stories_ids) >= max_stories: return all_stories_ids else: return __get_merged_stories_ids(db=db, stories_ids=all_stories_ids, n=n + 1)
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:MAX_URL_LENGTH] medium = guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = get_spidered_tag(db) if title is None: title = html_title(content, url, MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date story = add_story(db, story, feed['feeds_id']) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): download = create_download_for_new_story(db, story, feed) store_content(db, download, content) _extract_story(story) return story
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> None: """Update the topic_fetch_url.message field in the database.""" if _USE_TFU_DEBUG_MESSAGES: db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], {'message': message})
def _add_stale_feeds(db: DatabaseHandler) -> None: """Add pending downloads for all stale feeds.""" vars(_add_stale_feeds).setdefault('last_check', 0) if _add_stale_feeds.last_check > time.time() - STALE_FEED_CHECK_INTERVAL: return _add_stale_feeds.last_check = time.time() # If the table doesn't exist, PostgreSQL sends a NOTICE which breaks the "no warnings" unit test db.query('SET client_min_messages=WARNING') db.query('DROP TABLE IF EXISTS feeds_to_queue') db.query('SET client_min_messages=NOTICE') db.query( """ CREATE TEMPORARY TABLE feeds_to_queue AS SELECT feeds_id, url FROM feeds WHERE active = 't' AND url ~ 'https?://' AND ( -- Never attempted last_attempted_download_time IS NULL -- Feed was downloaded more than stale_feed_interval seconds ago OR (last_attempted_download_time < (NOW() - (%(a)s || ' seconds')::interval)) -- (Probably) if a new story comes in every "n" seconds, refetch feed every "n" + 5 minutes OR ( (NOW() > last_attempted_download_time + (last_attempted_download_time - last_new_story_time) + interval '5 minutes') -- "web_page" feeds are to be downloaded only once a week, -- independently from when the last new story comes in from the -- feed (because every "web_page" feed download provides a -- single story) AND type != 'web_page' ) ) """, {'a': STALE_FEED_INTERVAL}) db.query( """ UPDATE feeds SET last_attempted_download_time = NOW() WHERE feeds_id IN (SELECT feeds_id FROM feeds_to_queue) """) downloads = db.query( """ WITH inserted_downloads as ( INSERT INTO downloads (feeds_id, url, host, type, sequence, state, priority, download_time, extracted) SELECT feeds_id, url, LOWER(SUBSTRING(url from '.*://([^/]*)' )), 'feed', 1, 'pending', 0, NOW(), false FROM feeds_to_queue RETURNING * ) select d.*, f.media_id as _media_id from inserted_downloads d join feeds f using (feeds_id) """).hashes() db.query("drop table feeds_to_queue") log.info("added stale feeds: %d" % len(downloads))
def get_and_store_story_stats(db: DatabaseHandler, story: dict) -> FacebookURLStats: """ Get Facebook statistics for story URL, store them in a database. Return statistics object on success, throw exception on failure (updates the stats in the database in any case). :param db: Database handler. :param story: Story dictionary. :return Statistics object. """ story = decode_object_from_bytes_if_needed(story) story_url = story['url'] stats = None thrown_exception = None story_stats = db.query("select * from story_statistics where stories_id = %(a)s", {'a': story['stories_id']}).hash() try: if len(story_stats.get('facebook_api_error', '')) > 0: message ='ignore story %d with error: %s' % (story['stories_id'], story_stats['facebook_api_error']) raise McFacebookSoftFailureException(message) except Exception: pass try: stats = _get_url_stats(url=story_url) except Exception as ex: log.error(f"Statistics can't be fetched for URL '{story_url}': {ex}") thrown_exception = ex db.query(""" INSERT INTO story_statistics ( stories_id, facebook_share_count, facebook_comment_count, facebook_reaction_count, facebook_api_collect_date, facebook_api_error ) VALUES ( %(stories_id)s, %(share_count)s, %(comment_count)s, %(reaction_count)s, NOW(), %(facebook_error)s ) ON CONFLICT (stories_id) DO UPDATE SET facebook_share_count = %(share_count)s, facebook_comment_count = %(comment_count)s, facebook_reaction_count = %(reaction_count)s, facebook_api_collect_date = NOW(), facebook_api_error = %(facebook_error)s """, { 'stories_id': story['stories_id'], 'share_count': stats.share_count if stats else None, 'comment_count': stats.comment_count if stats else None, 'reaction_count': stats.reaction_count if stats else None, 'facebook_error': str(thrown_exception) if thrown_exception else None, }) if thrown_exception: raise thrown_exception else: return stats
def login_with_api_key(db: DatabaseHandler, api_key: str, ip_address: str) -> CurrentUser: """Fetch user object for the API key. Only active users are fetched.""" api_key = decode_object_from_bytes_if_needed(api_key) ip_address = decode_object_from_bytes_if_needed(ip_address) if not api_key: raise McAuthLoginException("API key is undefined.") if not ip_address: # Even if provided API key is the global one, we want the IP address raise McAuthLoginException("IP address is undefined.") api_key_user = db.query(""" SELECT auth_users.email FROM auth_users INNER JOIN auth_user_api_keys ON auth_users.auth_users_id = auth_user_api_keys.auth_users_id WHERE ( auth_user_api_keys.api_key = %(api_key)s AND ( auth_user_api_keys.ip_address IS NULL OR auth_user_api_keys.ip_address = %(ip_address)s ) ) GROUP BY auth_users.auth_users_id, auth_users.email ORDER BY auth_users.auth_users_id LIMIT 1 """, { 'api_key': api_key, 'ip_address': ip_address, }).hash() if api_key_user is None or len(api_key_user) == 0: raise McAuthLoginException("Unable to find user for API key '%s' and IP address '%s'" % (api_key, ip_address,)) email = api_key_user['email'] # Check if user has tried to log in unsuccessfully before and now is trying again too fast if __user_is_trying_to_login_too_soon(db=db, email=email): raise McAuthLoginException( "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email ) user = user_info(db=db, email=email) # Reset password reset token (if any) db.query(""" UPDATE auth_users SET password_reset_token_hash = NULL WHERE email = %(email)s AND password_reset_token_hash IS NOT NULL """, {'email': email}) if not user.active(): raise McAuthLoginException("User '%s' for API key '%s' is not active." % (email, api_key,)) return user
def store_download(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) downloads_id = download['downloads_id'] log.info(f"Processing feed download {downloads_id}...") try: added_story_ids = self.add_stories_from_feed(db=db, download=download, content=content) story_ids_to_extract = self.add_stories_from_feed( db=db, download=download, content=content) except Exception as ex: error_message = f"Error processing feed for download {downloads_id}: {ex}" log.error(error_message) db.query( """ UPDATE downloads SET state = 'feed_error', error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'error_message': error_message, 'downloads_id': downloads_id, }) # On non-soft errors (explicitly hard errors or unknown errors), pass the exception up if not isinstance(ex, McCrawlerFetcherSoftError): raise ex story_ids_to_extract = [] else: if len(added_story_ids): last_new_story_time_sql = 'last_new_story_time = last_attempted_download_time, ' else: last_new_story_time_sql = '' db.query( f""" UPDATE feeds SET {last_new_story_time_sql} last_successful_download_time = GREATEST(last_successful_download_time, %(download_time)s) WHERE feeds_id = %(feeds_id)s """, { 'download_time': download['download_time'], 'feeds_id': download['feeds_id'], }) # If no new stories, just store "(redundant feed)" to save storage space if len(added_story_ids) == 0: content = '(redundant feed)' # Reread the possibly updated download download = db.find_by_id(table='downloads', object_id=downloads_id) # Store the feed in any case store_content(db=db, download=download, content=content) log.info(f"Done processing feed download {downloads_id}") return story_ids_to_extract
def find_dup_story(db: DatabaseHandler, story: dict) -> bool: """Return existing duplicate story within the same media source. Search for a story that is a duplicate of the given story. A story is a duplicate if it shares the same media source and: * has the same normalized title and has a publish_date within the same calendar week * has a normalized guid or url that is the same as the normalized guid or url If a dup story is found, insert the url and guid into the story_urls table. Return the found story or None if no story is found. """ story = decode_object_from_bytes_if_needed(story) if story['title'] == '(no title)': return None urls = _get_story_url_variants(story) db_story = db.query( """ SELECT s.* FROM stories s WHERE (s.guid = any( %(urls)s ) or s.url = any( %(urls)s)) and media_id = %(media_id)s """, { 'urls': urls, 'media_id': story['media_id'], }).hash() if db_story: return db_story # make sure that postgres uses the story_urls_url index db.query( "create temporary table _u as select stories_id from story_urls where url = any( %(a)s )", {'a': urls}) db_story = db.query( "select * from stories s join _u u using ( stories_id ) where media_id = %(a)s order by stories_id limit 1", { 'a': story['media_id'] }).hash() db.query("drop table _u") if db_story: return db_story db_story = db.query( """ SELECT * FROM stories WHERE (md5(title) = md5(%(title)s) OR normalized_title_hash = md5( get_normalized_title( %(title)s, %(media_id)s ) )::uuid) AND media_id = %(media_id)s -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index AND date_trunc('day', publish_date) + interval '1 second' = date_trunc('day', %(publish_date)s::date) + interval '1 second' """, { 'title': story['title'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }).hash() if db_story: [ insert_story_urls(db, db_story, u) for u in (story['url'], story['guid']) ] return db_story return None
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fetch_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) if 'stories_id' in topic_fetch_url and topic_fetch_url[ 'stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): add_to_topic_stories(db, story, topic) # add redirect_url as a lookup url for the story, if it is different from the story url if not redirect_url == topic_fetch_url['url']: mediawords.dbi.stories.stories.insert_story_urls( db, story, redirect_url) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format( topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None: """Update an existing user.""" if not user_updates: raise McAuthProfileException("Existing user is undefined.") # Check if user exists try: user = user_info(db=db, email=user_updates.email()) except Exception: raise McAuthProfileException('User with email address "%s" does not exist.' % user_updates.email()) db.begin() if user_updates.full_name() is not None: db.query(""" UPDATE auth_users SET full_name = %(full_name)s WHERE email = %(email)s """, { 'full_name': user_updates.full_name(), 'email': user_updates.email(), }) if user_updates.notes() is not None: db.query(""" UPDATE auth_users SET notes = %(notes)s WHERE email = %(email)s """, { 'notes': user_updates.notes(), 'email': user_updates.email(), }) if user_updates.active() is not None: db.query(""" UPDATE auth_users SET active = %(active)s WHERE email = %(email)s """, { 'active': bool(int(user_updates.active())), 'email': user_updates.email(), }) if user_updates.has_consented() is not None: db.query(""" UPDATE auth_users SET has_consented = %(has_consented)s WHERE email = %(email)s """, { 'has_consented': bool(int(user_updates.has_consented())), 'email': user_updates.email(), }) if user_updates.password() is not None: try: change_password( db=db, email=user_updates.email(), new_password=user_updates.password(), new_password_repeat=user_updates.password_repeat(), do_not_inform_via_email=True, ) except Exception as ex: db.rollback() raise McAuthProfileException("Unable to change password: %s" % str(ex)) resource_limits = user_updates.resource_limits() if resource_limits: if resource_limits.weekly_requests() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requests_limit': resource_limits.weekly_requests(), 'auth_users_id': user.user_id(), }) if resource_limits.weekly_requested_items() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'weekly_requested_items_limit': resource_limits.weekly_requested_items(), 'auth_users_id': user.user_id(), }) if resource_limits.max_topic_stories() is not None: db.query(""" UPDATE auth_user_limits SET max_topic_stories = %(max_topic_stories)s WHERE auth_users_id = %(auth_users_id)s """, { 'max_topic_stories': resource_limits.max_topic_stories(), 'auth_users_id': user.user_id(), }) if user_updates.role_ids() is not None: db.query(""" DELETE FROM auth_users_roles_map WHERE auth_users_id = %(auth_users_id)s """, {'auth_users_id': user.user_id()}) for auth_roles_id in user_updates.role_ids(): db.insert(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) db.commit()
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls pending_state = _get_pending_state(topic_fetch_url) if pending_state: topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: if _ignore_link_pattern(response_url): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url story = generate_story(db=db, content=content, url=url) topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException( "Unable to find matching story after unique constraint error." ) topic_fetch_url['stories_id'] = story_match['stories_id'] _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
def generate_story( db: DatabaseHandler, url: str, content: str, fallback_date: typing.Optional[datetime.datetime]=None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:_MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) title = mediawords.util.html.html_title(content, url, _MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re.sub('\x00', '', story[field]) date_guess = guess_date(url, content) story['publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() try: story = db.create('stories', story) except mediawords.db.exceptions.handler.McUniqueConstraintException as e: raise McTMStoriesDuplicateException("Attempt to insert duplicate story url %s" % url) except Exception as e: raise McTMStoriesException("Error adding story: %s" % traceback.format_exc()) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': spidered_tag['tags_id']}) assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']}) download = create_download_for_new_story(db, story, feed) download = mediawords.dbi.downloads.store_content(db, download, content) return story
def _try_fetch_topic_url( db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...") # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): log.info(f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching") return log.info("Checking ignore links...") _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): log.info("Link is to be ignored, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking failed URL...") _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: log.info("URL is failed, returning") topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return log.info("Checking self-linked domain...") _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): log.info("Link is self-linked domain, returning") topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return log.info(f"Fetching topic {topic_fetch_url['topics_id']}...") topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests log.info("Checking story match...") _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: log.info("URL is in pending state, getting story match...") story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: log.info(f"Matched story {story_match['stories_id']}, returning") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls log.info("Checking for pending state...") pending_state = _get_pending_state(topic_fetch_url) if pending_state: log.info("URL is in pending state, returning") topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it log.info("Checking seeded content...") _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: log.info("Seeded content found, fetching URL...") _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.info(f"{response.code} response returned") else: log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}") content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: log.info( f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern" ) if _ignore_link_pattern(response_url): log.info("Ignore link pattern matched, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking story match for redirect URL...") _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] log.info("Checking content match...") _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: log.info("Request failed") topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: log.info(f"Story {story_match['stories_id']} matched") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match): log.info("Content matched") topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: log.info("Nothing matched, generating story...") try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url log.info("Creating story...") story = generate_story(db=db, content=content, url=url) log.info(f"Created story {story['stories_id']}") topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: log.info("Duplicate story found, checking for story match on unique constraint error...") # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: message = "Unable to find matching story after unique constraint error." log.error(message) raise McTMFetchLinkException(message) log.info(f"Matched story {story_match['stories_id']}") topic_fetch_url['stories_id'] = story_match['stories_id'] log.info("Done generating story") _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done") log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
def copy_stories_to_topic(db: DatabaseHandler, source_topics_id: int, target_topics_id: int) -> None: """Copy stories from source_topics_id into seed_urls for target_topics_id.""" message = "copy_stories_to_topic: %s -> %s [%s]" % ( source_topics_id, target_topics_id, datetime.datetime.now()) log.info("querying novel urls from source topic...") db.query("set work_mem = '8GB'") db.query( """ create temporary table _stories as select distinct stories_id from topic_seed_urls where topics_id = %(a)s and stories_id is not null; create temporary table _urls as select distinct url from topic_seed_urls where topics_id = %(a)s; """, {'a': target_topics_id}) # noinspection SqlResolve db.query( """ create temporary table _tsu as select %(target)s topics_id, url, stories_id, %(message)s source from snap.live_stories s where s.topics_id = %(source)s and s.stories_id not in ( select stories_id from _stories ) and s.url not in ( select url from _urls ) """, { 'target': target_topics_id, 'source': source_topics_id, 'message': message }) # noinspection SqlResolve (num_inserted, ) = db.query("select count(*) from _tsu").flat() log.info("inserting %d urls ..." % num_inserted) # noinspection SqlInsertValues,SqlResolve db.query( "insert into topic_seed_urls ( topics_id, url, stories_id, source ) select * from _tsu" ) # noinspection SqlResolve db.query("drop table _stories; drop table _urls; drop table _tsu;")
def add_story(db: DatabaseHandler, story: dict, feeds_id: int) -> Optional[dict]: """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it. Returns found or created story. Adds an is_new = True story if the story was created by the call. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") db_story = find_dup_story(db, story) if db_story: log.debug("found existing dup story: %s [%s]" % (story['title'], story['url'])) db.commit() return db_story medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) story['is_new'] = True [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])] # on conflict does not work with partitioned feeds_stories_map db.query( """ insert into feeds_stories_map_p ( feeds_id, stories_id ) select %(a)s, %(b)s where not exists ( select 1 from feeds_stories_map where feeds_id = %(a)s and stories_id = %(b)s ) """, { 'a': feeds_id, 'b': story['stories_id'] }) db.commit() log.debug("added story: %s" % story['url']) return story
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed download; if there is a new URL, check whether each story is new, and if so add it to the database and add a pending download for it. Return new stories that were found in the feed. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) media_id = get_media_id(db=db, download=download) download_time = download['download_time'] try: stories = self._get_stories_from_syndicated_feed( content=content, media_id=media_id, download_time=download_time, ) except Exception as ex: raise McCrawlerFetcherSoftError( f"Error processing feed for {download['url']}: {ex}") if stories_checksum_matches_feed(db=db, feeds_id=download['feeds_id'], stories=stories): return [] new_story_ids = [] for story in stories: # FIXME None of the helpers like keys they don't know about story_without_enclosures = story.copy() story_without_enclosures.pop('enclosures') if self._add_content_download_for_new_stories(): added_story = add_story_and_content_download( db=db, story=story_without_enclosures, parent_download=download, ) else: added_story = add_story( db=db, story=story_without_enclosures, feeds_id=download['feeds_id'], ) # We might have received None due to a GUID conflict if added_story: stories_id = added_story['stories_id'] story_is_new = added_story.get('is_new', False) if story_is_new: # Add all of the enclosures for enclosure in story['enclosures']: # ...provided that the URL is set if enclosure['url']: db.query( """ INSERT INTO story_enclosures (stories_id, url, mime_type, length) VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s) -- Some stories have multiple enclosures pointing to the same URL ON CONFLICT (stories_id, url) DO NOTHING """, { 'stories_id': stories_id, 'url': enclosure['url'], 'mime_type': enclosure['mime_type'], 'length': enclosure['length'], }) # Append to the list of newly added storyes new_story_ids.append(stories_id) log.info( f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}" ) return new_story_ids
def get_story_match( db: DatabaseHandler, url: str, redirect_url: typing.Optional[str] = None) -> typing.Optional[dict]: """Search for any story within the database that matches the given url. Searches for any story whose guid or url matches either the url or redirect_url or the mediawords.util.url.normalize_url_lossy() version of either. If multiple stories are found, use get_preferred_story() to decide which story to return. Only mach the first _MAX_URL_LENGTH characters of the url / redirect_url. Arguments: db - db handle url - story url redirect_url - optional url to which the story url redirects Returns: the matched story or None """ u = url[0:_MAX_URL_LENGTH] ru = '' if not ignore_redirect(db, url, redirect_url): ru = redirect_url[0:_MAX_URL_LENGTH] if redirect_url is not None else u nu = mediawords.util.url.normalize_url_lossy(u) nru = mediawords.util.url.normalize_url_lossy(ru) urls = list({u, ru, nu, nru}) # for some reason some rare urls trigger a seq scan on the below query db.query("set enable_seqscan=off") # look for matching stories, ignore those in foreign_rss_links media, only get last # 100 to avoid hanging job trying to handle potentially thousands of matches stories = db.query( """ with matching_stories as ( select distinct(s.*) from stories s join media m on s.media_id = m.media_id where ( ( s.url = any( %(a)s ) ) or ( s.guid = any ( %(a)s ) ) ) and m.foreign_rss_links = false union select distinct(s.*) from stories s join media m on s.media_id = m.media_id join topic_seed_urls csu on s.stories_id = csu.stories_id where csu.url = any ( %(a)s ) and m.foreign_rss_links = false ) select distinct(ms.*) from matching_stories ms order by collect_date desc limit 100 """, { 'a': urls }).hashes() db.query("set enable_seqscan=on") if len(stories) == 0: return None story = get_preferred_story(db, stories) return story
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed( skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) db.find_or_create(table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, }) db.commit() return story
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]: """Lookup a media source by normalized url and then name. Uses normalize_url_lossy() to normalize urls. Returns the parent media for duplicate media sources and returns no media that are marked foreign_rss_links. This function queries the media.normalized_url field to find the matching urls. Because the normalization function is in python, we have to keep that denormalized_url field current from within python. This function is responsible for keeping the table up to date by filling the field for any media for which it is null. Arguments: db - db handle url - url to lookup name - name to lookup Returns: a media source dict or None """ _update_media_normalized_urls(db) normalized_url = _normalize_url(url) medium = db.query( """ SELECT * FROM media where normalized_url = %(a)s AND foreign_rss_links = 'f' ORDER BY dup_media_id NULLS LAST, media_id """, { 'a': normalized_url }).hash() if medium is None: medium = db.query( """ SELECT * FROM media WHERE lower(name) = lower(%(a)s) AND foreign_rss_links = false """, { 'a': name }).hash() if medium is None: return None if medium['dup_media_id'] is not None: media_cycle_lookup = dict() # type: dict while medium['dup_media_id'] is not None: if medium['media_id'] in media_cycle_lookup: raise McTopicMediaException( 'Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys())) media_cycle_lookup[medium['media_id']] = True medium = db.query("select * from media where media_id = %(a)s", { 'a': medium['dup_media_id'] }).hash() if medium['foreign_rss_links']: raise McTopicMediaException( 'Parent duplicate media source %d has foreign_rss_links' % medium['media_id']) return medium
def login_with_email_password(db: DatabaseHandler, email: str, password: str, ip_address: str = None) -> CurrentUser: """Log in with username and password; raise on unsuccessful login.""" email = decode_object_from_bytes_if_needed(email) password = decode_object_from_bytes_if_needed(password) if not (email and password): raise McAuthLoginException("Email and password must be defined.") # Try-except block because we don't want to reveal the specific reason why the login has failed try: user = user_info(db=db, email=email) # Check if user has tried to log in unsuccessfully before and now is trying # again too fast if __user_is_trying_to_login_too_soon(db=db, email=email): raise McAuthLoginException( "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email ) if not password_hash_is_valid(password_hash=user.password_hash(), password=password): raise McAuthLoginException("Password for user '%s' is invalid." % email) except Exception as ex: log.info( "Login failed for %(email)s, will delay any successive login attempt for %(delay)d seconds: %(exc)s" % { 'email': email, 'delay': __POST_UNSUCCESSFUL_LOGIN_DELAY, 'exc': str(ex), } ) # Set the unsuccessful login timestamp # (TIMESTAMP 'now' returns "current transaction's start time", so using LOCALTIMESTAMP instead) db.query(""" UPDATE auth_users SET last_unsuccessful_login_attempt = LOCALTIMESTAMP WHERE email = %(email)s """, {'email': email}) # It might make sense to time.sleep() here for the duration of $POST_UNSUCCESSFUL_LOGIN_DELAY seconds to prevent # legitimate users from trying to log in too fast. However, when being actually brute-forced through multiple # HTTP connections, this approach might end up creating a lot of processes that would time.sleep() and take up # memory. # # So, let's return the error page ASAP and hope that a legitimate user won't be able to reenter his / her # password before the $POST_UNSUCCESSFUL_LOGIN_DELAY amount of seconds pass. # Don't give out a specific reason for the user to not be able to find # out which user emails are registered raise McAuthLoginException("User '%s' was not found or password is incorrect." % email) if not user.active(): raise McAuthLoginException("User with email '%s' is not active." % email) # Reset password reset token (if any) db.query(""" UPDATE auth_users SET password_reset_token_hash = NULL WHERE email = %(email)s AND password_reset_token_hash IS NOT NULL """, {'email': email}) if ip_address: if not user.api_key_for_ip_address(ip_address): db.create( table='auth_user_api_keys', insert_hash={ 'auth_users_id': user.user_id(), 'ip_address': ip_address, }) # Fetch user again user = user_info(db=db, email=email) if not user.api_key_for_ip_address(ip_address): raise McAuthLoginException("Unable to create per-IP API key for IP %s" % ip_address) return user
def add_user(db: DatabaseHandler, new_user: NewUser) -> None: """Add new user.""" if not new_user: raise McAuthRegisterException("New user is undefined.") # Check if user already exists user_exists = db.query( """ SELECT auth_users_id FROM auth_users WHERE email = %(email)s LIMIT 1 """, { 'email': new_user.email() }).hash() if user_exists is not None and 'auth_users_id' in user_exists: raise McAuthRegisterException("User with email '%s' already exists." % new_user.email()) # Hash + validate the password try: password_hash = generate_secure_hash(password=new_user.password()) if not password_hash: raise McAuthRegisterException("Password hash is empty.") except Exception as _: raise McAuthRegisterException('Unable to hash a new password.') db.begin() # Create the user db.create(table='auth_users', insert_hash={ 'email': new_user.email(), 'password_hash': password_hash, 'full_name': new_user.full_name(), 'notes': new_user.notes(), 'active': bool(int(new_user.active())), }) # Fetch the user's ID try: user = user_info(db=db, email=new_user.email()) except Exception as ex: db.rollback() raise McAuthRegisterException( "I've attempted to create the user but it doesn't exist: %s" % str(ex)) # Create roles try: for auth_roles_id in new_user.role_ids(): db.create(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) except Exception as ex: raise McAuthRegisterException("Unable to create roles: %s" % str(ex)) # Update limits (if they're defined) if new_user.weekly_requests_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requests_limit': new_user.weekly_requests_limit(), }) if new_user.weekly_requested_items_limit() is not None: db.query( """ UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requested_items_limit': new_user.weekly_requested_items_limit(), }) # Subscribe to newsletter if new_user.subscribe_to_newsletter(): db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()}) if not new_user.active(): send_user_activation_token( db=db, email=new_user.email(), activation_link=new_user.activation_url(), subscribe_to_newsletter=new_user.subscribe_to_newsletter(), ) db.commit()
def get_preferred_story(db: DatabaseHandler, stories: list) -> dict: """Given a set of possible story matches, find the story that is likely the best to include in the topic. The best story is the one that belongs to the media source that sorts first according to the following criteria, in descending order of importance: * pointed to by some dup_media_id * without a dup_media_id * url domain matches that of the story * lower media_id Within a media source, the preferred story is the one with the most sentences. Arguments: db - db handle url - url of matched story redirect_url - redirect_url of matched story stories - list of stories from which to choose Returns: a single preferred story """ assert len(stories) > 0 if len(stories) == 1: return stories[0] log.debug("get_preferred_story: %d stories" % len(stories)) media = db.query( """ select *, exists ( select 1 from media d where d.dup_media_id = m.media_id ) as is_dup_target from media m where media_id = any(%(a)s) """, { 'a': [s['media_id'] for s in stories] }).hashes() story_urls = [s['url'] for s in stories] for medium in media: # is_dup_target defined in query above medium['is_dup_target'] = 0 if medium['is_dup_target'] else 1 medium['is_not_dup_source'] = 1 if medium['dup_media_id'] else 0 medium['matches_domain'] = 0 if _url_domain_matches_medium( medium, story_urls) else 1 medium['stories'] = list( filter(lambda s: s['media_id'] == medium['media_id'], stories)) sorted_media = sorted(media, key=operator.itemgetter('is_dup_target', 'is_not_dup_source', 'matches_domain', 'media_id')) preferred_story = _get_story_with_most_sentences( db, sorted_media[0]['stories']) return preferred_story
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep media_normalized_urls table up to date. This function compares the media and versions in media_normalized_urls against the version returned by mediawords.util.url.normalize_url_lossy_version() and updates or inserts rows for any media that do not have up to date versions. """ if not _normalized_urls_out_of_date(db): return # put a lock on this because the process of generating all media urls will take around 30 seconds, and we don't # want all workers to do the work db.begin() db.query("lock media_normalized_urls in access exclusive mode") if not _normalized_urls_out_of_date(db): db.commit() return log.warning("updating media_normalized_urls ...") version = mediawords.util.url.normalize_url_lossy_version() media = db.query( """ select m.* from media m left join media_normalized_urls u on ( m.media_id = u.media_id and u.normalize_url_lossy_version = %(a)s) where u.normalized_url is null or u.db_row_last_updated < m.db_row_last_updated """, { 'a': version }).hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.query( "delete from media_normalized_urls where media_id = %(a)s and normalize_url_lossy_version = %(b)s", { 'a': medium['media_id'], 'b': version }) db.create( 'media_normalized_urls', { 'media_id': medium['media_id'], 'normalized_url': normalized_url, 'normalize_url_lossy_version': version }) db.commit()
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]: """Lookup a media source by normalized url and then name. Uses mediawords.util.url.normalize_url_lossy to normalize urls. Returns the parent media for duplicate media sources and returns no media that are marked foreign_rss_links. This function queries the media_normalized_urls table to find the matching urls. Because the normalization function is in python, we have to keep that denormalized table current from within python. This function is responsible for keeping the table up to date by comparing the normalize_url_lossy_version values in the table with the current return value of mediawords.util.url.normalize_url_lossy_version(). Arguments: db - db handle url - url to lookup name - name to lookup Returns: a media source dict or None """ _update_media_normalized_urls(db) nu = _normalize_url(url) version = mediawords.util.url.normalize_url_lossy_version() lookup_query = \ """ select m.* from media m join media_normalized_urls u using ( media_id ) where u.normalized_url = %(a)s and u.normalize_url_lossy_version = %(b)s and foreign_rss_links = 'f' order by dup_media_id asc nulls last, media_id asc """ medium = db.query(lookup_query, {'a': nu, 'b': version}).hash() if medium is None: medium = db.query( "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false", { 'a': name }).hash() if medium is None: return None if medium['dup_media_id'] is not None: media_cycle_lookup = dict() # type: dict while medium['dup_media_id'] is not None: if medium['media_id'] in media_cycle_lookup: raise McTopicMediaException( 'Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys())) media_cycle_lookup[medium['media_id']] = True medium = db.query("select * from media where media_id = %(a)s", { 'a': medium['dup_media_id'] }).hash() if medium['foreign_rss_links']: raise McTopicMediaException( 'Parent duplicate media source %d has foreign_rss_links' % medium['media_id']) return medium
def _insert_story_sentences( db: DatabaseHandler, story: dict, sentences: List[str], no_dedup_sentences: bool = False, ) -> List[str]: """Insert the story sentences into story_sentences, optionally skipping duplicate sentences by setting is_dup = 't' to the found duplicates that are already in the table. Returns list of sentences that were inserted into the table. """ story = decode_object_from_bytes_if_needed(story) sentences = decode_object_from_bytes_if_needed(sentences) if isinstance(no_dedup_sentences, bytes): no_dedup_sentences = decode_object_from_bytes_if_needed( no_dedup_sentences) no_dedup_sentences = bool(int(no_dedup_sentences)) stories_id = story['stories_id'] media_id = story['media_id'] # Story's publish date is the same for all the sentences, so we might as well pass it as a constant escaped_story_publish_date = db.quote_date(story['publish_date']) if len(sentences) == 0: log.warning(f"Story sentences are empty for story {stories_id}") return [] if no_dedup_sentences: log.debug( f"Won't de-duplicate sentences for story {stories_id} because 'no_dedup_sentences' is set" ) dedup_sentences_statement = """ -- Nothing to deduplicate, return empty list SELECT NULL WHERE 1 = 0 """ else: # Limit to unique sentences within a story sentences = _get_unique_sentences_in_story(sentences) # Set is_dup = 't' to sentences already in the table, return those to be later skipped on INSERT of new # sentences dedup_sentences_statement = f""" -- noinspection SqlResolve UPDATE story_sentences SET is_dup = 't' FROM new_sentences WHERE public.half_md5(story_sentences.sentence) = public.half_md5(new_sentences.sentence) AND public.week_start_date(story_sentences.publish_date::date) = public.week_start_date({escaped_story_publish_date}) AND story_sentences.media_id = new_sentences.media_id RETURNING story_sentences.sentence """ # Convert to list of dicts (values escaped for insertion into database) sentence_dicts = _get_db_escaped_story_sentence_dicts(db=db, story=story, sentences=sentences) # Ordered list of columns story_sentences_columns = sorted(sentence_dicts[0].keys()) str_story_sentences_columns = ', '.join(story_sentences_columns) # List of sentences (in predefined column order) new_sentences_sql = [] for sentence_dict in sentence_dicts: new_sentence_sql = [] for column in story_sentences_columns: new_sentence_sql.append(sentence_dict[column]) new_sentences_sql.append(f"({', '.join(new_sentence_sql)})") str_new_sentences_sql = "\n{}".format(",\n".join(new_sentences_sql)) # sometimes the big story_sentences query below deadlocks sticks in an idle state, holding this lock so we set a # short idle timeout for postgres just while we do this query. the timeout should not kick in while the # big story_sentences query is actively processing, so we can set it pretty short. we usually set this timeout # to 0 globally, but just to be safe store and reset the pre-existing value. idle_timeout = db.query( "SHOW idle_in_transaction_session_timeout").flat()[0] db.query("SET idle_in_transaction_session_timeout = 5000") db.query('SET citus.max_adaptive_executor_pool_size TO 64') sql = f""" -- noinspection SqlType,SqlResolve WITH new_sentences ({str_story_sentences_columns}) AS (VALUES -- New sentences to potentially insert {str_new_sentences_sql} ) -- Either list of duplicate sentences already found in the table or return an empty list if deduplication is -- disabled -- -- The query assumes that there are no existing sentences for this story in the "story_sentences" table, so -- if you are reextracting a story, DELETE its sentences from "story_sentences" before running this query. {dedup_sentences_statement} """ log.debug(f"Running 'UPDATE story_sentences SET is_dup' query:\n{sql}") duplicate_sentences = db.query(sql).flat() duplicate_sentences = [ db.quote_varchar(sentence) for sentence in duplicate_sentences ] sql = f""" -- noinspection SqlType,SqlResolve WITH new_sentences ({str_story_sentences_columns}) AS (VALUES {str_new_sentences_sql} ), duplicate_sentences AS ( SELECT unnest(ARRAY[{', '.join(duplicate_sentences)}]::TEXT[]) AS sentence ) INSERT INTO story_sentences (language, media_id, publish_date, sentence, sentence_number, stories_id) SELECT language, media_id, publish_date, sentence, sentence_number, stories_id FROM new_sentences WHERE sentence NOT IN ( -- Skip the ones for which we've just set is_dup = 't' SELECT sentence FROM duplicate_sentences ) RETURNING story_sentences.sentence """ log.debug(f"Running 'INSERT INTO story_sentences' query:\n{sql}") inserted_sentences = db.query(sql).flat() db.query("SET idle_in_transaction_session_timeout = %(a)s", {'a': idle_timeout}) return inserted_sentences
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: typing.Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return if re.search(mediawords.tm.extract_story_links.IGNORE_LINK_PATTERN, topic_fetch_url['url'], flags=re.I) is not None: topic_fetch_url['state'] = FETCH_STATE_IGNORE topic_fetch_url['code'] = 403 return log.warning("PASSED IGNORE") failed_url = get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = mediawords.tm.stories.get_story_match( db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # get content from either the seed or by fetching it response = get_seeded_content(db, topic_fetch_url) if response is None: response = fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code(), topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.decoded_content() fetched_url = topic_fetch_url['url'] response_url = response.request().url() if response.request() else None if fetched_url != response_url: story_match = mediawords.tm.stories.get_story_match( db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code() if not response.is_success(): topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message() elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=topic_fetch_url['assume_match']): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: url = response_url if response_url is not None else fetched_url story = mediawords.tm.stories.generate_story(db=db, content=content, url=url) topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED topic_fetch_url['stories_id'] = story['stories_id'] except mediawords.tm.stories.McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = mediawords.tm.stories.get_story_match( db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException( "Unable to find matching story after unique constraint error." ) topic_fetch_url['stories_id'] = story_match['stories_id']
def update_story_sentences_and_language( db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Update story vectors for the given story, updating "story_sentences". If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new ones (useful for optimization if you are very sure no story vectors exist for this story). If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are reprocessing a small set of stories). """ story = decode_object_from_bytes_if_needed(story) use_transaction = not db.in_transaction() if use_transaction: db.begin() stories_id = story['stories_id'] if not extractor_args.no_delete(): _delete_story_sentences(db=db, story=story) story_text = story.get('story_text', None) if not story_text: story_text = get_text_for_word_counts(db=db, story=story) if not story_text: story_text = '' story_lang = language_code_for_text(text=story_text) sentences = _get_sentences_from_story_text(story_text=story_text, story_lang=story_lang) if (not story.get('language', None)) or story.get('language', None) != story_lang: db.query( """ UPDATE stories SET language = %(story_lang)s WHERE stories_id = %(stories_id)s """, { 'stories_id': stories_id, 'story_lang': story_lang }) story['language'] = story_lang if sentences is None: raise McUpdateStorySentencesAndLanguageException( "Sentences for story {} are undefined.".format(stories_id)) if len(sentences) == 0: log.debug("Story {} doesn't have any sentences.".format(stories_id)) return sentences = _clean_sentences(sentences) _insert_story_sentences( db=db, story=story, sentences=sentences, no_dedup_sentences=extractor_args.no_dedup_sentences(), ) story['ap_syndicated'] = _update_ap_syndicated( db=db, stories_id=stories_id, story_title=story['title'], story_text=story_text, story_language=story_lang, ) if use_transaction: db.commit()
def _fetch_tweets_for_day(db: DatabaseHandler, twitter_class: typing.Type[AbstractTwitter], topic: dict, topic_tweet_day: dict, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets from CH and then fetching each tweet from twitter. Arguments: db - db handle twitter_class - AbstractTwitter class topic - topic dict topic_tweet_day - topic_tweet_day dict max_tweets - max tweets to fetch for a single day Return: None """ if topic_tweet_day['tweets_fetched']: return ch_posts_data = topic_tweet_day['ch_posts'] ch_posts = ch_posts_data['posts'] if (max_tweets is not None): ch_posts = ch_posts[0:max_tweets] log.info("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(ch_posts), 100): _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100]) ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p), ch_posts)) log.info("%d tweets remaining after match" % (len(ch_posts))) db.begin() log.debug("inserting into topic_tweets ...") [ _store_tweet_and_urls(db, topic_tweet_day, ch_post) for ch_post in ch_posts ] topic_tweet_day['num_ch_tweets'] = len(ch_posts) db.query( "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s", { 'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id'] }) db.commit() log.debug("done inserting into topic_tweets")
def user_info(db: DatabaseHandler, email: str) -> CurrentUser: """Fetch user information (email, full name, notes, API keys, password hash). Raises on error if user is not found. """ email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthInfoException("User email is not defined.") # Fetch read-only information about the user user = db.query( """ SELECT auth_users.auth_users_id, auth_users.email, auth_users.full_name, auth_users.notes, EXTRACT(EPOCH FROM NOW())::BIGINT AS created_timestamp, auth_users.active, auth_users.password_hash, auth_user_api_keys.api_key, auth_user_api_keys.ip_address, weekly_requests_sum, weekly_requested_items_sum, auth_user_limits.weekly_requests_limit, auth_user_limits.weekly_requested_items_limit, auth_roles.auth_roles_id, auth_roles.role FROM auth_users INNER JOIN auth_user_api_keys ON auth_users.auth_users_id = auth_user_api_keys.auth_users_id INNER JOIN auth_user_limits ON auth_users.auth_users_id = auth_user_limits.auth_users_id LEFT JOIN auth_users_roles_map ON auth_users.auth_users_id = auth_users_roles_map.auth_users_id LEFT JOIN auth_roles ON auth_users_roles_map.auth_roles_id = auth_roles.auth_roles_id, auth_user_limits_weekly_usage( %(email)s ) WHERE auth_users.email = %(email)s """, { 'email': email }).hashes() if user is None or len(user) == 0: raise McAuthInfoException("User with email '%s' was not found." % email) unique_api_keys = dict() unique_roles = dict() for row in user: # Should have at least one API key unique_api_keys[row['api_key']] = row['ip_address'] # Might have some roles if row['auth_roles_id'] is not None: unique_roles[row['auth_roles_id']] = row['role'] api_keys = [] for api_key in sorted(unique_api_keys.keys()): api_keys.append( APIKey(api_key=api_key, ip_address=unique_api_keys[api_key])) roles = [] for role_id in sorted(unique_roles.keys()): roles.append(Role(role_id=role_id, role_name=unique_roles[role_id])) first_row = user[0] return CurrentUser( user_id=first_row['auth_users_id'], email=email, full_name=first_row['full_name'], notes=first_row['notes'], created_timestamp=first_row['created_timestamp'], active=bool(int(first_row['active'])), password_hash=first_row['password_hash'], roles=roles, api_keys=api_keys, weekly_requests_limit=first_row['weekly_requests_limit'], weekly_requested_items_limit=first_row['weekly_requested_items_limit'], weekly_requests_sum=first_row['weekly_requests_sum'], weekly_requested_items_sum=first_row['weekly_requested_items_sum'], )