def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException("add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid']) ) return None else: raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story))) db.find_or_create( table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, } ) db.commit() return story
def create(db: DatabaseHandler, download: dict, extract: dict) -> dict: """Create a download_text hash and insert it into the database. Delete any existing download_text row for the download.""" # FIXME don't pass freeform "extract" dict, we need just the "extracted_text" download = decode_object_from_bytes_if_needed(download) extract = decode_object_from_bytes_if_needed(extract) db.query(""" DELETE FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) download_text = db.query(""" INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) RETURNING * """, { 'downloads_id': download['downloads_id'], 'download_text': extract['extracted_text'], }).hash() db.query(""" UPDATE downloads SET extracted = 't' WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) return download_text
def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None: """Add extractor version tag to the story.""" # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated. # Purging such a cache manually is very error-prone. story = decode_object_from_bytes_if_needed(story) tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()}) db.query(""" DELETE FROM stories_tags_map AS stm USING tags AS t JOIN tag_sets AS ts ON ts.tag_sets_id = t.tag_sets_id WHERE t.tags_id = stm.tags_id AND ts.tag_sets_id = %(tag_sets_id)s AND stm.stories_id = %(stories_id)s """, { 'tag_sets_id': tag_set['tag_sets_id'], 'stories_id': story['stories_id'], }) extractor_version = extractor_name() tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']}) tags_id = tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, {'stories_id': story['stories_id'], 'tags_id': tags_id})
def get_links_from_story_text(db: DatabaseHandler, story: dict) -> typing.List[str]: """Get all urls that appear in the text or description of the story using a simple regex.""" download_ids = db.query(""" SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s """, {'stories_id': story['stories_id']} ).flat() download_texts = db.query(""" SELECT * FROM download_texts WHERE downloads_id = ANY(%(download_ids)s) ORDER BY download_texts_id """, {'download_ids': download_ids} ).hashes() story_text = ' '.join([dt['download_text'] for dt in download_texts]) story_text = story_text + ' ' + str(story['title']) if story['title'] is not None else story_text story_text = story_text + ' ' + str(story['description']) if story['description'] is not None else story_text links = [] for url in re.findall(r'https?://[^\s\")]+', story_text): url = re.sub(r'\W+$', '', url) links.append(url) return links
def _fetch_tweets_for_day( db: DatabaseHandler, twitter_class: typing.Type[AbstractTwitter], topic: dict, topic_tweet_day: dict, max_tweets: typing.Optional[int] = None) -> None: """ Fetch tweets for a single day. If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying the list of tweets from CH and then fetching each tweet from twitter. Arguments: db - db handle twitter_class - AbstractTwitter class topic - topic dict topic_tweet_day - topic_tweet_day dict max_tweets - max tweets to fetch for a single day Return: None """ if topic_tweet_day['tweets_fetched']: return ch_posts_data = topic_tweet_day['ch_posts'] ch_posts = ch_posts_data['posts'] if (max_tweets is not None): ch_posts = ch_posts[0:max_tweets] log.info("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day'])) # we can only get 100 posts at a time from twitter for i in range(0, len(ch_posts), 100): _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100]) ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p), ch_posts)) log.info("%d tweets remaining after match" % (len(ch_posts))) db.begin() log.debug("inserting into topic_tweets ...") [_store_tweet_and_urls(db, topic_tweet_day, ch_post) for ch_post in ch_posts] topic_tweet_day['num_ch_tweets'] = len(ch_posts) db.query( "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s", {'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id']}) db.commit() log.debug("done inserting into topic_tweets")
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]: """Lookup a media source by normalized url and then name. Uses mediawords.util.url.normalize_url_lossy to normalize urls. Returns the parent media for duplicate media sources and returns no media that are marked foreign_rss_links. This function queries the media.normalized_url field to find the matching urls. Because the normalization function is in python, we have to keep that denormalized_url field current from within python. This function is responsible for keeping the table up to date by filling the field for any media for which it is null. Arguments: db - db handle url - url to lookup name - name to lookup Returns: a media source dict or None """ _update_media_normalized_urls(db) nu = _normalize_url(url) lookup_query = \ """ select m.* from media m where m.normalized_url = %(a)s and foreign_rss_links = 'f' order by dup_media_id asc nulls last, media_id asc """ medium = db.query(lookup_query, {'a': nu}).hash() if medium is None: medium = db.query( "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false", {'a': name}).hash() if medium is None: return None if medium['dup_media_id'] is not None: media_cycle_lookup = dict() # type: dict while medium['dup_media_id'] is not None: if medium['media_id'] in media_cycle_lookup: raise McTopicMediaException('Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys())) media_cycle_lookup[medium['media_id']] = True medium = db.query("select * from media where media_id = %(a)s", {'a': medium['dup_media_id']}).hash() if medium['foreign_rss_links']: raise McTopicMediaException('Parent duplicate media source %d has foreign_rss_links' % medium['media_id']) return medium
def _insert_tweet_urls(db: DatabaseHandler, topic_tweet: dict, urls: typing.List) -> typing.List: """Insert list of urls into topic_tweet_urls.""" for url in urls: db.query( """ insert into topic_tweet_urls( topic_tweets_id, url ) values( %(a)s, %(b)s ) on conflict do nothing """, {'a': topic_tweet['topic_tweets_id'], 'b': url})
def change_password(db: DatabaseHandler, email: str, new_password: str, new_password_repeat: str, do_not_inform_via_email: bool = False) -> None: """Change user's password.""" email = decode_object_from_bytes_if_needed(email) new_password = decode_object_from_bytes_if_needed(new_password) new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat) if isinstance(do_not_inform_via_email, bytes): do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email) do_not_inform_via_email = bool(int(do_not_inform_via_email)) # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email) password_validation_message = validate_new_password(email=email, password=new_password, password_repeat=new_password_repeat) if password_validation_message: raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message) # Hash + validate the password try: password_new_hash = generate_secure_hash(password=new_password) except Exception as ex: raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex)) if not password_new_hash: raise McAuthChangePasswordException("Generated password hash is empty.") # Set the password hash db.query(""" UPDATE auth_users SET password_hash = %(password_hash)s, active = TRUE WHERE email = %(email)s """, { 'email': email, 'password_hash': password_new_hash, }) if not do_not_inform_via_email: message = AuthPasswordChangedMessage(to=email, full_name=user.full_name()) if not send_email(message): raise McAuthChangePasswordException( 'The password has been changed, but I was unable to send an email notifying you about the change.' )
def __remove_object_from_cache(self, db: DatabaseHandler, object_id: int) -> None: """Attempt to remove object from cache. Raise if removal fails because after removal we'd expect the object to be gone for good.""" object_id = self._prepare_object_id(object_id) sql = "DELETE FROM %s " % self.__cache_table # interpolated by Python sql += "WHERE object_id = %(object_id)s" # interpolated by psycopg2 db.query(sql, {'object_id': object_id})
def __get_topic_url_variants(db: DatabaseHandler, urls: List[str]) -> List[str]: """Get any alternative urls for the given url from topic_merged_stories or topic_links.""" urls = decode_object_from_bytes_if_needed(urls) # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples stories_ids_sql = "SELECT stories_id " stories_ids_sql += "FROM stories " stories_ids_sql += "WHERE url = ANY(?)" stories_ids = db.query(stories_ids_sql, urls).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if stories_ids is None: stories_ids = [] elif isinstance(stories_ids, int): stories_ids = [stories_ids] stories_ids = [int(x) for x in stories_ids] all_stories_ids = __get_merged_stories_ids(db=db, stories_ids=stories_ids) if len(all_stories_ids) == 0: return urls all_urls = db.query(""" SELECT DISTINCT url FROM ( SELECT redirect_url AS url FROM topic_links WHERE ref_stories_id = ANY(?) UNION SELECT url FROM topic_links WHERE ref_stories_id = ANY(?) UNION SELECT url FROM stories WHERE stories_id = ANY(?) ) AS q WHERE q IS NOT NULL """, all_stories_ids, all_stories_ids, all_stories_ids).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if all_urls is None: all_urls = [] elif isinstance(all_urls, str): all_urls = [all_urls] return all_urls
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]: """Generate password reset token used for both activating newly registered users and resetting passwords. Returns non-hashed password reset token or None if user was not found. """ email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a # "success" message. That way the adversary would not be able to find out which email addresses are active users. # # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing # attacks) user_exists = db.query(""" SELECT auth_users_id, email FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if user_exists is None or len(user_exists) == 0: # User was not found, so set the email address to an empty string, but don't return just now and continue with a # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks) email = '' # Generate the activation token password_reset_token = random_string(length=64) if len(password_reset_token) == 0: raise McAuthProfileException('Unable to generate an activation token.') # Hash + validate the activation token password_reset_token_hash = generate_secure_hash(password=password_reset_token) if not password_reset_token_hash: raise McAuthProfileException("Unable to hash an activation token.") # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing) db.query(""" UPDATE auth_users SET password_reset_token_hash = %(password_reset_token_hash)s WHERE email = %(email)s AND email != '' """, { 'email': email, 'password_reset_token_hash': password_reset_token_hash, }) return password_reset_token
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str: """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called.""" item_set = decode_object_from_bytes_if_needed(item_set) item_id = decode_object_from_bytes_if_needed(item_id) # Always return grey for null or not typed values if item_id.lower() in {'null', 'not typed'}: return '999999' color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", { 'item_set': item_set, 'item_id': item_id, }).flat() if color is not None and len(color): if isinstance(color, list): color = color[0] return color set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", { 'item_set': item_set, }).flat() if set_colors is not None: if not isinstance(set_colors, list): set_colors = [set_colors] existing_colors = set() if set_colors is not None: for color in set_colors: existing_colors.add(color) # Use the hard coded palette of 25 colors if possible new_color = None for color in __MC_COLORS: if color not in existing_colors: new_color = color break # Otherwise, just generate a random color if new_color is None: colors = analogous_color(color='0000ff', return_slices=256, split_slices=255) new_color = random.choice(colors) db.create(table='color_sets', insert_hash={ 'color_set': item_set, 'id': item_id, 'color': new_color, }) return new_color
def regenerate_api_key(db: DatabaseHandler, email: str) -> None: """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthProfileException("User with email address '%s' does not exist." % email) db.begin() # Purge all IP-limited API keys db.query(""" DELETE FROM auth_user_api_keys WHERE ip_address IS NOT NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) # Regenerate non-IP limited API key db.query(""" UPDATE auth_user_api_keys -- DEFAULT points to a generation function SET api_key = DEFAULT WHERE ip_address IS NULL AND auth_users_id = ( SELECT auth_users_id FROM auth_users WHERE email = %(email)s ) """, {'email': email}) message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name()) if not send_email(message): db.rollback() raise McAuthProfileException("Unable to send email about reset API key.") db.commit()
def _get_ap_dup_sentence_lengths(db: DatabaseHandler, story_text: str) -> List[int]: story_text = decode_object_from_bytes_if_needed(story_text) ap_media_id = _get_ap_media_id(db=db) if ap_media_id is None: return [] sentences = _get_sentences_from_content(story_text=story_text) md5s = [] for sentence in sentences: md5_hash = hashlib.md5(sentence.encode('utf-8')).hexdigest() md5s.append(md5_hash) sentence_lengths = db.query(""" SELECT length(sentence) AS len FROM story_sentences WHERE media_id = %(ap_media_id)s -- FIXME this probably never worked because the index is half_md5(), not md5() AND md5(sentence) = ANY(%(md5s)s) """, { 'ap_media_id': ap_media_id, 'md5s': md5s, }).flat() # MC_REWRITE_TO_PYTHON: Perlism if sentence_lengths is None: sentence_lengths = [] return sentence_lengths
def process_download_for_extractor(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None: """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be extracted for the story, call process_extracted_story() on the parent story.""" download = decode_object_from_bytes_if_needed(download) stories_id = download['stories_id'] log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url'])) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) has_remaining_download = db.query(""" SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s AND extracted = 'f' AND type = 'content' """, {'stories_id': stories_id}).hash() # MC_REWRITE_TO_PYTHON: Perlism if has_remaining_download is None: has_remaining_download = {} if len(has_remaining_download) > 0: log.info("Pending more downloads...") else: story = db.find_by_id(table='stories', object_id=stories_id) process_extracted_story(db=db, story=story, extractor_args=extractor_args)
def _get_story_with_most_sentences(db: DatabaseHandler, stories: list) -> dict: """Given a list of stories, return the story with the most sentences.""" assert len(stories) > 0 if len(stories) == 1: return stories[0] story = db.query( """ select s.* from stories s where stories_id in ( select stories_id from story_sentences where stories_id = any (%(a)s) group by stories_id order by count(*) desc limit 1 ) """, {'a': [s['stories_id'] for s in stories]}).hash() if story is not None: return story else: return stories[0]
def _get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[FetchLinkResponse]: """Return content for this url and topic in topic_seed_urls. Arguments: db - db handle topic_fetch_url - topic_fetch_url dict from db Returns: dummy response object """ r = db.query( "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null", {'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url']}).flat() if len(r) == 0: return None return FetchLinkResponse( url=topic_fetch_url['url'], is_success=True, code=HTTPStatus.OK.value, message=HTTPStatus.OK.phrase, content=r[0], last_requested_url=topic_fetch_url['url'], )
def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool: """Validate password reset token (used for both user activation and password reset).""" email = decode_object_from_bytes_if_needed(email) password_reset_token = decode_object_from_bytes_if_needed(password_reset_token) if not (email and password_reset_token): log.error("Email and / or password reset token is empty.") return False # Fetch readonly information about the user password_reset_token_hash = db.query(""" SELECT auth_users_id, email, password_reset_token_hash FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash: log.error("Unable to find user %s in the database." % email) return False password_reset_token_hash = password_reset_token_hash['password_reset_token_hash'] if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token): return True else: return False
def extract_and_process_story(db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None: """Extract all of the downloads for the given story and then call process_extracted_story().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] use_transaction = not db.in_transaction() if use_transaction: db.begin() log.debug("Fetching downloads for story {}...".format(stories_id)) downloads = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s AND type = 'content' ORDER BY downloads_id ASC """, {'stories_id': stories_id}).hashes() # MC_REWRITE_TO_PYTHON: Perlism if downloads is None: downloads = [] for download in downloads: log.debug("Extracting download {} for story {}...".format(download['downloads_id'], stories_id)) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) log.debug("Processing extracted story {}...".format(stories_id)) process_extracted_story(db=db, story=story, extractor_args=extractor_args) if use_transaction: db.commit()
def lookup_tag(db: DatabaseHandler, tag_name: str) -> Optional[Dict[str, Any]]: """Lookup the tag given the tag_set:tag format.""" tag_name = decode_object_from_bytes_if_needed(tag_name) if not tag_name: log.warning("Tag name is empty.") return None if not re.match(pattern='^([^:]*):([^:]*)$', string=tag_name): log.warning("Unable to parse tag name '{}'.".format(tag_name)) return None tag_set_name, tag = tag_name.split(':') found_tag = db.query(""" SELECT t.* FROM tags AS t, tag_sets AS ts WHERE t.tag_sets_id = ts.tag_sets_id AND t.tag = %(tag)s AND ts.name = %(tag_set_name)s """, {'tag': tag, 'tag_set_name': tag_set_name}).hash() # MC_REWRITE_TO_PYTHON: Perlism if found_tag is None: found_tag = {} return found_tag
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls get_links_from_story on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ try: log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = get_links_from_story(db, story) for link in links: if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) link_mine_error = '' except Exception: link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
def assign_date_guess_tag( db: DatabaseHandler, story: dict, date_guess: GuessDateResult, fallback_date: typing.Optional[str]) -> None: """Assign a guess method tag to the story based on the date_guess result. If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag. Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date. Else assign date_invalid:date_invalid. Arguments: db - db handle story - story dict from db date_guess - GuessDateResult from guess_date() call Returns: None """ if date_guess.found: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET guess_method = date_guess.guess_method if guess_method.startswith('Extracted from url'): tag = 'guess_by_url' elif guess_method.startswith('Extracted from tag'): match = re2.search(r'\<(\w+)', guess_method) html_tag = match.group(1) if match is not None else 'unknown' tag = 'guess_by_tag_' + str(html_tag) else: tag = 'guess_by_unknown' elif fallback_date is not None: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET tag = 'fallback_date' else: tag_set = mediawords.tm.guess_date.INVALID_TAG_SET tag = mediawords.tm.guess_date.INVALID_TAG ts = db.find_or_create('tag_sets', {'name': tag_set}) t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']}) db.query("delete from stories_tags_map where stories_id = %(a)s", {'a': story['stories_id']}) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': t['tags_id']})
def get_media_id(db: DatabaseHandler, download: dict) -> int: """Convenience method to get the media_id for the download.""" download = decode_object_from_bytes_if_needed(download) return db.query(""" SELECT media_id FROM feeds WHERE feeds_id = %(feeds_id)s """, {'feeds_id': download['feeds_id']}).hash()['media_id']
def get_unique_medium_name(db: DatabaseHandler, names: list) -> str: """Return the first name in the names list that does not yet exist for a media source, or None.""" for name in names: name = name[0:MAX_NAME_LENGTH] name_exists = db.query("select 1 from media where lower(name) = lower(%(a)s)", {'a': name}).hash() if name_exists is None: return name raise McTopicMediaUniqueException("Unable to find unique name among names: " + str(names))
def is_new(db: DatabaseHandler, story: dict) -> bool: """Return true if this story should be considered new for the given media source. This is used to determine whether to add a new story for a feed item URL. A story is new if no story with the same URL or GUID exists in the same media source and if no story exists with the same title in the same media source in the same calendar day. """ story = decode_object_from_bytes_if_needed(story) if story['title'] == '(no title)': return False db_story = db.query(""" SELECT * FROM stories WHERE guid = %(guid)s AND media_id = %(media_id)s """, { 'guid': story['guid'], 'media_id': story['media_id'], }).hash() if db_story: return False db_story = db.query(""" SELECT 1 FROM stories WHERE md5(title) = md5(%(title)s) AND media_id = %(media_id)s -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index AND date_trunc('day', publish_date) + interval '1 second' = date_trunc('day', %(publish_date)s::date) + interval '1 second' """, { 'title': story['title'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }).hash() if db_story: return False return True
def __get_merged_stories_ids(db: DatabaseHandler, stories_ids: List[int], n: int = 0) -> List[int]: """Gor a given set of stories, get all the stories that are source or target merged stories in topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found.""" stories_ids = decode_object_from_bytes_if_needed(stories_ids) # "The crazy load was from a query to our topic_merged_stories_ids to get # url variants. It looks like we have some case of many, many merged story # pairs that are causing that query to make postgres sit on a cpu for a # super long time. There's no good reason to query for ridiculous numbers # of merged stories, so I just arbitrarily capped the number of merged story # pairs to 20 to prevent this query from running away in the future." max_stories = 20 # MC_REWRITE_TO_PYTHON: cast strings to ints # noinspection PyTypeChecker stories_ids = [int(x) for x in stories_ids] if len(stories_ids) == 0: return [] if len(stories_ids) >= max_stories: return stories_ids[0:max_stories - 1] # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples stories_ids_list = ', '.join(str(x) for x in stories_ids) merged_stories_ids = db.query(""" SELECT DISTINCT target_stories_id, source_stories_id FROM topic_merged_stories_map WHERE target_stories_id IN (%(stories_ids_list)s) OR source_stories_id IN (%(stories_ids_list)s) LIMIT %(max_stories)s """ % { 'stories_ids_list': stories_ids_list, 'max_stories': int(max_stories), }).flat() # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result # sets, a scalar on a single item and arrayref on many items if merged_stories_ids is None: merged_stories_ids = [] elif isinstance(merged_stories_ids, int): merged_stories_ids = [merged_stories_ids] merged_stories_ids = [int(x) for x in merged_stories_ids] all_stories_ids = list(set(stories_ids + merged_stories_ids)) if n > 10 or len(stories_ids) == len(all_stories_ids) or len(stories_ids) >= max_stories: return all_stories_ids else: return __get_merged_stories_ids(db=db, stories_ids=all_stories_ids, n=n + 1)
def add_to_topic_stories( db: DatabaseHandler, story: dict, topic: dict, link_mined: bool = False, valid_foreign_rss_story: bool = False, iteration: int = None) -> None: """Add story to topic_stories table. Query topic_stories and topic_links to find the linking story with the smallest iteration and use that iteration + 1 for the new topic_stories row. """ if iteration is None: source_story = db.query( """ select ts.* from topic_stories ts join topic_links tl on ( ts.stories_id = tl.stories_id and ts.topics_id = tl.topics_id ) where tl.ref_stories_id = %(a)s and tl.topics_id = %(b)s order by ts.iteration asc limit 1 """, {'a': story['stories_id'], 'b': topic['topics_id']}).hash() iteration = (source_story['iteration'] + 1) if source_story else 0 db.query( """ insert into topic_stories ( topics_id, stories_id, iteration, redirect_url, link_mined, valid_foreign_rss_story ) values ( %(a)s, %(b)s, %(c)s, %(d)s, %(e)s, %(f)s ) on conflict do nothing """, { 'a': topic['topics_id'], 'b': story['stories_id'], 'c': iteration, 'd': story['url'], 'e': link_mined, 'f': valid_foreign_rss_story })
def delete_user(db: DatabaseHandler, email: str) -> None: """Delete user.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user_info(db=db, email=email) except Exception: raise McAuthProfileException("User with email address '%s' does not exist." % email) # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map') db.query(""" DELETE FROM auth_users WHERE email = %(email)s """, {'email': email})
def _is_not_topic_story(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Return True if the story is not in topic_stories for the given topic.""" if 'stories_id' not in topic_fetch_url: return True ts = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': topic_fetch_url['stories_id'], 'b': topic_fetch_url['topics_id']}).hash() return ts is None
def __count_null_title_stories(db: DatabaseHandler, topic: dict) -> int: """Count the stories in the topic with a null normalized_title_hash.""" null_count = db.query( """ select count(*) from stories s join topic_stories ts using ( stories_id ) where ts.topics_id = %(a)s and s.normalized_title_hash is null """, { 'a': topic['topics_id'] }).flat()[0] return null_count
def __try_storing_object_in_cache(self, db: DatabaseHandler, object_id: int, content: bytes) -> None: """Attempt to store object to cache, don't worry too much if it fails.""" object_id = self._prepare_object_id(object_id) if content is None: raise McCachedAmazonS3StoreException("Content to store is None for object ID %d." % object_id) if isinstance(content, str): content = content.encode('utf-8') try: content = self._compress_data_for_method(data=content, compression_method=self.__cache_compression_method) sql = "INSERT INTO %s " % self.__cache_table # interpolated by Python sql += "(object_id, raw_data) " sql += "VALUES (%(object_id)s, %(raw_data)s) " # interpolated by psycopg2 sql += "ON CONFLICT (object_id) DO UPDATE " sql += " SET raw_data = EXCLUDED.raw_data" db.query(sql, {'object_id': object_id, 'raw_data': content}) except Exception as ex: log.warning("Unable to cache object ID %d: %s" % (object_id, str(ex),))
def _id_exists_in_db(db: DatabaseHandler, guid: str) -> bool: """Internal method to check if item exists in the database.""" guid_exists = db.query( "select 1 from stories s join media m using (media_id) where m.name = %(b)s and s.guid = %(a)s", { 'a': guid, 'b': AP_MEDIUM_NAME }).hash() if guid_exists: log.debug( 'Story with guid: {} is already in the database -- skipping story.' ) return True return False
def content_exists(self, db: DatabaseHandler, object_id: int, object_path: str = None) -> bool: """Test if object exists in PostgreSQL table.""" object_id = self._prepare_object_id(object_id) sql = "SELECT 1" sql += "FROM %s " % self.__table # interpolated by Python sql += "WHERE object_id = %(object_id)s" # interpolated by psycopg2 object_exists = db.query(sql, {'object_id': object_id}).hash() if object_exists is not None and len(object_exists) > 0: return True else: return False
def default_weekly_requested_items_limit(db: DatabaseHandler) -> int: """Get default weekly requested items limit.""" limit = db.query(""" SELECT column_default AS default_weekly_requested_items_limit FROM information_schema.columns WHERE (table_schema, table_name) = ('public', 'auth_user_limits') AND column_name = 'weekly_requested_items_limit' """).flat() if not limit: raise McAuthLimitsException( "Unable to fetch default weekly requested items limit.") return limit[0]
def _delete_story_sentences(db: DatabaseHandler, story: dict) -> None: """Delete any existing stories for the given story and also update media_stats to adjust for the deletion.""" story = decode_object_from_bytes_if_needed(story) num_deleted = db.query( """ DELETE FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'] }).rows() if num_deleted > 0: db.query( """ UPDATE media_stats SET num_sentences = num_sentences - %(num_deleted)s WHERE media_id = %(media_id)s AND stat_date = %(publish_date)s::date """, { 'num_deleted': num_deleted, 'media_id': story['media_id'], 'publish_date': story['publish_date'], })
def _validate_topic_posts(db: DatabaseHandler, topic: dict, mock_posts: list) -> None: """Validate that the topic_posts match the mock_posts.""" got_posts = db.query( """ select * from topic_posts tp join topic_post_days tpd using ( topic_post_days_id ) where topics_id = %(a)s """, {'a': topic['topics_id']}).hashes() assert len(got_posts) == len(mock_posts) mock_posts = sorted(mock_posts, key=lambda x: x['post_id']) for i, mock_post in enumerate(mock_posts): got_post = db.query( "select * from topic_posts where post_id = %(a)s::text", {'a': mock_post['post_id']}).hash() assert got_post for field in POST_FIELDS: assert str(got_post.get(field, None)) == str(mock_post.get(field, None))
def default_max_topic_stories_limit(db: DatabaseHandler) -> int: """Get default max. topic stories limit.""" # noinspection SqlResolve limit = db.query(""" SELECT column_default AS default_weekly_requested_items_limit FROM information_schema.columns WHERE (table_schema, table_name) = ('public', 'auth_user_limits') AND column_name = 'max_topic_stories' """).flat() if not limit: raise McAuthLimitsException("Unable to fetch default max. topic stories limit.") return limit[0]
def _get_extracted_text(db: DatabaseHandler, story: dict) -> str: """Return the concatenated download_texts associated with the story.""" story = decode_object_from_bytes_if_needed(story) download_texts = db.query(""" SELECT dt.download_text FROM downloads AS d, download_texts AS dt WHERE dt.downloads_id = d.downloads_id AND d.stories_id = %(stories_id)s ORDER BY d.downloads_id """, {'stories_id': story['stories_id']}).flat() return ".\n\n".join(download_texts)
def _add_topic_tweet_single_day( db: DatabaseHandler, topic: dict, day: datetime.datetime, ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict: """ Add a row to topic_tweet_day if it does not already exist. fetch data for new row from CH. Arguments: db - database handle topic - topic dict day - date to fetch eg '2017-12-30' ch_class - AbstractCrimsonHexagon class Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_tweet_day = db.query( "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s", {'a': topic['topics_id'], 'b': str(day)}).hash() if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']: raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_tweet_day is not None: db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id']) ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day) tweet_count = ch_posts['totalPostsAvailable'] num_ch_tweets = len(ch_posts['posts']) topic_tweet_day = db.create( 'topic_tweet_days', { 'topics_id': topic['topics_id'], 'day': day, 'tweet_count': tweet_count, 'num_ch_tweets': num_ch_tweets, 'tweets_fetched': False }) topic_tweet_day['ch_posts'] = ch_posts return topic_tweet_day
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, meta_tweet: dict) -> None: """ Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in meta_tweet. Arguments: db - database handler topic - topic dict topic_tweet_day - topic_tweet_day dict meta_tweet - meta_tweet dict Return: None """ log.debug("remove nulls") _remove_json_tree_nulls(meta_tweet) log.debug("encode json") data_json = mediawords.util.parse_json.encode_json(meta_tweet) # null characters are not legal in json but for some reason get stuck in these tweets # data_json = data_json.replace('\x00', '') topic_tweet = { 'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'], 'data': data_json, 'content': meta_tweet['tweet']['text'], 'tweet_id': meta_tweet['tweet_id'], 'publish_date': meta_tweet['tweet']['created_at'], 'twitter_user': meta_tweet['tweet']['user']['screen_name'] } log.debug("insert topic tweet") topic_tweet = db.query( """ insert into topic_tweets ( topic_tweet_days_id, data, content, tweet_id, publish_date, twitter_user ) values ( %(topic_tweet_days_id)s, %(data)s, %(content)s, %(tweet_id)s, %(publish_date)s, %(twitter_user)s ) returning * """, topic_tweet).hash() log.debug("get tweet urls") urls = mediawords.util.twitter.get_tweet_urls(meta_tweet['tweet']) log.debug("insert tweet urls") _insert_tweet_urls(db, topic_tweet, urls) log.debug("done")
def get_extracted_html(db: DatabaseHandler, story: dict) -> str: """Get the extracted html for the story. We don't store the extracted html of a story, so we have to get the first download assoicated with the story and run the extractor on it. """ download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() extractor_results = mediawords.dbi.downloads.extract( db, download, PyExtractorArguments(use_cache=True)) return extractor_results['extracted_html']
def create_test_story_stack_for_indexing(db: DatabaseHandler, data: dict) -> dict: data = decode_object_from_bytes_if_needed(data) story_stack = create_test_story_stack(db=db, data=data) media = add_content_to_test_story_stack(db=db, story_stack=story_stack) test_stories = db.query( "SELECT * FROM stories ORDER BY md5(stories_id::text)").hashes() # Add ancillary data so that it can be queried in Solr _add_story_tags_to_stories(db=db, stories=test_stories) _add_timespans_to_stories(db=db, stories=test_stories) return media
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict: """Find or create the 'Spider Feed' feed for the media source.""" feed = db.query( "select * from feeds where media_id = %(a)s and name = %(b)s", {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash() if feed is not None: return feed return db.find_or_create('feeds', { 'media_id': medium['media_id'], 'url': medium['url'] + '#spiderfeed', 'name': SPIDER_FEED_NAME, 'feed_status': 'inactive' })
def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]: """Parse youtube embedded video urls out of the full html of the story. This function looks for youtube embed links anywhere in the html of the story content, rather than just in the extracted html. It aims to return a superset of all youtube embed links by returning every iframe src= attribute that includes the string 'youtube'. Arguments: db - db handle story - story dict from db Returns: list of string urls """ download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY stories_id LIMIT 1 """, { 'stories_id': story['stories_id'], }).hash() html = fetch_content(db, download) soup = BeautifulSoup(html, 'lxml') links = [] for tag in soup.find_all('iframe', src=True): url = tag['src'] if 'youtube' not in url: continue if not url.lower().startswith('http'): url = 'http:' + url url = url.strip() url = url.replace('youtube-embed', 'youtube') links.append(url) return links
def _add_tag_to_snapshot_media( db: DatabaseHandler, topics_id: int, timespans_id: int, media: List[Dict[str, Any]], tag_set_name: str, field_name: str) -> None: """Add field to media based on tag association from the given tag set. Use snapshotted data from snap.media_tags_map. """ partisan_tags = db.query( """ WITH timespan_tags AS ( SELECT snap.media_tags_map.* FROM snap.media_tags_map INNER JOIN timespans ON snap.media_tags_map.topics_id = timespans.topics_id AND snap.media_tags_map.snapshots_id = timespans.snapshots_id WHERE timespans.topics_id = %(topics_id)s AND timespans.timespans_id = %(timespans_id)s ) SELECT timespan_tags.*, tags.tag FROM timespan_tags INNER JOIN tags ON timespan_tags.tags_id = tags.tags_id INNER JOIN tag_sets ON tag_sets.tag_sets_id = tags.tag_sets_id WHERE tag_sets.name = %(tag_set_name)s """, { 'topics_id': topics_id, 'timespans_id': timespans_id, 'tag_set_name': tag_set_name, } ).hashes() partisan_map = {pt['media_id']: pt['tag'] for pt in partisan_tags} for medium in media: medium[field_name] = partisan_map.get(medium['media_id'], 'null')
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict, post: dict) -> None: """ Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post. Arguments: db - database handler topic - topic dict topic_post_day - topic_post_day dict post - post dict Return: None """ log.debug("remove nulls") _remove_json_tree_nulls(post) log.debug("encode json") data_json = encode_json(post) # null characters are not legal in json but for some reason get stuck in these tweets # data_json = data_json.replace('\x00', '') data = {} for field in POST_FIELDS: data[field] = post.get(field, None) data['topic_post_days_id'] = topic_post_day['topic_post_days_id'] data['data'] = data_json topic_post = db.query( "select * from topic_posts where topic_post_days_id = %(a)s and post_id = %(b)s::text", { 'a': topic_post_day['topic_post_days_id'], 'b': data['post_id'] }).hash() if not topic_post: log.debug("insert topic post") topic_post = db.create('topic_posts', data) log.debug("insert tweet urls") _insert_post_urls(db, topic_post, post['urls']) log.debug("done")
def default_role_ids(db: DatabaseHandler) -> List[int]: """List of role IDs to apply to new users.""" default_roles = db.query(""" SELECT auth_roles_id FROM auth_roles WHERE role = %(role)s """, {'role': UserRoles.tm_readonly()}).flat() if (not default_roles) or (not len(default_roles)): raise McRoleIDForRoleException('Unable to find default role IDs.') if default_roles is None: default_roles = [] # MC_REWRITE_TO_PYTHON: if only a single item is to be returned, Perl doesn't bother to make it into a list if isinstance(default_roles, int): default_roles = [default_roles] return default_roles
def extract_and_create_download_text( db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments) -> dict: """Extract the download and create a download_text from the extracted download.""" download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] log.debug("Extracting download {}...".format(downloads_id)) extraction_result = extract(db=db, download=download, extractor_args=extractor_args) log.debug("Done extracting download {}.".format(downloads_id)) download_text = None if extractor_args.use_existing(): log.debug( "Fetching download text for download {}...".format(downloads_id)) download_text = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': downloads_id }).hash() if download_text is None: log.debug( "Creating download text for download {}...".format(downloads_id)) download_text = create(db=db, download=download, extract=extraction_result) assert 'stories_id' in download, "It is expected to have 'stories_id' set for a download at this point." if not extractor_args.no_tag_extractor_version(): log.debug("Updating extractor version tag for story {}...".format( download['stories_id'])) update_extractor_version_tag( db=db, stories_id=download['stories_id'], extractor_version=extraction_result['extractor_version'], ) return download_text
def _get_merged_iteration(db: DatabaseHandler, topic: dict, delete_story: dict, keep_story: dict) -> int: """Get the smaller iteration of two stories""" iterations = db.query( """ select iteration from topic_stories where topics_id = %(a)s and stories_id in (%(b)s, %(c)s) and iteration is not null """, {'a': topic['topics_id'], 'b': delete_story['stories_id'], 'c': keep_story['stories_id']}).flat() if len(iterations) > 0: return min(iterations) else: return 0
def _get_topic_stories_by_medium(db: DatabaseHandler, topic: dict) -> dict: """Return hash of { $media_id: stories } for the topic.""" stories = db.query( """ select s.stories_id, s.media_id, s.title, s.url, s.publish_date from snap.live_stories s where s.topics_id = %(a)s """, {'a': topic['topics_id']}).hashes() media_lookup = {} for s in stories: media_lookup.setdefault(s['media_id'], []) media_lookup[s['media_id']].append(s) return media_lookup
def __try_retrieving_object_from_cache(self, db: DatabaseHandler, object_id: int) -> Union[bytes, None]: """Attempt to retrieve object from cache, don't worry too much if it fails.""" object_id = self._prepare_object_id(object_id) try: sql = "SELECT raw_data " sql += "FROM %s " % self.__cache_table # interpolated by Python sql += "WHERE object_id = %(object_id)s" # interpolated by psycopg2 content = db.query(sql, {'object_id': object_id}).hash() if content is None or len(content) == 0: raise McCachedAmazonS3StoreException("Object with ID %d was not found." % object_id) content = content['raw_data'] # MC_REWRITE_TO_PYTHON: Perl database handler returns value as array of bytes if isinstance(content, list): content = b''.join(content) if isinstance(content, memoryview): content = content.tobytes() if not isinstance(content, bytes): raise McCachedAmazonS3StoreException("Content is not bytes for object %d." % object_id) try: content = self._uncompress_data_for_method(data=content, compression_method=self.__cache_compression_method) except Exception as ex: raise McCachedAmazonS3StoreException( "Unable to uncompress data for object ID %d: %s" % (object_id, str(ex),)) if content is None: raise McCachedAmazonS3StoreException("Content is None after uncompression for object ID %d" % object_id) if not isinstance(content, bytes): raise McCachedAmazonS3StoreException( "Content is not bytes after uncompression for object ID %d" % object_id) except Exception as ex: log.debug("Unable to retrieve object ID %d from cache: %s" % (object_id, str(ex),)) return None else: return content
def _get_cached_extractor_results(db: DatabaseHandler, download: dict) -> typing.Optional[dict]: """Get extractor results from cache. Return: None if there is a miss or a dict in the form of extract_content() if there is a hit. """ r = db.query( "select extracted_html, extracted_text from cached_extractor_results where downloads_id = %(a)s", { 'a': download['downloads_id'] }).hash() log.debug( "EXTRACTOR CACHE HIT" if r is not None else "EXTRACTOR CACHE MISS") return r
def _get_extractor_results_cache(db: DatabaseHandler, download: dict) -> Optional[dict]: """Get extractor results from cache. Return: None if there is a miss or a dict in the form of extract_content() if there is a hit. """ download = decode_object_from_bytes_if_needed(download) r = db.query(""" SELECT extracted_html, extracted_text FROM cache.extractor_results_cache WHERE downloads_id = %(a)s """, {'a': download['downloads_id']}).hash() log.debug("EXTRACTOR CACHE HIT" if r is not None else "EXTRACTOR CACHE MISS") return r
def _get_dup_story_groups(db: DatabaseHandler, topic: dict) -> list: """Return a list of duplicate story groups. Find all stories within a topic that have duplicate normalized titles with a given day and media_id. Return a list of story lists. Each story list is a list of stories that are duplicated os each other. """ story_pairs = db.query( """ SELECT a.stories_id AS stories_id_a, b.stories_id AS stories_id_b FROM snap.live_stories AS a, snap.live_stories AS b WHERE a.topics_id = %(topics_id)s AND a.topics_id = b.topics_id AND a.stories_id < b.stories_id AND a.media_id = b.media_id AND a.normalized_title_hash = b.normalized_title_hash AND date_trunc('day', a.publish_date) = date_trunc('day', b.publish_date) ORDER BY stories_id_a, stories_id_b """, { 'topics_id': topic['topics_id'], } ).hashes() story_groups = {} ignore_stories = {} for story_pair in story_pairs: if story_pair['stories_id_b'] in ignore_stories: continue story_a = db.require_by_id('stories', story_pair['stories_id_a']) story_b = db.require_by_id('stories', story_pair['stories_id_b']) story_groups.setdefault(story_a['stories_id'], [story_a]) story_groups[story_a['stories_id']].append(story_b) ignore_stories[story_b['stories_id']] = True return list(story_groups.values())
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep normalized_url field in media table up to date. Set the normalized_url field of any row in media for which it is null. Take care to lock the process so that only one process is doing this work at a time. """ # put a lock on this because the process of generating all media urls will take a couple hours, and we don't # want all workers to do the work locked = False while not locked: if not _normalized_urls_out_of_date(db): return db.begin() # poll instead of block so that we can releae the transaction and see whether someone else has already # updated all of the media locked = get_session_lock( db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False) if not locked: db.commit() log.info("sleeping for media_normalized_urls lock...") time.sleep(1) log.warning("updating media_normalized_urls ...") media = db.query( "select * from media where normalized_url is null").hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url}) db.commit()
def all_user_roles(db: DatabaseHandler) -> List[dict]: """Fetch a list of available user roles.""" # MC_REWRITE_TO_PYTHON: could return a set after Python rewrite roles = db.query(""" SELECT auth_roles_id, role, description FROM auth_roles ORDER BY auth_roles_id """).hashes() if roles is None: roles = [] # MC_REWRITE_TO_PYTHON: if only a single item is to be returned, Perl doesn't bother to make it into a list if isinstance(roles, dict): roles = [roles] return roles
def get_unique_medium_url(db: DatabaseHandler, urls: list) -> str: """Return the first url in the list that does not yet exist for a media source, or None. If no unique urls are found, trying appending '#spider' to each of the urls. """ spidered_urls = [u + URL_SPIDERED_SUFFIX for u in urls] urls = urls + spidered_urls for url in urls: url = url[0:MAX_URL_LENGTH] url_exists = db.query("select 1 from media where url = %(a)s", { 'a': url }).hash() if url_exists is None: return url raise McTopicMediaUniqueException( "Unable to find unique url among urls: " + str(urls))
def __story_extractor_tags(db: DatabaseHandler, stories_id: int) -> List[Dict[str, Any]]: return db.query( """ SELECT stories_tags_map.stories_id, tags.tag AS tags_name, tag_sets.name AS tag_sets_name FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s AND tag_sets.name = %(tag_sets_name)s """, { 'stories_id': stories_id, 'tag_sets_name': extractor_version_tag_sets_name() }).hashes()
def ignore_redirect(db: DatabaseHandler, url: str, redirect_url: Optional[str]) -> bool: """Return true if we should ignore redirects to the target media source. This is usually to avoid redirects to domain resellers for previously valid and important but now dead links.""" if redirect_url is None or url == redirect_url: return False medium_url = generate_medium_url_and_name_from_url(redirect_url)[0] u = normalize_url_lossy(medium_url) match = db.query("select 1 from topic_ignore_redirects where url = %(a)s", { 'a': u }).hash() return match is not None