def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url[ 'topic_links_id'] is None: return False topic_link = db.query( """ SELECT * FROM topic_links WHERE topics_id = %(topics_id)s AND topic_links_id = %(topic_links_id)s """, { 'topics_id': topic_fetch_url['topics_id'], 'topic_links_id': topic_fetch_url['topic_links_id'], }).hash() if not topic_link: raise Exception( f"Topic link ID {topic_fetch_url['topic_links_id']} was not found." ) story = db.require_by_id('stories', topic_link['stories_id']) url = topic_link.get('redirect_url', topic_link['url']) return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = "%s: %s" % (screen_name, content) tweet_date = tweet['created_at'] url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id']) story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = mediawords.util.twitter.get_tweet_urls(tweet) for url in urls: if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) return story
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create(table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, }) db.create(table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), }) return story
def _add_tweet_story(db: DatabaseHandler, topic: Dict[str, Any], tweet: dict, topic_fetch_urls: List[Dict[str, Any]]) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = f"{screen_name}: {content}" tweet_date = tweet['created_at'] url = f"https://twitter.com/{screen_name}/status/{tweet['id']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = get_tweet_urls(tweet) for url in urls: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url, } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) return story
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = f"{user['name']} ({user['screen_name']}): {user['description']}" title = f"{user['name']} ({user['screen_name']}) | Twitter" tweet_date = sql_now() url = f"https://twitter.com/{user['screen_name']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) stories_id = story['stories_id'] tags_id = undateable_tag['tags_id'] db.query(""" INSERT INTO public.stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) return story
def _call_function_on_url_chunks(db: DatabaseHandler, topic: Dict[str, Any], urls: List[Dict[str, Any]], chunk_function: Callable) -> None: """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time. Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk. """ i = 0 while i < len(urls): chunk_urls = urls[i:i + URLS_CHUNK_SIZE] try: chunk_function(db, topic, chunk_urls) except Exception as ex: log.warning(f"error fetching twitter data: {ex}") topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls] db.query(""" UPDATE topic_fetch_urls SET state = %(state)s, message = %(message)s WHERE topics_id = %(topics_id)s AND topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s) """, { 'state': FETCH_STATE_PYTHON_ERROR, 'message': str(ex), 'topics_id': topic['topics_id'], 'topic_fetch_urls_ids': topic_fetch_urls_ids, }) i += URLS_CHUNK_SIZE
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create( table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, } ) db.create( table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), } ) return story
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links and all_links counts in the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. Always increment all_links. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) self_link = 1 if story_domain in (url_domain, redirect_url_domain) else 0 db.query( """ insert into topic_domains (topics_id, domain, self_links, all_links) values(%(topics_id)s, %(domain)s, %(self_link)s, 1) on conflict (topics_id, md5(domain)) do update set self_links = topic_domains.self_links + %(self_link)s, all_links = topic_domains.all_links + 1 """, {'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain, 'self_link': self_link})
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) title = '%s (%s) | Twitter' % (user['name'], user['screen_name']) tweet_date = mediawords.util.sql.sql_now() url = 'https://twitter.com/%s' % user['screen_name'] story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id( db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", { 'a': story['stories_id'], 'b': undateable_tag['tags_id'] }) return story
def _get_undateable_tag(db: DatabaseHandler) -> dict: """Return the date_invalid:undateable tag.""" tag_name = 'undateable' tag_set_name = 'date_invalid' invalid_tag = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) where t.tag = %(a)s and ts.name = %(b)s """, { 'a': tag_name, 'b': tag_set_name }).hash() if invalid_tag is None: tag_set = db.find_or_create('tag_sets', {'name': tag_set_name}) invalid_tag = db.find_or_create('tags', { 'tag': tag_name, 'tag_sets_id': tag_set['tag_sets_id'] }) return invalid_tag
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None: """Print table dump to STDOUT.""" column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns() primary_key_column = db.primary_key_column(table=table) print(""" -- -- Table '%(table)s' -- """ % {'table': table}) # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values: # # http://stackoverflow.com/a/11379550/200603 # # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values. print( "COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');" % { 'table': table, 'column_names': ', '.join(column_names), }) csv_writer = csv.writer(sys.stdout, delimiter="\t", escapechar="\\", quoting=csv.QUOTE_NONE) res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" % { 'table': table, 'primary_key_column': primary_key_column, }) postgresql_null_value = '\\N' postgresql_end_of_data = '\.' while True: row = res.array() if row is None: break else: csv_writer.writerow( [postgresql_null_value if val is None else val for val in row]) print(postgresql_end_of_data) print(""" -- Update sequence head SELECT setval( pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'), (SELECT max(%(primary_key_column)s)+1 FROM %(table)s) ); """ % { 'table': table, 'primary_key_column': primary_key_column, })
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List, chunk_function: Callable) -> None: """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time. Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk. """ i = 0 while i < len(urls): chunk_urls = urls[i:i + URLS_CHUNK_SIZE] try: chunk_function(db, topic, chunk_urls) except Exception as ex: log.warning("error fetching twitter data: {}".format(ex)) topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls] db.query( "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)", { 'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids }) i += URLS_CHUNK_SIZE
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict: """Update topic_fetch_url state to tweet missing.""" db.query( "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s", { 'a': FETCH_STATE_TWEET_MISSING, 'b': topic_fetch_url['topic_fetch_urls_id'] })
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> dict: """Update topic_fetch_url state to content match failed.""" db.query( "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s", { 'a': FETCH_STATE_CONTENT_MATCH_FAILED, 'b': topic_fetch_url['topic_fetch_urls_id'] })
def __kill_connections_to_database(db: DatabaseHandler, database_name: str) -> None: """Kill all active connections to the database.""" # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call # tearDown() and the the connection to the test database persists (apparently) db.query(""" SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_catalog.pg_stat_activity WHERE datname = %(template_db_name)s AND pid != pg_backend_pid() """, {'template_db_name': database_name})
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None: """Print table dump to STDOUT.""" column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns() primary_key_column = db.primary_key_column(table) print(""" -- -- Table '%(table)s' -- """ % {'table': table}) # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values: # # http://stackoverflow.com/a/11379550/200603 # # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values. print("COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');" % { 'table': table, 'column_names': ', '.join(column_names), }) csv_writer = csv.writer(sys.stdout, delimiter="\t", escapechar="\\", quoting=csv.QUOTE_NONE) res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" % { 'table': table, 'primary_key_column': primary_key_column, }) postgresql_null_value = '\\N' postgresql_end_of_data = r'\.' while True: row = res.array() if row is None: break else: csv_writer.writerow([postgresql_null_value if val is None else val for val in row]) print(postgresql_end_of_data) print(""" -- Update sequence head SELECT setval( pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'), (SELECT max(%(primary_key_column)s)+1 FROM %(table)s) ); """ % { 'table': table, 'primary_key_column': primary_key_column, })
def __kill_connections_to_database(db: DatabaseHandler, database_name: str) -> None: """Kill all active connections to the database.""" # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call # tearDown() and the the connection to the test database persists (apparently) db.query( """ SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_catalog.pg_stat_activity WHERE datname = %(template_db_name)s AND pid != pg_backend_pid() """, {'template_db_name': database_name})
def fetch_twitter_urls_update_state(db: DatabaseHandler, topic_fetch_urls_ids: List[int]): """Try fetch_twitter_urls(), update state.""" try: fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids) except Exception as ex: log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex))) db.query(""" update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now() where topic_fetch_urls_id = any(%(c)s) """, {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids})
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> None: """Update topic_fetch_url state to content match failed.""" db.query(""" UPDATE topic_fetch_urls SET state = %(state)s, fetch_date = NOW() WHERE topics_id = %(topics_id)s AND topic_fetch_urls_id = %(topic_fetch_urls_id)s """, { 'state': FETCH_STATE_CONTENT_MATCH_FAILED, 'topics_id': topic_fetch_url['topics_id'], 'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'], })
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List[int]) -> None: """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100.""" if len(topic_fetch_urls_ids) == 0: return topic_fetch_urls = db.query( "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)", {'a': topic_fetch_urls_ids}).hashes() topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id']) (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls) _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk) _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> None: """Update topic_fetch_url state to tweet missing.""" db.query(""" UPDATE topic_fetch_urls SET state = %(state)s, fetch_date = NOW() WHERE topics_id = %(topics_id)s AND topic_fetch_urls_id = %(topic_fetch_urls_id)s """, { 'state': FETCH_STATE_TWEET_MISSING, 'topics_id': topic_fetch_url['topics_id'], 'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'], })
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List) -> None: """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100.""" if len(topic_fetch_urls_ids) == 0: return topic_fetch_urls = db.query( "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)", {'a': topic_fetch_urls_ids}).hashes() topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id']) (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls) _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk) _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
def create_test_timespan(db: DatabaseHandler, topic: dict = None, snapshot: dict = None) -> dict: """Create simple timespans for testing. Mast pass either topic or snapshot or both. If a snapshot is not passed, create one. """ assert topic is not None or snapshot is not None if not snapshot: snapshot = create_test_snapshot(db, topic) return db.create(table='timespans', insert_hash={ 'topics_id': snapshot['topics_id'], 'snapshots_id': snapshot['snapshots_id'], 'start_date': snapshot['start_date'], 'end_date': snapshot['end_date'], 'period': 'overall', 'story_count': 0, 'story_link_count': 0, 'medium_count': 0, 'medium_link_count': 0, 'post_count': 0 })
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int, source_url: str, ref_url: str) -> bool: """Return true if the url should be skipped because it is a self linked domain within the topic. Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following is true: * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or * ref_url matches SKIP_SELF_LINK_RE. """ source_domain = get_url_distinctive_domain(source_url) ref_domain = get_url_distinctive_domain(ref_url) if source_domain != ref_domain: return False if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I): return True topic_domain = db.query( "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)", { 'a': topics_id, 'b': ref_domain }).hash() if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS: return True return False
def print_long_running_job_states(db: DatabaseHandler): """print a count of job_states that have been running or queued for longer than a day""" counts = db.query(""" select class, count(*) as count, min(last_updated::date) as min_date, max(last_updated::date) as max_date from ( select *, rank() over ( partition by class, (args->>'media_id')::int order by last_updated desc ) as media_rank, args->>'media_id' as media_id from job_states ) as q where media_rank = 1 and state in( 'queued', 'running') and last_updated < now() - interval '1 day' group by class order by class """).hashes() if len(counts) > 0: print("Long Running Jobs:\n") for count in counts: print("%s: %d (%s - %s)" % (count['class'], count['count'], count['min_date'], count['max_date']))
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int, source_url: str, ref_url: str) -> bool: """Return true if the url should be skipped because it is a self linked domain within the topic. Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following is true: * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or * ref_url matches SKIP_SELF_LINK_RE. """ source_domain = mediawords.util.url.get_url_distinctive_domain(source_url) ref_domain = mediawords.util.url.get_url_distinctive_domain(ref_url) if source_domain != ref_domain: return False if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I): return True topic_domain = db.query( "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)", {'a': topics_id, 'b': ref_domain}).hash() if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS: return True return False
def connect_to_db() -> DatabaseHandler: """Connect to PostgreSQL.""" db_config = CommonConfig.database() retries_config = db_config.retries() assert retries_config.max_attempts() > 0, "max_tries can't be negative." db = None for attempt in range(1, retries_config.max_attempts() + 1): try: log.debug("Connecting to PostgreSQL...") db = DatabaseHandler( host=db_config.hostname(), port=db_config.port(), username=db_config.username(), password=db_config.password(), database=db_config.database_name(), ) if not db: raise ValueError("Returned value is None.") # Return the database handler upon successful connection break except Exception as ex: error_message = "Unable to connect to %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % { 'username': db_config.username(), 'host': db_config.hostname(), 'port': db_config.port(), 'database': db_config.database_name(), 'exception': str(ex), } log.error(error_message) if attempt < retries_config.max_attempts(): log.info( f"Will retry for #{attempt} time in {retries_config.sleep_between_attempts()} seconds..." ) time.sleep(retries_config.sleep_between_attempts()) else: log.info("Out of retries, giving up and exiting...") # Don't throw any exceptions because they might be caught by # the try-catch block, and so the caller will just assume that # there was something wrong with the input data and proceed # with processing next item in the job queue (e.g. the next # story). Instead, just quit and wait for someone to restart # the whole app that requires database access. fatal_error(error_message) return db
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ insert into topic_domains (topics_id, domain, self_links) values(%(topics_id)s, %(domain)s, 1) on conflict (topics_id, md5(domain)) do nothing returning * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain } ).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ update topic_domains set self_links = topic_domains.self_links + 1 where topics_id = %(topics_id)s and domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain } )
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict: """Update topic_fetch_url stat to tweet added.""" return db.query( """ update topic_fetch_urls set state=%(a)s, stories_id=%(b)s, fetch_date=now() where topic_fetch_urls_id=%(c)s returning * """, {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
def _log_python_errpr(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> dict: """Update topic_fetch_url stat to tweet failed.""" return db.query( """ update topic_fetch_urls set state=%(a)s, fetch_date=now(), message = %(b)s where topic_fetch_urls_id=%(c)s returning * """, {'a': FETCH_STATE_PYTHON_ERROR, 'b': message, 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None: return False topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id']) story = db.require_by_id('stories', topic_link['stories_id']) url = topic_link.get('redirect_url', topic_link['url']) return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
def create_test_snapshot(db: DatabaseHandler, topic: dict) -> dict: """Create simple snapshot for testing.""" return db.create(table='snapshots', insert_hash={ 'topics_id': topic['topics_id'], 'snapshot_date': topic['end_date'], 'start_date': topic['start_date'], 'end_date': topic['end_date'] })
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return true if the domain of the linked url is the same as the domain of the linking story and one of the following is true: * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or * the linked url matches SKIP_SELF_LINK_RE. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None: return False topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id']) story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) if redirect_url is None: redirect_url = topic_link['url'] redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) link_domain = redirect_url_domain if redirect_url_domain else url_domain if story_domain not in (url_domain, redirect_url_domain): return False for url in (topic_link['url'], redirect_url): if re.search(SKIP_SELF_LINK_RE, url, flags=re.I): return True topic_domain = db.query( "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)", {'a': topic_fetch_url['topics_id'], 'b': link_domain}).hash() if topic_domain and topic_domain['self_links'] > MAX_SELF_LINKS: return True return False
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict: """Update topic_fetch_url stat to tweet added.""" return db.query(""" UPDATE topic_fetch_urls SET state = %(a)s, stories_id = %(b)s, fetch_date = NOW() WHERE topic_fetch_urls_id = %(c)s RETURNING * """, {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List, chunk_function: Callable) -> None: """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time. Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk. """ i = 0 while i < len(urls): chunk_urls = urls[i:i + URLS_CHUNK_SIZE] try: chunk_function(db, topic, chunk_urls) except Exception as ex: log.warning("error fetching twitter data: {}".format(ex)) topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls] db.query( "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)", {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids}) i += URLS_CHUNK_SIZE
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = get_url_distinctive_domain(story['url']) url_domain = get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ INSERT INTO topic_domains (topics_id, domain, self_links) VALUES (%(topics_id)s, %(domain)s, 1) ON CONFLICT (topics_id, md5(domain)) DO NOTHING RETURNING * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain }).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ UPDATE topic_domains SET self_links = topic_domains.self_links + 1 WHERE topics_id = %(topics_id)s AND domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain })
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = "%s: %s" % (screen_name, content) tweet_date = tweet['created_at'] url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id']) story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id( db, topic_fetch_url) urls = mediawords.util.twitter.get_tweet_urls(tweet) for url in urls: if mediawords.tm.domains.skip_self_linked_domain_url( db, topic['topics_id'], story['url'], url): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) return story
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) title = '%s (%s) | Twitter' % (user['name'], user['screen_name']) tweet_date = mediawords.util.sql.sql_now() url = 'https://twitter.com/%s' % user['screen_name'] story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': undateable_tag['tags_id']}) return story
def _get_undateable_tag(db: DatabaseHandler) -> dict: """Return the date_invalid:undateable tag.""" tag_name = 'undateable' tag_set_name = 'date_invalid' invalid_tag = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) where t.tag = %(a)s and ts.name = %(b)s """, {'a': tag_name, 'b': tag_set_name}).hash() if invalid_tag is None: tag_set = db.find_or_create('tag_sets', {'name': tag_set_name}) invalid_tag = db.find_or_create('tags', {'tag': tag_name, 'tag_sets_id': tag_set['tag_sets_id']}) return invalid_tag
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict: """Create test feed with a simple label belonging to medium.""" label = decode_object_from_bytes_if_needed(label) medium = decode_object_from_bytes_if_needed(medium) return db.create(table='feeds', insert_hash={ 'name': label, 'url': "http://feed.test/%s" % label, 'media_id': int(medium['media_id']), })
def create_test_medium(db: DatabaseHandler, label: str) -> dict: """Create test medium with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create( table='media', insert_hash={ 'name': label, 'url': "http://media.test/%s" % (label,), 'is_monitored': True, 'public_notes': "%s public notes" % (label,), 'editor_notes': "%s editor notes" % (label,), })
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict: """Create test feed with a simple label belonging to medium.""" label = decode_object_from_bytes_if_needed(label) medium = decode_object_from_bytes_if_needed(medium) return db.create( table='feeds', insert_hash={ 'name': label, 'url': "http://feed.test/%s" % label, 'media_id': int(medium['media_id']), } )
def reset_all_schemas(db_: DatabaseHandler) -> None: """Recreate all schemas.""" schemas = db_.query(""" SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT LIKE %(schema_pattern)s AND schema_name != 'information_schema' ORDER BY schema_name """, {'schema_pattern': 'pg_%'}).flat() # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test db_.query('SET client_min_messages=WARNING') for schema in schemas: db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema) db_.query('SET client_min_messages=NOTICE')
def create_test_topic(db: DatabaseHandler, label: str) -> dict: """Create test topic with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create( table='topics', insert_hash={ 'name': label, 'description': label, 'pattern': label, 'solr_seed_query': label, 'solr_seed_query_run': True, 'start_date': '2016-01-01', 'end_date': '2016-03-01', 'job_queue': 'mc', 'max_stories': 100000, } )
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) story = decode_object_from_bytes_if_needed(story) host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'success', 'priority': 1, 'extracted': False, 'path': 'postgresql:foo', 'stories_id': story['stories_id'], } )
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def __validate_table_foreign_keys(db: DatabaseHandler, table: str) -> None: """Validate all table's foreign keys; raise McValidateTableForeignKeysException if any of the keys are invalid. If table's constraints aren't right, SQL would be pretty much invalid.""" foreign_keys = db.query(""" SELECT tc.constraint_name, tc.table_schema, tc.table_name, kcu.column_name, ccu.table_schema AS foreign_table_schema, ccu.table_name AS foreign_table_name, ccu.column_name AS foreign_column_name FROM information_schema.table_constraints AS tc JOIN information_schema.key_column_usage AS kcu ON tc.constraint_name = kcu.constraint_name JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name WHERE constraint_type = 'FOREIGN KEY' AND tc.table_schema = 'public' AND tc.table_name = %(table)s """, {'table': table}).hashes() foreign_key_errors = [] for foreign_key in foreign_keys: constraint_name = foreign_key['constraint_name'] log.info("Validating foreign key '%s' for table '%s'..." % (constraint_name, table)) sql = """ SELECT DISTINCT a.%(column_name)s FROM %(table_schema)s.%(table_name)s AS a LEFT JOIN %(foreign_table_schema)s.%(foreign_table_name)s AS b ON a.%(column_name)s = b.%(foreign_column_name)s WHERE a.%(column_name)s IS NOT NULL AND b.%(foreign_column_name)s IS NULL ORDER BY a.%(column_name)s """ % { 'column_name': foreign_key['column_name'], 'table_schema': foreign_key['table_schema'], 'table_name': table, 'foreign_table_schema': foreign_key['foreign_table_schema'], 'foreign_table_name': foreign_key['foreign_table_name'], 'foreign_column_name': foreign_key['foreign_column_name'], } unreferenced_rows = db.query(sql).flat() if len(unreferenced_rows) > 0: error = """ Table '%(table)s' has unreferenced rows for constraint '%(constraint_name)s': %(unreferenced_rows)s; SQL: %(sql)s """ % { 'table': table, 'constraint_name': constraint_name, 'unreferenced_rows': ', '.join(unreferenced_rows), 'sql': sql, } foreign_key_errors.append(error) log.warning(error) else: log.info("Foreign key '%s' for table '%s' looks fine." % (constraint_name, table)) if len(foreign_key_errors) > 0: raise McValidateTableForeignKeysException( "One or more foreign key checks failed for table '%(table)s': %(foreign_key_errors)s" % { 'table': table, 'foreign_key_errors': "\n".join(foreign_key_errors) } )
def print_exported_tables_to_backup_crawler(db: DatabaseHandler) -> None: """Export tables by printing their SQL dump to STDOUT.""" # Tables to export tables = ['tag_sets', 'media', 'feeds', 'tags', 'media_tags_map', 'feeds_tags_map'] db.begin() log.info("Validating foreign keys...") foreign_key_errors = [] for table in tables: log.info("Validating foreign keys for table '%s'..." % table) # Aggregate errors into array to be able to print a one huge complaint try: __validate_table_foreign_keys(db=db, table=table) except McValidateTableForeignKeysException as ex: error = str(ex) log.warning("Validating foreign key for table '%s' failed: %s" % (table, error)) foreign_key_errors.append(error) if len(foreign_key_errors): raise McPrintExportedTablesToBackupCrawlerException( "One or more foreign key checks failed, won't continue as resulting SQL would be invalid:\n\n%s" % str(foreign_key_errors) ) log.info("Done validating foreign keys.") print(""" -- -- This is a dataset needed for running a backup crawler. -- -- Import this dump into the backup crawler's PostgreSQL instance. -- BEGIN; -- -- Die if schema has not been initialized -- DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_schema = CURRENT_SCHEMA() AND table_name = 'media' ) THEN RAISE EXCEPTION 'Table "media" does not exist, please initialize schema.'; END IF; END$$; -- -- Die if something's already in the database -- DO $$ BEGIN IF EXISTS (SELECT 1 FROM media) THEN RAISE EXCEPTION 'Table "media" already contains data, please purge the database.'; END IF; END$$; -- -- Temporarily disable constraints to speed up import -- SET CONSTRAINTS ALL DEFERRED; -- -- Truncate "tag_sets" table (might already have something) -- TRUNCATE tag_sets CASCADE; """) log.info("Exporting tables...") for table in tables: log.info("Exporting table '%s'..." % table) __print_table_csv_to_stdout(db=db, table=table) log.info("Done exporting tables.") db.commit() print(""" -- -- Reenable constraints -- SET CONSTRAINTS ALL IMMEDIATE; COMMIT; """)
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> dict: """Update topic_fetch_url state to content match failed.""" db.query( "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s", {'a': FETCH_STATE_CONTENT_MATCH_FAILED, 'b': topic_fetch_url['topic_fetch_urls_id']})
def connect_to_db( label: typing.Optional[str] = None, do_not_check_schema_version: bool = False, is_template: bool = False) -> DatabaseHandler: """Connect to PostgreSQL. Arguments: label - db config section label for mediawords.yml do_no_check_schema_version - if false, throw an error if the versions in mediawords.ym and the db do not match is_template - if true, connect to a db called <db_name>_template instead of <db_name> """ label = decode_str_from_bytes_if_needed(label) # If this is Catalyst::Test run, force the label to the test database if using_test_database(): label = 'test' config = py_get_config() if 'database' not in config: raise McConnectToDBException("No database connections are configured") all_settings = config['database'] if all_settings is None: raise McConnectToDBException("No database connections are configured") settings = None if label is not None: for configured_database in all_settings: if configured_database['label'] == label: settings = configured_database break if settings is None: raise McConnectToDBException("No database connection settings labeled '%s'." % label) else: if len(all_settings) == 0: raise McConnectToDBException("No default connection settings found.") settings = all_settings[0] if settings is None: raise McConnectToDBException("Settings are undefined.") if 'host' not in settings or 'db' not in settings: raise McConnectToDBException("Settings are incomplete ('db' and 'host' must both be set).") host = settings['host'] port = int(settings['port']) username = settings['user'] password = settings['pass'] database = settings['db'] if is_template: database = database + "_template" try: ret = DatabaseHandler( host=host, port=port, username=username, password=password, database=database, do_not_check_schema_version=do_not_check_schema_version ) except Exception as ex: raise McConnectToDBException( "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % { 'username': username, 'host': host, 'port': port, 'database': database, 'exception': str(ex) }) if ret is None: raise McConnectToDBException("Error while connecting to the database.") if 'db_statement_timeout' in config['mediawords']: db_statement_timeout = config['mediawords']['db_statement_timeout'] ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout}) return ret
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict: """Update topic_fetch_url state to tweet missing.""" db.query( "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s", {'a': FETCH_STATE_TWEET_MISSING, 'b': topic_fetch_url['topic_fetch_urls_id']})