def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Select a single row from the database matching the hash or insert a row with the hash values and return the inserted row as a hash.""" # FIXME probably do this in a serialized transaction? table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) insert_hash = insert_hash.copy() # To be able to safely modify it if len(insert_hash) == 0: raise McFindOrCreateException("Hash to INSERT or SELECT is empty") # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in insert_hash: del insert_hash["submit"] row = self.select(table=table, what_to_select='*', condition_hash=insert_hash) if row is not None and row.rows() > 0: return row.hash() else: # try to create it, but if some other process has created it because we don't have a lock, just use that one try: return self.create(table=table, insert_hash=insert_hash) except McUniqueConstraintException: return self.select(table=table, what_to_select='*', condition_hash=insert_hash).hash()
def __init__(self, no_dedup_sentences: bool = False, no_delete: bool = False, no_tag_extractor_version: bool = False, use_cache: bool = False, use_existing: bool = False): """Constructor.""" if isinstance(no_dedup_sentences, bytes): no_dedup_sentences = decode_object_from_bytes_if_needed(no_dedup_sentences) if isinstance(no_delete, bytes): no_delete = decode_object_from_bytes_if_needed(no_delete) if isinstance(no_tag_extractor_version, bytes): no_tag_extractor_version = decode_object_from_bytes_if_needed(no_tag_extractor_version) if isinstance(use_cache, bytes): use_cache = decode_object_from_bytes_if_needed(use_cache) if isinstance(use_existing, bytes): use_existing = decode_object_from_bytes_if_needed(use_existing) # MC_REWRITE_TO_PYTHON: remove weird casts after Python rewrite no_dedup_sentences = bool(int(no_dedup_sentences)) no_delete = bool(int(no_delete)) no_tag_extractor_version = bool(int(no_tag_extractor_version)) use_cache = bool(int(use_cache)) use_existing = bool(int(use_existing)) self.__no_dedup_sentences = no_dedup_sentences self.__no_delete = no_delete self.__no_tag_extractor_version = no_tag_extractor_version self.__use_cache = use_cache self.__use_existing = use_existing
def create(db: DatabaseHandler, download: dict, extract: dict) -> dict: """Create a download_text hash and insert it into the database. Delete any existing download_text row for the download.""" # FIXME don't pass freeform "extract" dict, we need just the "extracted_text" download = decode_object_from_bytes_if_needed(download) extract = decode_object_from_bytes_if_needed(extract) db.query(""" DELETE FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) download_text = db.query(""" INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) RETURNING * """, { 'downloads_id': download['downloads_id'], 'download_text': extract['extracted_text'], }).hash() db.query(""" UPDATE downloads SET extracted = 't' WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}) return download_text
def __init__(self, email: str, full_name: str = None, notes: str = None, active: bool = None, weekly_requests_limit: int = None, weekly_requested_items_limit: int = None, password: str = None, password_repeat: str = None, role_ids: List[int] = None): super().__init__( email=email, full_name=full_name, notes=notes, active=active, weekly_requests_limit=weekly_requests_limit, weekly_requested_items_limit=weekly_requested_items_limit ) password = decode_object_from_bytes_if_needed(password) password_repeat = decode_object_from_bytes_if_needed(password_repeat) if password is not None and password_repeat is not None: password_validation_message = validate_new_password( email=self.email(), password=password, password_repeat=password_repeat ) if password_validation_message: raise McAuthUserException("Password is invalid: %s" % password_validation_message) self.__password = password self.__password_repeat = password_repeat self.__role_ids = role_ids
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException("add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid']) ) return None else: raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story))) db.find_or_create( table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, } ) db.commit() return story
def __init__(self, host: str, port: int, username: str, password: str, database: str, do_not_check_schema_version: bool = False): """Database handler constructor; connects to PostgreSQL too.""" host = decode_object_from_bytes_if_needed(host) # noinspection PyTypeChecker port = int(decode_object_from_bytes_if_needed(port)) username = decode_object_from_bytes_if_needed(username) password = decode_object_from_bytes_if_needed(password) database = decode_object_from_bytes_if_needed(database) self.__primary_key_columns = {} self.__schema_version_check_pids = {} self.__print_warnings = True self.__in_manual_transaction = False self.__conn = None self.__db = None self.__connect( host=host, port=port, username=username, password=password, database=database, do_not_check_schema_version=do_not_check_schema_version )
def run_job(cls, stories_id: int, topics_id: int) -> None: """Run the extract_story_links job, using mediawords.tm.extract_story_links for the logic.""" if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) if stories_id is None: raise McExtractStoryLinksJobException("'stories_id' is None.") if isinstance(topics_id, bytes): topics_id = decode_object_from_bytes_if_needed(topics_id) if topics_id is None: raise McExtractStoryLinksJobException("'topics_id' is None.") stories_id = int(stories_id) topics_id = int(topics_id) log.info("Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) try: db = connect_to_db() story = db.require_by_id(table='stories', object_id=stories_id) topic = db.require_by_id(table='topics', object_id=topics_id) mediawords.tm.extract_story_links.extract_links_for_topic_story(db, story, topic) except Exception as ex: log.error("Error while processing story {}: {}".format(stories_id, ex)) raise McExtractStoryLinksJobException( "Unable to process story {}: {}".format(stories_id, traceback.format_exc()) ) log.info("Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id))
def send_password_reset_token(db: DatabaseHandler, email: str, password_reset_link: str) -> None: """Prepare for password reset by emailing the password reset token.""" email = decode_object_from_bytes_if_needed(email) password_reset_link = decode_object_from_bytes_if_needed(password_reset_link) # Check if user exists try: user = user_info(db=db, email=email) full_name = user.full_name() except Exception as ex: log.warning("Unable to fetch user profile for user '%s': %s" % (email, str(ex),)) full_name = 'Nonexistent user' # If user was not found, send an email to a random address anyway to avoid timing attack full_password_reset_link = _generate_password_reset_token( db=db, email=email, password_reset_link=password_reset_link, ) if not full_password_reset_link: log.warning("Unable to generate full password reset link for email '%s'" % email) email = '*****@*****.**' full_password_reset_link = 'password reset link' message = AuthResetPasswordMessage(to=email, full_name=full_name, password_reset_url=full_password_reset_link) if not send_email(message): raise McAuthResetPasswordException('Unable to send password reset email.')
def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool: """Validate password reset token (used for both user activation and password reset).""" email = decode_object_from_bytes_if_needed(email) password_reset_token = decode_object_from_bytes_if_needed(password_reset_token) if not (email and password_reset_token): log.error("Email and / or password reset token is empty.") return False # Fetch readonly information about the user password_reset_token_hash = db.query(""" SELECT auth_users_id, email, password_reset_token_hash FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash: log.error("Unable to find user %s in the database." % email) return False password_reset_token_hash = password_reset_token_hash['password_reset_token_hash'] if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token): return True else: return False
def validate_new_password(email: str, password: str, password_repeat: str) -> str: """Check if password complies with strength the requirements. Returns empty string on valid password, error message on invalid password.""" email = decode_object_from_bytes_if_needed(email) password = decode_object_from_bytes_if_needed(password) password_repeat = decode_object_from_bytes_if_needed(password_repeat) if not email: return 'Email address is empty.' if not (password and password_repeat): return 'To set the password, please repeat the new password twice.' if password != password_repeat: return 'Passwords do not match.' if len(password) < __MIN_PASSWORD_LENGTH or len(password) > __MAX_PASSWORD_LENGTH: return 'Password must be between %d and %d characters length.' % (__MIN_PASSWORD_LENGTH, __MAX_PASSWORD_LENGTH,) if password == email: return "New password is your email address; don't cheat!" return ''
def extract_tarball_to_directory(archive_file: str, dest_directory: str, strip_root: bool = False) -> None: """Extract Tar archive (.tar, .tar.gz or .tgz) to destination directory, optionally stripping the root directory first.""" archive_file = decode_object_from_bytes_if_needed(archive_file) dest_directory = decode_object_from_bytes_if_needed(dest_directory) if not os.path.isfile(archive_file): raise McExtractTarballToDirectoryException("Archive at '%s' does not exist" % archive_file) archive_file_extension = file_extension(archive_file) if archive_file_extension in [".gz", ".tgz"]: tar_args = "-zxf" elif archive_file_extension in [".tar"]: tar_args = "-xf" else: raise McExtractTarballToDirectoryException("Unsupported archive '%s' with extension '%s'" % (archive_file, archive_file_extension)) args = ["tar", tar_args, archive_file, "-C", dest_directory] if strip_root: args += ['--strip', '1'] try: run_command_in_foreground(args) except McRunCommandInForegroundException as ex: raise McExtractTarballToDirectoryException("Error while extracting archive '%s': %s" % (archive_file, str(ex)))
def link_canonical_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]: """From the provided HTML, determine the <link rel="canonical" /> URL (if any).""" html = str(decode_object_from_bytes_if_needed(html)) base_url_decode = decode_object_from_bytes_if_needed(base_url) base_url = None if base_url_decode is None else str(base_url_decode) link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I) for link_element in link_elements: if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I): match = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I) if match: url = match.group(1) if not is_http_url(url): # Maybe it's absolute path? if base_url is not None: return urljoin(base=base_url, url=url) else: log.debug( "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid." % url ) else: # Looks like URL, so return it return url return None
def select(self, table: str, what_to_select: str, condition_hash: dict = None) -> DatabaseResult: """SELECT chosen columns from the table that match given conditions.""" table = decode_object_from_bytes_if_needed(table) what_to_select = decode_object_from_bytes_if_needed(what_to_select) condition_hash = decode_object_from_bytes_if_needed(condition_hash) if condition_hash is None: condition_hash = {} condition_hash = condition_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in condition_hash: del condition_hash["submit"] sql_conditions = [] for key, value in condition_hash.items(): condition = key condition += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python sql_conditions.append(condition) # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) condition_hash[key] = value sql = "SELECT %s " % what_to_select sql += "FROM %s " % table if len(sql_conditions) > 0: sql += "WHERE %s" % " AND ".join(sql_conditions) return self.query(sql, condition_hash)
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create( table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, } ) db.create( table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), } ) return story
def find_by_id(self, table: str, object_id: int) -> Union[Dict[str, Any], None]: """Do an ID lookup on the table and return a single row match if found.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McFindByIDException("Primary key for table '%s' was not found" % table) # Python substitution find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % { "table": table, "id_column": primary_key_column, } # psycopg2 substitution result = self.query(find_by_id_query + " = %(id_value)s", {'id_value': object_id}) if result.rows() > 1: raise McFindByIDException("More than one row was found for ID '%d' from table '%s'" % (object_id, table)) elif result.rows() == 1: return result.hash() else: return None
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
def store_test_data_to_individual_files(basename: str, data: dict) -> None: """Write the given data to disk under the given basename; split the data (list) into individual files.""" basename = decode_object_from_bytes_if_needed(basename) data = decode_object_from_bytes_if_needed(data) data_dict = {} for story in data: stories_id = story.get('stories_id', None) if not stories_id: raise McStoreTestDataToIndividualFilesException("Story ID is unset for story: {}".format(story)) if stories_id in data_dict: raise McStoreTestDataToIndividualFilesException( "Story ID is not unique (such story already exists in a dict) for story: {}".format(story) ) data_dict[stories_id] = story # Remove all files before overwriting them (in case the new unit test contains *less* stories, we don't want old # files lying around) old_data_files = __test_data_files(basename=basename) log.info("Will remove old data files at path '{}': {}".format(basename, old_data_files)) for path in old_data_files: os.unlink(path) # Write dict to files for index in data_dict.keys(): store_test_data(basename=str(index), data=data_dict[index], subdirectory=basename)
def get_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int, wait: bool = False) -> bool: """Get a postgres advisory lock with the lock_type and lock_id as the two keys. Arguments: db - db handle lock_type - must be in LOCK_TYPES dict above lock_id - id for the particular lock within the type wait - if true, block while waiting for the lock, else return false if the lock is not available Returns: True if the lock is available """ lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if isinstance(lock_id, bytes): lock_id = decode_object_from_bytes_if_needed(lock_id) lock_id = int(lock_id) if isinstance(wait, bytes): wait = decode_object_from_bytes_if_needed(wait) wait = bool(wait) log.debug("trying for lock: %s, %d" % (lock_type, lock_id)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] if wait: db.query("select pg_advisory_lock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id}) return True else: r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", {'a': lock_type_id, 'b': lock_id}).hash() return r['locked']
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict: """Store the content for the download.""" # feed_error state indicates that the download was successful but that there was a problem # parsing the feed afterward. so we want to keep the feed_error state even if we redownload # the content download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) new_state = 'success' if download['state'] != 'feed_error' else 'feed_error' try: path = _get_store_for_writing().store_content(db, download['downloads_id'], content) except Exception as ex: raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex)) if new_state == 'success': download['error_message'] = '' db.update_by_id( table='downloads', object_id=download['downloads_id'], update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']}, ) download = db.find_by_id('downloads', download['downloads_id']) return download
def _set_extractor_results_cache(db, download: dict, results: dict) -> None: """Store results in extractor cache and manage size of cache.""" # This cache is used as a backhanded way of extracting stories asynchronously in the topic spider. Instead of # submitting extractor jobs and then directly checking whether a given story has been extracted, we just # throw extraction jobs in chunks into the extractor job and cache the results. Then if we re-extract # the same story shortly after, this cache will hit and the cost will be trivial. download = decode_object_from_bytes_if_needed(download) results = decode_object_from_bytes_if_needed(results) # Upsert cache entry db.query(""" INSERT INTO cache.extractor_results_cache ( extracted_html, extracted_text, downloads_id ) VALUES ( %(extracted_html)s, %(extracted_text)s, %(downloads_id)s ) ON CONFLICT (downloads_id) DO UPDATE SET extracted_html = EXCLUDED.extracted_html, extracted_text = EXCLUDED.extracted_text """, { 'extracted_html': results['extracted_html'], 'extracted_text': results['extracted_text'], 'downloads_id': int(download['downloads_id']), })
def fetch_test_data(basename: str, subdirectory: str = '') -> dict: """Fetch the given data from disk.""" basename = decode_object_from_bytes_if_needed(basename) subdirectory = decode_object_from_bytes_if_needed(subdirectory) file_path = _get_data_file(basename=basename, subdirectory=subdirectory) with open(file_path, mode='r', encoding='utf-8') as f: return decode_json(f.read())
def __init__(self, api_key: str, ip_address: str = None): api_key = decode_object_from_bytes_if_needed(api_key) ip_address = decode_object_from_bytes_if_needed(ip_address) if not api_key: raise McAuthUserException("API key is unset.") self.__api_key = api_key self.__ip_address = ip_address
def _get_all_string_match_positions(haystack: str, needle: str) -> List[int]: haystack = decode_object_from_bytes_if_needed(haystack) needle = decode_object_from_bytes_if_needed(needle) positions = [] for match in re.finditer(pattern=needle, string=haystack): positions.append(match.start()) return positions
def store_test_data(basename: str, data: dict, subdirectory: str = '') -> None: """Write the given data to disk under the given basename.""" basename = decode_object_from_bytes_if_needed(basename) data = decode_object_from_bytes_if_needed(data) subdirectory = decode_object_from_bytes_if_needed(subdirectory) file_path = _get_data_file(basename=basename, subdirectory=subdirectory) with open(file_path, mode='w', encoding='utf-8') as f: f.write(encode_json(json_obj=data, pretty=True))
def change_password(db: DatabaseHandler, email: str, new_password: str, new_password_repeat: str, do_not_inform_via_email: bool = False) -> None: """Change user's password.""" email = decode_object_from_bytes_if_needed(email) new_password = decode_object_from_bytes_if_needed(new_password) new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat) if isinstance(do_not_inform_via_email, bytes): do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email) do_not_inform_via_email = bool(int(do_not_inform_via_email)) # Check if user exists try: user = user_info(db=db, email=email) except Exception: raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email) password_validation_message = validate_new_password(email=email, password=new_password, password_repeat=new_password_repeat) if password_validation_message: raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message) # Hash + validate the password try: password_new_hash = generate_secure_hash(password=new_password) except Exception as ex: raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex)) if not password_new_hash: raise McAuthChangePasswordException("Generated password hash is empty.") # Set the password hash db.query(""" UPDATE auth_users SET password_hash = %(password_hash)s, active = TRUE WHERE email = %(email)s """, { 'email': email, 'password_hash': password_new_hash, }) if not do_not_inform_via_email: message = AuthPasswordChangedMessage(to=email, full_name=user.full_name()) if not send_email(message): raise McAuthChangePasswordException( 'The password has been changed, but I was unable to send an email notifying you about the change.' )
def update_by_id(self, table: str, object_id: int, update_hash: dict) -> Union[Dict[str, Any], None]: """Update the row in the table with the given ID. Ignore any fields that start with '_'.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) update_hash = decode_object_from_bytes_if_needed(update_hash) update_hash = update_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in update_hash: del update_hash["submit"] update_hash = {k: v for k, v in update_hash.items() if not k.startswith("_")} if len(update_hash) == 0: raise McUpdateByIDException("Hash to UPDATE is empty.") primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McUpdateByIDException("Primary key for table '%s' was not found" % table) keys = [] for key, value in update_hash.items(): key_value = key # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) update_hash[key] = value key_value += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python keys.append(key_value) update_hash['__object_id'] = object_id sql = "UPDATE %s " % table sql += "SET %s " % ", ".join(keys) sql += "WHERE %s = " % primary_key_column sql += "%(__object_id)s" # "%(__object_id)s" to be resolved by psycopg2, not Python try: self.query(sql, update_hash) except Exception as ex: raise McUpdateByIDException("Update to UPDATE hash '%s': %s" % (str(update_hash), str(ex))) updated_row = self.find_by_id(table=table, object_id=object_id) return updated_row
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def add_story_and_content_download(db: DatabaseHandler, story: dict, parent_download: dict) -> Optional[dict]: """If the story is new, add it to the database and also add a pending download for the story content.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id']) if story is not None: _create_child_download_for_story(db=db, story=story, parent_download=parent_download) return story
def __init__(self, email: str, full_name: str = None, notes: str = None, active: bool = None, weekly_requests_limit: int = None, weekly_requested_items_limit: int = None, password: str = None, password_repeat: str = None, role_ids: List[int] = None, subscribe_to_newsletter: bool = None, activation_url: str = None): if not full_name: raise McAuthUserException("User full name is unset.") if notes is None: raise McAuthUserException("User notes are undefined (should be at least an empty string).") if not isinstance(role_ids, list): raise McAuthUserException("List of role IDs is not an array.") if not password: raise McAuthUserException("Password is unset.") if not password_repeat: raise McAuthUserException("Password repeat is unset.") # Password will be verified by ::NewOrModifyUser # Either activate the user right away, or make it inactive and send out an email with activation link if (active and activation_url) or (not active and not activation_url): raise McAuthUserException("Either make the user active or set the activation URL.") super().__init__( email=email, full_name=full_name, notes=notes, active=active, weekly_requests_limit=weekly_requests_limit, weekly_requested_items_limit=weekly_requested_items_limit, password=password, password_repeat=password_repeat, role_ids=role_ids, ) if isinstance(subscribe_to_newsletter, bytes): subscribe_to_newsletter = decode_object_from_bytes_if_needed(subscribe_to_newsletter) subscribe_to_newsletter = bool(int(subscribe_to_newsletter or 0)) activation_url = decode_object_from_bytes_if_needed(activation_url) self.__subscribe_to_newsletter = subscribe_to_newsletter self.__activation_url = activation_url
def __init__(self, role_id: int, role_name: str): if isinstance(role_id, bytes): role_id = decode_object_from_bytes_if_needed(role_id) role_name = decode_object_from_bytes_if_needed(role_name) if not role_id: raise McAuthUserException("Role ID is unset.") if not role_name: raise McAuthUserException("Role name is unset.") self.__role_id = role_id self.__role_name = role_name
def fetch_annotation_for_story(self, db: DatabaseHandler, stories_id: int) -> Union[dict, list, None]: """Fetch the annotation from key-value store for the story, or None if story is not annotated.""" if not self.annotator_is_enabled(): fatal_error("Annotator is not enabled in the configuration.") # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) if not self.story_is_annotated(db=db, stories_id=stories_id): log.warning("Story %d is not annotated." % stories_id) return None json = self.__postgresql_store.fetch_content(db=db, object_id=stories_id) if json is None: raise McJSONAnnotatorException("Fetched annotation is undefined or empty for story %d." % stories_id) json = json.decode('utf-8') try: annotation = decode_json(json) if annotation is None: raise McJSONAnnotatorException("Annotation is None after decoding from JSON.") except Exception as ex: raise McJSONAnnotatorException( "Unable to parse annotation JSON for story %d: %s\nString JSON: %s" % (stories_id, str(ex), json,) ) try: annotation = self._preprocess_stored_annotation(annotation) if annotation is None: raise McJSONAnnotatorException("Annotation is None after preprocessing.") except Exception as ex: fatal_error( "Unable to preprocess stored annotation for story %d: %s\nString JSON: %s" % (stories_id, str(ex), json,) ) return annotation
def stem_words(self, words: List[str]) -> List[str]: """Stem list of words with PyStemmer.""" language_code = self.language_code() words = decode_object_from_bytes_if_needed(words) # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence # tokenization first) words = [word.replace("’", "'") for word in words] if language_code is None: raise McLanguageException("Language code is None.") if words is None: raise McLanguageException("Words to stem is None.") # (Re-)initialize stemmer if needed if self.__pystemmer is None: try: self.__pystemmer = PyStemmer(language_code) except Exception as ex: raise McLanguageException( "Unable to initialize PyStemmer for language '%s': %s" % ( language_code, str(ex), )) stems = self.__pystemmer.stemWords(words) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words
def recreate_db(label: str = None) -> None: """(Re)create database schema.""" def reset_all_schemas(db_: DatabaseHandler) -> None: """Recreate all schemas.""" schemas = db_.query( """ SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT LIKE %(schema_pattern)s AND schema_name != 'information_schema' ORDER BY schema_name """, { 'schema_pattern': 'pg_%' }).flat() # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test db_.query('SET client_min_messages=WARNING') for schema in schemas: db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema) db_.query('SET client_min_messages=NOTICE') # --- label = decode_object_from_bytes_if_needed(label) db = connect_to_db(label=label, do_not_check_schema_version=True) log.info("Resetting all schemas...") reset_all_schemas(db_=db) db.set_show_error_statement(True) mediawords_sql_path = mc_sql_schema_path() log.info("Importing from %s..." % mediawords_sql_path) with open(mediawords_sql_path, 'r') as mediawords_sql_f: mediawords_sql = mediawords_sql_f.read() db.query(mediawords_sql) log.info("Done.")
def is_homepage_url(url: str) -> bool: """Returns true if URL is homepage (e.g. http://www.wired.com/) and not a child page (e.g. http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/).""" url = decode_object_from_bytes_if_needed(url) if url is None: log.debug("URL is None.") return False if len(url) == 0: log.debug("URL is empty.") return False url = fix_common_url_mistakes(url) if not is_http_url(url): log.debug("URL '%s' is invalid." % url) return False # Remove cruft from the URL first try: url = normalize_url(url) except McNormalizeURLException as ex: log.debug( "Unable to normalize URL '%s' before checking if it's a homepage: %s" % (url, ex)) return False # The shortened URL may lead to a homepage URL, but the shortened URL # itself is not a homepage URL if is_shortened_url(url): return False # If we still have something for a query of the URL after the # normalization, always assume that the URL is *not* a homepage uri = furl(url) if len(str(uri.query)) > 0: return False for homepage_url_path_regex in __HOMEPAGE_URL_PATH_REGEXES: if re.search(homepage_url_path_regex, str(uri.path)): return True return False
def get_url_distinctive_domain(url: str) -> str: """Return a truncated form of URL's host (domain) that distinguishes it from others, e.g.: * www.whitehouse.gov => whitehouse.gov * www.blogspot.com => blogspot.com * kardashian.blogspot.com => kardashian.blogspot.com Return original URL if unable to process the URL.""" try: url = decode_object_from_bytes_if_needed(url) host = get_url_host(url) if host is None: return url name_parts = host.split('.') n = len(name_parts) - 1 if re.search(r'\.(gov|org|com?)\...$', host, re.I): # foo.co.uk -> foo.co.uk instead of co.uk parts = [ str(name_parts[n - 2]), str(name_parts[n - 1]), str(name_parts[n]) ] domain = '.'.join(parts) elif re.search( r'\.go\.com|\.wordpress\.com|\.blogspot\.|\.livejournal\.com|\.privet\.ru|\.wikia\.com' r'|\.feedburner\.com|\.24open\.ru|\.patch\.com|\.tumblr\.com', host, re.I): # identify sites in these domains as the whole host name (abcnews.go.com instead of go.com) domain = host else: parts = [str(name_parts[n - 1] or ''), str(name_parts[n] or '')] domain = '.'.join(parts) return domain.lower() except Exception as ex: log.debug("get_url_distinctive_domain falling back to url: " + str(ex)) return str(url).lower()
def process_download_for_extractor( db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be extracted for the story, call process_extracted_story() on the parent story.""" download = decode_object_from_bytes_if_needed(download) stories_id = download['stories_id'] log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url'])) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) has_remaining_download = db.query( """ SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s AND extracted = 'f' AND type = 'content' """, { 'stories_id': stories_id }).hash() # MC_REWRITE_TO_PYTHON: Perlism if has_remaining_download is None: has_remaining_download = {} if len(has_remaining_download) > 0: log.info("Pending more downloads...") else: story = db.find_by_id(table='stories', object_id=stories_id) process_extracted_story(db=db, story=story, extractor_args=extractor_args)
def extract_article_from_html(html: str) -> str: """Extract article HTML from a full HTML file.""" # FIXME move HTML stripping here too html = decode_object_from_bytes_if_needed(html) if html is None or html == '': return '' try: doc = readability.readability.Document(html) doc_title = doc.short_title().strip() doc_summary = doc.summary().strip() extracted_text = "%s\n\n%s" % (doc_title, doc_summary) except Exception as ex: l.error('Exception raised while extracting HTML: %s' % str(ex)) extracted_text = '' return extracted_text
def fetch_content(self, db: DatabaseHandler, object_id: int, object_path: str = None) -> bytes: """Read object from PostgreSQL's 'path' row.""" object_id = self._prepare_object_id(object_id) object_path = decode_object_from_bytes_if_needed(object_path) if object_path is None: raise McDatabaseInlineStoreException("Object path for object ID %d is None." % object_id) if not object_path.startswith(self.__CONTENT_PREFIX): raise McDatabaseInlineStoreException( "Object path for object ID %d is invalid: %s" % (object_id, object_path,) ) object_path = object_path[len(self.__CONTENT_PREFIX):] content = object_path.encode('utf-8') return content
def decode_json(json_string: str) -> Union[dict, list]: """Decode JSON to dictionary or list.""" json_string = decode_object_from_bytes_if_needed(json_string) if json_string is None: raise McDecodeJSONException("JSON string is None.") if len(json_string) == 0: raise McDecodeJSONException("JSON string is empty.") try: json_obj = json.loads(json_string) except Exception as ex: raise McDecodeJSONException("Unable to decode string %s from JSON: %s" % (str(json_string), str(ex))) if json_obj is None: raise McEncodeJSONException("Resulting JSON object is None for string: %s" % (str(json_string),)) return json_obj
def _delete_story_sentences(db: DatabaseHandler, story: dict) -> None: """Delete any existing stories for the given story and also update media_stats to adjust for the deletion.""" story = decode_object_from_bytes_if_needed(story) num_deleted = db.query(""" DELETE FROM story_sentences WHERE stories_id = %(stories_id)s """, {'stories_id': story['stories_id']}).rows() if num_deleted > 0: db.query(""" UPDATE media_stats SET num_sentences = num_sentences - %(num_deleted)s WHERE media_id = %(media_id)s AND stat_date = %(publish_date)s::date """, { 'num_deleted': num_deleted, 'media_id': story['media_id'], 'publish_date': story['publish_date'], })
def fix_common_url_mistakes(url: str) -> Optional[str]: """Fixes common URL mistakes (mistypes, etc.).""" url = decode_object_from_bytes_if_needed(url) if url is None: return None # Fix broken URLs that look like this: http://http://www.al-monitor.com/pulse url = re.sub(r'(https?://)https?:?//', r"\1", url, flags=re.I) # Fix URLs with only one slash after "http" ("http:/www.") url = re.sub(r'(https?:/)(www)', r"\1/\2", url, flags=re.I) # replace backslashes with forward url = re.sub(r'\\', r'/', url) # http://newsmachete.com?page=2 -> http://newsmachete.com/?page=2 url = re.sub(r'(https?://[^/]+)\?', r"\1/?", url) return url
def create_test_topic(db: DatabaseHandler, label: str) -> dict: """Create test topic with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create( table='topics', insert_hash={ 'name': label, 'description': label, 'pattern': label, 'solr_seed_query': label, 'solr_seed_query_run': True, 'start_date': '2016-01-01', 'end_date': '2016-03-01', 'job_queue': 'mc', 'max_stories': 100000, 'platform': 'web' } )
def run_topics_fetch_link(topic_fetch_urls_id: int, domain_timeout: Optional[int] = None) -> None: """Fetch a link for a topic and either match it to an existing story or generate a story from it. Almost all of the interesting functionality here happens in fetch_topic_url(). The code here just deals with routing, including requeueing responses throttled by mediawords.util.web.user_agent.throttled.""" global _consecutive_requeues if isinstance(topic_fetch_urls_id, bytes): topic_fetch_urls_id = decode_object_from_bytes_if_needed( topic_fetch_urls_id) topic_fetch_urls_id = int(topic_fetch_urls_id) if topic_fetch_urls_id is None: raise McFetchLinkJobException("'topic_fetch_urls_id' is None.") log.info("Start fetch for topic_fetch_url %d" % topic_fetch_urls_id) db = connect_to_db() try: if not fetch_topic_url_update_state( db=db, topic_fetch_urls_id=topic_fetch_urls_id, domain_timeout=domain_timeout): JobBroker(queue_name=QUEUE_NAME).add_to_queue( topic_fetch_urls_id=topic_fetch_urls_id) _consecutive_requeues += 1 if _consecutive_requeues > REQUEUES_UNTIL_SLEEP: log.info("sleeping after %d consecutive retries ..." % _consecutive_requeues) time.sleep(1) except Exception as ex: # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the # "consecutive retries" here log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}") _consecutive_requeues = 0 log.info("Finished fetch for topic_fetch_url %d" % topic_fetch_urls_id)
def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None: """Add extractor version tag to the story.""" # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated. # Purging such a cache manually is very error-prone. story = decode_object_from_bytes_if_needed(story) tag_set = db.find_or_create( table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()}) db.query( """ DELETE FROM stories_tags_map AS stm USING tags AS t JOIN tag_sets AS ts ON ts.tag_sets_id = t.tag_sets_id WHERE t.tags_id = stm.tags_id AND ts.tag_sets_id = %(tag_sets_id)s AND stm.stories_id = %(stories_id)s """, { 'tag_sets_id': tag_set['tag_sets_id'], 'stories_id': story['stories_id'], }) extractor_version = extractor_name() tag = db.find_or_create(table='tags', insert_hash={ 'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id'] }) tags_id = tag['tags_id'] db.query( """ INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, { 'stories_id': story['stories_id'], 'tags_id': tags_id })
def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None: """Generate word2vec model for a given snapshot.""" # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(snapshots_id, bytes): snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() # FIXME might be more efficient to pass topics_id as a parameter topics_id = db.query( """ SELECT topics_id FROM snapshots WHERE snapshots_id = %(snapshots_id)s """, { 'snapshots_id': snapshots_id }).flat()[0] log.info( f"Generating word2vec model for topic {topics_id}, snapshot {snapshots_id}..." ) sentence_iterator = SnapshotSentenceIterator(db=db, topics_id=topics_id, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, topics_id=topics_id, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info( f"Finished generating word2vec model for topic {topics_id}, snapshot {snapshots_id}." )
def role_id_for_role(db: DatabaseHandler, role: str) -> int: """Fetch a user role's ID for a role; raise if no such role was found.""" role = decode_object_from_bytes_if_needed(role) if not role: raise McRoleIDForRoleException("Role is empty.") auth_roles_id = db.query( """ SELECT auth_roles_id FROM auth_roles WHERE role = %(role)s LIMIT 1 """, { 'role': role }).flat() if (not auth_roles_id) or (not len(auth_roles_id)): raise McRoleIDForRoleException("Role '%s' was not found." % role) return int(auth_roles_id[0])
def get_url_host(url: str) -> str: """Return hostname of an URL. If we can't parse out the host name, just return the URL.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetURLHostException("URL is None") if len(url) == 0: raise McGetURLHostException("URL is empty") url = fix_common_url_mistakes(url) if not is_http_url(url): return url uri = furl(url) host = uri.host if host is not None and len(host) > 0: return host else: return url
def delete_user(db: DatabaseHandler, email: str) -> None: """Delete user.""" email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if user exists try: user_info(db=db, email=email) except Exception as _: raise McAuthProfileException( "User with email address '%s' does not exist." % email) # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map') db.query( """ DELETE FROM auth_users WHERE email = %(email)s """, {'email': email})
def update_job_state_message(self, db: DatabaseHandler, message: str) -> None: """ Update the message field for the current "job_states" row. This is a public method that is intended to be used by code run anywhere above the stack from run() to publish messages updating the progress of a long running job. """ message = decode_object_from_bytes_if_needed(message) # Verify that it exists I guess? db.require_by_id(table='job_states', object_id=self.__job_states_id) job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'message': message, 'last_updated': sql_now(), }) self.__update_table_state(db=db, job_state=job_state)
def story_is_annotatable(self, db: DatabaseHandler, stories_id: int) -> bool: """Check if story can be annotated.""" if not self.annotator_is_enabled(): raise McJSONAnnotatorException("Annotator is not enabled in the configuration.") # MC_REWRITE_TO_PYTHON: remove after rewrite to Python if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) story = db.query(""" SELECT story_is_english_and_has_sentences FROM story_is_english_and_has_sentences(%(stories_id)s) """, {'stories_id': stories_id}).hash() if story is not None and int(story['story_is_english_and_has_sentences']) == 1: return True else: return False
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def _add_story_tags_to_stories(db: DatabaseHandler, stories: List[Dict[str, Any]]) -> None: """Add story tags to stories for Solr indexing.""" stories = decode_object_from_bytes_if_needed(stories) tags = [] num_tags = 5 for i in range(1, num_tags + 1): tags.append(lookup_or_create_tag(db=db, tag_name=f"test:test_{i}")) for story in stories: assert isinstance(story, dict) tag = tags.pop() tags.insert(0, tag) db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, { 'stories_id': story['stories_id'], 'tags_id': tag['tags_id'], })
def _get_first_download(db: DatabaseHandler, story: dict) -> dict: """Get the first download linking to this story.""" story = decode_object_from_bytes_if_needed(story) first_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY sequence ASC LIMIT 1 """, { 'stories_id': story['stories_id'] }).hash() # MC_REWRITE_TO_PYTHON: Perlism if first_download is None: first_download = {} return first_download
def run_command_in_foreground(command: List[str]) -> None: """Run command in foreground, raise McRunCommandInForegroundException if it fails.""" l.debug("Running command: %s" % ' '.join(command)) command = decode_object_from_bytes_if_needed(command) # Add some more PATHs to look into env_path = os.environ.copy() env_path[ 'PATH'] = '/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin:' + env_path[ 'PATH'] # noinspection PyBroadException try: if sys.platform.lower() == 'darwin': # OS X -- requires some crazy STDOUT / STDERR buffering line_buffered = 1 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=line_buffered, env=env_path) while True: output = process.stdout.readline() if len(output) == 0 and process.poll() is not None: break l.info(output.strip()) rc = process.poll() if rc > 0: raise McRunCommandInForegroundException( "Process returned non-zero exit code %d" % rc) else: # assume Ubuntu subprocess.check_call(command, env=env_path) except subprocess.CalledProcessError as ex: raise McRunCommandInForegroundException( "Process returned non-zero exit code %d" % ex.returncode) except Exception as ex: raise McRunCommandInForegroundException( "Error while running command: %s" % str(ex))
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, {'a': topic['topics_id']}).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", {'a': story['stories_id']}).hash() content = '' try: content = mediawords.dbi.downloads.fetch_content(db, download) except Exception: pass db.begin() db.create('topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}) db.commit()
def language_code_for_text(text: str): """Returns an ISO 690 language code for the plain text passed as a parameter. :param text: Text that should be identified :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure """ text = decode_object_from_bytes_if_needed(text) if not text: return '' if len(text) > __MAX_TEXT_LENGTH: log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH) text = text[:__MAX_TEXT_LENGTH] # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8 text = __recode_utf8_string(text) try: is_reliable, text_bytes_found, details = cld2.detect( utf8Bytes=text, useFullLangTables=True) except Exception as ex: log.error("Error while detecting language: %s" % str(ex)) return '' if not details: return '' best_match = details[0] language_name = best_match.language_name.lower() language_code = best_match.language_code.lower() if language_name in {'unknown', 'tg_unknown_language' } or language_code == 'un': return '' if not language_is_supported(language_code): return '' return language_code
def __execute(self, cursor: DictCursor, query: str, page: int, rows_per_page: int, double_percentage_sign_marker: str) -> None: query = decode_object_from_bytes_if_needed(query) if page < 1: raise McQueryPagedHashesException('Page must be 1 or bigger.') offset = (page - 1) * rows_per_page query = "%(original_query)s LIMIT ( %(rows_per_page)d + 1 ) OFFSET %(offset)s" % { 'original_query': query, 'rows_per_page': rows_per_page, 'offset': offset, } query_args = [query] query_args = convert_dbd_pg_arguments_to_psycopg2_format(*query_args) # Query rs = DatabaseResult(cursor=cursor, query_args=query_args, double_percentage_sign_marker=double_percentage_sign_marker) hashes = rs.hashes() # Truncate one_more_page = False if len(hashes) > rows_per_page: one_more_page = True del hashes[rows_per_page:] hashes_size = offset + len(hashes) if one_more_page: hashes_size += 1 pager = Pages(total_entries=hashes_size, entries_per_page=rows_per_page, current_page=page) self.__list = hashes self.__pager = pager
def wait_for_tcp_port_to_open(port: int, hostname: str = 'localhost', retries: int = 60, delay: Union[int, float] = 1) -> bool: """Try connecting to TCP port until it opens (or not); return True if managed to connect.""" hostname = decode_object_from_bytes_if_needed(hostname) port_is_open = False for retry in range(retries): if retry == 0: log.debug("Trying to connect to %s:%d" % (hostname, port)) else: log.debug("Trying to connect to %s:%d, retry %d" % (hostname, port, retry)) if tcp_port_is_open(port, hostname): port_is_open = True break else: time.sleep(delay) return port_is_open
def tcp_port_is_open(port: int, hostname: str = 'localhost') -> bool: """Test if TCP port is open.""" hostname = decode_object_from_bytes_if_needed(hostname) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(2) try: result = sock.connect_ex((hostname, port)) except socket.gaierror as ex: log.warning(f"Unable to resolve {hostname}: {ex}") return False if result == 0: try: sock.shutdown(socket.SHUT_RDWR) except OSError as ex: # Quiet down "OSError: [Errno 57] Socket is not connected" log.warning("Error while shutting down socket: %s" % str(ex)) sock.close() return result == 0