def test_file_with_env_value(): random_env_name = random_string(length=16) random_env_value = random_string(length=16) os.environ[random_env_name] = random_env_value env_file = file_with_env_value(name=random_env_name) assert os.path.exists(env_file) with open(env_file, mode='r') as f: assert random_env_value == f.read() env_file_2 = file_with_env_value(name=random_env_name) assert env_file == env_file_2, f"Helper doesn't recreate file on identical value." random_env_value = random_string(length=16) # Try changing value os.environ[random_env_name] = random_env_value env_file_3 = file_with_env_value(name=random_env_name) with open(env_file_3, mode='r') as f: assert random_env_value == f.read() assert env_file != env_file_3, f"Helper recreates file on different value."
def test_env_value(): random_env_name = random_string(length=16) random_env_value = random_string(length=16) os.environ[random_env_name] = random_env_value assert env_value(name=random_env_name) == random_env_value
def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str: """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field. The database connection must be within a transaction. The temporary table is setup to be dropped at the end of the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table.""" table_name = '_tmp_ids_%s' % random_string(length=16) l.debug("Temporary IDs table: %s" % table_name) primary_key_clause = "" if ordered: primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name sql = """CREATE TEMPORARY TABLE %s (""" % table_name sql += primary_key_clause sql += "id BIGINT)" self.query(sql) copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name) for single_id in ids: copy.put_line("%d\n" % single_id) copy.end() self.query("ANALYZE %s" % table_name) return table_name
def test_env_value_required(): nonexistent_env_name = random_string(length=16) with pytest.raises(McConfigEnvironmentVariableUnsetException): env_value(name=nonexistent_env_name) assert env_value(name=nonexistent_env_name, required=False) is None
def test_validate_new_password(): # noinspection PyTypeChecker assert len(validate_new_password(email=None, password=None, password_repeat=None)) > 0 assert len(validate_new_password(email='', password='', password_repeat='')) > 0 assert len(validate_new_password(email='*****@*****.**', password='', password_repeat='')) > 0 # Passwords do not match assert len(validate_new_password(email='*****@*****.**', password='******', password_repeat='abcdefghX')) > 0 # Too short assert len(validate_new_password(email='*****@*****.**', password='******', password_repeat='abc')) > 0 too_long_password = random_string(length=200) assert len(validate_new_password(email='*****@*****.**', password=too_long_password, password_repeat=too_long_password)) > 0 # Email == password email = '*****@*****.**' assert len(validate_new_password(email=email, password=email, password_repeat=email)) > 0 # All good password = '******' assert len(validate_new_password(email='*****@*****.**', password=password, password_repeat=password)) == 0
def get_test_s3_credentials() -> Union[dict, None]: """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured.""" config = py_get_config() credentials = None # Environment variables if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None: credentials = { 'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None), 'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None), 'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None), 'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None), } # mediawords.yml elif 'amazon_s3' in config and 'test' in config['amazon_s3']: credentials = copy.deepcopy(config['amazon_s3']['test']) # We want to be able to run S3 tests in parallel if credentials is not None: credentials['directory_name'] = credentials[ 'directory_name'] + '-' + random_string(64) return credentials
def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str: """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field. The database connection must be within a transaction. The temporary table is setup to be dropped at the end of the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table.""" table_name = '_tmp_ids_%s' % random_string(length=16) log.debug("Temporary IDs table: %s" % table_name) primary_key_clause = "" if ordered: primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name sql = """CREATE TEMPORARY TABLE %s (""" % table_name sql += primary_key_clause sql += "id BIGINT)" self.query(sql) copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name) for single_id in ids: copy.put_line("%d\n" % int(single_id)) copy.end() self.query("ANALYZE %s" % table_name) return table_name
def get_test_s3_credentials() -> Union[dict, None]: """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured.""" config = py_get_config() credentials = None # Environment variables if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None: credentials = { 'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None), 'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None), 'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None), 'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None), } # mediawords.yml elif 'amazon_s3' in config and 'test' in config['amazon_s3']: credentials = copy.deepcopy(config['amazon_s3']['test']) # We want to be able to run S3 tests in parallel if credentials is not None: credentials['directory_name'] = credentials['directory_name'] + '-' + random_string(64) return credentials
def _initialize_store(self) -> CachedAmazonS3Store: return CachedAmazonS3Store( access_key_id=test_credentials.access_key_id(), secret_access_key=test_credentials.secret_access_key(), bucket_name=test_credentials.bucket_name(), directory_name=test_credentials.directory_name() + '/' + random_string(16), cache_table='cache.s3_raw_downloads_cache', )
def test_env_value_empty_string(): empty_env_name = random_string(length=16) os.environ[empty_env_name] = '' with pytest.raises(McConfigEnvironmentVariableUnsetException): env_value(name=empty_env_name) assert env_value(name=empty_env_name, allow_empty_string=True) == ''
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]: """Generate password reset token used for both activating newly registered users and resetting passwords. Returns non-hashed password reset token or None if user was not found. """ email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a # "success" message. That way the adversary would not be able to find out which email addresses are active users. # # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing # attacks) user_exists = db.query( """ SELECT auth_users_id, email FROM auth_users WHERE email = %(email)s LIMIT 1 """, { 'email': email }).hash() if user_exists is None or len(user_exists) == 0: # User was not found, so set the email address to an empty string, but don't return just now and continue with a # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks) email = '' # Generate the activation token password_reset_token = random_string(length=64) if len(password_reset_token) == 0: raise McAuthProfileException('Unable to generate an activation token.') # Hash + validate the activation token password_reset_token_hash = generate_secure_hash( password=password_reset_token) if not password_reset_token_hash: raise McAuthProfileException("Unable to hash an activation token.") # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing) db.query( """ UPDATE auth_users SET password_reset_token_hash = %(password_reset_token_hash)s WHERE email = %(email)s AND email != '' """, { 'email': email, 'password_reset_token_hash': password_reset_token_hash, }) return password_reset_token
def test_file_with_env_value_base64(): random_env_name = random_string(length=16) random_env_value = secrets.token_bytes(16) random_env_value_b64 = base64.b64encode(random_env_value).decode('utf-8') os.environ[random_env_name] = random_env_value_b64 env_file = file_with_env_value(name=random_env_name, encoded_with_base64=True) assert os.path.exists(env_file) with open(env_file, mode='rb') as f: assert random_env_value == f.read()
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]: """Generate password reset token used for both activating newly registered users and resetting passwords. Returns non-hashed password reset token or None if user was not found. """ email = decode_object_from_bytes_if_needed(email) if not email: raise McAuthProfileException('Email address is empty.') # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a # "success" message. That way the adversary would not be able to find out which email addresses are active users. # # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing # attacks) user_exists = db.query(""" SELECT auth_users_id, email FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': email}).hash() if user_exists is None or len(user_exists) == 0: # User was not found, so set the email address to an empty string, but don't return just now and continue with a # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks) email = '' # Generate the activation token password_reset_token = random_string(length=64) if len(password_reset_token) == 0: raise McAuthProfileException('Unable to generate an activation token.') # Hash + validate the activation token password_reset_token_hash = generate_secure_hash(password=password_reset_token) if not password_reset_token_hash: raise McAuthProfileException("Unable to hash an activation token.") # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing) db.query(""" UPDATE auth_users SET password_reset_token_hash = %(password_reset_token_hash)s WHERE email = %(email)s AND email != '' """, { 'email': email, 'password_reset_token_hash': password_reset_token_hash, }) return password_reset_token
def __init__(self, db: DatabaseHandler, snapshots_id: int): super().__init__() snapshots_id = int(snapshots_id) # Verify that topic exists if db.find_by_id(table='snapshots', object_id=snapshots_id) is None: raise McWord2vecException("Snapshot with ID %d does not exist." % snapshots_id) self.__snapshots_id = snapshots_id self.__sentence_counter = 0 # Subselect such as: # # SELECT sentence # FROM story_sentences # WHERE stories_id IN ( # SELECT stories_id # FROM snap.snapshots # WHERE snapshots_id = ... # ) # # or its variants (e.g. INNER JOIN) makes the query planner decide on a sequential scan on "story_sentences", # so we create a temporary table with snapshot's "stories_id" first. log.info("Creating a temporary table with snapshot's stories_id...") snapshots_stories_id_temp_table_name = 'snapshot_stories_ids_{}'.format( random_string(32)) db.query( """ CREATE TEMPORARY TABLE {} AS SELECT stories_id FROM snap.stories WHERE snapshots_id = %(snapshots_id)s """.format(snapshots_stories_id_temp_table_name), {'snapshots_id': snapshots_id}) # "INNER JOIN" instead of "WHERE stories_id IN (SELECT ...)" here because then database doesn't have to compute # distinct "stories_id" to SELECT sentence FROM story_sentences against, i.e. it doesn't have to # Group + HashAggregate on the temporary table. log.info("Creating COPY TO object...") self.__copy_to = db.copy_to(""" COPY ( SELECT story_sentences.sentence FROM {} AS snapshot_stories_ids INNER JOIN story_sentences ON snapshot_stories_ids.stories_id = story_sentences.stories_id ) TO STDOUT WITH CSV """.format(snapshots_stories_id_temp_table_name))
def test_random_string(): with pytest.raises(McRandomStringException): random_string(0) with pytest.raises(McRandomStringException): random_string(-1) length = 16 string_1 = random_string(length=length) string_2 = random_string(length=length) assert string_1 != string_2 assert len(string_1) == length assert len(string_2) == length assert string_1.isalnum() assert string_2.isalnum()
def test_stories_checksum_matches_feed(): db = connect_to_db() rand = random_string(length=8) medium = db.create(table='media', insert_hash={ 'name': f"test feed checksum {rand}", 'url': f"url://test/feed/checksum/{rand}", }) feed = db.create(table='feeds', insert_hash={ 'name': 'feed', 'url': medium['url'], 'media_id': medium['media_id'], }) feeds_id = feed['feeds_id'] urls_a = [ "http://www.bzf.ro/rezultate-liga-a-v-a-zona-fagaras-20.html", "http://www.mehrnews.com/detail/News/2027821", "http://www.chip.de/news/Parallels-Zwei-Android-Systeme-auf-einem-Handy_61383826.html", "http://www.inn.co.il/News/Flash.aspx/401095", ("http://www.moheet.com/2013/04/07/%d9%85%d8%ad%d8%b3%d9%88%d8%a8-%d8%a3%d8%ad%d8%af%d8%a7%d8%ab-%d8%a7%d9" "%84%d9%83%d8%a7%d8%aa%d8%af%d8%b1%d8.%a7%d8%a6%d9%8a%d8%a9-%d9%88%d8%a7%d8%ad%d8%af%d8%a9-%d9%85%d9%86-%d9" "%85%d9%88%d8%b1%d9%88/"), "http://twitter.com/radiationn/statuses/320948496549154816", "http://news.chinatimes.com/realtime/110105/112013040700840.html", "http://www.northkoreannews.net/index.php/sid/213669147/scat/08aysdf7tga9s7f7", "http://twitter.com/NastyaaPatrick/statuses/320956956149948417", "http://life.chinatimes.com/life/11051801/112013040800054.html", "http://www.enet.gr/?i=news.el.article&id=355553", ("http://www.ibtimes.co.uk/articles/454410/20130407/portugal-government-sticks-to-bailout-goals-despite-" "court-ruling.htm"), "http://www.egynews.net:80/wps/portal/news?params=223267", ("http://www.merkur-online.de:80/sport/fussball/hannover-trostlose-nullnummer-gegen-stuttgart-zr-" "2838522.html?cmp=defrss"), "http://www.farsnews.com/newstext.php?nn=13920118001322", ] urls_b = [ "http://www.guardian.co.uk/football/blog/2013/apr/07/sunderland-chelsea-tactics-match", ("http://www.nicematin.com/monde/egypte-un-mort-dans-des-violences-apres-les-funerailles-de-coptes-tues." "1206791.html"), ("http://www.mercurynews.com/breaking-news/ci_22965002/immigration-talks-between-california-farm-groups-" "hit-impasse?source=rss_emailed"), "http://www.belfasttelegraph.co.uk/sport/racing/cut-too-sharp-for-gladness-rivals-29179755.html", "http://www.vz.ru/news/2013/4/7/627732.html", "http://www.thehindu.com/sport/ipl2013/fleming-unhappy-with-csk-batsmen/article4591746.ece", ("http://www.dallasnews.com/entertainment/music/headlines/20130407-academy-of-country-music-awards-7-p.m.-" "burleson-s-kelly-clarkson-set-to-perform.ece"), "http://feedproxy.google.com/~r/OTB/~3/TNKm_R0dEKo/", ("http://rss.feedsportal.com/c/266/f/3492/s/2a6f8876/l/0L0Sindependent0O0Cnews0Cworld0Cmiddle0Eeast0Cisraels" "0Enew0Estrategic0Eaffairs0Eminister0Ewest0Emust0Ethreaten0Eiran0Eover0Enuclear0Eplans0E85635150Bhtml/" "story01.htm"), "http://news.chinatimes.com/focus/11050105/112013040800090.html", "http://blogi.newsweek.pl/Tekst/naluzie/669783,marzenie-przyziemne.html#comment-168169", "http://jamaica-gleaner.com/gleaner/20130407/ent/ent6.html", "http://www.wboc.com/story/21901967/timeline-of-the-whereabouts-of-suspected-strangler", "http://www.cadenaser.com/internacional/articulo/feminismo-islamico-femen/csrcsrpor/20130407csrcsrint_6/Tes", "http://thehimalayantimes.com/rssReference.php?id=MzcyMDQw", ("http://au.ibtimes.com/articles/454410/20130408/portugal-government-sticks-to-bailout-goals-despite-court-" "ruling.htm"), "http://www.ziar.com/articol-din-ziar?id_syndic_article=5566035", "http://www.bellinghamherald.com/2013/04/07/2955579/hardwood-to-trading-floor-stocks.html#storylink=rss", ] stories_a = [{'url': url} for url in urls_a] stories_b = [{'url': url} for url in urls_b] # First check should fail since feed checksum should be empty assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_a) is False, "Empty checksum." # Next check with same stories should be a match assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 1." # And another match assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 2." # And now try with different set of stories assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_b) is False, "Fail 1." # And now with the same B stories assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_b) is True, "Match 3." # And now add one story stories_b.append({'url': 'http://foo.bar.com'}) assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_b) is False, "Fail 2." assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_b) is True, "Match 4." # And now with no stories assert stories_checksum_matches_feed(db=db, feeds_id=feeds_id, stories=[]) is False, "Fail 3." # And now with B again assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_a) is False, "Fail 4." assert stories_checksum_matches_feed( db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 5."
class DatabaseHandler(object): """PostgreSQL middleware (imitates DBIx::Simple's interface).""" # Min. "deadlock_timeout" to not cause problems under load (in seconds) __MIN_DEADLOCK_TIMEOUT = 5 # "Double percentage sign" marker (see handler's quote() for explanation) __DOUBLE_PERCENTAGE_SIGN_MARKER = "<DOUBLE PERCENTAGE SIGN: " + random_string(length=16) + ">" # Whether or not "deadlock_timeout" was checked # * lowercase because it's not a constant # * class variable because we don't need to do it on every connect_to_db()) __deadlock_timeout_checked = False __slots__ = [ # Cache of table primary key columns ([schema][table]) '__primary_key_columns', # Whether or not to print PostgreSQL warnings '__print_warnings', # Debugging variable to test whether we're in a transaction '__in_manual_transaction', # Pyscopg2 instance and cursor '__conn', '__db', ] def __init__(self, host: str, port: int, username: str, password: str, database: str): """Database handler constructor; connects to PostgreSQL too.""" host = decode_object_from_bytes_if_needed(host) # noinspection PyTypeChecker port = int(decode_object_from_bytes_if_needed(port)) username = decode_object_from_bytes_if_needed(username) password = decode_object_from_bytes_if_needed(password) database = decode_object_from_bytes_if_needed(database) self.__primary_key_columns = {} self.__print_warnings = True self.__in_manual_transaction = False self.__conn = None self.__db = None self.__connect( host=host, port=port, username=username, password=password, database=database, ) def __connect(self, host: str, port: int, username: str, password: str, database: str) -> None: """Connect to PostgreSQL.""" host = decode_object_from_bytes_if_needed(host) # noinspection PyTypeChecker port = int(decode_object_from_bytes_if_needed(port)) username = decode_object_from_bytes_if_needed(username) password = decode_object_from_bytes_if_needed(password) database = decode_object_from_bytes_if_needed(database) if not (host and username and password and database): raise McConnectException("Database connection credentials are not set.") if not port: port = 5432 application_name = '%s %d' % (socket.gethostname(), os.getpid()) self.__conn = psycopg2.connect( host=host, port=port, user=username, password=password, database=database, application_name=application_name ) # Magic bits for psycopg2 to start supporting UTF-8 psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, self.__conn) psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY, self.__conn) self.__conn.set_client_encoding(psycopg2.extensions.encodings['UTF8']) # Don't automatically decode JSON, just like DBD::Pg doesn't # MC_REWRITE_TO_PYTHON: (probably) remove after porting psycopg2.extras.register_default_json(loads=lambda x: x) # psycopg2.extras.DictCursor factory enables server-side query prepares so all result data does not get fetched # at once cursor_factory = psycopg2.extras.DictCursor self.__db = self.__conn.cursor(cursor_factory=cursor_factory) # Queries to have immediate effect by default self.__conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) # Check deadlock_timeout if not DatabaseHandler.__deadlock_timeout_checked: (deadlock_timeout,) = self.query("SHOW deadlock_timeout").flat() deadlock_timeout = re.sub(r'\s*s$', '', deadlock_timeout, re.I) deadlock_timeout = int(deadlock_timeout) if deadlock_timeout == 0: raise McConnectException("'deadlock_timeout' is 0, probably unable to read it") if deadlock_timeout < DatabaseHandler.__MIN_DEADLOCK_TIMEOUT: log.warning( '"deadlock_timeout" is less than "{}", expect deadlocks on high extractor load.'.format( DatabaseHandler.__MIN_DEADLOCK_TIMEOUT ) ) DatabaseHandler.__deadlock_timeout_checked = True def disconnect(self) -> None: """Disconnect from the database.""" self.__db.close() self.__db = None self.__conn.close() self.__db = None # noinspection PyMethodMayBeStatic def dbh(self) -> None: raise McDatabaseHandlerException("Please don't use internal database handler directly") def query(self, *query_params) -> DatabaseResult: """Run the query, return instance of DatabaseResult for accessing the result. Accepts either (preferred) psycopg2-style query and parameters: # Dictionary parameters (preferred): db.query('SELECT * FROM foo WHERE bar = %(bar)s AND baz = %(baz)s', {'bar': bar, 'baz': baz}) # Dictionary parameters with tuple: db.query('SELECT * FROM foo WHERE bar IN %(bar)s, {'bar': tuple(['a', 'b', 'c'])}) # Tuple parameters: db.query('SELECT * FROM foo WHERE bar = %s AND baz = %s', (bar, baz,)) ...or DBD::Pg (DBIx::Simple) form of query and parameters: db.query('SELECT * FROM foo WHERE bar = ? AND baz = ?', bar, baz) """ # MC_REWRITE_TO_PYTHON: remove after porting queries to named parameter style query_params = convert_dbd_pg_arguments_to_psycopg2_format(*query_params) if len(query_params) == 0: raise McQueryException("Query is unset.") if len(query_params) > 2: raise McQueryException("psycopg2's execute() accepts at most 2 parameters.") return DatabaseResult(cursor=self.__db, query_args=query_params, double_percentage_sign_marker=DatabaseHandler.__DOUBLE_PERCENTAGE_SIGN_MARKER, print_warnings=self.__print_warnings) def primary_key_column(self, object_name: str) -> str: """Get INT / BIGINT primary key column name for a table or a view. If the table has a composite primary key, return the first INT / BIGINT column name. """ object_name = decode_object_from_bytes_if_needed(object_name) if '.' in object_name: schema_name, object_name = object_name.split('.', maxsplit=1) else: schema_name = 'public' if schema_name not in self.__primary_key_columns: self.__primary_key_columns[schema_name] = {} if object_name not in self.__primary_key_columns[schema_name]: # noinspection SpellCheckingInspection,SqlResolve columns = self.query(""" SELECT n.nspname AS schema_name, c.relname AS object_name, c.relkind AS object_type, a.attname AS column_name, i.indisprimary AS is_primary_index, t.typname AS column_type, t.typcategory AS column_type_category FROM pg_namespace AS n INNER JOIN pg_class AS c ON n.oid = c.relnamespace INNER JOIN pg_attribute AS a ON a.attrelid = c.oid AND NOT a.attisdropped INNER JOIN pg_type AS t ON a.atttypid = t.oid -- Object might be a view, so LEFT JOIN LEFT JOIN pg_index AS i ON c.oid = i.indrelid AND a.attnum = ANY(i.indkey) WHERE -- No xid, cid, ... a.attnum > 0 -- Live column AND NOT attisdropped -- Numeric (INT or BIGINT) AND t.typcategory = 'N' AND n.nspname = %(schema_name)s AND c.relname = %(object_name)s -- In case of a composite PK, select the first numeric column ORDER BY a.attnum """, { 'schema_name': schema_name, 'object_name': object_name, }).hashes() if not columns: raise McPrimaryKeyColumnException( "Object '{}' in schema '{} was not found.".format(schema_name, object_name) ) primary_key_column = None for column in columns: column_name = column['column_name'] if column['object_type'] in ['r', 'p']: # Table if column['is_primary_index']: primary_key_column = column_name break elif column['object_type'] in ['v', 'm']: # (Materialized) view if column['column_name'] == 'id' or column['column_name'] == '{}_id'.format(object_name): primary_key_column = column_name break if not primary_key_column: raise McPrimaryKeyColumnException( "Primary key for schema '%s', object '%s' was not found" % (schema_name, object_name,) ) self.__primary_key_columns[schema_name][object_name] = primary_key_column return self.__primary_key_columns[schema_name][object_name] def find_by_id(self, table: str, object_id: int) -> Union[Dict[str, Any], None]: """Do an ID lookup on the table and return a single row match if found.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McFindByIDException("Primary key for table '%s' was not found" % table) # Python substitution find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % { "table": table, "id_column": primary_key_column, } # psycopg2 substitution result = self.query(find_by_id_query + " = %(id_value)s", {'id_value': object_id}) if result.rows() > 1: raise McFindByIDException("More than one row was found for ID '%d' from table '%s'" % (object_id, table)) elif result.rows() == 1: return result.hash() else: return None def require_by_id(self, table: str, object_id: int) -> Dict[str, Any]: """find_by_id() or raise exception if not found.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) row = self.find_by_id(table, object_id) if row is None: raise McRequireByIDException("Unable to find ID '%d' in table '%s'" % (object_id, table)) return row def update_by_id(self, table: str, object_id: int, update_hash: dict) -> Union[Dict[str, Any], None]: """Update the row in the table with the given ID. Ignore any fields that start with '_'.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) update_hash = decode_object_from_bytes_if_needed(update_hash) update_hash = update_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in update_hash: del update_hash["submit"] update_hash = {k: v for k, v in update_hash.items() if not k.startswith("_")} if len(update_hash) == 0: raise McUpdateByIDException("Hash to UPDATE is empty.") primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McUpdateByIDException("Primary key for table '%s' was not found" % table) keys = [] for key, value in update_hash.items(): key_value = key # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) update_hash[key] = value key_value += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python keys.append(key_value) update_hash['__object_id'] = object_id sql = "UPDATE %s " % table sql += "SET %s " % ", ".join(keys) sql += "WHERE %s = " % primary_key_column sql += "%(__object_id)s" # "%(__object_id)s" to be resolved by psycopg2, not Python self.query(sql, update_hash) updated_row = self.find_by_id(table=table, object_id=object_id) return updated_row def delete_by_id(self, table: str, object_id: int) -> None: """Delete the row in the table with the given ID.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McDeleteByIDException("Primary key for table '%s' was not found" % table) # noinspection SqlWithoutWhere sql = "DELETE FROM %s " % table sql += "WHERE %s = " % primary_key_column sql += "%(__object_id)s" # "%(object_id)s" to be resolved by psycopg2, not Python self.query(sql, {"__object_id": object_id}) def insert(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Alias for create().""" table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) return self.create(table=table, insert_hash=insert_hash) def create(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Insert a row into the database for the given table with the given hash values and return the created row.""" table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) insert_hash = insert_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in insert_hash: del insert_hash["submit"] if len(insert_hash) == 0: raise McCreateException("Hash to INSERT is empty") primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McCreateException("Primary key for table '%s' was not found" % table) keys = [] values = [] for key, value in insert_hash.items(): keys.append(key) values.append("%(" + key + ")s") # "%(key)s" to be resolved by psycopg2, not Python # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) insert_hash[key] = value sql = "INSERT INTO %s " % table sql += "(%s) " % ", ".join(keys) sql += "VALUES (%s) " % ", ".join(values) sql += "RETURNING %s" % primary_key_column try: last_inserted_id = self.query(sql, insert_hash).flat() except Exception as ex: if 'duplicate key value violates unique constraint' in str(ex): raise McUniqueConstraintException("Unable to INSERT into '%(table)s' data '%(data)s': %(exception)s" % { 'table': table, 'data': str(insert_hash), 'exception': str(ex), }) else: raise ex if last_inserted_id is None or len(last_inserted_id) == 0: raise McCreateException("Last inserted ID was not found") last_inserted_id = last_inserted_id[0] inserted_row = self.find_by_id(table=table, object_id=last_inserted_id) if inserted_row is None: raise McCreateException("Could not find new ID %d in table '%s'" % (last_inserted_id, table)) return inserted_row def select(self, table: str, what_to_select: str, condition_hash: dict = None) -> DatabaseResult: """SELECT chosen columns from the table that match given conditions.""" table = decode_object_from_bytes_if_needed(table) what_to_select = decode_object_from_bytes_if_needed(what_to_select) condition_hash = decode_object_from_bytes_if_needed(condition_hash) if condition_hash is None: condition_hash = {} condition_hash = condition_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in condition_hash: del condition_hash["submit"] sql_conditions = [] for key, value in condition_hash.items(): condition = key condition += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python sql_conditions.append(condition) # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) condition_hash[key] = value sql = "SELECT %s " % what_to_select sql += "FROM %s " % table if len(sql_conditions) > 0: sql += "WHERE %s" % " AND ".join(sql_conditions) return self.query(sql, condition_hash) def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Select a single row from the database matching the hash or insert a row with the hash values and return the inserted row as a hash.""" # FIXME probably do this in a serialized transaction? table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) insert_hash = insert_hash.copy() # To be able to safely modify it if len(insert_hash) == 0: raise McFindOrCreateException("Hash to INSERT or SELECT is empty") # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in insert_hash: del insert_hash["submit"] row = self.select(table=table, what_to_select='*', condition_hash=insert_hash) if row is not None and row.rows() > 0: return row.hash() else: # try to create it, but if some other process has created it because we don't have a lock, just use that one try: return self.create(table=table, insert_hash=insert_hash) except McUniqueConstraintException: return self.select(table=table, what_to_select='*', condition_hash=insert_hash).hash() # noinspection PyMethodMayBeStatic def show_error_statement(self) -> bool: """Return whether failed SQL statement will be included into thrown exception.""" # FIXME I suppose psycopg2 always returns failed statement? # MC_REWRITE_TO_PYTHON remove after porting return True # noinspection PyMethodMayBeStatic def set_show_error_statement(self, show_error_statement: bool) -> None: """Set whether failed SQL statement will be included into thrown exception.""" # FIXME I suppose psycopg2 always returns failed statement? # MC_REWRITE_TO_PYTHON remove after porting pass def print_warn(self) -> bool: """Return whether PostgreSQL warnings will be printed.""" return self.__print_warnings def set_print_warn(self, print_warn: bool) -> None: """Set whether PostgreSQL warnings will be printed.""" self.__print_warnings = print_warn def in_transaction(self) -> bool: """Return True if we're within a manually started transaction.""" return self.__in_manual_transaction def __set_in_transaction(self, in_transaction: bool) -> None: if self.__in_manual_transaction == in_transaction: log.warning("Setting self.__in_manual_transaction to the same value (%s)" % str(in_transaction)) self.__in_manual_transaction = in_transaction def begin(self) -> None: """Begin a transaction.""" if self.in_transaction(): raise McBeginException("Already in transaction, can't BEGIN.") self.query('BEGIN') self.__set_in_transaction(True) def begin_work(self) -> None: """Begin a transaction.""" return self.begin() def commit(self) -> None: """Commit a transaction.""" if not self.in_transaction(): log.debug("Not in transaction, nothing to COMMIT.") else: self.query('COMMIT') self.__set_in_transaction(False) def rollback(self) -> None: """Rollback a transaction.""" if not self.in_transaction(): log.warning("Not in transaction, nothing to ROLLBACK.") else: self.query('ROLLBACK') self.__set_in_transaction(False) # noinspection PyMethodMayBeStatic def quote(self, value: Union[bool, int, float, str, None]) -> str: """Quote a string for being passed as a literal in a query. Also, replace all cases of a percentage sign ('%') with a random string shared within database handler's instance which will be later replaced back into double percentage sign ('%%') when executing the query.""" value = decode_object_from_bytes_if_needed(value) quoted_obj = None try: # Docs say that: "While the original adapt() takes 3 arguments, psycopg2's one only takes 1: the bound # variable to be adapted", so: # # noinspection PyArgumentList quoted_obj = psycopg2_adapt(value) if hasattr(quoted_obj, 'encoding'): # integer adaptors don't support encoding for example # Otherwise string gets treated as Latin-1: quoted_obj.encoding = psycopg2.extensions.encodings['UTF8'] except Exception as ex: raise McQuoteException("psycopg2_adapt() failed while quoting '%s': %s" % (quoted_obj, str(ex))) if quoted_obj is None: raise McQuoteException("psycopg2_adapt() returned None while quoting '%s'" % quoted_obj) try: quoted_value = quoted_obj.getquoted() except Exception as ex: raise McQuoteException("getquoted() failed while quoting '%s': %s" % (quoted_obj, str(ex))) if quoted_value is None: raise McQuoteException("getquoted() returned None while quoting '%s'" % quoted_obj) if isinstance(quoted_value, bytes): quoted_value = quoted_value.decode(encoding='utf-8', errors='replace') if not isinstance(quoted_value, str): # Maybe overly paranoid, but better than returning random stuff for a string that will go into the database raise McQuoteException("Quoted value is not 'str' after quoting '%s'" % quoted_obj) # Replace percentage signs with a randomly generated marker that will be replaced back into '%%' when executing # the query. quoted_value = quoted_value.replace('%', DatabaseHandler.__DOUBLE_PERCENTAGE_SIGN_MARKER) return quoted_value def quote_bool(self, value: bool) -> str: """Quote a boolean value for being passed as a literal in a query.""" # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere if isinstance(value, bool): pass elif isinstance(value, int): if value == 0: value = False elif value == 1: value = True else: raise McQuoteException("Value '%s' is neither 0 nor 1" % str(value)) elif isinstance(value, str) or isinstance(value, bytes): value = decode_object_from_bytes_if_needed(value) if value.lower() in ['t', 'true', 'y', 'yes', 'on', '1']: value = True elif value.lower() in ['f', 'false', 'n', 'no', 'off', '0']: value = False else: raise McQuoteException("Value '%s' is string but neither of supported values" % str(value)) else: raise McQuoteException("Value '%s' is unsupported" % str(value)) return self.quote(value=value) def quote_varchar(self, value: str) -> str: """Quote VARCHAR for being passed as a literal in a query.""" # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere value = decode_object_from_bytes_if_needed(value) return self.quote(value=value) def quote_date(self, value: str) -> str: """Quote DATE for being passed as a literal in a query.""" value = decode_object_from_bytes_if_needed(value) return '%s::date' % self.quote(value=value) def quote_timestamp(self, value: str) -> str: """Quote TIMESTAMP for being passed as a literal in a query.""" value = decode_object_from_bytes_if_needed(value) return '%s::timestamp' % self.quote(value=value) def copy_from(self, sql: str) -> CopyFrom: """Return COPY FROM helper object.""" sql = decode_object_from_bytes_if_needed(sql) return CopyFrom(cursor=self.__db, sql=sql) def copy_to(self, sql: str) -> CopyTo: """Return COPY TO helper object.""" sql = decode_object_from_bytes_if_needed(sql) return CopyTo(cursor=self.__db, sql=sql) def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str: """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field. The database connection must be within a transaction. The temporary table is setup to be dropped at the end of the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table.""" table_name = '_tmp_ids_%s' % random_string(length=16) log.debug("Temporary IDs table: %s" % table_name) primary_key_clause = "" if ordered: primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name sql = """CREATE TEMPORARY TABLE %s (""" % table_name sql += primary_key_clause sql += "id BIGINT)" self.query(sql) copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name) for single_id in ids: copy.put_line("%d\n" % int(single_id)) copy.end() self.query("ANALYZE %s" % table_name) return table_name def attach_child_query(self, data: List[Dict[str, Any]], child_query: str, child_field: str, id_column: str, single: bool = False) -> List[Dict[str, Any]]: """For each row in "data", attach all results in the child query that match a JOIN with the "id_column" field in each row of "data". Then, attach to "row[child_field]": * If "single" is True, the "child_field" column in the corresponding row in "data"; * If "single" is False, a list of values for each row in "data". For an example on how this works, see test_attach_child_query() in test_handler.py.""" # FIXME get rid of this hard to understand reimplementation of JOIN which is here due to the sole reason that # _add_nested_data() is hard to refactor out and no one bothered to do it. # HMR: the point of this thing is to be able to add nested data in only a single query, which vastly increases # performance over performing one query per row for the nested data data = decode_object_from_bytes_if_needed(data) if not isinstance(data, list): raise McDecodeObjectFromBytesIfNeededException( "'data' is not a list anymore after converting: %s" % str(data) ) data = list(data) # get rid of return type warning by enforcing that 'data' is still a list child_query = decode_object_from_bytes_if_needed(child_query) child_field = decode_object_from_bytes_if_needed(child_field) id_column = decode_object_from_bytes_if_needed(id_column) parent_lookup = {} ids = [] for parent in data: parent_id = parent[id_column] parent_lookup[parent_id] = parent ids.append(parent_id) ids_table = self.get_temporary_ids_table(ids=ids) sql = """ -- noinspection SqlResolve SELECT q.* FROM ( %(child_query)s ) AS q -- Limit rows returned by "child_query" to only IDs from "ids" INNER JOIN %(ids_table)s AS ids ON q.%(id_column)s = ids.id """ % { 'child_query': child_query, 'ids_table': ids_table, 'id_column': id_column, } children = self.query(sql).hashes() # if we're appending lists, make sure each parent row has an empty list if not single: for parent in data: if child_field not in parent: parent[child_field] = [] for child in children: child_id = child[id_column] parent = parent_lookup[child_id] if single: parent[child_field] = child[child_field] else: parent[child_field].append(child) return data
def directory_name(): return '%s-%s'.format(AmazonS3DownloadsConfig.directory_name(), random_string(64))
class DatabaseHandler(object): """PostgreSQL middleware (imitates DBIx::Simple's interface).""" # Environment variable which, when set, will make us ignore the schema version __IGNORE_SCHEMA_VERSION_ENV_VARIABLE = 'MEDIACLOUD_IGNORE_DB_SCHEMA_VERSION' # Min. "deadlock_timeout" to not cause problems under load (in seconds) __MIN_DEADLOCK_TIMEOUT = 5 # cache of table primary key columns __primary_key_columns = {} # PIDs for which the schema version has been checked __schema_version_check_pids = {} # Whether or not to print PostgreSQL warnings __print_warnings = True # "Double percentage sign" marker (see handler's quote() for explanation) __double_percentage_sign_marker = "<DOUBLE PERCENTAGE SIGN: " + random_string( length=16) + ">" # Debugging variable to test whether we're in a transaction __in_manual_transaction = False # Pyscopg2 instance and cursor __conn = None __db = None def __init__(self, host: str, port: int, username: str, password: str, database: str, do_not_check_schema_version: bool = False): """Database handler constructor; connects to PostgreSQL too.""" host = decode_object_from_bytes_if_needed(host) # noinspection PyTypeChecker port = int(decode_object_from_bytes_if_needed(port)) username = decode_object_from_bytes_if_needed(username) password = decode_object_from_bytes_if_needed(password) database = decode_object_from_bytes_if_needed(database) self.__connect(host=host, port=port, username=username, password=password, database=database, do_not_check_schema_version=do_not_check_schema_version) def __connect(self, host: str, port: int, username: str, password: str, database: str, do_not_check_schema_version: bool = False) -> None: """Connect to PostgreSQL.""" host = decode_object_from_bytes_if_needed(host) # noinspection PyTypeChecker port = int(decode_object_from_bytes_if_needed(port)) username = decode_object_from_bytes_if_needed(username) password = decode_object_from_bytes_if_needed(password) database = decode_object_from_bytes_if_needed(database) # If the user didn't clearly (via 'true' or 'false') state whether or not # to check schema version, check it once per PID pid = os.getpid() if not (host and username and password and database): raise McConnectException( "Database connection credentials are not set.") if not port: port = 5432 if not do_not_check_schema_version: if pid in self.__schema_version_check_pids: do_not_check_schema_version = True else: do_not_check_schema_version = False self.__conn = psycopg2.connect(host=host, port=port, user=username, password=password, database=database) # Magic bits for psycopg2 to start supporting UTF-8 psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, self.__conn) psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY, self.__conn) self.__conn.set_client_encoding(psycopg2.extensions.encodings['UTF8']) # Don't automatically decode JSON, just like DBD::Pg doesn't # MC_REWRITE_TO_PYTHON: (probably) remove after porting psycopg2.extras.register_default_json(loads=lambda x: x) # psycopg2.extras.DictCursor factory enables server-side query prepares so all result data does not get fetched # at once cursor_factory = psycopg2.extras.DictCursor self.__db = self.__conn.cursor(cursor_factory=cursor_factory) # Queries to have immediate effect by default self.__conn.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) if not do_not_check_schema_version: if not self.schema_is_up_to_date(): # It would make sense to check the MEDIACLOUD_IGNORE_DB_SCHEMA_VERSION environment variable # at this particular point too, but schema_is_up_to_date() warns the user about schema being # too old on every run, and that's supposedly a good thing. raise McConnectException("Database schema is not up-to-date.") # If schema is not up-to-date, connect() dies and we don't get to set PID here self.__schema_version_check_pids[pid] = True # Check deadlock_timeout (deadlock_timeout, ) = self.query("SHOW deadlock_timeout").flat() deadlock_timeout = re.sub(r'\s*s$', '', deadlock_timeout, re.I) deadlock_timeout = int(deadlock_timeout) if deadlock_timeout == 0: raise McConnectException( "'deadlock_timeout' is 0, probably unable to read it") if deadlock_timeout < self.__MIN_DEADLOCK_TIMEOUT: l.warning( '"deadlock_timeout" is less than "%ds", expect deadlocks on high extractor load' % self.__MIN_DEADLOCK_TIMEOUT) def disconnect(self) -> None: """Disconnect from the database.""" self.__db.close() self.__db = None self.__conn.close() self.__db = None # noinspection PyMethodMayBeStatic def dbh(self) -> None: raise McDatabaseHandlerException( "Please don't use internal database handler directly") def __should_continue_with_outdated_schema( self, current_schema_version: int, target_schema_version: int) -> bool: """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise""" config = py_get_config() config_ignore_schema_version = False if 'ignore_schema_version' in config['mediawords']: config_ignore_schema_version = config["mediawords"][ "ignore_schema_version"] if config_ignore_schema_version and self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ: l.warning(""" The current Media Cloud database schema is older than the schema present in mediawords.sql, but %s is set so continuing anyway. """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE) return True else: l.warning( """ ################################ The current Media Cloud database schema is not the same as the schema present in mediawords.sql. The database schema currently running in the database is %(current_schema_version)s, and the schema version in the mediawords.sql is %(target_schema_version)s. Please run: ./script/mediawords_upgrade_db.py --import to automatically upgrade the database schema to the latest version. If you want to connect to the Media Cloud database anyway (ignoring the schema version), set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such: %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py ################################ """ % { "current_schema_version": current_schema_version, "target_schema_version": target_schema_version, "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE, }) return False def schema_is_up_to_date(self) -> bool: """Checks if the database schema is up-to-date""" root_dir = mc_root_path() # Check if the database is empty db_vars_table_exists = len( self.query(""" -- noinspection SqlResolve SELECT * FROM information_schema.tables WHERE table_name = 'database_variables' """).flat()) > 0 if not db_vars_table_exists: l.info( "Database table 'database_variables' does not exist, probably the database is empty at this point." ) return True # Current schema version (current_schema_version, ) = self.query(""" SELECT value AS schema_version FROM database_variables WHERE name = 'database-schema-version' LIMIT 1 """).flat() current_schema_version = int(current_schema_version) if current_schema_version == 0: raise McSchemaIsUpToDateException("Current schema version is 0") # Target schema version sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'), 'r').read() target_schema_version = schema_version_from_lines(sql) if not target_schema_version: raise McSchemaIsUpToDateException("Invalid target schema version.") # Check if the current schema is up-to-date if current_schema_version != target_schema_version: return self.__should_continue_with_outdated_schema( current_schema_version, target_schema_version) else: # Things are fine at this point. return True def query(self, *query_params) -> DatabaseResult: """Run the query, return instance of DatabaseResult for accessing the result. Accepts either (preferred) psycopg2-style query and parameters: # Dictionary parameters (preferred): db.query('SELECT * FROM foo WHERE bar = %(bar)s AND baz = %(baz)s', {'bar': bar, 'baz': baz}) # Dictionary parameters with tuple: db.query('SELECT * FROM foo WHERE bar IN %(bar)s, {'bar': tuple(['a', 'b', 'c'])}) # Tuple parameters: db.query('SELECT * FROM foo WHERE bar = %s AND baz = %s', (bar, baz,)) ...or DBD::Pg (DBIx::Simple) form of query and parameters: db.query('SELECT * FROM foo WHERE bar = ? AND baz = ?', bar, baz) """ # MC_REWRITE_TO_PYTHON: remove after porting queries to named parameter style query_params = convert_dbd_pg_arguments_to_psycopg2_format( *query_params) if len(query_params) == 0: raise McQueryException("Query is unset.") if len(query_params) > 2: raise McQueryException( "psycopg2's execute() accepts at most 2 parameters.") return DatabaseResult( cursor=self.__db, query_args=query_params, double_percentage_sign_marker=self.__double_percentage_sign_marker, print_warnings=self.__print_warnings) def prepare(self, sql: str) -> DatabaseStatement: """Return a prepared statement.""" # MC_REWRITE_TO_PYTHON get rid of it because it was useful only for writing BYTEA cells; psycopg2 can just # use 'bytes' arguments sql = decode_object_from_bytes_if_needed(sql) return DatabaseStatement( cursor=self.__db, sql=sql, double_percentage_sign_marker=self.__double_percentage_sign_marker) def __get_current_work_mem(self) -> str: current_work_mem = self.query("SHOW work_mem").flat()[0] return current_work_mem def __get_large_work_mem(self) -> str: config = py_get_config() if 'large_work_mem' in config['mediawords']: work_mem = config['mediawords']['large_work_mem'] else: work_mem = self.__get_current_work_mem() return work_mem def __set_work_mem(self, new_work_mem: str) -> None: new_work_mem = decode_object_from_bytes_if_needed(new_work_mem) self.query("SET work_mem TO %s", (new_work_mem, )) def execute_with_large_work_mem(self, *query_args) -> None: """Execute query with large 'work_mem' setting; does *not* return a result of any kind.""" def __execute_with_large_work_mem_subquery(): self.query(*query_args) exception = None try: self.run_block_with_large_work_mem( __execute_with_large_work_mem_subquery) except Exception as ex: l.error("Error while running query with large work memory: %s" % str(ex)) exception = ex if exception is not None: raise exception # pass further def run_block_with_large_work_mem(self, block: Callable[[], None]) -> None: """Run a block (function) with a large 'work_mem' setting set; does *not* return a result of any kind.""" l.debug("starting run_block_with_large_work_mem") large_work_mem = self.__get_large_work_mem() old_work_mem = self.__get_current_work_mem() if large_work_mem is not None: self.__set_work_mem(large_work_mem) else: l.warning("Large work memory is unset, using default 'work_mem'") exception = None try: block() except Exception as ex: l.error("Error while running block with large work memory: %s" % str(ex)) exception = ex self.__set_work_mem(old_work_mem) l.debug("exiting run_block_with_large_work_mem") if exception is not None: raise exception # pass further def primary_key_column(self, table: str) -> str: """Get the primary key column for the table.""" table = decode_object_from_bytes_if_needed(table) if table not in self.__primary_key_columns: # noinspection SqlResolve,SqlCheckUsingColumns primary_key_column = self.query( """ SELECT column_name FROM information_schema.table_constraints JOIN information_schema.key_column_usage USING (constraint_catalog, constraint_schema, constraint_name, table_catalog, table_schema, table_name) WHERE constraint_type = 'PRIMARY KEY' AND table_name = %(table_name)s ORDER BY ordinal_position """, { 'table_name': table }).flat() if primary_key_column is None or len(primary_key_column) == 0: raise McPrimaryKeyColumnException( "Primary key for table '%s' was not found" % table) if len(primary_key_column) > 1: raise McPrimaryKeyColumnException( "More than one primary key column was found for table '%(table)s': %(primary_key_columns)s" % { 'table': table, 'primary_key_columns': str(primary_key_column) }) primary_key_column = primary_key_column[0] self.__primary_key_columns[table] = primary_key_column return self.__primary_key_columns[table] def find_by_id(self, table: str, object_id: int) -> Union[Dict[str, Any], None]: """Do an ID lookup on the table and return a single row match if found.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McFindByIDException( "Primary key for table '%s' was not found" % table) # Python substitution find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % { "table": table, "id_column": primary_key_column, } # psycopg2 substitution result = self.query(find_by_id_query + " = %(id_value)s", {'id_value': object_id}) if result.rows() > 1: raise McFindByIDException( "More than one row was found for ID '%d' from table '%s'" % (object_id, table)) elif result.rows() == 1: return result.hash() else: return None def require_by_id(self, table: str, object_id: int) -> Dict[str, Any]: """find_by_id() or raise exception if not found.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) row = self.find_by_id(table, object_id) if row is None: raise McRequireByIDException( "Unable to find ID '%d' in table '%s'" % (object_id, table)) return row def update_by_id(self, table: str, object_id: int, update_hash: dict) -> Union[Dict[str, Any], None]: """Update the row in the table with the given ID. Ignore any fields that start with '_'.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) update_hash = decode_object_from_bytes_if_needed(update_hash) update_hash = update_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in update_hash: del update_hash["submit"] update_hash = { k: v for k, v in update_hash.items() if not k.startswith("_") } if len(update_hash) == 0: raise McUpdateByIDException("Hash to UPDATE is empty.") primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McUpdateByIDException( "Primary key for table '%s' was not found" % table) keys = [] for key, value in update_hash.items(): key_value = key # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) update_hash[key] = value key_value += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python keys.append(key_value) update_hash['__object_id'] = object_id sql = "UPDATE %s " % table sql += "SET %s " % ", ".join(keys) sql += "WHERE %s = " % primary_key_column sql += "%(__object_id)s" # "%(__object_id)s" to be resolved by psycopg2, not Python try: self.query(sql, update_hash) except Exception as ex: raise McUpdateByIDException("Update to UPDATE hash '%s': %s" % (str(update_hash), str(ex))) updated_row = self.find_by_id(table=table, object_id=object_id) return updated_row def delete_by_id(self, table: str, object_id: int) -> None: """Delete the row in the table with the given ID.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker object_id = decode_object_from_bytes_if_needed(object_id) object_id = int(object_id) table = decode_object_from_bytes_if_needed(table) primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McDeleteByIDException( "Primary key for table '%s' was not found" % table) sql = "DELETE FROM %s " % table sql += "WHERE %s = " % primary_key_column sql += "%(__object_id)s" # "%(object_id)s" to be resolved by psycopg2, not Python self.query(sql, {"__object_id": object_id}) def insert(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Alias for create().""" table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) return self.create(table=table, insert_hash=insert_hash) def create(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Insert a row into the database for the given table with the given hash values and return the created row.""" table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) insert_hash = insert_hash.copy() # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in insert_hash: del insert_hash["submit"] if len(insert_hash) == 0: raise McCreateException("Hash to INSERT is empty") primary_key_column = self.primary_key_column(table) if not primary_key_column: raise McCreateException( "Primary key for table '%s' was not found" % table) keys = [] values = [] for key, value in insert_hash.items(): keys.append(key) values.append( "%(" + key + ")s") # "%(key)s" to be resolved by psycopg2, not Python # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) insert_hash[key] = value sql = "INSERT INTO %s " % table sql += "(%s) " % ", ".join(keys) sql += "VALUES (%s) " % ", ".join(values) sql += "RETURNING %s" % primary_key_column try: last_inserted_id = self.query(sql, insert_hash).flat() except Exception as ex: raise McCreateException( "Unable to INSERT into '%(table)s' data '%(data)s': %(exception)s" % { 'table': table, 'data': str(insert_hash), 'exception': str(ex), }) if last_inserted_id is None or len(last_inserted_id) == 0: raise McCreateException("Last inserted ID was not found") last_inserted_id = last_inserted_id[0] inserted_row = self.find_by_id(table=table, object_id=last_inserted_id) if inserted_row is None: raise McCreateException("Could not find new ID %d in table '%s'" % (last_inserted_id, table)) return inserted_row def select(self, table: str, what_to_select: str, condition_hash: dict = None) -> DatabaseResult: """SELECT chosen columns from the table that match given conditions.""" table = decode_object_from_bytes_if_needed(table) what_to_select = decode_object_from_bytes_if_needed(what_to_select) condition_hash = decode_object_from_bytes_if_needed(condition_hash) if condition_hash is None: condition_hash = {} condition_hash = condition_hash.copy( ) # To be able to safely modify it # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in condition_hash: del condition_hash["submit"] sql_conditions = [] for key, value in condition_hash.items(): condition = key condition += " = %(" + key + ")s" # "%(key)s" to be resolved by psycopg2, not Python sql_conditions.append(condition) # Cast Inline::Python's booleans to Python's booleans # MC_REWRITE_TO_PYTHON: remove after porting if type(value).__name__ == '_perl_obj': value = bool(value) condition_hash[key] = value sql = "SELECT %s " % what_to_select sql += "FROM %s " % table if len(sql_conditions) > 0: sql += "WHERE %s" % " AND ".join(sql_conditions) return self.query(sql, condition_hash) def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]: """Select a single row from the database matching the hash or insert a row with the hash values and return the inserted row as a hash.""" table = decode_object_from_bytes_if_needed(table) insert_hash = decode_object_from_bytes_if_needed(insert_hash) insert_hash = insert_hash.copy() # To be able to safely modify it if len(insert_hash) == 0: raise McFindOrCreateException("Hash to INSERT or SELECT is empty") # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst if "submit" in insert_hash: del insert_hash["submit"] row = self.select(table=table, what_to_select='*', condition_hash=insert_hash) if row is not None and row.rows() > 0: return row.hash() else: return self.create(table=table, insert_hash=insert_hash) # noinspection PyMethodMayBeStatic def show_error_statement(self) -> bool: """Return whether failed SQL statement will be included into thrown exception.""" # FIXME I suppose psycopg2 always returns failed statement? # MC_REWRITE_TO_PYTHON remove after porting return True # noinspection PyMethodMayBeStatic def set_show_error_statement(self, show_error_statement: bool) -> None: """Set whether failed SQL statement will be included into thrown exception.""" # FIXME I suppose psycopg2 always returns failed statement? # MC_REWRITE_TO_PYTHON remove after porting pass def print_warn(self) -> bool: """Return whether PostgreSQL warnings will be printed.""" return self.__print_warnings def set_print_warn(self, print_warn: bool) -> None: """Set whether PostgreSQL warnings will be printed.""" self.__print_warnings = print_warn def in_transaction(self) -> bool: """Return True if we're within a manually started transaction.""" return self.__in_manual_transaction def __set_in_transaction(self, in_transaction: bool) -> None: if self.__in_manual_transaction == in_transaction: l.warning( "Setting self.__in_manual_transaction to the same value (%s)" % str(in_transaction)) self.__in_manual_transaction = in_transaction def begin(self) -> None: """Begin a transaction.""" if self.in_transaction(): raise McBeginException("Already in transaction, can't BEGIN.") self.query('BEGIN') self.__set_in_transaction(True) def begin_work(self) -> None: """Begin a transaction.""" return self.begin() def commit(self) -> None: """Commit a transaction.""" if not self.in_transaction(): l.debug("Not in transaction, nothing to COMMIT.") else: self.query('COMMIT') self.__set_in_transaction(False) def rollback(self) -> None: """Rollback a transaction.""" if not self.in_transaction(): l.warning("Not in transaction, nothing to ROLLBACK.") else: self.query('ROLLBACK') self.__set_in_transaction(False) def quote(self, value: Union[bool, int, float, str, None]) -> str: """Quote a string for being passed as a literal in a query. Also, replace all cases of a percentage sign ('%') with a random string shared within database handler's instance which will be later replaced back into double percentage sign ('%%') when executing the query.""" value = decode_object_from_bytes_if_needed(value) quoted_obj = None try: # Docs say that: "While the original adapt() takes 3 arguments, psycopg2's one only takes 1: the bound # variable to be adapted", so: # # noinspection PyArgumentList quoted_obj = psycopg2_adapt(value) if hasattr( quoted_obj, 'encoding' ): # integer adaptors don't support encoding for example # Otherwise string gets treated as Latin-1: quoted_obj.encoding = psycopg2.extensions.encodings['UTF8'] except Exception as ex: raise McQuoteException( "psycopg2_adapt() failed while quoting '%s': %s" % (quoted_obj, str(ex))) if quoted_obj is None: raise McQuoteException( "psycopg2_adapt() returned None while quoting '%s'" % quoted_obj) try: quoted_value = quoted_obj.getquoted() except Exception as ex: raise McQuoteException( "getquoted() failed while quoting '%s': %s" % (quoted_obj, str(ex))) if quoted_value is None: raise McQuoteException( "getquoted() returned None while quoting '%s'" % quoted_obj) if isinstance(quoted_value, bytes): quoted_value = quoted_value.decode(encoding='utf-8', errors='replace') if not isinstance(quoted_value, str): # Maybe overly paranoid, but better than returning random stuff for a string that will go into the database raise McQuoteException( "Quoted value is not 'str' after quoting '%s'" % quoted_obj) # Replace percentage signs with a randomly generated marker that will be replaced back into '%%' when executing # the query. quoted_value = quoted_value.replace( '%', self.__double_percentage_sign_marker) return quoted_value def quote_bool(self, value: bool) -> str: """Quote a boolean value for being passed as a literal in a query.""" # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere if isinstance(value, bool): pass elif isinstance(value, int): if value == 0: value = False elif value == 1: value = True else: raise McQuoteException("Value '%s' is neither 0 nor 1" % str(value)) elif isinstance(value, str) or isinstance(value, bytes): value = decode_object_from_bytes_if_needed(value) if value.lower() in ['t', 'true', 'y', 'yes', 'on', '1']: value = True elif value.lower() in ['f', 'false', 'n', 'no', 'off', '0']: value = False else: raise McQuoteException( "Value '%s' is string but neither of supported values" % str(value)) else: raise McQuoteException("Value '%s' is unsupported" % str(value)) return self.quote(value=value) def quote_varchar(self, value: str) -> str: """Quote VARCHAR for being passed as a literal in a query.""" # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere value = decode_object_from_bytes_if_needed(value) return self.quote(value=value) def quote_date(self, value: str) -> str: """Quote DATE for being passed as a literal in a query.""" value = decode_object_from_bytes_if_needed(value) return '%s::date' % self.quote(value=value) def quote_timestamp(self, value: str) -> str: """Quote TIMESTAMP for being passed as a literal in a query.""" value = decode_object_from_bytes_if_needed(value) return '%s::timestamp' % self.quote(value=value) def copy_from(self, sql: str) -> CopyFrom: """Return COPY FROM helper object.""" sql = decode_object_from_bytes_if_needed(sql) return CopyFrom(cursor=self.__db, sql=sql) def copy_to(self, sql: str) -> CopyTo: """Return COPY TO helper object.""" sql = decode_object_from_bytes_if_needed(sql) return CopyTo(cursor=self.__db, sql=sql) def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str: """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field. The database connection must be within a transaction. The temporary table is setup to be dropped at the end of the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table.""" table_name = '_tmp_ids_%s' % random_string(length=16) l.debug("Temporary IDs table: %s" % table_name) primary_key_clause = "" if ordered: primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name sql = """CREATE TEMPORARY TABLE %s (""" % table_name sql += primary_key_clause sql += "id BIGINT)" self.query(sql) copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name) for single_id in ids: copy.put_line("%d\n" % single_id) copy.end() self.query("ANALYZE %s" % table_name) return table_name def attach_child_query(self, data: List[Dict[str, Any]], child_query: str, child_field: str, id_column: str, single: bool = False) -> List[Dict[str, Any]]: """For each row in "data", attach all results in the child query that match a JOIN with the "id_column" field in each row of "data". Then, attach to "row[child_field]": * If "single" is True, the "child_field" column in the corresponding row in "data"; * If "single" is False, a list of values for each row in "data". For an example on how this works, see test_attach_child_query() in test_handler.py.""" # FIXME get rid of this hard to understand reimplementation of JOIN which is here due to the sole reason that # _add_nested_data() is hard to refactor out and no one bothered to do it. data = decode_object_from_bytes_if_needed(data) if not isinstance(data, list): raise McDecodeObjectFromBytesIfNeededException( "'data' is not a list anymore after converting: %s" % str(data)) data = list( data ) # get rid of return type warning by enforcing that 'data' is still a list child_query = decode_object_from_bytes_if_needed(child_query) child_field = decode_object_from_bytes_if_needed(child_field) id_column = decode_object_from_bytes_if_needed(id_column) parent_lookup = {} ids = [] for parent in data: parent_id = parent[id_column] parent_lookup[parent_id] = parent ids.append(parent_id) ids_table = self.get_temporary_ids_table(ids=ids) sql = """ -- noinspection SqlResolve SELECT q.* FROM ( %(child_query)s ) AS q -- Limit rows returned by "child_query" to only IDs from "ids" INNER JOIN %(ids_table)s AS ids ON q.%(id_column)s = ids.id """ % { 'child_query': child_query, 'ids_table': ids_table, 'id_column': id_column, } children = self.query(sql).hashes() for child in children: child_id = child[id_column] parent = parent_lookup[child_id] if single: parent[child_field] = child[child_field] else: if child_field not in parent: parent[child_field] = [] parent[child_field].append(child) return data def query_paged_hashes(self, query: str, page: int, rows_per_page: int) -> DatabasePages: """Execute the query and return a list of pages hashes.""" # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst # noinspection PyTypeChecker page = decode_object_from_bytes_if_needed(page) page = int(page) query = decode_object_from_bytes_if_needed(query) return DatabasePages( cursor=self.__db, query=query, page=page, rows_per_page=rows_per_page, double_percentage_sign_marker=self.__double_percentage_sign_marker)
def _test_story(db: DatabaseHandler, story: dict, num: int) -> None: assert _find_dup_stories( db=db, story=story, ) == [story], f"{num} identical" assert _find_dup_stories( db=db, story={**story, **{ 'media_id': story['media_id'] + 1, }}, ) == [], f"{num} media_id diff" assert _find_dup_stories( db=db, story={**story, **{ 'url': random_string(16), 'guid': random_string(16), }}, ) == [story], f"{num} URL + GUID diff, title same" assert _find_dup_stories( db=db, story={**story, **{ 'url': random_string(16), 'title': random_string(16), }}, ) == [story], f"{num} title + URL diff, GUID same" assert _find_dup_stories( db=db, story={**story, **{ 'guid': random_string(16), 'title': random_string(16), }}, ) == [story], f"{num} title + GUID diff, URL same" assert _find_dup_stories( db=db, story={**story, **{ 'url': story['url'].upper(), 'guid': random_string(16), 'title': random_string(16), }}, ) == [story], f"{num} title + GUID diff, normalized url same" assert _find_dup_stories( db=db, story={**story, **{ 'url': random_string(16), 'guid': random_string(16), 'publish_date': increment_day(date=story['publish_date'], days=2), }}, ) == [], f"{num} date + 2 days" assert _find_dup_stories( db=db, story={**story, **{ 'url': random_string(16), 'guid': random_string(16), 'publish_date': increment_day(date=story['publish_date'], days=-2), }}, ) == [], f"{num} date - 2 days" # verify that we can find dup story by the url or guid of a previously dup'd story dup_url = random_string(16) dup_guid = random_string(16) nondup_url = random_string(16) nondup_guid = 'bogus unique guid' nondup_title = 'bogus unique title' dup_stories = _find_dup_stories(db, {**story, **{'url': dup_url, 'guid': dup_guid}}) assert dup_stories == [story] assert _find_dup_stories(db, {**story, **{'url': dup_url, 'title': nondup_title}}) == [story] assert _find_dup_stories(db, {**story, **{'guid': dup_guid, 'title': nondup_title}}) == [story] nondup_story = {**story, **{'url': nondup_url, 'guid': nondup_guid, 'title': nondup_title}} assert _find_dup_stories(db, nondup_story) == []
class JapaneseLanguage(StopWordsFromFileMixIn): """Japanese language support module.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' __slots__ = [ # MeCab instance '__mecab', # Text -> sentence tokenizer for Japanese text '__japanese_sentence_tokenizer', # English language instance for tokenizing non-Chinese (e.g. English) text '__english_language', ] @staticmethod def _mecab_ipadic_neologd_path( ) -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McLanguageException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)) return mecab_dictionary_path @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message) @staticmethod def language_code() -> str: return "ja" @staticmethod def sample_sentence() -> str: return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。" # noinspection PyMethodMayBeStatic def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) # MeCab's sentence -> word tokenizer already returns "base forms" of every word return words def split_text_to_sentences(self, text: str) -> List[str]: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__english_language.split_text_to_sentences( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def split_sentence_to_words(self, sentence: str) -> List[str]: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words
class McJapaneseTokenizer(object): """Japanese language tokenizer that uses MeCab.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] # MeCab instance __mecab = None # Text -> sentence tokenizer for Japanese text __japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) # Text -> sentence tokenizer for non-Japanese (e.g. English) text __non_japanese_sentence_tokenizer = PunktSentenceTokenizer() __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' def __init__(self): """Initialize MeCab tokenizer.""" mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path( ) try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McJapaneseTokenizerException( "Unable to initialize MeCab: %s" % str(ex)) @staticmethod def _mecab_ipadic_neologd_path( ) -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McJapaneseTokenizerException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)) return mecab_dictionary_path def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words