from pynonymizer.strategy.exceptions import UnknownColumnStrategyError, UnknownTableStrategyError, ConfigSyntaxError from pynonymizer.log import get_logger from pynonymizer.strategy.table import UpdateColumnsTableStrategy, TruncateTableStrategy, DeleteTableStrategy, TableStrategyTypes from pynonymizer.strategy.update_column import ( UpdateColumnStrategyTypes, EmptyUpdateColumnStrategy, UniqueEmailUpdateColumnStrategy, UniqueLoginUpdateColumnStrategy, FakeUpdateColumnStrategy, LiteralUpdateColumnStrategy) from pynonymizer.strategy.database import DatabaseStrategy from copy import deepcopy logger = get_logger(__name__) class StrategyParser: def __init__(self, fake_seeder): self.fake_seeder = fake_seeder @staticmethod def __normalize_table_config(table_config): # If the table config doesn't have a "type" specified, it needs to be determined automatically. if "type" not in table_config: if "columns" in table_config: return { **table_config, "type": TableStrategyTypes.UPDATE_COLUMNS.value, } elif table_config == "truncate": return { "type": TableStrategyTypes.TRUNCATE.value,
class MySqlProvider(DatabaseProvider): """ A command-line based mysql provider. Uses `mysql` and `mysqldump`, Because of the efficiency of piping mass amounts of sql into the command-line client. Unfortunately, this implementation provides limited feedback when things go wrong. """ __CHUNK_SIZE = 8192 __DUMPSIZE_ESTIMATE_INFLATION = 1.15 logger = log.get_logger(__name__) def __init__(self, db_host, db_user, db_pass, db_name, db_port=None, seed_rows=None, cmd_opts=None, dump_opts=None): if db_host is None: db_host = "127.0.0.1" if db_port is None: db_port = "3306" if cmd_opts is None: cmd_opts = "" if dump_opts is None: dump_opts = "" super().__init__(db_host=db_host, db_user=db_user, db_pass=db_pass, db_name=db_name, db_port=db_port, seed_rows=seed_rows) self.__runner = execution.MySqlCmdRunner(db_host, db_user, db_pass, db_name, db_port, additional_opts=cmd_opts) self.__dumper = execution.MySqlDumpRunner(db_host, db_user, db_pass, db_name, db_port, additional_opts=dump_opts) def __seed(self, qualifier_map): """ 'Seed' the database with a bunch of pre-generated random records so updates can be performed in batch updates """ for i in tqdm(range(0, self.seed_rows), desc="Inserting seed data", unit="rows"): self.logger.debug(f"Inserting seed row {i}") self.__runner.db_execute(query_factory.get_insert_seed_row(SEED_TABLE_NAME, qualifier_map)) def __estimate_dumpsize(self): """ Makes a guess on the dump size using internal database metrics :return: A value in bytes, or None (unknown) """ statement = query_factory.get_dumpsize_estimate(self.db_name) process_output = self.__runner.get_single_result(statement) try: return int(process_output) * self.__DUMPSIZE_ESTIMATE_INFLATION except ValueError: # Value unparsable, likely NULL return None def __read_until_empty_byte(self, data): return iter(lambda: data.read(self.__CHUNK_SIZE), b'') def __run_scripts(self, script_list, title=""): for i, script in enumerate(script_list): self.logger.info(f"Running {title} script #{i} \"{script[:50]}\"") self.logger.info(self.__runner.db_execute(script)) def create_database(self): """Create the working database""" self.__runner.execute(query_factory.get_create_database(self.db_name)) def drop_database(self): """Drop the working database""" self.__runner.execute(query_factory.get_drop_database(self.db_name)) def anonymize_database(self, database_strategy): """ Anonymize a restored database using the passed database strategy :param database_strategy: a strategy.DatabaseStrategy configuration :return: """ qualifier_map = database_strategy.fake_update_qualifier_map if len(qualifier_map) > 0: self.logger.info("creating seed table with %d columns", len(qualifier_map)) create_seed_table_sql = query_factory.get_create_seed_table(SEED_TABLE_NAME, qualifier_map) self.__runner.db_execute(create_seed_table_sql) self.logger.info("Inserting seed data") self.__seed(qualifier_map) self.__run_scripts(database_strategy.before_scripts, "before") table_strategies = database_strategy.table_strategies self.logger.info("Anonymizing %d tables", len(table_strategies)) with tqdm(desc="Anonymizing database", total=len(table_strategies)) as progressbar: for table_strategy in table_strategies: if table_strategy.schema is not None: self.logger.warning( "%s: MySQL provider does not support table schema. This option will be ignored.", table_strategy.table_name ) if table_strategy.strategy_type == TableStrategyTypes.TRUNCATE: progressbar.set_description("Truncating {}".format(table_strategy.table_name)) self.__runner.db_execute(query_factory.get_truncate_table(table_strategy.table_name)) elif table_strategy.strategy_type == TableStrategyTypes.DELETE: progressbar.set_description("Deleting {}".format(table_strategy.table_name)) self.__runner.db_execute(query_factory.get_delete_table(table_strategy.table_name)) elif table_strategy.strategy_type == TableStrategyTypes.UPDATE_COLUMNS: progressbar.set_description("Anonymizing {}".format(table_strategy.table_name)) statements = query_factory.get_update_table(SEED_TABLE_NAME, table_strategy) self.__runner.db_execute(statements) else: raise UnsupportedTableStrategyError(table_strategy) progressbar.update() self.__run_scripts(database_strategy.after_scripts, "after") self.logger.info("dropping seed table") self.__runner.db_execute(query_factory.get_drop_seed_table(SEED_TABLE_NAME)) # Wait an arbitrary amount of time here to prevent this step from interacting with # transactional dump operations self.logger.debug("Waiting for trailing operations to complete...") sleep(0.2) def restore_database(self, input_path): """ Feed a mysqldump dumpfile to the mysql binary on stdin. :param input_path: :return: """ input_obj = resolve_input(input_path) dumpsize = input_obj.get_size() batch_processor = self.__runner.open_batch_processor() with input_obj.open() as dumpfile_data: with tqdm(desc="Restoring", total=dumpsize, unit='B', unit_scale=True, unit_divisor=1000) as bar: for chunk in self.__read_until_empty_byte(dumpfile_data): batch_processor.write(chunk) batch_processor.flush() bar.update(len(chunk)) def dump_database(self, output_path): """ Feed an output with stdout from the mysqldump binary :param output_path: :return: """ output_obj = resolve_output(output_path) dumpsize_estimate = self.__estimate_dumpsize() dump_process = self.__dumper.open_dumper() with output_obj.open() as output_file: with tqdm(desc="Dumping", total=dumpsize_estimate, unit='B', unit_scale=True, unit_divisor=1000) as bar: for chunk in self.__read_until_empty_byte(dump_process): output_file.write(chunk) bar.update(len(chunk))
class MsSqlProvider(DatabaseProvider): """ A pyodbc-based MSSQL provider. """ logger = get_logger(__name__) # stats value for restore/backup command: Report progress every X percent # A lower value means MORE resultssets / more frequent updates from the backup command. # Values lower than 5 often yield unreliable results on smaller backups __STATS = 5 def __init__(self, db_host, db_user, db_pass, db_name, db_port=None, seed_rows=None, backup_compression=False): # import here for fast-failiness import pyodbc if db_host is not None: raise DependencyError( "db_host", "MsSqlProvider does not support remote servers due to backup file " "location requirements. You must omit db_host from your configuration " "and run pynonymizer on the same server as the database.") # TODO: The odbc port syntax doesn't seem to work with (local),1433 or port=1433 # TODO: This needs attention, but as it's backwards compatible, we're going to disallow it here. if db_port is not None: raise DependencyError( "db_port", "MsSqlProvider does not support custom ports. You must omit db_port " "from your configuration to continue.") db_host = "(local)" super().__init__(db_host=db_host, db_user=db_user, db_pass=db_pass, db_name=db_name, db_port=db_port, seed_rows=seed_rows) self.__conn = None self.__db_conn = None self.__backup_compression = backup_compression def __connection(self): import pyodbc """a lazy-evaluated connection""" if self.__conn is None: self.__conn = pyodbc.connect( driver="{SQL Server Native Client 11.0}", server=self.db_host, uid=self.db_user, pwd=self.db_pass, autocommit=True) return self.__conn def __db_connection(self): import pyodbc """a lazy-evaluated db-specific connection""" if self.__db_conn is None: self.__db_conn = pyodbc.connect( driver="{SQL Server Native Client 11.0}", database=self.db_name, server=self.db_host, uid=self.db_user, pwd=self.db_pass, autocommit=True) return self.__db_conn def __execute(self, *args, **kwargs): return self.__connection().execute(*args, **kwargs) def __db_execute(self, *args, **kwargs): return self.__db_connection().execute(*args, **kwargs) def __get_path(self, filepath): if "\\" in filepath: return PureWindowsPath(filepath) else: return PurePosixPath(filepath) def __get_default_datafolder(self): """ Locate the default data folder using the `model` database location It's possible that the model database is not the currently set default, i.e if it's been changed after install The solution to this would be to make a new database and then perform the below check on that instead. See https://blogs.technet.microsoft.com/sqlman/2009/07/19/tsql-script-determining-default-database-file-log-path/ However, this seems like a heavyweight solution for what is essentially a tsql-writeable tempfolder, so checking the model db seems like a good 'boring' solution :return: Default data directory e.g. "C:\\DATA" """ datafile = self.__execute(""" SELECT physical_name FROM sys.master_files mf INNER JOIN sys.[databases] d ON mf.[database_id] = d.[database_id] WHERE d.[name] = 'model' AND type = 0 """).fetchone()[0] return self.__get_path(datafile).parent def __get_default_logfolder(self): """ Locate the default log folder using the `model` database location __get_default_datafolder: see for more info :return: """ logfile = self.__execute(""" SELECT physical_name FROM sys.master_files mf INNER JOIN sys.[databases] d ON mf.[database_id] = d.[database_id] WHERE d.[name] = 'model' AND type = 1 """).fetchone()[0] return self.__get_path(logfile).parent def __get_file_moves(self, input_path): """ Using RESTORE FILELISTONLY, get all the files in the backup that need to be moved to the local system for restore :return: a dict of file name: new destination """ datadir = self.__get_default_datafolder() logdir = self.__get_default_logfolder() filelist = self.__execute(f"RESTORE FILELISTONLY FROM DISK = ?;", input_path).fetchall() move_file_map = {} for file in filelist: name = file[0] type = file[2].upper() filepath = self.__get_path(file[1]) # log files can go into the default log directory, everything else can go into the data directory if type == "L": target_path = str( logdir.joinpath(f"{self.db_name}_{filepath.name}")) else: target_path = str( datadir.joinpath(f"{self.db_name}_{filepath.name}")) move_file_map[name] = target_path return move_file_map def __async_operation_progress(self, desc, cursor): # With STATS=x, we should recieve 100/x resultsets, provided the backup is slow enough. # With some databases, it will jump from y% to 100, so we'll only get <x nextset calls. # Even SSMS doesn't get informed (read: it's not my fault, blame microsoft) with tqdm(desc=desc, total=math.floor(100 / self.__STATS)) as progressbar: while cursor.nextset(): progressbar.update() # finish the progress - less confusing than a dangling 40% progressbar progressbar.update(progressbar.total - progressbar.n) def __run_scripts(self, script_list, title=""): import pyodbc for i, script in enumerate(script_list): self.logger.info(f"Running f{title} script #{i} \"{script[:50]}\"") cursor = self.__db_execute(script) results = None try: results = cursor.fetchall() except pyodbc.Error: pass self.logger.info(results) def __create_seed_table(self, qualifier_map): seed_column_lines = [ "[{}] {}".format(name, _FAKE_COLUMN_TYPES[col.data_type]) for name, col in qualifier_map.items() ] create_statement = "CREATE TABLE [{}]({});".format( SEED_TABLE_NAME, ",".join(seed_column_lines)) self.__db_execute(create_statement) def __drop_seed_table(self): self.__db_execute("DROP TABLE IF EXISTS [{}];".format(SEED_TABLE_NAME)) def __insert_seed_row(self, qualifier_map): column_list = ",".join( ["[{}]".format(qualifier) for qualifier in qualifier_map]) substitution_list = ",".join( [" ?".format(qualifier) for qualifier in qualifier_map]) value_list = [ column.value for qualifier, column in qualifier_map.items() ] statement = "INSERT INTO [{}]({}) VALUES ({});".format( SEED_TABLE_NAME, column_list, substitution_list) self.__db_execute(statement, value_list) def __seed(self, qualifier_map): for i in tqdm(range(0, self.seed_rows), desc="Inserting seed data", unit="rows"): self.__insert_seed_row(qualifier_map) def __get_column_subquery(self, column_strategy): if column_strategy.strategy_type == UpdateColumnStrategyTypes.EMPTY: return "('')" elif column_strategy.strategy_type == UpdateColumnStrategyTypes.UNIQUE_EMAIL: return f"( SELECT CONCAT(NEWID(), '@', NEWID(), '.com') )" elif column_strategy.strategy_type == UpdateColumnStrategyTypes.UNIQUE_LOGIN: return f"( SELECT NEWID() )" elif column_strategy.strategy_type == UpdateColumnStrategyTypes.FAKE_UPDATE: return f"( SELECT TOP 1 [{column_strategy.qualifier}] FROM [{SEED_TABLE_NAME}] ORDER BY NEWID())" elif column_strategy.strategy_type == UpdateColumnStrategyTypes.LITERAL: return column_strategy.value else: raise UnsupportedColumnStrategyError(column_strategy) def create_database(self): self.logger.warning( "MSSQL: create_database ignored, database will be created when restore_db is run" ) def drop_database(self): # force connection close so we can always drop the db: sometimes timing makes a normal drop impossible. self.__execute( f"ALTER DATABASE [{self.db_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE" ) self.__execute(f"DROP DATABASE IF EXISTS [{self.db_name}];") def anonymize_database(self, database_strategy): qualifier_map = database_strategy.fake_update_qualifier_map if len(qualifier_map) > 0: self.logger.info("creating seed table with %d columns", len(qualifier_map)) self.__create_seed_table(qualifier_map) self.logger.info("Inserting seed data") self.__seed(qualifier_map) self.__run_scripts(database_strategy.before_scripts, "before") table_strategies = database_strategy.table_strategies self.logger.info("Anonymizing %d tables", len(table_strategies)) with tqdm(desc="Anonymizing database", total=len(table_strategies)) as progressbar: for table_strategy in table_strategies: table_name = table_strategy.table_name schema_prefix = f"[{table_strategy.schema}]." if table_strategy.schema else "" if table_strategy.strategy_type == TableStrategyTypes.TRUNCATE: progressbar.set_description( "Truncating {}".format(table_name)) self.__db_execute("TRUNCATE TABLE {}[{}];".format( schema_prefix, table_name)) elif table_strategy.strategy_type == TableStrategyTypes.DELETE: progressbar.set_description( "Deleting {}".format(table_name)) self.__db_execute("DELETE FROM {}[{}];".format( schema_prefix, table_name)) elif table_strategy.strategy_type == TableStrategyTypes.UPDATE_COLUMNS: progressbar.set_description( "Anonymizing {}".format(table_name)) where_grouping = table_strategy.group_by_where() total_wheres = len(where_grouping) for i, (where, column_map) in enumerate(where_grouping.items()): column_assignments = ",".join([ "[{}] = {}".format( name, self.__get_column_subquery(column)) for name, column in column_map.items() ]) where_clause = f" WHERE {where}" if where else "" progressbar.set_description( "Anonymizing {}: w[{}/{}]".format( table_name, i + 1, total_wheres)) self.__db_execute("UPDATE {}[{}] SET {}{};".format( schema_prefix, table_name, column_assignments, where_clause)) else: raise UnsupportedTableStrategyError(table_strategy) progressbar.update() self.__run_scripts(database_strategy.after_scripts, "after") self.logger.info("Dropping seed table") self.__drop_seed_table() def restore_database(self, input_path): move_files = self.__get_file_moves(input_path) self.logger.info("Found %d files in %s", len(move_files), input_path) self.logger.debug(move_files) # get move statements and flatten pairs out so we can do the 2-param substitution move_clauses = ", ".join(["MOVE ? TO ?"] * len(move_files)) move_clause_params = [ item for pair in move_files.items() for item in pair ] restore_cursor = self.__execute( f"RESTORE DATABASE ? FROM DISK = ? WITH {move_clauses}, STATS = ?;", [self.db_name, input_path, *move_clause_params, self.__STATS]) self.__async_operation_progress("Restoring Database", restore_cursor) def dump_database(self, output_path): with_options = [] if self.__backup_compression: with_options.append("COMPRESSION") with_options_str = ",".join(with_options) + ", " if len( with_options) > 0 else "" dump_cursor = self.__execute( f"BACKUP DATABASE ? TO DISK = ? WITH {with_options_str}STATS = ?;", [self.db_name, output_path, self.__STATS]) self.__async_operation_progress("Dumping Database", dump_cursor)
def test_no_logger_name_str(get_logger_mock): with pytest.raises(ValueError): logger = get_logger("")
def test_no_logger_name_array(get_logger_mock): with pytest.raises(ValueError): logger = get_logger([])
def test_multipart_name(get_logger_mock): logger = get_logger(["a", "b", "cd", "ef", "ghi"]) get_logger_mock.assert_called_once_with("pynonymizer.a.b.cd.ef.ghi")
def test_simple_name(get_logger_mock): logger = get_logger("test") get_logger_mock.assert_called_once_with("pynonymizer.test")