def run_fuzz_test(self, vector, unique_database, table, num_copies=1): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) table_format = vector.get_value('table_format') self.change_database(self.client, table_format) tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table, dir=os.path.join(os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s" % (unique_database, table, table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( unique_database, table)) LOG.info("Generating corrupted version of %s in %s. Local working directory is %s", table, unique_database, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. table_loc = self._get_table_location(table, vector) check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query('alter table {0}.{1} add partition ({2})'.format( unique_database, table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir)] check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location]) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (unique_database, table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( unique_database, table), 'select count(*) from {0}.{1} q'.format(unique_database, table)] for query, batch_size, disable_codegen in \ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES): query_options = copy(vector.get_value('exec_option')) query_options['batch_size'] = batch_size query_options['disable_codegen'] = disable_codegen try: result = self.execute_query(query, query_options = query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format(e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). if table_format.file_format != 'parquet' \ and not (table_format.file_format == 'text' and table_format.compression_codec != 'none'): raise
def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1, custom_queries=None): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) tmp_table_dir = tempfile.mkdtemp( prefix="tmp-scanner-fuzz-%s" % fuzz_table, dir=os.path.join(os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s.%s" % (fuzz_db, fuzz_table, src_db, src_table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( fuzz_db, fuzz_table)) LOG.info( "Generating corrupted version of %s in %s. Local working directory is %s", fuzz_table, fuzz_db, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. fq_table_name = src_db + "." + src_table table_loc = self._get_table_location(fq_table_name, vector) check_call( ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query( 'alter table {0}.{1} add partition ({2})'.format( fuzz_db, fuzz_table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = [ "%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir) ] self.filesystem_client.copy_from_local(to_copy, fuzz_table_location) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (fuzz_db, fuzz_table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( fuzz_db, fuzz_table), 'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table) ] if custom_queries is not None: queries = queries + [ s.format(fuzz_db, fuzz_table) for s in custom_queries ] for query, batch_size, disable_codegen in \ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES): query_options = copy(vector.get_value('exec_option')) query_options['batch_size'] = batch_size query_options['disable_codegen'] = disable_codegen query_options['disable_codegen_rows_threshold'] = 0 try: result = self.execute_query(query, query_options=query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format( e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). table_format = vector.get_value('table_format') if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \ and not (table_format.file_format == 'text' and table_format.compression_codec != 'none'): raise
def run_fuzz_test(self, vector, unique_database, table, num_copies=1): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) table_format = vector.get_value('table_format') self.change_database(self.client, table_format) tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table, dir=os.path.join( os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s" % (unique_database, table, table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( unique_database, table)) LOG.info( "Generating corrupted version of %s in %s. Local working directory is %s", table, unique_database, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. table_loc = self._get_table_location(table, vector) check_call( ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query( 'alter table {0}.{1} add partition ({2})'.format( unique_database, table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = [ "%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir) ] check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location]) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (unique_database, table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. # Use abort_on_error=0 to ensure we scan all the files. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( unique_database, table), 'select count(*) from {0}.{1} q'.format(unique_database, table) ] xfail_msgs = [] for query in queries: for batch_size in self.BATCH_SIZES: query_options = { 'abort_on_error': '0', 'batch_size': batch_size } try: result = self.execute_query(query, query_options=query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format( e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). if table_format.file_format == 'parquet' or \ (table_format.file_format == 'text' and table_format.compression_codec != 'none'): xfail_msgs.append(msg) else: raise if len(xfail_msgs) != 0: pytest.xfail('\n'.join(xfail_msgs))