def test_failpoints(self, vector): query = vector.get_value('query') action = vector.get_value('action') location = vector.get_value('location') vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') try: plan_node_ids = self.__parse_plan_nodes_from_explain(query, vector) except ImpalaBeeswaxException as e: if "MT_DOP not supported" in str(e): pytest.xfail(reason="MT_DOP not supported.") else: raise e for node_id in plan_node_ids: debug_action = '%d:%s:%s' % (node_id, location, FAILPOINT_ACTION_MAP[action]) # IMPALA-7046: add jitter to backend startup to exercise various failure paths. debug_action += '|COORD_BEFORE_EXEC_RPC:JITTER@[email protected]' LOG.info('Current debug action: SET DEBUG_ACTION=%s' % debug_action) vector.get_value('exec_option')['debug_action'] = debug_action if action == 'CANCEL': self.__execute_cancel_action(query, vector) elif action == 'FAIL' or action == 'MEM_LIMIT_EXCEEDED': self.__execute_fail_action(query, vector) else: assert 0, 'Unknown action: %s' % action # We should be able to execute the same query successfully when no failures are # injected. del vector.get_value('exec_option')['debug_action'] self.execute_query(query, vector.get_value('exec_option'))
def corrupt_file(self, path, rng): """ Corrupt the file at 'path' in the local file system in a randomised way using the random number generator 'rng'. Rewrites the file in-place. Logs a message to describe how the file was corrupted, so the error is reproducible. """ with open(path, "rb") as f: data = bytearray(f.read()) num_corruptions = rng.randint(0, int(math.log(len(data)))) for _ in xrange(num_corruptions): flip_offset = rng.randint(0, len(data) - 1) flip_val = rng.randint(0, 255) LOG.info( "corrupt file: Flip byte in {0} at {1} from {2} to {3}".format( path, flip_offset, data[flip_offset], flip_val)) data[flip_offset] = flip_val if rng.random() < 0.4: truncation = rng.randint(0, len(data)) LOG.info("corrupt file: Truncate {0} to {1}".format( path, truncation)) data = data[:truncation] with open(path, "wb") as f: f.write(data)
def test_failpoints(self, vector): query = vector.get_value('query') action = vector.get_value('action') location = vector.get_value('location') vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') plan_node_ids = self.__parse_plan_nodes_from_explain(query, vector) for node_id in plan_node_ids: debug_action = '%d:%s:%s' % (node_id, location, FAILPOINT_ACTION_MAP[action]) # IMPALA-7046: add jitter to backend startup to exercise various failure paths. debug_action += '|COORD_BEFORE_EXEC_RPC:JITTER@[email protected]' LOG.info('Current debug action: SET DEBUG_ACTION=%s' % debug_action) vector.get_value('exec_option')['debug_action'] = debug_action if action == 'CANCEL': self.__execute_cancel_action(query, vector) elif action == 'FAIL' or action == 'MEM_LIMIT_EXCEEDED': self.__execute_fail_action(query, vector) else: assert 0, 'Unknown action: %s' % action # We should be able to execute the same query successfully when no failures are # injected. del vector.get_value('exec_option')['debug_action'] self.execute_query(query, vector.get_value('exec_option')) # Detect any hung fragments left from this test. for impalad in ImpalaCluster.get_e2e_test_cluster().impalads: verifier = MetricVerifier(impalad.service) verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0)
def test_failpoints(self, vector): query = vector.get_value('query') action = vector.get_value('action') location = vector.get_value('location') vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') if action == "CANCEL" and location == "PREPARE": pytest.xfail(reason="IMPALA-5202 leads to a hang.") try: plan_node_ids = self.__parse_plan_nodes_from_explain(query, vector) except ImpalaBeeswaxException as e: if "MT_DOP not supported" in str(e): pytest.xfail(reason="MT_DOP not supported.") else: raise e for node_id in plan_node_ids: debug_action = '%d:%s:%s' % (node_id, location, FAILPOINT_ACTION_MAP[action]) LOG.info('Current debug action: SET DEBUG_ACTION=%s' % debug_action) vector.get_value('exec_option')['debug_action'] = debug_action if action == 'CANCEL': self.__execute_cancel_action(query, vector) elif action == 'FAIL' or action == 'MEM_LIMIT_EXCEEDED': self.__execute_fail_action(query, vector) else: assert 0, 'Unknown action: %s' % action # We should be able to execute the same query successfully when no failures are # injected. del vector.get_value('exec_option')['debug_action'] self.execute_query(query, vector.get_value('exec_option'))
def test_strings_utf8(self, vector, unique_database): # Create table table_name = "ice_str_utf8" qualified_table_name = "%s.%s" % (unique_database, table_name) query = 'create table %s (a string) stored as iceberg' % qualified_table_name self.client.execute(query) # Inserted string data should have UTF8 annotation regardless of query options. query = 'insert into %s values ("impala")' % qualified_table_name self.execute_query(query, {'parquet_annotate_strings_utf8': False}) # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/iceberg_utf8_test_%s.parq' % random.randint( 0, 10000) LOG.info("test_strings_utf8 local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/data/*.parq' % (unique_database, table_name)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table column a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' # Check that the schema uses the UTF8 annotation assert a_schema_element.converted_type == ConvertedType.UTF8 os.remove(local_file)
def __execute_fail_action(self, query, vector): try: self.execute_query(query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) assert 'Expected Failure' except ImpalaBeeswaxException as e: LOG.debug(e)
def corrupt_file(self, path, rng): """ Corrupt the file at 'path' in the local file system in a randomised way using the random number generator 'rng'. Rewrites the file in-place. Logs a message to describe how the file was corrupted, so the error is reproducible. """ with open(path, "rb") as f: data = bytearray(f.read()) num_corruptions = rng.randint(0, int(math.log(len(data)))) for _ in xrange(num_corruptions): flip_offset = rng.randint(0, len(data) - 1) flip_val = rng.randint(0, 255) LOG.info( "corrupt file: Flip byte in {0} at {1} from {2} to {3}".format( path, flip_offset, data[flip_offset], flip_val)) data[flip_offset] = flip_val if rng.random() < 0.4: # delete random part of the file beg = rng.randint(0, len(data) - 1) end = rng.randint(beg, len(data)) LOG.info("corrupt file: Remove range [{0}, {1}) in {2}".format( beg, end, path)) with open(path, "wb") as f: f.write(data[:beg]) f.write(data[end:]) else: with open(path, "wb") as f: f.write(data)
def __execute_fail_action(self, query, vector): try: self.execute_query(query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) assert 'Expected Failure' except ImpalaBeeswaxException as e: LOG.debug(e) # IMPALA-5197: None of the debug actions should trigger corrupted file message assert 'Corrupt Parquet file' not in str(e)
def __execute_cancel_action(self, query, vector): LOG.info('Starting async query execution') handle = self.execute_query_async(query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) LOG.info('Sleeping') sleep(3) cancel_result = self.client.cancel(handle) self.client.close_query(handle) assert cancel_result.status_code == 0,\ 'Unexpected status code from cancel request: %s' % cancel_result
def test_create_alter_bulk_partition(self, vector, unique_database): # Change the scale depending on the exploration strategy, with 50 partitions this # test runs a few minutes, with 10 partitions it takes ~50s for two configurations. num_parts = 50 if self.exploration_strategy() == 'exhaustive' else 10 fq_tbl_name = unique_database + ".part_test_tbl" self.client.execute( "create table {0}(i int) partitioned by(j int, s string) " "location '{1}/{0}'".format(fq_tbl_name, WAREHOUSE)) # Add some partitions (first batch of two) for i in xrange(num_parts / 5): start = time.time() self.client.execute( "alter table {0} add partition(j={1}, s='{1}')".format( fq_tbl_name, i)) LOG.info('ADD PARTITION #%d exec time: %s' % (i, time.time() - start)) # Modify one of the partitions self.client.execute("alter table {0} partition(j=1, s='1')" " set fileformat parquetfile".format(fq_tbl_name)) # Alter one partition to a non-existent location twice (IMPALA-741) self.filesystem_client.delete_file_dir("tmp/dont_exist1/", recursive=True) self.filesystem_client.delete_file_dir("tmp/dont_exist2/", recursive=True) self.execute_query_expect_success( self.client, "alter table {0} partition(j=1,s='1') set location '{1}/tmp/dont_exist1'" .format(fq_tbl_name, WAREHOUSE)) self.execute_query_expect_success( self.client, "alter table {0} partition(j=1,s='1') set location '{1}/tmp/dont_exist2'" .format(fq_tbl_name, WAREHOUSE)) # Add some more partitions for i in xrange(num_parts / 5, num_parts): start = time.time() self.client.execute( "alter table {0} add partition(j={1},s='{1}')".format( fq_tbl_name, i)) LOG.info('ADD PARTITION #%d exec time: %s' % (i, time.time() - start)) # Insert data and verify it shows up. self.client.execute( "insert into table {0} partition(j=1, s='1') select 1".format( fq_tbl_name)) assert '1' == self.execute_scalar( "select count(*) from {0}".format(fq_tbl_name))
def test_create_alter_bulk_partition(self, vector): TBL_NAME = 'foo_part' # Change the scale depending on the exploration strategy, with 50 partitions this # takes a few minutes to run, with 10 partitions it takes ~50s for two configurations. num_parts = 50 if self.exploration_strategy() == 'exhaustive' else 10 self.client.execute("use default") self.client.execute("drop table if exists {0}".format(TBL_NAME)) self.client.execute("""create table {0}(i int) partitioned by(j int, s string) location '{1}/{0}'""".format(TBL_NAME, WAREHOUSE)) # Add some partitions (first batch of two) for i in xrange(num_parts / 5): start = time.time() self.client.execute("alter table {0} add partition(j={1}, s='{1}')".format(TBL_NAME, i)) LOG.info('ADD PARTITION #%d exec time: %s' % (i, time.time() - start)) # Modify one of the partitions self.client.execute("alter table %s partition(j=1, s='1')" " set fileformat parquetfile" % TBL_NAME) # Alter one partition to a non-existent location twice (IMPALA-741) self.filesystem_client.delete_file_dir("tmp/dont_exist1/", recursive=True) self.filesystem_client.delete_file_dir("tmp/dont_exist2/", recursive=True) self.execute_query_expect_success(self.client, "alter table {0} partition(j=1,s='1') set location '{1}/tmp/dont_exist1'" .format(TBL_NAME, WAREHOUSE)) self.execute_query_expect_success(self.client, "alter table {0} partition(j=1,s='1') set location '{1}/tmp/dont_exist2'" .format(TBL_NAME, WAREHOUSE)) # Add some more partitions for i in xrange(num_parts / 5, num_parts): start = time.time() self.client.execute("alter table {0} add partition(j={1},s='{1}')".format(TBL_NAME, i)) LOG.info('ADD PARTITION #%d exec time: %s' % (i, time.time() - start)) # Insert data and verify it shows up. self.client.execute("insert into table {0} partition(j=1, s='1') select 1" .format(TBL_NAME)) assert '1' == self.execute_scalar("select count(*) from {0}".format(TBL_NAME)) self.client.execute("drop table {0}".format(TBL_NAME))
def execute_query_expect_debug_action_failure(impala_test_suite, query, vector): """Executes the given query with the configured debug_action and asserts that the query fails. Removes the debug_action from the exec options, re-runs the query, and assert that it succeeds.""" assert 'debug_action' in vector.get_value('exec_option') # Run the query with the given debug_action and assert that the query fails. # execute_query_expect_failure either returns the client exception thrown when executing # the query, or the result of the query if it failed but did the client did not throw an # exception. Either way, log the result. LOG.debug( ImpalaTestSuite.execute_query_expect_failure( impala_test_suite.client, query, vector.get_value('exec_option'))) # Assert that the query can be run without the debug_action. del vector.get_value('exec_option')['debug_action'] result = impala_test_suite.execute_query(query, vector.get_value('exec_option')) assert result.success, "Failed to run {0} without debug action".format( query)
def get_schema_elements(): # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000) LOG.info("test_annotate_utf8_option local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, TABLE_NAME)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table columns a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' b_schema_element = metadata.schema[2] assert b_schema_element.name == 'b' c_schema_element = metadata.schema[3] assert c_schema_element.name == 'c' d_schema_element = metadata.schema[4] assert d_schema_element.name == 'd' os.remove(local_file) return a_schema_element, b_schema_element, c_schema_element, d_schema_element
def corrupt_file(self, path, rng): """ Corrupt the file at 'path' in the local file system in a randomised way using the random number generator 'rng'. Rewrites the file in-place. Logs a message to describe how the file was corrupted, so the error is reproducible. """ with open(path, "rb") as f: data = bytearray(f.read()) if rng.random() < 0.5: flip_offset = rng.randint(0, len(data) - 1) flip_val = rng.randint(0, 255) LOG.info("corrupt_file: Flip byte in %s at %d from %d to %d", path, flip_offset, data[flip_offset], flip_val) data[flip_offset] = flip_val else: truncation = rng.randint(0, len(data)) LOG.info("corrupt_file: Truncate %s to %d", path, truncation) data = data[:truncation] with open(path, "wb") as f: f.write(data)
def get_num_cache_requests(): """Returns the number of outstanding cache requests. Due to race conditions in the way cache requests are added/dropped/reported (see IMPALA-3040), this function tries to return a stable result by making several attempts to stabilize it within a reasonable timeout.""" def get_num_cache_requests_util(): rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) return len(stdout.split('\n')) # IMPALA-3040: This can take time, especially under slow builds like ASAN. wait_time_in_sec = build_flavor_timeout(5, slow_build_timeout=20) num_stabilization_attempts = 0 max_num_stabilization_attempts = 10 new_requests = None num_requests = None LOG.info("{0} Entered get_num_cache_requests()".format(time.time())) while num_stabilization_attempts < max_num_stabilization_attempts: new_requests = get_num_cache_requests_util() if new_requests == num_requests: break LOG.info("{0} Waiting to stabilise: num_requests={1} new_requests={2}".format( time.time(), num_requests, new_requests)) num_requests = new_requests num_stabilization_attempts = num_stabilization_attempts + 1 time.sleep(wait_time_in_sec) LOG.info("{0} Final num requests: {1}".format(time.time(), num_requests)) return num_requests
def get_num_cache_requests(): """Returns the number of outstanding cache requests. Due to race conditions in the way cache requests are added/dropped/reported (see IMPALA-3040), this function tries to return a stable result by making several attempts to stabilize it within a reasonable timeout.""" def get_num_cache_requests_util(): rc, stdout, stderr = exec_process( "hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) return len(stdout.split('\n')) # IMPALA-3040: This can take time, especially under slow builds like ASAN. wait_time_in_sec = specific_build_type_timeout(5, slow_build_timeout=20) num_stabilization_attempts = 0 max_num_stabilization_attempts = 10 new_requests = None num_requests = None LOG.info("{0} Entered get_num_cache_requests()".format(time.time())) while num_stabilization_attempts < max_num_stabilization_attempts: new_requests = get_num_cache_requests_util() if new_requests == num_requests: break LOG.info("{0} Waiting to stabilise: num_requests={1} new_requests={2}". format(time.time(), num_requests, new_requests)) num_requests = new_requests num_stabilization_attempts = num_stabilization_attempts + 1 time.sleep(wait_time_in_sec) LOG.info("{0} Final num requests: {1}".format(time.time(), num_requests)) return num_requests
def test_failpoints(self, vector): query = QUERY node_type, node_ids = vector.get_value('target_node') action = vector.get_value('action') location = vector.get_value('location') for node_id in node_ids: debug_action = '%d:%s:%s' % (node_id, location, FAILPOINT_ACTION_MAP[action]) LOG.info('Current debug action: SET DEBUG_ACTION=%s' % debug_action) vector.get_value('exec_option')['debug_action'] = debug_action if action == 'CANCEL': self.__execute_cancel_action(query, vector) elif action == 'FAIL' or action == 'MEM_LIMIT_EXCEEDED': self.__execute_fail_action(query, vector) else: assert 0, 'Unknown action: %s' % action # We should be able to execute the same query successfully when no failures are # injected. del vector.get_value('exec_option')['debug_action'] self.execute_query(query, vector.get_value('exec_option'))
def corrupt_file(self, path, rng): """ Corrupt the file at 'path' in the local file system in a randomised way using the random number generator 'rng'. Rewrites the file in-place. Logs a message to describe how the file was corrupted, so the error is reproducible. """ with open(path, "rb") as f: data = bytearray(f.read()) num_corruptions = rng.randint(0, int(math.log(len(data)))) for _ in xrange(num_corruptions): flip_offset = rng.randint(0, len(data) - 1) flip_val = rng.randint(0, 255) LOG.info("corrupt file: Flip byte in {0} at {1} from {2} to {3}".format( path, flip_offset, data[flip_offset], flip_val)) data[flip_offset] = flip_val if rng.random() < 0.4: truncation = rng.randint(0, len(data)) LOG.info("corrupt file: Truncate {0} to {1}".format(path, truncation)) data = data[:truncation] with open(path, "wb") as f: f.write(data)
def get_num_cache_requests(): """Returns the number of outstanding cache requests. Due to race conditions in the way cache requests are added/dropped/reported (see IMPALA-3040), this function tries to return a stable result by making several attempts to stabilize it within a reasonable timeout.""" def get_num_cache_requests_util(): rc, stdout, stderr = exec_process( "hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) # remove blank new lines from output count lines = [line for line in stdout.split('\n') if line.strip()] count = None for line in lines: if line.startswith("Found "): # the line should say "Found <int> entries" # if we find this line we parse the number of entries # from this line. count = int(re.search(r'\d+', line).group()) break # if count is available we return it else we just # return the total number of lines if count is not None: return count else: return len(stdout.split('\n')) # IMPALA-3040: This can take time, especially under slow builds like ASAN. wait_time_in_sec = build_flavor_timeout(5, slow_build_timeout=20) num_stabilization_attempts = 0 max_num_stabilization_attempts = 10 num_requests = None LOG.info("{0} Entered get_num_cache_requests()".format(time.time())) while num_stabilization_attempts < max_num_stabilization_attempts: new_requests = get_num_cache_requests_util() if new_requests == num_requests: break LOG.info("{0} Waiting to stabilise: num_requests={1} new_requests={2}". format(time.time(), num_requests, new_requests)) num_requests = new_requests num_stabilization_attempts = num_stabilization_attempts + 1 time.sleep(wait_time_in_sec) LOG.info("{0} Final num requests: {1}".format(time.time(), num_requests)) return num_requests
# They should both succeed. threads = [QuerySubmitThread(COORDINATOR_QUERY, self.cluster.impalads[i]) for i in xrange(2)] for t in threads: t.start() for t in threads: t.join() assert t.error is None # Create two threads to submit COORDINATOR_QUERY to one coordinator and # SYMMETRIC_QUERY to another coordinator. One of the queries should fail because # memory would be overcommitted on daemon 0. threads = [QuerySubmitThread(COORDINATOR_QUERY, self.cluster.impalads[0]), QuerySubmitThread(SYMMETRIC_QUERY, self.cluster.impalads[1])] for t in threads: t.start() num_errors = 0 for t in threads: t.join() if t.error is not None: assert "Failed to get minimum memory reservation" in t.error LOG.info("Query failed with error: %s", t.error) LOG.info(t.query) num_errors += 1 assert num_errors == 1 # Check that free buffers are released over time. We set the memory maintenance sleep # time very low above so this should happen quickly. verifiers = [MetricVerifier(i.service) for i in self.cluster.impalads] for v in verifiers: v.wait_for_metric("buffer-pool.free-buffers", 0, timeout=60) v.wait_for_metric("buffer-pool.free-buffer-bytes", 0, timeout=60)
def run_fuzz_test(self, vector, unique_database, table, num_copies=1): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) table_format = vector.get_value('table_format') self.change_database(self.client, table_format) tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table, dir=os.path.join( os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s" % (unique_database, table, table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( unique_database, table)) LOG.info( "Generating corrupted version of %s in %s. Local working directory is %s", table, unique_database, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. table_loc = self._get_table_location(table, vector) check_call( ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query( 'alter table {0}.{1} add partition ({2})'.format( unique_database, table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = [ "%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir) ] check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location]) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (unique_database, table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. # Use abort_on_error=0 to ensure we scan all the files. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( unique_database, table), 'select count(*) from {0}.{1} q'.format(unique_database, table) ] xfail_msgs = [] for query in queries: for batch_size in self.BATCH_SIZES: query_options = { 'abort_on_error': '0', 'batch_size': batch_size } try: result = self.execute_query(query, query_options=query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format( e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). if table_format.file_format == 'parquet' or \ (table_format.file_format == 'text' and table_format.compression_codec != 'none'): xfail_msgs.append(msg) else: raise if len(xfail_msgs) != 0: pytest.xfail('\n'.join(xfail_msgs))
def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1, custom_queries=None): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) tmp_table_dir = tempfile.mkdtemp( prefix="tmp-scanner-fuzz-%s" % fuzz_table, dir=os.path.join(os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s.%s" % (fuzz_db, fuzz_table, src_db, src_table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( fuzz_db, fuzz_table)) LOG.info( "Generating corrupted version of %s in %s. Local working directory is %s", fuzz_table, fuzz_db, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. fq_table_name = src_db + "." + src_table table_loc = self._get_table_location(fq_table_name, vector) check_call( ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query( 'alter table {0}.{1} add partition ({2})'.format( fuzz_db, fuzz_table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = [ "%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir) ] self.filesystem_client.copy_from_local(to_copy, fuzz_table_location) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (fuzz_db, fuzz_table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( fuzz_db, fuzz_table), 'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table) ] if custom_queries is not None: queries = queries + [ s.format(fuzz_db, fuzz_table) for s in custom_queries ] for query, batch_size, disable_codegen in \ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES): query_options = copy(vector.get_value('exec_option')) query_options['batch_size'] = batch_size query_options['disable_codegen'] = disable_codegen query_options['disable_codegen_rows_threshold'] = 0 try: result = self.execute_query(query, query_options=query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format( e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). table_format = vector.get_value('table_format') if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \ and not (table_format.file_format == 'text' and table_format.compression_codec != 'none'): raise
def run_fuzz_test(self, vector, unique_database, table, num_copies=1): """ Do some basic fuzz testing: create a copy of an existing table with randomly corrupted files and make sure that we don't crash or behave in an unexpected way. 'unique_database' is used for the table, so it will be cleaned up automatically. If 'num_copies' is set, create that many corrupted copies of each input file. SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that input files are the same). SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files. """ # Create and seed a new random number generator for reproducibility. rng = random.Random() random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time() LOG.info("Using random seed %d", random_seed) rng.seed(long(random_seed)) table_format = vector.get_value('table_format') self.change_database(self.client, table_format) tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table, dir=os.path.join(os.environ['IMPALA_HOME'], "testdata")) self.execute_query("create table %s.%s like %s" % (unique_database, table, table)) fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format( unique_database, table)) LOG.info("Generating corrupted version of %s in %s. Local working directory is %s", table, unique_database, tmp_table_dir) # Find the location of the existing table and get the full table directory structure. table_loc = self._get_table_location(table, vector) check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir]) partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng) for partition in partitions: self.execute_query('alter table {0}.{1} add partition ({2})'.format( unique_database, table, ','.join(partition))) # Copy all of the local files and directories to hdfs. to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir) for file_or_dir in os.listdir(tmp_table_dir)] check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location]) if "SCANNER_FUZZ_KEEP_FILES" not in os.environ: shutil.rmtree(tmp_table_dir) # Querying the corrupted files should not DCHECK or crash. self.execute_query("refresh %s.%s" % (unique_database, table)) # Execute a query that tries to read all the columns and rows in the file. # Also execute a count(*) that materializes no columns, since different code # paths are exercised. queries = [ 'select count(*) from (select distinct * from {0}.{1}) q'.format( unique_database, table), 'select count(*) from {0}.{1} q'.format(unique_database, table)] for query, batch_size, disable_codegen in \ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES): query_options = copy(vector.get_value('exec_option')) query_options['batch_size'] = batch_size query_options['disable_codegen'] = disable_codegen try: result = self.execute_query(query, query_options = query_options) LOG.info('\n'.join(result.log)) except Exception as e: if 'memory limit exceeded' in str(e).lower(): # Memory limit error should fail query. continue msg = "Should not throw error when abort_on_error=0: '{0}'".format(e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file # (IMPALA-4013). if table_format.file_format != 'parquet' \ and not (table_format.file_format == 'text' and table_format.compression_codec != 'none'): raise