def test_strings_utf8(self, vector, unique_database): # Create table table_name = "ice_str_utf8" qualified_table_name = "%s.%s" % (unique_database, table_name) query = 'create table %s (a string) stored as iceberg' % qualified_table_name self.client.execute(query) # Inserted string data should have UTF8 annotation regardless of query options. query = 'insert into %s values ("impala")' % qualified_table_name self.execute_query(query, {'parquet_annotate_strings_utf8': False}) # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/iceberg_utf8_test_%s.parq' % random.randint( 0, 10000) LOG.info("test_strings_utf8 local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/data/*.parq' % (unique_database, table_name)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table column a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' # Check that the schema uses the UTF8 annotation assert a_schema_element.converted_type == ConvertedType.UTF8 os.remove(local_file)
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format( unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ( "insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath]) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) assert file_meta_data.column_orders == expected_col_orders
def test_sorting_columns(self, vector, unique_database, tmpdir): """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY columns.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata row_groups = [] check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath]) for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) row_groups.extend(file_meta_data.row_groups) # Verify that the files have the sorted_columns set expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)] for row_group in row_groups: assert row_group.sorting_columns == expected
def test_sorting_columns(self, vector, unique_database, tmpdir): """Tests that RowGroup::sorting_columns gets populated when specifying a sortby() insert hint.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table # TODO: Simplify once IMPALA-4167 (insert hints in CTAS) has been fixed. query = "create table {0} like {1} stored as parquet".format(qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) /* +sortby(int_col, id) */ " "select * from {1}").format(qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata row_groups = [] check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath]) for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) row_groups.extend(file_meta_data.row_groups) # Verify that the files have the sorted_columns set expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)] for row_group in row_groups: assert row_group.sorting_columns == expected
def _get_row_group_stats_from_file(self, parquet_file): """Returns a list of statistics for each row group in file 'parquet_file'. The result is a two-dimensional list, containing stats by row group and column.""" file_meta_data = get_parquet_metadata(parquet_file) # We only support flat schemas, the additional element is the root element. schemas = file_meta_data.schema[1:] file_stats = [] for row_group in file_meta_data.row_groups: num_columns = len(row_group.columns) assert num_columns == len(schemas) column_stats = [c.meta_data.statistics for c in row_group.columns] file_stats.append(self._decode_row_group_stats(schemas, column_stats)) return file_stats
def _get_row_group_stats_from_file(self, parquet_file): """Returns a list of statistics for each row group in file 'parquet_file'. The result is a two-dimensional list, containing stats by row group and column.""" file_meta_data = get_parquet_metadata(parquet_file) # We only support flat schemas, the additional element is the root element. schemas = file_meta_data.schema[1:] file_stats = [] for row_group in file_meta_data.row_groups: num_columns = len(row_group.columns) assert num_columns == len(schemas) column_stats = [c.meta_data.statistics for c in row_group.columns] file_stats.append(self._decode_row_group_stats(schemas, column_stats)) return file_stats
def _get_row_group_from_file(self, parquet_file): """Returns namedtuples that contain the schema, stats, offset_index, column_index, and page_headers for each column in the first row group in file 'parquet_file'. Fails if the file contains multiple row groups. """ ColumnInfo = namedtuple('ColumnInfo', [ 'schema', 'stats', 'offset_index', 'column_index', 'page_headers' ]) file_meta_data = get_parquet_metadata(parquet_file) assert len(file_meta_data.row_groups) == 1 # We only support flat schemas, the additional element is the root element. schemas = file_meta_data.schema[1:] row_group = file_meta_data.row_groups[0] assert len(schemas) == len(row_group.columns) row_group_index = [] with open(parquet_file) as file_handle: for column, schema in zip(row_group.columns, schemas): column_index_offset = column.column_index_offset column_index_length = column.column_index_length column_index = None if column_index_offset and column_index_length: column_index = read_serialized_object( ColumnIndex, file_handle, column_index_offset, column_index_length) column_meta_data = column.meta_data stats = None if column_meta_data: stats = column_meta_data.statistics offset_index_offset = column.offset_index_offset offset_index_length = column.offset_index_length offset_index = None page_headers = [] if offset_index_offset and offset_index_length: offset_index = read_serialized_object( OffsetIndex, file_handle, offset_index_offset, offset_index_length) for page_loc in offset_index.page_locations: page_header = read_serialized_object( PageHeader, file_handle, page_loc.offset, page_loc.compressed_page_size) page_headers.append(page_header) column_info = ColumnInfo(schema, stats, offset_index, column_index, page_headers) row_group_index.append(column_info) return row_group_index
def _get_first_row_group_bloom_filters(self, parquet_file): # While other functions require a filename relative to $IMPALA_HOME, and prepend the # path of $IMPALA_HOME but this one does not so we have to prepend it ourselves. filename = os.path.join(os.environ['IMPALA_HOME'], parquet_file) file_meta_data = get_parquet_metadata(filename) # We only support flat schemas, the additional element is the root element. schemas = file_meta_data.schema[1:] # We are only interested in the first row group. row_group = file_meta_data.row_groups[0] assert len(schemas) == len(row_group.columns) col_to_bloom_filter = dict() with open(filename) as file_handle: for i, column in enumerate(row_group.columns): column_meta_data = column.meta_data if column_meta_data and column_meta_data.bloom_filter_offset: bloom_filter = self._try_read_bloom_filter( file_handle, column_meta_data.bloom_filter_offset) if bloom_filter: col_to_bloom_filter[i] = bloom_filter return col_to_bloom_filter
def get_schema_elements(): # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000) LOG.info("test_annotate_utf8_option local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, TABLE_NAME)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table columns a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' b_schema_element = metadata.schema[2] assert b_schema_element.name == 'b' c_schema_element = metadata.schema[3] assert c_schema_element.name == 'c' d_schema_element = metadata.schema[4] assert d_schema_element.name == 'd' os.remove(local_file) return a_schema_element, b_schema_element, c_schema_element, d_schema_element
def get_schema_elements(): # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000) LOG.info("test_annotate_utf8_option local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, TABLE_NAME)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table columns a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' b_schema_element = metadata.schema[2] assert b_schema_element.name == 'b' c_schema_element = metadata.schema[3] assert c_schema_element.name == 'c' d_schema_element = metadata.schema[4] assert d_schema_element.name == 'd' os.remove(local_file) return a_schema_element, b_schema_element, c_schema_element, d_schema_element