def _ctas_and_get_metadata(self, vector, unique_database, tmp_dir, source_table, table_name="test_hdfs_parquet_table_writer"): """CTAS 'source_table' into a Parquet table and returns its Parquet metadata.""" qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format( unique_database, table_name)) # Setting num_nodes = 1 ensures that the query is executed on the coordinator, # resulting in a single parquet file being written. query = ( "create table {0} stored as parquet as select * from {1}").format( qualified_table_name, source_table) vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query_expect_success(self.client, query, vector.get_value('exec_option')) file_metadata_list = get_parquet_metadata_from_hdfs_folder( hdfs_path, tmp_dir) assert len(file_metadata_list) == 1 assert file_metadata_list[0] is not None return file_metadata_list[0]
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format(qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for file_metadata in file_metadata_list: assert file_metadata.column_orders == expected_col_orders
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format( unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ( "insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders file_metadata_list = get_parquet_metadata_from_hdfs_folder( hdfs_path, tmpdir.strpath) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for file_metadata in file_metadata_list: assert file_metadata.column_orders == expected_col_orders
def test_sorting_columns(self, vector, unique_database, tmpdir): """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY columns.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath) row_groups = [] for file_metadata in file_metadata_list: row_groups.extend(file_metadata.row_groups) # Verify that the files have the sorted_columns set expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)] for row_group in row_groups: assert row_group.sorting_columns == expected
def _get_row_group_stats_from_hdfs_folder(self, hdfs_path, tmp_dir): """Returns a list of statistics for each row group in all parquet files i 'hdfs_path'. 'tmp_dir' needs to be supplied by the caller and will be used to store temporary files. The caller is responsible for cleaning up 'tmp_dir'. The result is a two-dimensional list, containing stats by row group and column.""" row_group_stats = [] file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir) for file_metadata in file_metadata_list: row_group_stats.extend(self._get_row_group_stats_from_file_metadata(file_metadata)) return row_group_stats
def _ctas_and_get_metadata(self, vector, unique_database, tmp_dir, source_table, table_name="test_hdfs_parquet_table_writer"): """CTAS 'source_table' into a Parquet table and returns its Parquet metadata.""" qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database, table_name)) # Setting num_nodes = 1 ensures that the query is executed on the coordinator, # resulting in a single parquet file being written. query = ("create table {0} stored as parquet as select * from {1}").format( qualified_table_name, source_table) vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query_expect_success(self.client, query, vector.get_value('exec_option')) file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir) assert len(file_metadata_list) == 1 assert file_metadata_list[0] is not None return file_metadata_list[0]