def _get_row_group_from_file(self, parquet_file): """Returns namedtuples that contain the schema, stats, offset_index, column_index, and page_headers for each column in the first row group in file 'parquet_file'. Fails if the file contains multiple row groups. """ ColumnInfo = namedtuple('ColumnInfo', [ 'schema', 'stats', 'offset_index', 'column_index', 'page_headers' ]) file_meta_data = get_parquet_metadata(parquet_file) assert len(file_meta_data.row_groups) == 1 # We only support flat schemas, the additional element is the root element. schemas = file_meta_data.schema[1:] row_group = file_meta_data.row_groups[0] assert len(schemas) == len(row_group.columns) row_group_index = [] with open(parquet_file) as file_handle: for column, schema in zip(row_group.columns, schemas): column_index_offset = column.column_index_offset column_index_length = column.column_index_length column_index = None if column_index_offset and column_index_length: column_index = read_serialized_object( ColumnIndex, file_handle, column_index_offset, column_index_length) column_meta_data = column.meta_data stats = None if column_meta_data: stats = column_meta_data.statistics offset_index_offset = column.offset_index_offset offset_index_length = column.offset_index_length offset_index = None page_headers = [] if offset_index_offset and offset_index_length: offset_index = read_serialized_object( OffsetIndex, file_handle, offset_index_offset, offset_index_length) for page_loc in offset_index.page_locations: page_header = read_serialized_object( PageHeader, file_handle, page_loc.offset, page_loc.compressed_page_size) page_headers.append(page_header) column_info = ColumnInfo(schema, stats, offset_index, column_index, page_headers) row_group_index.append(column_info) return row_group_index
def _try_read_bloom_filter_header(self, file_handle, bloom_filter_offset): """ Returns the Bloom filter header and its size. If it is not found, None is returned instead of the header and the size is unspecified. """ header = None header_size = 8 while (header_size <= 1024 and header is None): try: header = read_serialized_object(BloomFilterHeader, file_handle, bloom_filter_offset, header_size) except EOFError: header_size *= 2 return (header, header_size)