Exemplo n.º 1
0
def read_data_page(fo, schema_helper, page_header, column_metadata,
                   dictionary):
    """Reads the datapage from the given file-like object based upon the
    metadata in the schema_helper, page_header, column_metadata, and
    (optional) dictionary. Returns a list of values.
    """
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = io.BytesIO(raw_bytes)
    vals = []
    # definition levels are skipped if data is required.
    if not schema_helper.is_required(column_metadata.path_in_schema[-1]):
        max_definition_level = schema_helper.max_definition_level(
            column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_definition_level)
        if bit_width == 0:
            definition_levels = [0] * daph.num_values
        else:
            definition_levels = _read_data(io_obj,
                                           daph.definition_level_encoding,
                                           daph.num_values, bit_width)

    # repetition levels are skipped if data is at the first level.
    if len(column_metadata.path_in_schema) > 1:
        max_repetition_level = schema_helper.max_repetition_level(
            column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_repetition_level)
        repetition_levels = _read_data(io_obj, daph.repetition_level_encoding,
                                       daph.num_values, bit_width)

    # TODO Actually use the definition and repetition levels.

    if daph.encoding == Encoding.PLAIN:
        width = getattr(column_metadata, 'width')
        for i in range(daph.num_values):
            vals.append(
                encoding.read_plain(io_obj, column_metadata.type, width))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = io.BytesIO(dict_values_bytes)
        # TODO jcrobak -- not sure that this loop is needed?
        while total_seen < daph.num_values:
            values = encoding.read_rle_bit_packed_hybrid(
                dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0:daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    else:
        raise ParquetFormatException("Unsupported encoding: %s",
                                     _get_name(Encoding, daph.encoding))
    return vals
Exemplo n.º 2
0
 def _read_repetitions(self, io_obj, daph, schema_helper, column_metadata):
     if len(column_metadata.path_in_schema) > 1:
         max_repetition_level = schema_helper.max_repetition_level(
             column_metadata.path_in_schema)
         bit_width = encoding.width_from_max_int(max_repetition_level)
         repetition_levels = self._read_data(io_obj,
                                             daph.repetition_level_encoding,
                                             daph.num_values, bit_width)
         return repetition_levels
     return None
Exemplo n.º 3
0
 def _read_repetitions(self, io_obj, daph, schema_helper, column_metadata):
     if len(column_metadata.path_in_schema) > 1:
         max_repetition_level = schema_helper.max_repetition_level(
             column_metadata.path_in_schema)
         bit_width = encoding.width_from_max_int(max_repetition_level)
         repetition_levels = self._read_data(io_obj,
                                             daph.repetition_level_encoding,
                                             daph.num_values, bit_width)
         return repetition_levels
     return None
Exemplo n.º 4
0
 def _read_definitions(self, io_obj, daph, schema_helper, column_metadata):
     # definition levels are skipped if data is required.
     if not schema_helper.is_required(column_metadata.path_in_schema[-1]):
         max_definition_level = schema_helper.max_definition_level(
             column_metadata.path_in_schema)
         bit_width = encoding.width_from_max_int(max_definition_level)
         if bit_width == 0:
             definition_levels = [0] * daph.num_values
         else:
             definition_levels = self._read_data(
                 io_obj, daph.definition_level_encoding, daph.num_values,
                 bit_width)
         return definition_levels
     return None
Exemplo n.º 5
0
 def _read_definitions(self, io_obj, daph, schema_helper, column_metadata):
     # definition levels are skipped if data is required.
     if not schema_helper.is_required(column_metadata.path_in_schema[-1]):
         max_definition_level = schema_helper.max_definition_level(
             column_metadata.path_in_schema)
         bit_width = encoding.width_from_max_int(max_definition_level)
         if bit_width == 0:
             definition_levels = [0] * daph.num_values
         else:
             definition_levels = self._read_data(io_obj,
                                                 daph.definition_level_encoding,
                                                 daph.num_values,
                                                 bit_width)
         return definition_levels
     return None