예제 #1
0
def test_rle_bp():
    for _ in range(10):
        values = np.random.randint(0,
                                   15000,
                                   size=np.random.randint(10, 100),
                                   dtype=np.int32)
        buf = np.empty(len(values) + 5, dtype=np.int32)
        out = cencoding.NumpyIO(buf.view('uint8'))
        buf2 = np.zeros(900, dtype=np.uint8)
        o = cencoding.NumpyIO(buf2)
        width = cencoding.width_from_max_int(values.max())

        # without length
        cencoding.encode_rle_bp(values, width, o)
        l = o.tell()
        o.seek(0)

        cencoding.read_rle_bit_packed_hybrid(o, width, length=l, o=out)
        assert (buf[:len(values)] == values).all()
예제 #2
0
def test_length():
    lengths = np.random.randint(0, 15000, size=100)
    buf = np.zeros(900, dtype=np.uint8)
    o = cencoding.NumpyIO(buf)
    for l in lengths:
        o.seek(0)
        o.write_int(l)
        o.seek(0)
        out = buf.view('int32')[0]
        assert l == out
예제 #3
0
def test_uvarint():
    values = np.random.randint(0, 15000, size=100)
    buf = np.zeros(30, dtype=np.uint8)
    o = cencoding.NumpyIO(buf)
    for v in values:
        o.seek(0)
        cencoding.encode_unsigned_varint(v, o)
        o.seek(0)
        out = cencoding.read_unsigned_var_int(o)
        assert v == out
예제 #4
0
def read_data(fobj, coding, count, bit_width, out=None):
    """For definition and repetition levels

    Reads with RLE/bitpacked hybrid, where length is given by first byte.

    out: potentially provide a len(count) uint8 array to reuse
    """
    out = out or np.empty(count, dtype=np.uint8)
    o = encoding.NumpyIO(out)
    if coding == parquet_thrift.Encoding.RLE:
        while o.tell() < count:
            encoding.read_rle_bit_packed_hybrid(fobj, bit_width, 0, o, itemsize=1)
    else:
        raise NotImplementedError('Encoding %s' % coding)
    return out
예제 #5
0
def read_data_page_v2(infile, schema_helper, se, data_header2, cmd,
                      dic, assign, num, use_cat, file_offset, ph, idx=None,
                      selfmade=False):
    """
    :param infile: open file
    :param schema_helper:
    :param se: schema element
    :param data_header2: page header struct
    :param cmd: column metadata
    :param dic: any dictionary labels encountered
    :param assign: output array (all of it)
    :param num: offset, rows so far
    :param use_cat: output is categorical?
    :return: None

    test data "/Users/mdurant/Downloads/datapage_v2.snappy.parquet"
          a  b    c      d          e
    0   abc  1  2.0   True  [1, 2, 3]
    1   abc  2  3.0   True       None
    2   abc  3  4.0   True       None
    3  None  4  5.0  False  [1, 2, 3]
    4   abc  5  2.0   True     [1, 2]

    b is delta encoded; c is dict encoded

    """
    if data_header2.encoding not in [parquet_thrift.Encoding.PLAIN_DICTIONARY,
                                     parquet_thrift.Encoding.RLE_DICTIONARY,
                                     parquet_thrift.Encoding.RLE,
                                     parquet_thrift.Encoding.PLAIN,
                                     parquet_thrift.Encoding.DELTA_BINARY_PACKED
                                     ]:
        raise NotImplementedError
    size = (ph.compressed_page_size - data_header2.repetition_levels_byte_length -
            data_header2.definition_levels_byte_length)
    data = infile.tell() + data_header2.definition_levels_byte_length + data_header2.repetition_levels_byte_length
    n_values = data_header2.num_values - data_header2.num_nulls

    max_rep = schema_helper.max_repetition_level(cmd.path_in_schema)
    if max_rep:
        # TODO: probably not functional
        bit_width = encoding.width_from_max_int(max_rep)
        io_obj = encoding.NumpyIO(infile.read(data_header2.repetition_levels_byte_length))
        repi = np.empty(data_header2.num_values, dtype="uint8")
        encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values,
                                            encoding.NumpyIO(repi), itemsize=1)

    max_def = schema_helper.max_definition_level(cmd.path_in_schema)

    nullable = isinstance(assign.dtype, pd.core.arrays.masked.BaseMaskedDtype)
    if max_def and data_header2.num_nulls:
        bit_width = encoding.width_from_max_int(max_def)
        # not the same as read_data(), because we know the length
        io_obj = encoding.NumpyIO(infile.read(data_header2.definition_levels_byte_length))
        if nullable:
            defi = assign._mask
        else:
            # TODO: in tabular data, nulls arrays could be reused for each column
            defi = np.empty(data_header2.num_values, dtype=np.uint8)
        encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values,
                                            encoding.NumpyIO(defi), itemsize=1)
        if max_rep:
            # assemble_objects needs both arrays
            nulls = defi != max_def
        else:
            np.not_equal(defi.view("uint8"), max_def, out=defi)
            nulls = defi.view(np.bool_)
    infile.seek(data)

    # input and output element sizes match
    see = se.type_length == assign.dtype.itemsize * 8 or simple.get(se.type).itemsize == assign.dtype.itemsize
    # can read-into
    into0 = ((use_cat or converts_inplace(se) and see)
             and data_header2.num_nulls == 0
             and max_rep == 0 and assign.dtype.kind != "O")
    # can decompress-into
    into = (data_header2.is_compressed and rev_map[cmd.codec] in decom_into
            and into0)
    if nullable:
        assign = assign._data

    uncompressed_page_size = (ph.uncompressed_page_size - data_header2.definition_levels_byte_length -
                              data_header2.repetition_levels_byte_length)
    if into0 and data_header2.encoding == parquet_thrift.Encoding.PLAIN and (
            not data_header2.is_compressed or cmd.codec == parquet_thrift.CompressionCodec.UNCOMPRESSED
    ):
        # PLAIN read directly into output (a copy for remote files)
        infile.readinto(assign[num:num+n_values].view('uint8'))
        convert(assign[num:num+n_values], se)
    elif into and data_header2.encoding == parquet_thrift.Encoding.PLAIN:
        # PLAIN decompress directly into output
        decomp = decom_into[rev_map[cmd.codec]]
        decomp(infile.read(size), assign[num:num+data_header2.num_values].view('uint8'))
        convert(assign[num:num+n_values], se)
    elif data_header2.encoding == parquet_thrift.Encoding.PLAIN:
        # PLAIN, but with nulls or not in-place conversion
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = decompress_data(infile.read(size),
                                    uncompressed_page_size, codec)
        values = read_plain(raw_bytes,
                            cmd.type,
                            n_values,
                            width=se.type_length,
                            utf=se.converted_type == 0)
        if data_header2.num_nulls:
            if nullable:
                assign[num:num+data_header2.num_values][~nulls] = convert(values, se)
            else:
                assign[num:num+data_header2.num_values][nulls] = None  # or nan or nat
                assign[num:num+data_header2.num_values][~nulls] = convert(values, se)
        else:
            assign[num:num+data_header2.num_values] = convert(values, se)
    elif (use_cat and data_header2.encoding in [
        parquet_thrift.Encoding.PLAIN_DICTIONARY,
        parquet_thrift.Encoding.RLE_DICTIONARY,
    ]) or (data_header2.encoding == parquet_thrift.Encoding.RLE):
        # DICTIONARY or BOOL direct decode RLE into output (no nulls)
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = np.empty(size, dtype='uint8')
        # TODO: small improvement possible by file.readinto and decompress_into if we
        #  don't first read raw_bytes but seek in the open file
        infile.readinto(raw_bytes)
        raw_bytes = decompress_data(raw_bytes, uncompressed_page_size, codec)
        pagefile = encoding.NumpyIO(raw_bytes)
        if data_header2.encoding != parquet_thrift.Encoding.RLE:
            # TODO: check this bit; is the varint read only row byte-exact fastpath?
            bit_width = pagefile.read_byte()
            encoding.read_unsigned_var_int(pagefile)
        else:
            bit_width = 1
            pagefile.seek(4, 1)
        if bit_width in [8, 16, 32] and selfmade:
            # special fastpath for cats
            outbytes = raw_bytes[pagefile.tell():]
            if len(outbytes) == assign[num:num+data_header2.num_values].nbytes:
                assign[num:num+data_header2.num_values].view('uint8')[:] = outbytes
            else:
                if data_header2.num_nulls == 0:
                    assign[num:num+data_header2.num_values][:] = outbytes
                else:
                    assign[num:num+data_header2.num_values][~nulls] = outbytes
                    assign[num:num+data_header2.num_values][nulls] = -1
        else:
            if data_header2.num_nulls == 0:
                encoding.read_rle_bit_packed_hybrid(
                    pagefile,
                    bit_width,
                    uncompressed_page_size,
                    encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')),
                    itemsize=bit_width
                )
            else:
                temp = np.empty(data_header2.num_values, assign.dtype)
                encoding.read_rle_bit_packed_hybrid(
                    pagefile,
                    bit_width,
                    uncompressed_page_size,
                    encoding.NumpyIO(temp.view('uint8')),
                    itemsize=bit_width
                )
                if not nullable:
                    assign[num:num+data_header2.num_values][nulls] = None
                assign[num:num+data_header2.num_values][~nulls] = temp

    elif data_header2.encoding in [
        parquet_thrift.Encoding.PLAIN_DICTIONARY,
        parquet_thrift.Encoding.RLE_DICTIONARY
    ]:
        # DICTIONARY to be de-referenced, with or without nulls
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        compressed_bytes = infile.read(size)
        raw_bytes = decompress_data(compressed_bytes, uncompressed_page_size, codec)
        out = np.empty(n_values, dtype='uint8')
        pagefile = encoding.NumpyIO(raw_bytes)
        bit_width = pagefile.read_byte()
        encoding.read_rle_bit_packed_hybrid(
            pagefile,
            bit_width,
            uncompressed_page_size,
            encoding.NumpyIO(out),
            itemsize=1
        )
        if max_rep:
            # num_rows got filled, but consumed num_values data entries
            encoding._assemble_objects(
                assign[idx[0]:idx[0]+data_header2.num_rows], defi, repi, out, dic, d=True,
                null=True, null_val=False, max_defi=max_def, prev_i=0
            )
            idx[0] += data_header2.num_rows
        elif data_header2.num_nulls:
            if not nullable and assign.dtype != "O":
                assign[num:num+data_header2.num_values][nulls] = None  # may be unnecessary
            assign[num:num+data_header2.num_values][~nulls] = dic[out]
        else:
            assign[num:num+data_header2.num_values] = dic[out]
    elif data_header2.encoding == parquet_thrift.Encoding.DELTA_BINARY_PACKED:
        assert data_header2.num_nulls == 0, "null delta-int not implemented"
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = decompress_data(infile.read(size),
                                    uncompressed_page_size, codec)
        if converts_inplace(se):
            encoding.delta_binary_unpack(
                encoding.NumpyIO(raw_bytes),
                encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8'))
            )
            convert(assign[num:num+data_header2.num_values], se)
        else:
            out = np.empty(data_header2.num_values, dtype='int32')
            encoding.delta_binary_unpack(
                encoding.NumpyIO(raw_bytes), encoding.NumpyIO(out.view('uint8'))
            )
            assign[num:num+data_header2.num_values] = convert(out, se)
    else:
        # codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        # raw_bytes = decompress_data(infile.read(size),
        #                             ph.uncompressed_page_size, codec)
        raise NotImplementedError
    return data_header2.num_values
예제 #6
0
def read_data_page(f,
                   helper,
                   header,
                   metadata,
                   skip_nulls=False,
                   selfmade=False):
    """Read a data page: definitions, repetitions, values (in order)

    Only values are guaranteed to exist, e.g., for a top-level, required
    field.
    """
    daph = header.data_page_header
    raw_bytes = _read_page(f, header, metadata)
    io_obj = encoding.NumpyIO(raw_bytes)

    repetition_levels = read_rep(io_obj, daph, helper, metadata)

    if skip_nulls and not helper.is_required(metadata.path_in_schema):
        num_nulls = 0
        definition_levels = None
        skip_definition_bytes(io_obj, daph.num_values)
    else:
        definition_levels, num_nulls = read_def(io_obj, daph, helper, metadata)

    nval = daph.num_values - num_nulls
    se = helper.schema_element(metadata.path_in_schema)
    if daph.encoding == parquet_thrift.Encoding.PLAIN:

        width = helper.schema_element(metadata.path_in_schema).type_length
        values = read_plain(io_obj.read(),
                            metadata.type,
                            int(daph.num_values - num_nulls),
                            width=width,
                            utf=se.converted_type == 0)
    elif daph.encoding in [
            parquet_thrift.Encoding.PLAIN_DICTIONARY,
            parquet_thrift.Encoding.RLE_DICTIONARY, parquet_thrift.Encoding.RLE
    ]:
        # bit_width is stored as single byte.
        if daph.encoding == parquet_thrift.Encoding.RLE:
            bit_width = se.type_length
        else:
            bit_width = io_obj.read_byte()
        if bit_width in [8, 16, 32] and selfmade:
            num = (encoding.read_unsigned_var_int(io_obj) >> 1) * 8
            values = np.frombuffer(io_obj.read(num * bit_width // 8),
                                   dtype='int%i' % bit_width)
        elif bit_width:
            if bit_width > 8:
                values = np.empty(daph.num_values - num_nulls, dtype=np.int32)
                o = encoding.NumpyIO(values.view('uint8'))
                encoding.read_rle_bit_packed_hybrid(io_obj,
                                                    bit_width,
                                                    io_obj.len - io_obj.tell(),
                                                    o=o,
                                                    itemsize=4)
            else:
                values = np.empty(daph.num_values - num_nulls, dtype=np.uint8)
                o = encoding.NumpyIO(values)
                encoding.read_rle_bit_packed_hybrid(io_obj,
                                                    bit_width,
                                                    io_obj.len - io_obj.tell(),
                                                    o=o,
                                                    itemsize=1)
            values = values.data[:nval]
        else:
            values = np.zeros(nval, dtype=np.int8)
    else:
        raise NotImplementedError('Encoding %s' % daph.encoding)
    return definition_levels, repetition_levels, values[:nval]