def read_bytes_decimal(data, writer_schema=None, reader_schema=None): """ Decimal is encoded as fixed. Fixed instances are encoded using the number of bytes declared in the schema. based on https://github.com/apache/avro/pull/82/ """ scale = writer_schema['scale'] precision = writer_schema['precision'] size = len(data) datum_byte = str2ints(data) unscaled_datum = 0 msb = fstint(data) leftmost_bit = (msb >> 7) & 1 if leftmost_bit == 1: modified_first_byte = datum_byte[0] ^ (1 << 7) datum_byte = [modified_first_byte] + datum_byte[1:] for offset in xrange(size): unscaled_datum <<= 8 unscaled_datum += datum_byte[offset] unscaled_datum += pow(-2, (size * 8) - 1) else: for offset in xrange(size): unscaled_datum <<= 8 unscaled_datum += (datum_byte[offset]) with localcontext() as ctx: ctx.prec = precision scaled_datum = Decimal(unscaled_datum).scaleb(-scale) return scaled_datum
def _read_decimal(data, size, writer_schema): """ based on https://github.com/apache/avro/pull/82/ """ scale = writer_schema['scale'] precision = writer_schema['precision'] datum_byte = str2ints(data) unscaled_datum = 0 msb = fstint(data) leftmost_bit = (msb >> 7) & 1 if leftmost_bit == 1: modified_first_byte = datum_byte[0] ^ (1 << 7) datum_byte = [modified_first_byte] + datum_byte[1:] for offset in xrange(size): unscaled_datum <<= 8 unscaled_datum += datum_byte[offset] unscaled_datum += pow(-2, (size * 8) - 1) else: for offset in xrange(size): unscaled_datum <<= 8 unscaled_datum += (datum_byte[offset]) with localcontext() as ctx: ctx.prec = precision scaled_datum = Decimal(unscaled_datum).scaleb(-scale) return scaled_datum
def read_map(fo, writer_schema, reader_schema=None): """Maps are encoded as a series of blocks. Each block consists of a long count value, followed by that many key/value pairs. A block with count zero indicates the end of the map. Each item is encoded per the map's value schema. If a block's count is negative, then the count is followed immediately by a long block size, indicating the number of bytes in the block. The actual count in this case is the absolute value of the count written. """ if reader_schema: def item_reader(fo, w_schema, r_schema): return read_data(fo, w_schema['values'], r_schema['values']) else: def item_reader(fo, w_schema, _): return read_data(fo, w_schema['values']) read_items = {} block_count = read_long(fo) while block_count != 0: if block_count < 0: block_count = -block_count # Read block size, unused read_long(fo) for i in xrange(block_count): key = read_utf8(fo) read_items[key] = item_reader(fo, writer_schema, reader_schema) block_count = read_long(fo) return read_items
def read_array(fo, writer_schema, reader_schema=None): """Arrays are encoded as a series of blocks. Each block consists of a long count value, followed by that many array items. A block with count zero indicates the end of the array. Each item is encoded per the array's item schema. If a block's count is negative, then the count is followed immediately by a long block size, indicating the number of bytes in the block. The actual count in this case is the absolute value of the count written. """ if reader_schema: def item_reader(fo, w_schema, r_schema): return read_data(fo, w_schema['items'], r_schema['items']) else: def item_reader(fo, w_schema, _): return read_data(fo, w_schema['items']) read_items = [] block_count = read_long(fo) while block_count != 0: if block_count < 0: block_count = -block_count # Read block size, unused read_long(fo) for i in xrange(block_count): read_items.append(item_reader(fo, writer_schema, reader_schema)) block_count = read_long(fo) return read_items
def _iter_avro(fo, header, codec, writer_schema, reader_schema): """Return iterator over avro records.""" sync_marker = header['sync'] # Value in schema is bytes read_block = BLOCK_READERS.get(codec) if not read_block: raise ValueError('Unrecognized codec: %r' % codec) block_count = 0 while True: block_count = read_long(fo, None) block_fo = read_block(fo) for i in xrange(block_count): yield read_data(block_fo, writer_schema, reader_schema) skip_sync(fo, sync_marker)