def __init__(self, fo, schema, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None, validator=None): self.fo = fo self.schema = schema self.validate_fn = validate if validator is True else validator self.sync_marker = urandom(SYNC_SIZE) self.io = MemoryIO() self.block_count = 0 self.metadata = metadata or {} self.metadata['avro.codec'] = codec self.metadata['avro.schema'] = json.dumps(schema) self.sync_interval = sync_interval try: self.block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('unrecognized codec: %r' % codec) write_header(self.fo, self.metadata, self.sync_marker) acquaint_schema(self.schema)
def prepare_fixed_decimal(data, schema): if not isinstance(data, decimal.Decimal): return data scale = schema['scale'] size = schema['size'] # based on https://github.com/apache/avro/pull/82/ sign, digits, exp = data.as_tuple() if -exp > scale: raise ValueError('Scale provided in schema does not match the decimal') delta = exp + scale if delta > 0: digits = digits + (0, ) * delta unscaled_datum = 0 for digit in digits: unscaled_datum = (unscaled_datum * 10) + digit # 2.6 support if not hasattr(unscaled_datum, 'bit_length'): bits_req = len(bin(abs(unscaled_datum))) - 2 else: bits_req = unscaled_datum.bit_length() + 1 size_in_bits = size * 8 offset_bits = size_in_bits - bits_req mask = 2**size_in_bits - 1 bit = 1 for i in range(bits_req): mask ^= bit bit <<= 1 if bits_req < 8: bytes_req = 1 else: bytes_req = bits_req // 8 if bits_req % 8 != 0: bytes_req += 1 tmp = MemoryIO() if sign: unscaled_datum = (1 << bits_req) - unscaled_datum unscaled_datum = mask | unscaled_datum for index in range(size - 1, -1, -1): bits_to_write = unscaled_datum >> (8 * index) tmp.write(mk_bits(bits_to_write & 0xff)) else: for i in range(offset_bits // 8): tmp.write(mk_bits(0)) for index in range(bytes_req - 1, -1, -1): bits_to_write = unscaled_datum >> (8 * index) tmp.write(mk_bits(bits_to_write & 0xff)) return tmp.getvalue()
def prepare_bytes_decimal(data, schema): if not isinstance(data, decimal.Decimal): return data scale = schema['scale'] # based on https://github.com/apache/avro/pull/82/ sign, digits, exp = data.as_tuple() if -exp > scale: raise AssertionError( 'Scale provided in schema does not match the decimal') delta = exp + scale if delta > 0: digits = digits + (0, ) * delta unscaled_datum = 0 for digit in digits: unscaled_datum = (unscaled_datum * 10) + digit # 2.6 support if not hasattr(unscaled_datum, 'bit_length'): bits_req = len(bin(abs(unscaled_datum))) - 2 else: bits_req = unscaled_datum.bit_length() + 1 if sign: unscaled_datum = (1 << bits_req) - unscaled_datum bytes_req = bits_req // 8 padding_bits = ~((1 << bits_req) - 1) if sign else 0 packed_bits = padding_bits | unscaled_datum bytes_req += 1 if (bytes_req << 3) < bits_req else 0 tmp = MemoryIO() for index in range(bytes_req - 1, -1, -1): bits_to_write = packed_bits >> (8 * index) tmp.write(mk_bits(bits_to_write & 0xff)) return tmp.getvalue()
def snappy_read_block(fo): length = read_long(fo, None) data = fo.read(length - 4) fo.read(4) # CRC return MemoryIO(snappy.decompress(data))
def deflate_read_block(fo): """Read block in "deflate" codec.""" data = read_bytes(fo, None) # -15 is the log of the window size; negative indicates "raw" (no # zlib headers) decompression. See zlib.h. return MemoryIO(decompress(data, -15))
def writer(fo, schema, records, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None): """Write records to fo (stream) according to schema Paramaters ---------- fo: file like Output stream records: iterable Records to write codec: string, optional Compression codec, can be 'null', 'deflate' or 'snappy' (if installed) sync_interval: int, optional Size of sync interval metadata: dict, optional Header metadata Example ------- >>> from fastavro import writer >>> schema = { >>> 'doc': 'A weather reading.', >>> 'name': 'Weather', >>> 'namespace': 'test', >>> 'type': 'record', >>> 'fields': [ >>> {'name': 'station', 'type': 'string'}, >>> {'name': 'time', 'type': 'long'}, >>> {'name': 'temp', 'type': 'int'}, >>> ], >>> } >>> records = [ >>> {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388}, >>> {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389}, >>> {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379}, >>> {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478}, >>> ] >>> with open('weather.avro', 'wb') as out: >>> writer(out, schema, records) """ sync_marker = urandom(SYNC_SIZE) io = MemoryIO() block_count = 0 metadata = metadata or {} metadata['avro.codec'] = codec metadata['avro.schema'] = json.dumps(schema) try: block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('unrecognized codec: %r' % codec) def dump(): write_long(fo, block_count) block_writer(fo, io.getvalue()) fo.write(sync_marker) io.truncate(0) io.seek(0, SEEK_SET) write_header(fo, metadata, sync_marker) acquaint_schema(schema) for record in records: write_data(io, record, schema) block_count += 1 if io.tell() >= sync_interval: dump() block_count = 0 if io.tell() or block_count > 0: dump() fo.flush()