def run(self, n): # JSON Serializer # serializer = ajs.AvroJsonSerializer(self.movies_schema) # json_data = serializer.to_json(self.movies_data) total_ser = 0 total_deser = 0 bytes_len = 0 for i in range(0, n): datum_writer = DatumWriter(self.movies_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) tic = timeit.default_timer() datum_writer.write(self.movies_data, encoder) elapsed = timeit.default_timer() - tic payload = bytes_writer.getvalue() total_ser = total_ser + elapsed bytes_len = len(payload) bytes_reader = io.BytesIO(payload) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.movies_schema) tic2 = timeit.default_timer() movies = reader.read(decoder) elapsed2 = timeit.default_timer() - tic2 total_deser = total_deser + elapsed2 self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len) avg_ser = (total_ser*(10**9))/n avg_deser = (total_deser*(10**9))/n self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser) self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
def read_value(schema: TypedSchema, bio: io.BytesIO): if schema.schema_type is SchemaType.AVRO: reader = DatumReader(schema.schema) return reader.read(BinaryDecoder(bio)) if schema.schema_type is SchemaType.JSONSCHEMA: value = load(bio) try: schema.schema.validate(value) except ValidationError as e: raise InvalidPayload from e return value raise ValueError("Unknown schema type")
def deserialize_avro(binary_data, schema): """ Function used to deserialize an avro binary data :param schema: avro schema of binary data :param binary_data: event data in binary encoded (bytes) :return: deserialized data and corresponding schema """ bytes_reader = io.BytesIO(binary_data) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(schema) data = reader.read(decoder) return data, schema
def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def deserialize(flight_info_bytes): if flight_info_bytes is not None: bytes_reader = BytesIO(flight_info_bytes) decoder = BinaryDecoder(bytes_reader) schema_flight_info = Parse( open(dir_path + "/flight-info.schema.avsc", "rb").read()) reader = DatumReader(schema_flight_info) flight_info = reader.read(decoder) return json.dumps([{"id": 907955534287978496}]) else: return None
def decode_avro(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) decoded = reader.read(output) return decoded, schema.name # no magic bytes, something is wrong else: raise ValueError
def _decode_avro(self, avro_data): """ Decode Avro Message. Args: avro_data (binary): Avro data to decode. Returns: dict """ bytes_reader = io.BytesIO(avro_data.value) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.schema) return reader.read(decoder)
def __init__(self, callback, service_name, param_schema, result_schema, version=0): self.callback = callback self.service_name = service_name self.param_schema = SchemaFromJSONData(param_schema, Names()) self.result_schema = SchemaFromJSONData(result_schema, Names()) self.version = version self._param_writer = DatumWriter(self.param_schema) self._param_reader = DatumReader(self.param_schema) self._result_writer = DatumWriter(self.result_schema) self._result_reader = DatumReader(self.result_schema)
def unpack(self, payload): MAGIC_BYTES = 0 magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # Timestamp is inside my key return payload[:-8].decode()
def parse_avro_msg(msg, avro_schema): """ Parses an avro record using a specified avro schema Args: :msg: the avro message to parse :avro_schema: the avro schema Returns: The parsed/decoded message """ reader = DatumReader(avro_schema) message_bytes = BytesIO(msg) decoder = BinaryDecoder(message_bytes) return reader.read(decoder)
def test_seekable(self): fn = self.write_avro_file(avro_user_record, 500, 1024) with open(fn, 'rb') as f: sreader = SeekableDataFileReader(f, DatumReader()) res = [t for t in czip(cmap( lambda _: f.tell(), it.repeat(1) ), sreader)] sreader.align_after(res[-1][0]) with self.assertRaises(StopIteration): r = next(sreader) sreader.align_after(0) r = next(sreader) self.assertEqual(r, res[0][1]) def offset_iterator(): s = -1 for o, r in res: sreader.align_after(o) t = f.tell() if t == s: continue s = t x = next(sreader) yield (t, x) i = 0 for xo, x in offset_iterator(): sreader.align_after(xo) for o, r in res[i:]: if o >= xo: self.assertEqual(x, r) break i += 1
class Deserializer(object): def __init__(self, schema_str): schema = parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
def _from_avro_generic(avro_container_uri: str, ): datum_counter = 0 datum_to_return = None # DET TODO add other exception handling around the double with clause with open(avro_container_uri, "rb") as avro_fp: with DataFileReader(avro_fp, DatumReader()) as reader: # # This static meethod can only initialize one datum in the file - scan through and raise # error if more than one found # Not sure if there is lazy access to the datum - if so returning the datum to caller # for subsequent loading would be problematic # for datum_counter, datum in enumerate(reader, start=1): print('Reading datum #' + str(datum_counter)) print('The message datum = ' + str(datum)) if datum_counter == 1: datum_to_return = datum if datum_counter > 1: raise EmeraldMessageDeserializationError( 'Unable to deserialize from AVRO container "' + avro_container_uri + '" - this deserializer can only have one datum per file' + os.linesep + 'Total element count in this file = ' + str(datum_counter)) if datum_to_return is None: raise EmeraldMessageDeserializationError( 'Data could not be loaded from AVRO file "' + str(avro_container_uri) + '" using schema ' + AbstractContainer.get_avro_schema_record().avro_schema_name) print('Length of datum to return = ' + str(datum_to_return)) print('Type of data to return = ' + str(type(datum_to_return))) return datum_to_return
def _unpack(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == self.MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # If KSQL payload, exclude timestamp which is inside the key. # payload[:-8].decode() return payload.decode()
def decode(self, encoded_event: Any) -> Dict[str, Union[BaseModel, BaseStoreRecord, BaseHandler, BaseStoreRecordHandler]]: try: reader = DataFileReader(BytesIO(encoded_event), DatumReader()) schema = json.loads(reader.meta.get('avro.schema').decode('utf-8')) schema_name = schema['namespace'] + '.' + schema['name'] event_data = next(reader) except AvroTypeException as err: self.logger.exception(f'{err.__str__()}') raise AvroDecodeError # Finds a matching event name for e_name, event in self._events.items(): if e_name.match(schema_name): # type: ignore event_class = event break else: raise MissingEventClass # Finds a matching handler name for e_name, handler in self._handlers.items(): if e_name.match(schema_name): # type: ignore handler_class = handler break else: raise MissingHandlerClass return {'event_class': event_class.from_data(event_data=event_data), 'handler_class': handler_class}
def get_messages(): # делаем ридер для авро-схема schema = avro.schema.Parse(open(SCHEMA_PATH,'r').read()) reader = DatumReader(schema) # создаем консьюмера consumer = KafkaConsumer( TOPIC_NAME, group_id=GROUP_ID, bootstrap_servers=BOOTSTRAP_SERVERS, api_version=(0, 10), auto_offset_reset='earliest', enable_auto_commit=True, consumer_timeout_ms=15000 ) # читаем последние сообщения messages = [] for message in consumer: messages.append(avro_deserializer(message.value,reader)) consumer.close() # сортируем по времени messages = sorted(messages, key=lambda k: k['timestamp']) return messages
def test_avro_reader(self): N = 500 fn = self.write_avro_file(avro_user_record, N, 1024) url = hdfs.path.abspath(fn, local=True) class FunkyCtx(object): def __init__(self, isplit): self.input_split = isplit def get_areader(offset, length): isplit = InputSplit(InputSplit.to_string(url, offset, length)) ctx = FunkyCtx(isplit) return AvroReader(ctx) areader = get_areader(0, 14) file_length = areader.reader.file_length with self.assertRaises(StopIteration): next(areader) areader = get_areader(0, file_length) with SeekableDataFileReader(open(fn, 'rb'), DatumReader()) as sreader: for (o, a), s in czip(areader, sreader): self.assertEqual(a, s) mid_len = int(file_length / 2) lows = [x for x in get_areader(0, mid_len)] highs = [x for x in get_areader(mid_len, file_length)] self.assertEqual(N, len(lows) + len(highs))
def _read_avro_file(self) -> List[dict]: records = [] for file in glob.glob( os.path.join(os.path.join(self.avro_dir_name, '**/*'))): for record in DataFileReader(open(file, 'rb'), DatumReader()): records.append(record) return records
def read_then_to_json(client, file_names, bucket, error_keys_table): temp_json_output = [] for file in file_names: filename = "/tmp/temp.avro" try: client.download_file(Bucket = bucket, Key = file, Filename = filename) except Exception as e: ''' files which could not be downloaded''' print ("File could not be downloaded: " + file) error_keys_table['aws']['files'].append(file) continue try: reader = DataFileReader(open(filename , "rb"), DatumReader()) except Exception as e: ''' files that couldn't be opened ''' print ("File could not be opened: " + file) error_keys_table['open']['files'].append(file) continue for user in reader: if user not in temp_json_output: temp_json_output.append(user) return temp_json_output
def runEngine(self, engine): if engine.config.method == "emit": engine.emit = lambda x: x for record in DataFileReader( open("test/prettypfa/exoplanets.avro", "r"), DatumReader()): engine.action(record)
def print_all_events(path, limit=10): """example stepping through all the data files and parsing them 1. iterate through all data files 2. open files with avro 3. parse event JSON 4. pretty-print events """ printed = 0 for parent, dirs, files in os.walk(path): for fname in sorted(files): printed += 1 if printed >= limit: print("...") return with open(os.path.join(parent, fname), 'rb') as f: # this is how you open an avro file reader = DataFileReader(f, DatumReader()) # an avro file provides an iterable of events for reading in reader: # the uuid we want to use is reading.SystemProperties.connectionDeviceId print( f"uuid={reading['SystemProperties']['connectionDeviceId']}" ) # the actual payload from the app is the json body (as a bytestring) try: # parse it out so it looks nicer when we print: reading['Body'] = json.loads( reading['Body'].decode('utf8')) except ValueError: # leave not json as bytes. This shouldn't happen! pass pprint.pprint(reading)
def handle_file(path): print("Reading file from: " + path) reader = DataFileReader(open(path, "rb"), DatumReader()) for record in reader: cset = CollectionSet() cset.ParseFromString(record['Body']) handle_collection_set(cset)
def lambda_handler(event, context): source_bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote(event['Records'][0]['s3']['object']['key']) splitStr = source_bucket.split(".") account = splitStr[0] profile = splitStr[1] processed_bucket = source_bucket + "-processed" processed_key = key stream = get_object(source_bucket, key) success = copy_object(source_bucket, key, processed_bucket, processed_key) if success: s3.delete_object(Bucket=source_bucket, Key=key) if stream is not None: raw_bytes = stream.read() avro_bytes = io.BytesIO(raw_bytes) reader = DataFileReader(avro_bytes, DatumReader()) for line in reader: send_to_tealium(line, account, profile) return ""
def test_avro_reader(self): class FunkyCtx(object): def __init__(self, isplit): self.input_split = isplit this_directory = os.path.abspath(os.path.dirname(__file__)) url = '/'.join(['file://', this_directory, AVRO_DATA]) def get_areader(offset, length): isplit = InputSplit(InputSplit.to_string(url, offset, length)) ctx = FunkyCtx(isplit) return AvroReader(ctx) N = 500 with open(AVRO_DATA, 'wb') as f: self.write_avro_file(f, AVRO_USER_SCHEMA, avro_user_record, N, 1024) areader = get_areader(0, 14) file_length = areader.reader.file_length with self.assertRaises(StopIteration): areader.next() areader = get_areader(0, file_length) sreader = SeekableDataFileReader(open(AVRO_DATA), DatumReader()) for (o, a), s in it.izip(areader, sreader): self.assertEqual(a, s) mid_len = int(file_length / 2) lows = [x for x in get_areader(0, mid_len)] highs = [x for x in get_areader(mid_len, file_length)] self.assertEqual(N, len(lows) + len(highs))
def generic_dataframe(self, df, avro_schema, assert_fns=None): """Generic test running function for arbitrary avro schemas. Writes a dataframe containing the records to avro. Reads back and compares with the original """ print(avro_schema) cyavro.write_avro_file_from_dataframe(df, self.filename, json.dumps(avro_schema), codec='null') if assert_fns is None: assert_fns = {} df_read = cyavro.read_avro_file_as_dataframe(self.filename) import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open(self.filename, 'rb') as fo: reader = DataFileReader(fo, DatumReader()) records = [] for user in reader: records.append(user) df_reference = pd.DataFrame(records) reader.close() success = True for col in avro_schema["fields"]: colname = col['name'] assert_fn = assert_fns.get(colname, np.testing.assert_array_equal) def print_fail_header(s): print('#' * len(s)) print("FAIL: Column {}".format(col)) print('#' * len(s)) print(s) try: assert_fn(df_read[colname], df[colname]) except AssertionError: print_fail_header( "Failed for cyavro read comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False try: assert_fn(df_reference[colname], df[colname]) except AssertionError: print_fail_header( "Failed for cyavro write comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False assert success
class Deserializer(object): def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter( '%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def get_data_in_batches( self, bucket_name, prefix=None, data_after=None, data_until=None, batch_size=10000, ): rows = [] for blob in self.generate_blob_list(bucket_name, prefix, data_after, data_until): # download file content as bytes, read via avro blob_meta = { "blob_name": blob.name, "blob_modified_at": blob.updated, } bytes_data = blob.download_as_string() bytes_object = BytesIO(bytes_data) bytes_object.mode = "rb+" # need to "fake" the mode attribute because # avro checks the mode of the file given for some reason, fails otherwise reader = DataFileReader(bytes_object, DatumReader()) for row in reader: # add blob-level metadata row.update(blob_meta) rows.append(row) if len(rows) >= batch_size: yield rows rows = [] if rows: # return any data that was left after the last iteration yield rows
def build_cars_in_time(files): cars = {} for idx, file in enumerate(files): print_progress(idx, len(files)) try: reader = DataFileReader(open(file, "rb"), DatumReader()) for car in reader: car_reg_number = select_reg_number(car) if not cars.has_key(car_reg_number): cars[car_reg_number] = [] cars[car_reg_number].append({ 'timestamp': select_timestamp(car), 'distanceAccumulated': select_distance_accumulated(car), 'regNumber': select_reg_number(car) }) except TypeError: print("Error reading file {0}".format(file)) finally: reader.close() return cars
def respond(self, call_request): buffer_reader = io.BytesIO(call_request) buffer_decoder = BinaryDecoder(buffer_reader) buffer_writer = io.BytesIO() buffer_encoder = BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) if remote_protocol is None or self.local_protocol is None: return buffer_writer.getvalue() DatumReader(schema.parse( '{"type": "map", "values": "bytes"}')).read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() remote_message = remote_protocol.messages.get(remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.messages.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writers_schema = remote_message.request readers_schema = local_message.request request = self.read_request(writers_schema, readers_schema, buffer_decoder) response = None try: response = self.invoke(self.local_protocol, local_message, request) except AvroRemoteException as e: error = e except Exception as e: error = AvroRemoteException(str(e)) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writers_schema = local_message.response self.write_response(writers_schema, response, buffer_encoder) else: writers_schema = local_message.errors self.write_error(writers_schema, error, buffer_encoder) except schema.AvroException as e: error = AvroRemoteException(str(e)) buffer_encoder = BinaryEncoder(io.BytesIO()) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self.write_error(schema.parse('["string"]'), error, buffer_encoder) return buffer_encoder.writer.getvalue() return buffer_writer.getvalue()
def read(self, format): time_start = time.time() if format == 'json': with open('./output/output.json') as file: json.loads(file.read()) if format == 'jsch': with open('./output/output.json') as file: validate(json.loads(file.read()), self._schema_json) elif format == 'avro': reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader()) for user in reader: pass reader.close() elif format == 'protobuf': with open('./output/output.pb', 'rb') as file: addressbook_pb2.AddressBook().ParseFromString(file.read()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'rb') as file: json.loads(file.read()) time_end = time.time() return time_end - time_start
def get_flowrecords_from_flowdata_file(filename_path_input): """ Create a Python generator to read the csv/txt/avro file returning the records to processing. *Important: when considering CSV/TXT files remember to use files without header/statistics as input files* :param filename_path_input: exported csv/txt/avro flow input file from the original nfpcap file via NFDUMP :return: generator to records from file """ if filename_path_input.lower().endswith(('.csv', '.txt')): with open(filename_path_input) as csvfile: reader = csv.reader(csvfile) for line in reader: yield create_flow_record_from_csv(line) # >> default extension Apache AVRO << else: # prepare to read binary flowsrecords_reader = DataFileReader(open(filename_path_input, "rb"), DatumReader()) try: for flow in flowsrecords_reader: yield flow except zlib.error as ze: print ze.message pass except IOError as io: print io.message
def callback(message): # Get the message serialization type. encoding = message.attributes.get("googclient_schemaencoding") # Deserialize the message data accordingly. if encoding == "BINARY": bout = io.BytesIO(message.data) decoder = BinaryDecoder(bout) reader = DatumReader(avro_schema) message_data = reader.read(decoder) print(f"Received a binary-encoded message:\n{message_data}") elif encoding == "JSON": message_data = json.loads(message.data) print(f"Received a JSON-encoded message:\n{message_data}") else: print(f"Received a message with no encoding:\n{message}") message.ack()
class Deserializer(object): def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(string_io(rec_bytes)))
class AvroCodec(object): def __init__(self, schema): self._raw_schema = schema self._avro_schema = avro.schema.parse(json.dumps(schema)) self._reader = DatumReader(self._avro_schema) def dump(self, obj, fp): """ Serializes obj as an avro-format byte stream to the provided fp file-like object stream. """ if not validate(obj, self._raw_schema): raise AvroTypeException(self._avro_schema, obj) fastavro_write_data(fp, obj, self._raw_schema) def dumps(self, obj): """ Serializes obj to an avro-format byte array and returns it. """ out = BytesIO() try: self.dump(obj, out) return out.getvalue() finally: out.close() def load(self, fp): """ Deserializes the byte stream contents of the given file-like object into an object and returns it. """ return self._reader.read(BinaryDecoder(fp)) def loads(self, data): """ Deserializes the given byte array into an object and returns it. """ st = BytesIO(data) try: return self.load(st) finally: st.close()
def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema)
def fromKey(self, key, avroType): bytes = io.BytesIO(base64.b64decode(key)) reader = DatumReader(avroType.schema) return reader.read(BinaryDecoder(bytes))
def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema)
def __init__(self, schema): self._raw_schema = schema self._avro_schema = avro.schema.parse(json.dumps(schema)) self._reader = DatumReader(self._avro_schema)
# encoding: utf-8 """Python avro official implementation decoding benchmark.""" from io import BytesIO from itertools import repeat from time import time from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder import sys LOOPS = 1 with open(sys.argv[1]) as reader: datum_reader = DatumReader() file_reader = DataFileReader(reader, datum_reader) SCHEMA = datum_reader.writers_schema BUFS = [] datum_writer = DatumWriter(SCHEMA) for record in file_reader: buf = BytesIO() encoder = BinaryEncoder(buf) datum_writer.write(record, encoder) BUFS.append(buf) datum_reader = DatumReader(SCHEMA) start = time() n = 0 for _ in repeat(None, LOOPS): for buf in BUFS: