def test_with_hearder_reader(self): # Note: only when the data stream doesn't have header, we need header stream to help file_path = os.path.join(AvroReaderTests._samples_dir_root, 'changeFeed.avro') # this data stream has header full_data_stream = BytesIO() with open(file_path, 'rb') as reader: full_data = reader.read() full_data_stream.write(full_data) # This initialization helps find the position after the first sync_marker DataFileReader(full_data_stream, DatumReader()) position_after_sync_marker = full_data_stream.tell() # construct the partial data stream which doesn't have header partial_data_stream = _HeaderStream() with open(file_path, 'rb') as reader: reader.seek(position_after_sync_marker) partial_data_stream.write(reader.read()) header_stream = _HeaderStream() with open(file_path, 'rb') as reader: header_data = reader.read() header_stream.write(header_data) df_reader = DataFileReader(partial_data_stream, DatumReader(), header_reader=header_stream) records = list(df_reader) self.assertEqual(CHANGE_FEED_RECORD, records[0]) self.assertIsNot(partial_data_stream.event_position, 0)
def test_change_feed(self): file_path = os.path.join(AvroReaderTests._samples_dir_root, 'changeFeed.avro') with open(file_path, 'rb') as reader: datum_reader = DatumReader() with DataFileReader(reader, datum_reader) as dfr: data = list(dfr) self.assertEqual(1, len(data)) expected_record = CHANGE_FEED_RECORD self.assertEqual(expected_record, data[0])
def _initialize(self, chunk_cursor=None): # To get all events in a chunk blob_client = self.client.get_blob_client(self.chunk_path) file_offset = chunk_cursor.get("BlockOffset") if chunk_cursor else 0 # An offset means the avro data doesn't have avro header, # so only when the data stream has a offset we need header stream to help header_stream = ChangeFeedStreamer(blob_client) if file_offset else None self._data_stream = ChangeFeedStreamer(blob_client, chunk_file_start=file_offset) self.file_reader = DataFileReader(self._data_stream, DatumReader(), header_reader=header_stream) event_index = chunk_cursor.get("EventIndex") if chunk_cursor else 0 for _ in range(0, event_index): next(self.file_reader)
def test_reader(self): correct = 0 nitems = 10 for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: file_path = os.path.join(AvroReaderTests._samples_dir_root, 'test_' + codec + '_' + str(iexample) + '.avro') with open(file_path, 'rb') as reader: datum_reader = DatumReader() with DataFileReader(reader, datum_reader) as dfr: round_trip_data = list(dfr) if ([datum] * nitems) == round_trip_data: correct += 1 self.assertEqual( correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
def _initialize(self, chunk_cursor=None): # To get all events in a chunk blob_client = self.client.get_blob_client(self.chunk_path) file_offset = chunk_cursor.get('position') if chunk_cursor else 0 block_count = chunk_cursor.get('block_count') if chunk_cursor else 0 # An offset means the avro data doesn't have avro header, # so only when the data stream has a offset we need header stream to help header_stream = ChangeFeedStreamer( blob_client) if file_offset else None self._data_stream = ChangeFeedStreamer(blob_client, chunk_file_start=file_offset, block_count=block_count) self.file_reader = DataFileReader(self._data_stream, DatumReader(), header_reader=header_stream) # After initializing DataFileReader, data_stream cursor has been moved to the data part(DataFileReader read # the header part during initialization) self._data_stream.event_position = self._data_stream.tell()