def test__unpack_message_set_compressed_v1(fetcher): fetcher.config['check_crcs'] = False tp = TopicPartition('foo', 0) messages = [ (0, None, Message(b'a')), (1, None, Message(b'b')), (2, None, Message(b'c')), ] message_bytes = [] for offset, _, m in messages: encoded = m.encode() message_bytes.append( Int64.encode(offset) + Int32.encode(len(encoded)) + encoded) compressed_bytes = gzip_encode(b''.join(message_bytes)) compressed_base_offset = 10 compressed_msgs = [(compressed_base_offset, None, Message(compressed_bytes, magic=1, attributes=Message.CODEC_GZIP))] records = list(fetcher._unpack_message_set(tp, compressed_msgs)) assert len(records) == 3 assert all(map(lambda x: isinstance(x, ConsumerRecord), records)) assert records[0].value == b'a' assert records[1].value == b'b' assert records[2].value == b'c' assert records[0].offset == 8 assert records[1].offset == 9 assert records[2].offset == 10
def decode(cls, data, bytes_to_read=None): """Compressed messages should pass in bytes_to_read (via message size) otherwise, we decode from data as Int32 """ if isinstance(data, bytes): data = io.BytesIO(data) if bytes_to_read is None: bytes_to_read = Int32.decode(data) # if FetchRequest max_bytes is smaller than the available message set # the server returns partial data for the final message # So create an internal buffer to avoid over-reading raw = io.BytesIO(data.read(bytes_to_read)) items = [] while bytes_to_read: try: offset = Int64.decode(raw) msg_bytes = Bytes.decode(raw) bytes_to_read -= 8 + 4 + len(msg_bytes) items.append((offset, len(msg_bytes), Message.decode(msg_bytes))) except ValueError: # PartialMessage to signal that max_bytes may be too small items.append((None, None, PartialMessage())) break return items
def drain_ready(self): """Compress batch to be ready for send""" memview = self._buffer.getbuffer() self._drain_waiter.set_result(None) if self._compression_type: _, compressor, attrs = self._COMPRESSORS[self._compression_type] msg = Message(compressor(memview[4:].tobytes()), attributes=attrs, magic=self._version_id) encoded = msg.encode() # if compressed message is longer than original # we should send it as is (not compressed) header_size = 16 # 4(all size) + 8(offset) + 4(compressed size) if len(encoded) + header_size < len(memview): # write compressed message set (with header) to buffer # using memory view (for avoid memory copying) memview[:4] = Int32.encode(len(encoded) + 12) memview[4:12] = Int64.encode(0) # offset 0 memview[12:16] = Int32.encode(len(encoded)) memview[16:16 + len(encoded)] = encoded self._buffer.seek(0) return # update batch size (first 4 bytes of buffer) memview[:4] = Int32.encode(self._buffer.tell() - 4) self._buffer.seek(0)
def _build(self): if self._closed: self._buffer.seek(0) return self._buffer self._closed = True memview = self._buffer.getbuffer() if self._compression_type: _, compressor, attrs = self._COMPRESSORS[self._compression_type] msg = Message(compressor(memview[4:].tobytes()), attributes=attrs, magic=self._magic) encoded = msg.encode() # if compressed message is longer than original # we should send it as is (not compressed) header_size = 16 # 4(all size) + 8(offset) + 4(compressed size) if len(encoded) + header_size < len(memview): # write compressed message set (with header) to buffer # using memory view (for avoid memory copying) memview[:4] = Int32.encode(len(encoded) + 12) memview[4:12] = Int64.encode(0) # offset 0 memview[12:16] = Int32.encode(len(encoded)) memview[16:16 + len(encoded)] = encoded memview.release() self._buffer.seek(16 + len(encoded)) self._buffer.truncate() self._buffer.seek(0) return self._buffer # update batch size (first 4 bytes of buffer) memview[:4] = Int32.encode(self._buffer.tell() - 4) self._buffer.seek(0) return self._buffer
def decode(cls, data, bytes_to_read=None): """Compressed messages should pass in bytes_to_read (via message size) otherwise, we decode from data as Int32 """ if isinstance(data, bytes): data = io.BytesIO(data) if bytes_to_read is None: bytes_to_read = Int32.decode(data) # if FetchRequest max_bytes is smaller than the available message set # the server returns partial data for the final message # So create an internal buffer to avoid over-reading raw = io.BytesIO(data.read(bytes_to_read)) items = [] while bytes_to_read: try: offset = Int64.decode(raw) msg_bytes = Bytes.decode(raw) bytes_to_read -= 8 + 4 + len(msg_bytes) items.append( (offset, len(msg_bytes), Message.decode(msg_bytes))) except ValueError: # PartialMessage to signal that max_bytes may be too small items.append((None, None, PartialMessage())) break return items
def append(self, key, value, timestamp_ms): """Append message (key and value) to batch Returns: None if batch is full or asyncio.Future that will resolved when message is delivered """ if self._is_full(key, value): return None # `.encode()` is a weak method for some reason, so we need to save # reference before calling it. if self._version_id == 0: msg_inst = Message(value, key=key, magic=self._version_id) else: msg_inst = Message(value, key=key, magic=self._version_id, timestamp=timestamp_ms) encoded = msg_inst.encode() msg = Int64.encode(self._relative_offset) + Int32.encode(len(encoded)) msg += encoded self._buffer.write(msg) future = asyncio.Future(loop=self._loop) self._msg_futures.append(future) self._relative_offset += 1 return future
def test_decode_fetch_response_partial(): encoded = b''.join([ Int32.encode(1), # Num Topics (Array) String('utf-8').encode('foobar'), Int32.encode(2), # Num Partitions (Array) Int32.encode(0), # Partition id Int16.encode(0), # Error Code Int64.encode(1234), # Highwater offset Int32.encode(52), # MessageSet size Int64.encode(0), # Msg Offset Int32.encode(18), # Msg Size struct.pack('>i', 1474775406), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k1', # Key struct.pack('>i', 2), # Length of value b'v1', # Value Int64.encode(1), # Msg Offset struct.pack('>i', 24), # Msg Size (larger than remaining MsgSet size) struct.pack('>i', -16383415), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k2', # Key struct.pack('>i', 8), # Length of value b'ar', # Value (truncated) Int32.encode(1), Int16.encode(0), Int64.encode(2345), Int32.encode(52), # MessageSet size Int64.encode(0), # Msg Offset Int32.encode(18), # Msg Size struct.pack('>i', 1474775406), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k1', # Key struct.pack('>i', 2), # Length of value b'v1', # Value Int64.encode(1), # Msg Offset struct.pack('>i', 24), # Msg Size (larger than remaining MsgSet size) struct.pack('>i', -16383415), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k2', # Key struct.pack('>i', 8), # Length of value b'ar', # Value (truncated) ]) resp = FetchResponse[0].decode(io.BytesIO(encoded)) assert len(resp.topics) == 1 topic, partitions = resp.topics[0] assert topic == 'foobar' assert len(partitions) == 2 m1 = MessageSet.decode( partitions[0][3], bytes_to_read=len(partitions[0][3])) assert len(m1) == 2 assert m1[1] == (None, None, PartialMessage())
def test_decode_fetch_response_partial(): encoded = b''.join([ Int32.encode(1), # Num Topics (Array) String('utf-8').encode('foobar'), Int32.encode(2), # Num Partitions (Array) Int32.encode(0), # Partition id Int16.encode(0), # Error Code Int64.encode(1234), # Highwater offset Int32.encode(52), # MessageSet size Int64.encode(0), # Msg Offset Int32.encode(18), # Msg Size struct.pack('>i', 1474775406), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k1', # Key struct.pack('>i', 2), # Length of value b'v1', # Value Int64.encode(1), # Msg Offset struct.pack('>i', 24), # Msg Size (larger than remaining MsgSet size) struct.pack('>i', -16383415), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k2', # Key struct.pack('>i', 8), # Length of value b'ar', # Value (truncated) Int32.encode(1), Int16.encode(0), Int64.encode(2345), Int32.encode(52), # MessageSet size Int64.encode(0), # Msg Offset Int32.encode(18), # Msg Size struct.pack('>i', 1474775406), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k1', # Key struct.pack('>i', 2), # Length of value b'v1', # Value Int64.encode(1), # Msg Offset struct.pack('>i', 24), # Msg Size (larger than remaining MsgSet size) struct.pack('>i', -16383415), # CRC struct.pack('>bb', 0, 0), # Magic, flags struct.pack('>i', 2), # Length of key b'k2', # Key struct.pack('>i', 8), # Length of value b'ar', # Value (truncated) ]) resp = FetchResponse[0].decode(io.BytesIO(encoded)) assert len(resp.topics) == 1 topic, partitions = resp.topics[0] assert topic == 'foobar' assert len(partitions) == 2 m1 = partitions[0][3] assert len(m1) == 2 assert m1[1] == (None, None, PartialMessage())
def encode(cls, items, prepend_size=True): # RecordAccumulator encodes messagesets internally if isinstance(items, (io.BytesIO, KafkaBytes)): size = Int32.decode(items) if prepend_size: # rewind and return all the bytes items.seek(items.tell() - 4) size += 4 return items.read(size) encoded_values = [] for (offset, message) in items: encoded_values.append(Int64.encode(offset)) encoded_values.append(Bytes.encode(message)) encoded = b''.join(encoded_values) if prepend_size: return Bytes.encode(encoded) else: return encoded
def append(self, key, value): """Append message (key and value) to batch Returns: None if batch is full or asyncio.Future that will resolved when message is delivered """ if self._is_full(key, value): return None encoded = Message(value, key=key).encode() msg = Int64.encode(self._relative_offset) + Int32.encode(len(encoded)) msg += encoded self._buffer.write(msg) future = asyncio.Future(loop=self._loop) self._msg_futures.append(future) self._relative_offset += 1 return future
def append(self, *, timestamp, key, value): if not self._has_room_for(key, value): return 0 # `.encode()` is a weak method for some reason, so we need to save # reference before calling it. if self._magic == 0: msg_inst = Message(value, key=key, magic=self._magic) else: msg_inst = Message(value, key=key, magic=self._magic, timestamp=timestamp) encoded = msg_inst.encode() msg = Int64.encode(self._relative_offset) + Int32.encode(len(encoded)) msg += encoded actual_size = self._buffer.write(msg) self._relative_offset += 1 return actual_size
def drain_ready(self): """Compress batch to be ready for send""" memview = self._buffer.getbuffer() self._drain_waiter.set_result(None) if self._compression_type: _, compressor, attrs = self._COMPRESSORS[self._compression_type] msg = Message(compressor(memview[4:].tobytes()), attributes=attrs) encoded = msg.encode() # if compressed message is longer than original # we should send it as is (not compressed) header_size = 16 # 4(all size) + 8(offset) + 4(compressed size) if len(encoded) + header_size < len(memview): # write compressed message set (with header) to buffer # using memory view (for avoid memory copying) memview[:4] = Int32.encode(len(encoded) + 12) memview[4:12] = Int64.encode(0) # offset 0 memview[12:16] = Int32.encode(len(encoded)) memview[16:16+len(encoded)] = encoded self._buffer.seek(0) return # update batch size (first 4 bytes of buffer) memview[:4] = Int32.encode(self._buffer.tell()-4) self._buffer.seek(0)