def __next__(self): if self._next_record_index >= self._num_records: if self._pos != len(self._buffer): raise CorruptRecordException( "{} unconsumed bytes after all records consumed".format( len(self._buffer) - self._pos)) raise StopIteration try: msg = self._read_msg() except (ValueError, IndexError) as err: raise CorruptRecordException( "Found invalid record structure: {!r}".format(err)) else: self._next_record_index += 1 return msg
def _decompress(self, key_offset): # Copy of `_read_key_value`, but uses memoryview pos = key_offset key_size = struct.unpack_from(">i", self._buffer, pos)[0] pos += self.KEY_LENGTH if key_size != -1: pos += key_size value_size = struct.unpack_from(">i", self._buffer, pos)[0] pos += self.VALUE_LENGTH if value_size == -1: raise CorruptRecordException("Value of compressed message is None") else: data = self._buffer[pos:pos + value_size] compression_type = self.compression_type self._assert_has_codec(compression_type) if compression_type == self.CODEC_GZIP: uncompressed = gzip_decode(data) elif compression_type == self.CODEC_SNAPPY: uncompressed = snappy_decode(data.tobytes()) elif compression_type == self.CODEC_LZ4: if self._magic == 0: uncompressed = lz4_decode_old_kafka(data.tobytes()) else: uncompressed = lz4_decode(data.tobytes()) return uncompressed
def next_batch(self, _min_slice=MIN_SLICE, _magic_offset=MAGIC_OFFSET): next_slice = self._next_slice if next_slice is None: return None if len(next_slice) < _min_slice: raise CorruptRecordException( "Record size is less than the minimum record overhead " "({})".format(_min_slice - self.LOG_OVERHEAD)) self._cache_next() magic, = struct.unpack_from(">b", next_slice, _magic_offset) if magic <= 1: return LegacyRecordBatch(next_slice, magic) else: return DefaultRecordBatch(next_slice)
def _read_msg( self, decode_varint=decode_varint): # Record => # Length => Varint # Attributes => Int8 # TimestampDelta => Varlong # OffsetDelta => Varint # Key => Bytes # Value => Bytes # Headers => [HeaderKey HeaderValue] # HeaderKey => String # HeaderValue => Bytes buffer = self._buffer pos = self._pos length, pos = decode_varint(buffer, pos) start_pos = pos _, pos = decode_varint(buffer, pos) # attrs can be skipped for now ts_delta, pos = decode_varint(buffer, pos) if self.timestamp_type == self.LOG_APPEND_TIME: timestamp = self.max_timestamp else: timestamp = self.first_timestamp + ts_delta offset_delta, pos = decode_varint(buffer, pos) offset = self.base_offset + offset_delta key_len, pos = decode_varint(buffer, pos) if key_len >= 0: key = bytes(buffer[pos: pos + key_len]) pos += key_len else: key = None value_len, pos = decode_varint(buffer, pos) if value_len >= 0: value = bytes(buffer[pos: pos + value_len]) pos += value_len else: value = None header_count, pos = decode_varint(buffer, pos) if header_count < 0: raise CorruptRecordException("Found invalid number of record " "headers {}".format(header_count)) headers = [] while header_count: # Header key is of type String, that can't be None h_key_len, pos = decode_varint(buffer, pos) if h_key_len < 0: raise CorruptRecordException( "Invalid negative header key size {}".format(h_key_len)) h_key = buffer[pos: pos + h_key_len].decode("utf-8") pos += h_key_len # Value is of type NULLABLE_BYTES, so it can be None h_value_len, pos = decode_varint(buffer, pos) if h_value_len >= 0: h_value = bytes(buffer[pos: pos + h_value_len]) pos += h_value_len else: h_value = None headers.append((h_key, h_value)) header_count -= 1 # validate whether we have read all header bytes in the current record if pos - start_pos != length: raise CorruptRecordException( "Invalid record size: expected to read {} bytes in record " "payload, but instead read {}".format(length, pos - start_pos)) self._pos = pos return DefaultRecord( offset, timestamp, self.timestamp_type, key, value, headers)