def test_kafka_consumer_offsets_search_many_partitions(kafka_consumer, kafka_producer, topic): tp0 = TopicPartition(topic, 0) tp1 = TopicPartition(topic, 1) send_time = int(time.time() * 1000) timeout = 10 p0msg = kafka_producer.send( topic, partition=0, value=b"XXX", timestamp_ms=send_time).get(timeout) p1msg = kafka_producer.send( topic, partition=1, value=b"XXX", timestamp_ms=send_time).get(timeout) consumer = kafka_consumer offsets = consumer.offsets_for_times({ tp0: send_time, tp1: send_time }) assert offsets == { tp0: OffsetAndTimestamp(p0msg.offset, send_time), tp1: OffsetAndTimestamp(p1msg.offset, send_time) } offsets = consumer.beginning_offsets([tp0, tp1]) assert offsets == { tp0: p0msg.offset, tp1: p1msg.offset } offsets = consumer.end_offsets([tp0, tp1]) assert offsets == { tp0: p0msg.offset + 1, tp1: p1msg.offset + 1 }
def test_kafka_consumer_offsets_search_many_partitions(self): tp0 = TopicPartition(self.topic, 0) tp1 = TopicPartition(self.topic, 1) kafka_producer = self.kafka_producer() send_time = int(time.time() * 1000) p0msg = kafka_producer.send( self.topic, partition=0, value=b"XXX", timestamp_ms=send_time).get() p1msg = kafka_producer.send( self.topic, partition=1, value=b"XXX", timestamp_ms=send_time).get() consumer = self.kafka_consumer() offsets = consumer.offsets_for_times({ tp0: send_time, tp1: send_time }) self.assertEqual(offsets, { tp0: OffsetAndTimestamp(p0msg.offset, send_time), tp1: OffsetAndTimestamp(p1msg.offset, send_time) }) offsets = consumer.beginning_offsets([tp0, tp1]) self.assertEqual(offsets, { tp0: p0msg.offset, tp1: p1msg.offset }) offsets = consumer.end_offsets([tp0, tp1]) self.assertEqual(offsets, { tp0: p0msg.offset + 1, tp1: p1msg.offset + 1 })
def _setup_consumer(self): """ prepare offset numbers etc. for reading from Topic """ # <WTF> https://github.com/dpkp/kafka-python/issues/601 self.available_topics = self.client.topics() # </WTF> # might as well use it assert self.topic in self.available_topics if (self.start_params is None) != (self.end_params is None): raise ValueError("Both start and end params must be set or both must be None") if self.start_params is None: # setup partitions to read through # TODO not checked with multiple partitions since inheriting from foxglove # An offset is assigned to make repeatability (via a locking file) possible later on. # and it's easier to terminate the fetch loop this way. p_id = self.client.partitions_for_topic(self.topic) topic_partitions = [TopicPartition(topic=self.topic, partition=p) for p in list(p_id)] starts = self.client.beginning_offsets(topic_partitions) ends = self.client.end_offsets(topic_partitions) self.start_p_offsets = { tp: OffsetAndTimestamp(offset=offset, timestamp=None) for tp, offset in starts.items() } self.end_p_offsets = { tp: OffsetAndTimestamp(offset=offset - 1, timestamp=None) for tp, offset in ends.items() } else: # TODO - this code was inherited from Foxglove and hasn't be checked through # setup start and end partitions and offsets # self.client.seek_to_beginning() # datetime is only start/end implemented assert isinstance(self.start_params, datetime) and isinstance(self.end_params, datetime) start = int(self.start_params.timestamp() * 1000) end = int(self.end_params.timestamp() * 1000) partitions = self.client.partitions_for_topic(self.topic) tx = {TopicPartition(topic=self.topic, partition=p): start for p in list(partitions)} self.start_p_offsets = self.client.offsets_for_times(tx) # if you give a timestamp after the last record it returns None for tp, offset_details in self.start_p_offsets.items(): if offset_details is None: raise ValueError("Start date outside of available messages") tx = {TopicPartition(topic=self.topic, partition=p): end for p in list(partitions)} self.end_p_offsets = self.client.offsets_for_times(tx) # as above - out of range, for end offset give something useful for tp, offset_details in self.end_p_offsets.items(): if offset_details is None: # go to last message. I'm not 100% sure this is correct end_offsets = self.client.end_offsets([tp]) offset = end_offsets[tp] - 1 self.end_p_offsets[tp] = OffsetAndTimestamp(offset=offset, timestamp=None)
def test_commit_for_times_atomic(self, mock_kconsumer): partition_to_offset = { TopicPartition("topic1", 0): None, TopicPartition("topic2", 0): OffsetAndTimestamp(123, 123), } consumer_commit_for_times(mock_kconsumer, partition_to_offset, atomic=True) assert mock_kconsumer.commit.call_count == 0
def get_offsets_by_times(self, timestamps, timeout_ms): offsets = self._retrieve_offsets(timestamps, timeout_ms) for tp in timestamps: if tp not in offsets: offsets[tp] = None else: offset, timestamp = offsets[tp] offsets[tp] = OffsetAndTimestamp(offset, timestamp) return offsets
def test_commit_for_times(self, mock_kconsumer): timestamp = 123 topics = ["topic1", "topic2", "topic3"] parts = [0, 1] partition_to_offset = { TopicPartition(topic, part): OffsetAndTimestamp(42, timestamp) for topic in topics for part in parts } expected = { TopicPartition(topic, part): OffsetAndMetadata(42, metadata=None) for topic in topics for part in parts } consumer_commit_for_times(mock_kconsumer, partition_to_offset) mock_kconsumer.commit.assert_called_once_with(expected)
def connect(self): if self.client is None: self.bootstrap_server, self.topic, self.start_params, self.end_params = \ self._decode_engine_url() self.client = KafkaConsumer(bootstrap_servers=self.bootstrap_server) # <WTF> https://github.com/dpkp/kafka-python/issues/601 self.available_topics = self.client.topics() # </WTF> # might as well use it assert self.topic in self.available_topics # setup start and end partitions and offsets # self.client.seek_to_beginning() # datetime is only start/end implemented assert isinstance(self.start_params, datetime) and isinstance(self.end_params, datetime) start = int(self.start_params.timestamp() * 1000) end = int(self.end_params.timestamp() * 1000) partitions = self.client.partitions_for_topic(self.topic) tx = {TopicPartition(topic=self.topic, partition=p):start for p in list(partitions)} self.start_p_offsets = self.client.offsets_for_times(tx) # if you give a timestamp after the last record it returns None for tp, offset_details in self.start_p_offsets.items(): if offset_details is None: raise ValueError("Start date outside of available messages") tx = {TopicPartition(topic=self.topic, partition=p):end for p in list(partitions)} self.end_p_offsets = self.client.offsets_for_times(tx) # as above - out of range, for end offset give something useful for tp, offset_details in self.end_p_offsets.items(): if offset_details is None: # go to last message. I'm not 100% sure this is correct end_offsets = self.client.end_offsets([tp]) offset = end_offsets[tp]-1 self.end_p_offsets[tp] = OffsetAndTimestamp(offset=offset, timestamp=None)
def test_offset_for_times(mocker): partitions = [kafka.TopicPartition('ut_topic', 0)] offsets_for_times = {tp: OffsetAndTimestamp(42, -1) for tp in partitions} positions = {tp: 747 for tp in partitions} mock = mocker.Mock() mock.offsets_for_times.return_value = offsets_for_times mock.position.side_effect = lambda tp: positions.get(tp, 0) # Uses returned offset for time when provided offsets = client.offsets_for_times(mock, partitions, 987654321) assert len(offsets) == len(partitions) assert all(tp in offsets for tp in partitions) assert offsets[partitions[0]] == 42 # When offsets_for_times returns None returns position at end offsets_for_times[partitions[0]] = None offsets = client.offsets_for_times(mock, partitions, 987654321) assert len(offsets) == len(partitions) assert all(tp in offsets for tp in partitions) assert offsets[partitions[0]] == 747
def _answer_offsets_for_times(timestamps): return { tp: OffsetAndTimestamp(ts - 100, ts) for tp, ts in timestamps.items() }