def offsets_for_time(self, partitions_time: list, timestamp: int = -1): """ 寻找指定时间后的partition最早offset :param partitions_time: list of (topic, partition) if timestamp > 0, (topic, partition, timestamp) if timestamp = -1 :param timestamp: 指定的开始查询时间, 如果是-1则表示每个partitions都有自己的时间配置 :return: """ if timestamp > 0: _partitions = { TopicPartition(_tuple[0], _tuple[1]): timestamp for _tuple in partitions_time } else: _partitions = { TopicPartition(_tuple[0], _tuple[1]): _tuple[2] for _tuple in partitions_time } try: result = self.consumer.offsets_for_times(_partitions) except UnsupportedVersionError or ValueError or KafkaTimeoutError as e: if e.__class__ == UnsupportedVersionError: log.tag_error(KafkaInfo.KafkaConsumer, "API VERSION ERROR, DO NOT SUPPORT") raise ActionError(KafkaErr.NotSupport) if e.__class__ == ValueError: log.tag_error(KafkaInfo.KafkaConsumer, "Value Error: Target Timestamp is negative") else: log.tag_error(KafkaInfo.KafkaConsumer, "Get offset by timestamp failed, Time out") raise ActionError(KafkaErr.GetOffsetFailed) return result
def __init__(self): logging.info( 'Going to initialize KafkaHandler for kafka at endpont %s', kafka_endpoint) self.consumer = KafkaConsumer(bootstrap_servers=kafka_endpoint) self.dumps = {} end_offset = {} for topic in topics: self.dumps[topic] = collections.deque(maxlen=100) current_partition = TopicPartition(topic, 0) self.consumer.assign([current_partition]) self.consumer.seek_to_end() offset = self.consumer.position(current_partition) end_offset[topic] = offset > 100 and offset or 100 topic_partitions = [TopicPartition(topic, 0) for topic in topics] self.consumer.assign(topic_partitions) for topic in topics: self.consumer.seek(TopicPartition(topic, 0), end_offset[topic] - 100) self.thread = threading.Thread(target=self.run, args=()) self.thread.daemon = True # Demonize thread self.thread.start() # Start the execution
def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): self._location = location self._group = group self._topic = topic kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, max_partition_fetch_bytes=10485760, consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, heartbeat_interval_ms=10000, **kwargs) # explicitly causing consumer to bootstrap the cluster metadata self._consumer.topics() if partition_id is not None: self._partitions = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partitions) else: self._partitions = [ TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic) ] self._consumer.subscribe(topics=[self._topic])
def and_we_read_from_initial_offset(step): test_context = world.test_environment.load_context(SEND_RECEIVE_SCENARIO) knodes = world.pipeline_config.cluster.node_array topic = world.pipeline_config.get_user_topic('scratch_topic') consumer_group = world.pipeline_config.get_user_defined_consumer_group('scratch_group_1') kreader = telegraf.KafkaIngestRecordReader(topic, knodes, consumer_group) # show how many partitions this topic spans metadata = kreader.consumer.partitions_for_topic(topic) # TopicPartition named tuple consists of the topic and a partition number tp = TopicPartition(topic, 0) # manually assign one or more partitions to the consumer -- # required if we want to use explicit offsets kreader.consumer.assign([tp]) topic_partition = TopicPartition(topic, list(metadata)[0]) kreader.consumer.seek(topic_partition, test_context.offset) world_relay = WorldRelay(record_type='direct_sales_record', stream_id='test_stream_id', asset_id='test_asset_id') test_context.num_received_records = kreader.read(world_relay, world.log) #test_context.num_received_records = kreader.num_commits_issued for rec in world_relay.read_list: test_context.consumed_raw_record_list.append(rec)
def store_offset_records(self): consumer = self._getconsumer() partition_set = consumer.partitions_for_topic(self.topic) counter = 0 while counter < 5: counter += 1 partition_set = consumer.partitions_for_topic(self.topic) if partition_set: break else: time.sleep(10) partitions = [] for partition_id in partition_set: partitions.append(TopicPartition(self.topic, partition_id)) curr_offsets = {} for partition in partitions: committed = consumer.committed(partition) curr_offsets[partition.partition] = committed end_offsets = consumer.end_offsets(partitions) for partition_id, value in curr_offsets.items(): record = { 'curr_offset': value, 'end_offset': end_offsets[TopicPartition(topic=self.topic, partition=partition_id)] } self.offset_records[partition_id] = record
def when_we_read_from_initial_offset(step, checkpoint_freq): test_context = world.test_environment.load_context(DATA_COMMIT_SCENARIO) test_context.checkpoint_frequency = int(checkpoint_freq) knodes = world.pipeline_config.cluster.node_array topic = world.pipeline_config.get_user_topic('scratch_topic') consumer_group = world.pipeline_config.get_user_defined_consumer_group('scratch_group_1') kreader = telegraf.KafkaIngestRecordReader(topic, knodes, consumer_group) # show how many partitions this topic spans metadata = kreader.consumer.partitions_for_topic(topic) # TopicPartition named tuple consists of the topic and a partition number tp = TopicPartition(topic, 0) # manually assign one or more partitions to the consumer -- # required if we want to use explicit offsets kreader.consumer.assign([tp]) topic_partition = TopicPartition(topic, list(metadata)[0]) kreader.consumer.seek(topic_partition, test_context.offset) world_relay = WorldRelay(record_type='direct_sales_record', stream_id='test_stream_id', asset_id='test_asset_id') # world.log.debug('calling read() on kafka reader with ckpt frequency of %d and interval of %d...' % (int(checkpoint_freq), 10)) test_context.num_received_records = kreader.read(world_relay, world.log, checkpoint_frequency=test_context.checkpoint_frequency, checkpoint_interval=10) #xkreader.read(world_relay, world.log) test_context.num_successful_checkpoints = world_relay.checkpoint_successes test_context.num_checkpoint_errors = len(world_relay.checkpoint_errors)
def consume_data(self, offset=None): """ :param action: none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费 custom 从指定offset开始 begin 从 kafka 从这个topic最开始消费 end 从 kafka 从这个topic从最新生成的数据库开始,会跳过未消费数据,慎用 :param offset: 数字 int>=0 type为custom时有效从当前数字 offset 开始,包括当前数字, 如果数字大于当前topic总offset,从最新生成的数据库开始 :return: """ # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话, 在 KafkaConsumer 初始化时就不能指定 topic _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] if offset is None: offset = self.get_last_position() self.consumer.assign(_ps) for p in self.consumer.partitions_for_topic(self.topic): # 也可以只指定一个分区的 offset self.consumer.seek(TopicPartition(self.topic, p), offset) # try: # for message in self.consumer: # yield message # except KeyboardInterrupt as e: # print(e) for message in self.consumer: yield message
def start(*args, **kwargs): topic_name = topic.get() server_host = server.get() server_host = server_host + ':9092' print server_host t_partition = partition.get() if not t_partition: t_partition = 0 group = group_id.get() #print topic_name,server_host,t_partition,group # 指定分区的指定offset开始消费 consumerx = KafkaConsumer( topic_name, bootstrap_servers=[ server_host, ], auto_offset_reset='earliest', group_id=group, ) consumerx.unsubscribe() consumerx.assign([ TopicPartition(topic=topic_name, partition=0), ]) # 指定分区订阅 record_num = consumerx.end_offsets([ TopicPartition(topic=topic_name, partition=t_partition), ]) t.insert(1.0, record_num)
def start_consumer(): consumer = KafkaConsumer( bootstrap_servers = brokers) producer = KafkaProducer(bootstrap_servers=brokers,key_serializer= str.encode, value_serializer= str.encode) consumer.assign([ TopicPartition(topic=my_topic,partition=0) ]) consumer.seek(partition=TopicPartition(topic=my_topic,partition=0),offset=345) for msg in consumer: print(msg) print("topic = %s" % msg.topic) # topic default is string print("partition = %d" % msg.offset) print("value = %s" % msg.value.decode()) # bytes to string print("timestamp = %d" % msg.timestamp) print("time = ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime( msg.timestamp/1000 )) ) fund_content = json.loads(msg.value.decode()) keywords = service.GetKeywords() results = keywords.operate(fund_content['description']) industries = [] technology = [] for result in results: if result['label'] == 'indu': industries.append(result['text']) elif result['label'] == 'tech': technology.append(result['text']) fund_content['industries'] = industries fund_content['technology'] = technology future = producer.send(my_topic , key= 'import_raw', value= json.dumps(fund_content), partition= 0) future.get(timeout=10)
def __init__(self, location, topic, group, partition_id): self._location = location self._group = group self._topic = topic self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, max_partition_fetch_bytes=10485760, consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, ) if partition_id is not None: self._partition_ids = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partition_ids) else: self._partition_ids = [ TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic) ] self._consumer.subscribe(topics=[self._topic]) if self._consumer._use_consumer_group(): self._consumer._coordinator.ensure_coordinator_known() self._consumer._coordinator.ensure_active_group() self._consumer._update_fetch_positions(self._partition_ids) self._start_looping_call()
def next_requests(self): tps = [TopicPartition(topic=self.topic, partition=p) for p in self.topic_partitions] self.consumer.assign(tps) for partition in self.topic_partitions: offset = self.get_partition_offset(partition) self.consumer.seek(TopicPartition(topic=self.topic, partition=partition), offset) return self.start_consumer()
def test_producing_with_batched_records(self): """ Compared to previous test, we are going to have batching in Kafka producers (this is caused by high 'linger.ms' value). So a single request that reaches a Kafka broker might be carrying more than one record, for different partitions. """ messages_to_send = 100 partition1 = TopicPartition('apricots', 0) partition2 = TopicPartition('berries', 0) # This ensures that records to 'apricots' and 'berries' partitions. producer = KafkaProducer( bootstrap_servers=IntegrationTest.kafka_envoy_address(), api_version=(1, 0, 0), linger_ms=1000, batch_size=100) future_to_message1 = {} future_to_message2 = {} for _ in range(messages_to_send): message = Message() future1 = producer.send(key=message.key, value=message.value, headers=message.headers, topic=partition1.topic, partition=partition1.partition) future_to_message1[future1] = message message = Message() future2 = producer.send(key=message.key, value=message.value, headers=message.headers, topic=partition2.topic, partition=partition2.partition) future_to_message2[future2] = message offset_to_message1 = {} offset_to_message2 = {} for future in future_to_message1.keys(): offset_to_message1[ future.get().offset] = future_to_message1[future] self.assertTrue(future.get().offset >= 0) for future in future_to_message2.keys(): offset_to_message2[ future.get().offset] = future_to_message2[future] self.assertTrue(future.get().offset >= 0) self.assertTrue(len(offset_to_message1) == messages_to_send) self.assertTrue(len(offset_to_message2) == messages_to_send) producer.close() # Check the target clusters. self.__verify_target_kafka_cluster( IntegrationTest.kafka_cluster1_address(), partition1, offset_to_message1, partition2) self.__verify_target_kafka_cluster( IntegrationTest.kafka_cluster2_address(), partition2, offset_to_message2, partition1) # Check if requests have been received. self.metrics.collect_final_metrics() self.metrics.assert_metric_increase('produce', 1)
def _setup_consumer(self): """ prepare offset numbers etc. for reading from Topic """ # <WTF> https://github.com/dpkp/kafka-python/issues/601 self.available_topics = self.client.topics() # </WTF> # might as well use it assert self.topic in self.available_topics if (self.start_params is None) != (self.end_params is None): raise ValueError("Both start and end params must be set or both must be None") if self.start_params is None: # setup partitions to read through # TODO not checked with multiple partitions since inheriting from foxglove # An offset is assigned to make repeatability (via a locking file) possible later on. # and it's easier to terminate the fetch loop this way. p_id = self.client.partitions_for_topic(self.topic) topic_partitions = [TopicPartition(topic=self.topic, partition=p) for p in list(p_id)] starts = self.client.beginning_offsets(topic_partitions) ends = self.client.end_offsets(topic_partitions) self.start_p_offsets = { tp: OffsetAndTimestamp(offset=offset, timestamp=None) for tp, offset in starts.items() } self.end_p_offsets = { tp: OffsetAndTimestamp(offset=offset - 1, timestamp=None) for tp, offset in ends.items() } else: # TODO - this code was inherited from Foxglove and hasn't be checked through # setup start and end partitions and offsets # self.client.seek_to_beginning() # datetime is only start/end implemented assert isinstance(self.start_params, datetime) and isinstance(self.end_params, datetime) start = int(self.start_params.timestamp() * 1000) end = int(self.end_params.timestamp() * 1000) partitions = self.client.partitions_for_topic(self.topic) tx = {TopicPartition(topic=self.topic, partition=p): start for p in list(partitions)} self.start_p_offsets = self.client.offsets_for_times(tx) # if you give a timestamp after the last record it returns None for tp, offset_details in self.start_p_offsets.items(): if offset_details is None: raise ValueError("Start date outside of available messages") tx = {TopicPartition(topic=self.topic, partition=p): end for p in list(partitions)} self.end_p_offsets = self.client.offsets_for_times(tx) # as above - out of range, for end offset give something useful for tp, offset_details in self.end_p_offsets.items(): if offset_details is None: # go to last message. I'm not 100% sure this is correct end_offsets = self.client.end_offsets([tp]) offset = end_offsets[tp] - 1 self.end_p_offsets[tp] = OffsetAndTimestamp(offset=offset, timestamp=None)
def assign_and_seek(self, partoffs): tps = [] for tpo in partoffs: tps.append(TopicPartition(tpo.topic, tpo.partition)) super().assign(tps) for tpo in partoffs: if (tpo.offset > 0): super().seek(TopicPartition(tpo.topic, tpo.partition), tpo.offset)
def test_convert_partition_offsets_translates_partition_offsets_to_committable_topic_offsets( ): offsets = convert_partition_offsets('foo', {0: 100, 1: 200}) assert offsets == { TopicPartition(topic='foo', partition=0): OffsetAndMetadata(offset=100, metadata=''), TopicPartition(topic='foo', partition=1): OffsetAndMetadata(offset=200, metadata='') }
def test_producing(self): """ This test verifies that producer can send messages through mesh filter. We are going to send messages to two topics: 'apples' and 'bananas'. The mesh filter is configured to forward records for topics starting with 'a' (like 'apples') to the first cluster, and the ones starting with 'b' (so 'bananas') to the second one. We are going to send messages one by one, so they will not be batched in Kafka producer, so the filter is going to receive them one by one too. After sending, the consumers are going to read from Kafka clusters directly to make sure that nothing was lost. """ messages_to_send = 100 partition1 = TopicPartition('apples', 0) partition2 = TopicPartition('bananas', 0) producer = KafkaProducer( bootstrap_servers=IntegrationTest.kafka_envoy_address(), api_version=(1, 0, 0)) offset_to_message1 = {} offset_to_message2 = {} for _ in range(messages_to_send): message = Message() future1 = producer.send(key=message.key, value=message.value, headers=message.headers, topic=partition1.topic, partition=partition1.partition) self.assertTrue(future1.get().offset >= 0) offset_to_message1[future1.get().offset] = message future2 = producer.send(key=message.key, value=message.value, headers=message.headers, topic=partition2.topic, partition=partition2.partition) self.assertTrue(future2.get().offset >= 0) offset_to_message2[future2.get().offset] = message self.assertTrue(len(offset_to_message1) == messages_to_send) self.assertTrue(len(offset_to_message2) == messages_to_send) producer.close() # Check the target clusters. self.__verify_target_kafka_cluster( IntegrationTest.kafka_cluster1_address(), partition1, offset_to_message1, partition2) self.__verify_target_kafka_cluster( IntegrationTest.kafka_cluster2_address(), partition2, offset_to_message2, partition1) # Check if requests have been received. self.metrics.collect_final_metrics() self.metrics.assert_metric_increase('produce', 200)
def find_with_offset(offset_low=0, offset_high=-1): consumer = get_consumer() consumer.assign([TopicPartition(topic=TOPIC, partition=PARTITION_NUMBER)]) consumer.seek(TopicPartition(topic=TOPIC, partition=PARTITION_NUMBER), offset_low) filter_function = None if offset_high != -1: filter_function = lambda msg: msg.offset <= offset_high return filter(filter_function, consumer)
def seek_messages_by_timestamp(self, input_dt): # Seek messages by timestamp assert datetime.datetime.strptime(input_dt, "%Y-%m-%d %H:%M:%S"), \ 'Please provide date input in the format "%Y-%m-%d %H:%M:%S"' try: assignments = [] # We will manually assign topic partitions to read the messages from self.kc.unsubscribe() self.kc.topics() for topic in self.topics: partitions = self.kc.partitions_for_topic(topic) for p in partitions: assignments.append(TopicPartition(topic, p)) self.kc.assign(assignments) # self.kc.poll(timeout_ms=0) messages = [] for topic in self.topics: # Get the offset based on timestamp offset_time = int((datetime.datetime.strptime( input_dt, "%Y-%m-%d %H:%M:%S")).timestamp() * 1000) # print(offset_time) partitions = self.kc.partitions_for_topic(topic) # print(partitions) for p in partitions: dc = {TopicPartition(topic, p): offset_time} last_commit_for_partition = self.kc.committed( TopicPartition(topic, p)) offset = [ x[0] for x in self.kc.offsets_for_times(dc).values() ][0] print("Topic: " + topic + " Partition: " + str(p) + " Offset: " + str(offset)) # print(str(p) + " - " + str(last_commit_for_partition)) if last_commit_for_partition > offset: self.kc.seek(TopicPartition(topic, p), offset) for msg in self.kc: messages.append( (msg.offset, msg.value.decode('utf-8'), msg.key)) # print(str(p) + " - " + str(msg.offset)) if msg.offset >= (last_commit_for_partition - 1): # print("Offset 1 using offset position successful.") break return messages except (errors.KafkaTimeoutError, AssertionError, ValueError) as e: raise ValueError('%r' % e) except TypeError: raise ValueError('No last commit available for the given Topics') finally: self.kc.close()
def get_consumer(topic, offset=-1): # Check for offset, otherwise return consumer with group_id if offset == -1: consumer = KafkaConsumer(topic, group_id='MovieLog1', consumer_timeout_ms=KAFKA_TIMEOUT) else: consumer = KafkaConsumer(consumer_timeout_ms=KAFKA_TIMEOUT) consumer.assign([TopicPartition(topic, offset)]) consumer.seek_to_beginning(TopicPartition(topic, offset)) return consumer
def get_tweets_for_top_10_accounts(consumer): resubscribe(consumer) all_topics = get_all_acounts(consumer) top10 = [] start_date = datetime.datetime.timestamp(datetime.datetime.now() - datetime.timedelta(hours=3)) for topic in all_topics: tp = TopicPartition(topic, 0) consumer.seek_to_end(tp) last_offset = int(consumer.position(tp)) offset = consumer.offsets_for_times({tp: start_date}) start_offset = 0 if list(offset.values())[0]: start_offset = int(list(offset.values())[0].offset) top10.append({ 'user_id': topic, 'amount': int(last_offset - start_offset), 'length': last_offset }) sorted(top10, key=lambda x: x['amount']) top10 = top10[:10] data['top10_producing_account_latest_tweets'] = [] for topic in top10: tp = TopicPartition(topic['user_id'], 0) if topic['length'] >= 10: consumer.seek(tp, topic['length'] - 10) else: consumer.seek_to_beginning(tp) messages = consumer.poll(2000, 10) messages = list(messages.values()) tweets = [] for message in messages: tweets.append(message[0].value.decode("utf-8")) print(tweets) data['top10_producing_account_latest_tweets'].append({ 'user_id': topic['user_id'], 'latest_tweets': tweets })
def seek_last_n_messages(self, last_n_offset): # Seek by offset assignments = [] # We will manually assign topic partitions to read the messages from try: self.kc.unsubscribe() self.kc.topics() for topic in self.topics: partitions = self.kc.partitions_for_topic(topic) for p in partitions: assignments.append(TopicPartition(topic, p)) self.kc.assign(assignments) # self.kc.poll(timeout_ms=0) messages = [] for topic in self.topics: partitions = self.kc.partitions_for_topic(topic) # print(partitions) last_offset = max([(self.kc.committed(TopicPartition(topic, p))) for p in partitions]) offset = max(last_offset - last_n_offset, 0) print("Max offset - " + str(last_offset)) for p in partitions: last_commit_for_partition = self.kc.committed( TopicPartition(topic, p)) print("Topic: " + topic + " Partition: " + str(p)) print( "LastOffset: {} ** Position: {} ** HighWaterMark: {}". format( str(last_commit_for_partition), str(self.kc.position(TopicPartition(topic, p))), str(self.kc.highwater(TopicPartition(topic, p))))) # print(str(p) + " - " + str(last_commit_for_partition)) # if last_commit_for_partition > offset: # self.kc.seek(TopicPartition(topic, p), offset) # for msg in self.kc: # messages.append((msg.offset, msg.value.decode('utf-8'), msg.key)) # print(str(p) + " - " + str(msg.offset)) # if msg.offset >= (last_commit_for_partition - 1): # print("Offset 1 using offset position successful.") # break return messages except (errors.KafkaTimeoutError, AssertionError, ValueError) as e: raise ValueError('%r' % e) except TypeError: raise ValueError('No last commit available for the given Topics') finally: self.kc.close()
def run(self): consumer = KafkaConsumer( bootstrap_servers=self.server, auto_offset_reset='earliest', group_id=self.groupid) if consumer.partitions_for_topic(self.topic) is None: print("El tópico %s no existe!" % self.topic) sys.exit(2) if self.partition is None: partitions = [TopicPartition(self.topic, partition) for partition in consumer.partitions_for_topic(self.topic)] else: partitions = [TopicPartition(self.topic, int(self.partition))] consumer.assign(partitions) if self.offset is None: if self.inicio: for partition in partitions: consumer.seek_to_beginning(partition) else: for partition in partitions: consumer.seek(partition, int(self.offset)) while not self.stop_event.is_set(): try: for message in consumer: logging.info(message) try: valor = json.loads(message.value) if self.words: valor = valor['words'] except (ValueError): valor = message.value.decode('utf-8') print ("Recibiendo Mensaje (%s/%d/%d) %s" % (message.topic, message.partition, message.offset, #message.key, valor)) if self.stop_event.is_set(): break except IndexError: pass consumer.close()
def when_we_read_and_transform_the_records(step): test_context = world.test_environment.load_context( EXTRACT_TRANSFORM_CONSUME_SCENARIO) mssql_db = sqldbx.SQLServerDatabase('', 'Legacy') db_username = world.mssql_username db_password = world.mssql_password mssql_db.login(db_username, db_password, schema='mercury') pmgr = sqldbx.PersistenceManager(mssql_db) transform_map_filename = world.pipeline_config.transform_map map_file_path = os.path.join(world.data_dir, transform_map_filename) transformer_builder = dmap.RecordTransformerBuilder(map_file_path, persistence_mgr=pmgr) tfmr = transformer_builder.build() knodes = world.pipeline_config.cluster.node_array # a kafka group is a numbered context shared by some number of consumers group = world.pipeline_config.get_user_defined_consumer_group( 'scratch_group_2') topic = world.pipeline_config.raw_topic kreader = telegraf.KafkaIngestRecordReader(topic, knodes, group) # show how many partitions this topic spans metadata = kreader.consumer.partitions_for_topic(topic) print '### partitions for topic %s:\n%s' % (topic, '\n'.join( [str(p) for p in metadata])) # TopicPartition named tuple consists of the topic and a partition number tp = TopicPartition(topic, 0) # manually assign one or more partitions to the consumer -- # required if we want to use explicit offsets kreader.consumer.assign([tp]) offset = get_offset(topic) topic_partition = TopicPartition(topic, list(metadata)[0]) kreader.consumer.seek(topic_partition, offset) world_relay = WorldRelay(transformer=tfmr) kreader.read(world_relay, log) for rec in world_relay.read_list: test_context.consumed_raw_into_sst_record_list.append(rec)
def updateDeviceConfig(brokerArray, producer, consumer): #producer = KafkaProducer(bootstrap_servers=brokerArray, acks=0, linger_ms=1000, batch_size=1000000) #consumer = KafkaConsumer(bootstrap_servers=brokerArray, # max_poll_interval_ms=1000, # group_id=group_id_suffix + "-" + name, # auto_commit_interval_ms=500 ) tic = time.time() log("Starting Device Config Update") deviceList = getAgentConfig(brokerArray, producer, consumer) log("Got Device List") tp1 = TopicPartition(admin_devices_topic, 0) tp2 = TopicPartition(admin_topic, 0) consumer.assign([tp1, tp2]) log("Assigned Topics") consumer.seek_to_end() foundConfig = 1 message = {} producer.send(admin_devices_topic, genericMessage("get", name, message)) log("Sent Devices Config Request") deviceDatabase = [] tic = time.time() while (len(deviceList) > 0) and (time.time() < (DEFAULT_TIMEOUT_S + tic)): messages = consumer.poll() if tp1 in messages: for message in messages[tp1]: m = json.loads(message.value) if m['cmd'] == 'config-simulation': log("Passed Config Message") if m['name'] in deviceList: deviceList.remove(m['name']) deviceDatabase.append({ 'name': m['name'], 'type': m['message']['type'], 'tagCount': len(m['message']['tags']), 'replication': m['message']['replication'], 'scantime': m['message']['scantime'] }) foundConfig += 1 lastUpdate = time.time() return deviceDatabase
def amount_search(): #array = [] consumer.assign([TopicPartition('json_test', 0)]) i = consumer.position(TopicPartition('json_test', 0)) amount = raw_input( "\nPlease input an amount of messages before the current message offset to querry: \n(the latest offset is " + str(i - 1) + ")\n=>") consumer.seek(TopicPartition('json_test', 0), i - int(amount)) for message in consumer: json_message = json.loads(message.value) print 'offset = ', message.offset, '\n', json_message, '\n' if message.offset == i - 1: time.sleep(5) break
def time_search(): times = raw_input( "\nPlease input a time period (in seconds) for searching the message to querry: \n(1 day = 86400 s, 1 hr = 3600 s)\n=>" ) consumer.assign([TopicPartition('json_test', 0)]) consumer.seek(TopicPartition('json_test', 0), 0) t = datetime.datetime.now() print '\n\ntime period: ', datetime.timedelta(seconds=int(times)), '\n' for message in consumer: json_message = json.loads(message.value) json_message_time = datetime.datetime.strptime(json_message['time'], "%Y-%m-%d %H:%M:%S") if t - json_message_time < datetime.timedelta(seconds=int(times)): print 'offset = ', message.offset, '\n', json_message, '\n'
def wait_for_kafka_topic(hostport, topic, timeout=60): """Wait for a Kafka topic to become available.""" # Delay import to facilitate module use in limited virtualenvs. from kafka import SimpleClient, TopicPartition start = time.time() client = SimpleClient(hostport, client_id=b'dummy', timeout=1) while not client.has_metadata_for_topic(topic): if time.time() - start > timeout: raise Exception('timeout reached waiting for topic') time.sleep(0.1) client.load_metadata_for_topics() # And wait for all partitions in that topic to have a leader. while True: tps = [ TopicPartition(topic, p) for p in client.topic_partitions.get(topic, []) ] if tps and all(client.topics_to_brokers.get(tp) for tp in tps): break if time.time() - start > timeout: raise Exception('timeout reached waiting for topic brokers') time.sleep(0.1) client.load_metadata_for_topics()
async def consume(): consumer = AIOKafkaConsumer( 'my_topic', loop=loop, bootstrap_servers='localhost:9092', group_id="my-group") # Get cluster layout and join group `my-group` await consumer.start() try: # Consume messages msg = await consumer.getone() logger.info(msg) logger.info(f'msg.offset = {msg.offset}') # Unique msg autoincrement ID in this topic-partition. logger.info(f'msg.value = {msg.value}') tp = TopicPartition(msg.topic, msg.partition) position = await consumer.position(tp) # Position is the next fetched offset assert position == msg.offset + 1 committed = await consumer.committed(tp) logger.info(f'committed = {committed}') # print(committed) finally: # Will leave consumer group; perform autocommit if enabled. await consumer.stop()
def employeeportal(): tp = TopicPartition('crashed-devices', 0) consumer = KafkaConsumer( 'crashed-devices', bootstrap_servers=[ 'ec2-52-203-135-135.compute-1.amazonaws.com:9092', 'ec2-52-70-111-222.compute-1.amazonaws.com:9092', 'ec2-34-193-78-218.compute-1.amazonaws.com:9092' ], enable_auto_commit=True, group_id='my-group', auto_offset_reset='earliest', value_deserializer=lambda x: loads(x.decode('utf-8'))) lastOffset = consumer.beginning_offsets([tp])[tp] latitudes = [] longitudes = [] i = 0 for message in consumer: i += 1 msg = message.value latitudes.append(msg['latitude']) longitudes.append(msg['longitude']) print(latitudes, longitudes) if i == 1: print("GOT HERE") consumer.commit() break consumer.close() return render_template("employeeportal.html", APIkey='AIzaSyD9e3Rdo8fGQq6hzaXkdsdQzv9Hy0rTolE', latitudes=latitudes, longitudes=longitudes)
def debug(self, topic): c = KafkaConsumer(bootstrap_servers=KAFKA_HOSTS, client_id=self._client_id, group_id=None, api_version=(0, 10)) # assign/subscribe topic partitions = c.partitions_for_topic(topic) if not partitions: raise Exception("Topic " + topic + " not exist") c.assign([TopicPartition(topic, p) for p in partitions]) # seek to beginning if needed c.seek_to_beginning() # fetch messages while True: partitions = c.poll(100) if partitions: for p in partitions: for msg in partitions[p]: yield msg.value.decode('utf-8') yield "" c.close()