def _get_topic_offsets(topics, latest): """ :param topics: list of topics :param latest: True to fetch latest offsets, False to fetch earliest available :return: dict: { (topic, partition): offset, ... } """ # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetRequest # https://cfchou.github.io/blog/2015/04/23/a-closer-look-at-kafka-offsetrequest/ assert set(topics) <= set(ALL) with get_simple_kafka_client() as client: partition_meta = client.topic_partitions # only return the offset of the latest message in the partition num_offsets = 1 time_value = -1 if latest else -2 offsets = {} offset_requests = [] for topic in topics: partitions = list(partition_meta.get(topic, {})) for partition in partitions: offsets[(topic, partition)] = None offset_requests.append( OffsetRequestPayload(topic, partition, time_value, num_offsets)) responses = client.send_offset_request(offset_requests) for r in responses: offsets[(r.topic, r.partition)] = r.offsets[0] return offsets
def get_tail_offsets(self): request = [ OffsetRequestPayload(self.topic, p, -1, 1) for p in self.partitions.keys() ] response = self.client.send_offset_request(request) offsets = {r.partition: r.offsets[0] for r in response} # build dictionary return offsets
def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request([OffsetRequestPayload(topic, partition, -1, 1)]) except: # XXX: We've seen some UnknownErrors here and cant debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0]
def process(spouts): ''' Returns a named tuple of type PartitionsSummary. ''' results = [] total_depth = 0 total_delta = 0 brokers = [] for s in spouts: for p in s.partitions: try: k = SimpleClient([p['broker']['host'] + ":" + str(p['broker']['port'])]) except socket.gaierror as e: raise ProcessorError('Failed to contact Kafka broker %s (%s)' % (p['broker']['host'], str(e))) earliest_off = OffsetRequestPayload(p['topic'], p['partition'], -2, 1) latest_off = OffsetRequestPayload(p['topic'], p['partition'], -1, 1) earliest = k.send_offset_request([earliest_off])[0].offsets[0] latest = k.send_offset_request([latest_off])[0].offsets[0] current = p['offset'] brokers.append(p['broker']['host']) total_depth = total_depth + (latest - earliest) total_delta = total_delta + (latest - current) results.append(PartitionState._make([ p['broker']['host'], p['topic'], p['partition'], earliest, latest, latest - earliest, s.id, current, latest - current])) return PartitionsSummary(total_depth=total_depth, total_delta=total_delta, num_partitions=len(results), num_brokers=len(set(brokers)), partitions=tuple(results))
def _python_kafka_partitionoffset(self, topic): """ Return offset and partition of the topic """ topic = self.topic client = SimpleClient(self.brokers) partitions = client.topic_partitions[topic] offset_requests = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys() ] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: print("partition = %s, offset = %s" % (r.partition, r.offsets[0]))
def reset_partition_offset(self, partition): """Update offsets using auto_offset_reset policy (smallest|largest) Arguments: partition (int): the partition for which offsets should be updated Returns: Updated offset on success, None on failure """ LATEST = -1 EARLIEST = -2 if self.auto_offset_reset == 'largest': reqs = [OffsetRequestPayload(self.topic, partition, LATEST, 1)] elif self.auto_offset_reset == 'smallest': reqs = [OffsetRequestPayload(self.topic, partition, EARLIEST, 1)] else: # Let's raise an reasonable exception type if user calls # outside of an exception context if sys.exc_info() == (None, None, None): raise OffsetOutOfRangeError( 'Cannot reset partition offsets without a ' 'valid auto_offset_reset setting ' '(largest|smallest)') # Otherwise we should re-raise the upstream exception # b/c it typically includes additional data about # the request that triggered it, and we do not want to drop that raise # pylint: disable=E0704 # send_offset_request log.info('Resetting topic-partition offset to %s for %s:%d', self.auto_offset_reset, self.topic, partition) try: (resp, ) = self.client.send_offset_request(reqs) except KafkaError as e: log.error('%s sending offset request for %s:%d', e.__class__.__name__, self.topic, partition) else: self.offsets[partition] = resp.offsets[0] self.fetch_offsets[partition] = resp.offsets[0] return resp.offsets[0]
def getoffset(self, topic): '''目前未使用''' from kafka import SimpleClient from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy from kafka.common import OffsetRequestPayload client = SimpleClient(self.server) partitions = client.topic_partitions[topic] offset_requests = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys() ] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: print "partition = %s, offset = %s" % (r.partition, r.offsets[0])
def _python_kafka_offsetcount(self, topic): """ Count no of offset of the topic """ client = SimpleClient(self.brokers) self.topic = topic partitions = client.topic_partitions[self.topic] offset_requests = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys() ] offsets_responses = client.send_offset_request(offset_requests) totaloffset = 0 for r in offsets_responses: totaloffset = totaloffset + r.offsets[0] return totaloffset
def get_topic_max(topic, k_client): """Return the max offset of a kafka topic Args: topic (str): Name of kafka topic k_client (obj): Kafka client object Returns: int: Max offset """ partitions = k_client.topic_partitions[topic] offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()] offsets_responses = k_client.send_offset_request(offset_requests) for r in offsets_responses: if r.partition == 0: return r.offsets[0]
def _update_produced_offsets(self): """ Arguments: request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. """ for partition in self._client.get_partition_ids_for_topic(self._topic): reqs = [OffsetRequestPayload(self._topic, partition, -1, 1)] (resp, ) = self._client.send_offset_request(reqs) check_error(resp) assert resp.topic == self._topic assert resp.partition == partition self._offsets.produced[partition] = resp.offsets[0]
def getOffsets(self, topic, partitions, group): """ 指定topic、partition和group, 返回offsets数据 """ try: # 尝试使用zookeeper-storage api获取offsets数据 # 未获得指定group的offsets数据将抛出UnknownTopicOrPartitionError异常 tp = self.client.send_offset_fetch_request( group, [OffsetRequestPayload(topic, p, -1, 1) for p in partitions]) offsets = {p.partition: p.offset for p in tp} except UnknownTopicOrPartitionError: # 收到异常后使用kafka-storage api获取offsets数据 consumer = KafkaConsumer(group_id=group, bootstrap_servers=self.broker, enable_auto_commit=False) tp = [TopicPartition(topic, p) for p in partitions] consumer.assign(tp) offsets = {p.partition: consumer.position(p) for p in tp} return offsets
def count_kafka_mssg(topic, server): """Returns the total number of messages (sum of all partitions) in given kafka topic """ client = SimpleClient(server) partitions = client.topic_partitions[topic] offset_requests = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys() ] offsets_responses = client.send_offset_request(offset_requests) total_mssg = 0 for r in offsets_responses: logging.info("partition = {}, offset = {}".format( r.partition, r.offsets[0])) total_mssg += int(r.offsets[0]) return total_mssg
def handler(self): """ 查询指定Kafka集群Topic中每个Partition当前Logsize, 将Logsize写入LevelDB 每次收集Logsize数据后会检测retention_day参数,删除过期数据 """ clusters = base.config["collector"]["clusters"] for cluster, metric in clusters.items(): client = KafkaClient(metric["brokers"], timeout=3) for topic in metric["topics"]: partitions = client.get_partition_ids_for_topic(topic) payload = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions ] logsize = { p.partition: p.offsets[0] for p in client.send_offset_request(payload) } if logsize: key = str(int(time.time())).encode("utf-8") value = json.dumps(logsize).encode("utf-8") db = base.init_leveldb(cluster=cluster, topic=topic) db.Put(key, value) deadline = base.config["collector"]["clusters"][cluster][ "retention_hour"] * 3600 for key, _ in db.RangeIter(): if time.time() - int(key) > deadline: db.Delete(key) else: break client.close()
def pending(self, partitions=None): """ Gets the pending message count Keyword Arguments: partitions (list): list of partitions to check for, default is to check all """ if partitions is None: partitions = self.offsets.keys() total = 0 reqs = [] for partition in partitions: reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1)) resps = self.client.send_offset_request(reqs) for resp in resps: partition = resp.partition pending = resp.offsets[0] offset = self.offsets[partition] total += pending - offset return total
def spoorer(self): #连接kafka,获取topics try: kafka_client = SimpleClient(self.kafka_hosts, timeout=self.timeout) # print kafka_client.topics except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: kafka_topics = kafka_client.topics finally: kafka_client.close() #连接zk,获取当前消费进度current offset try: zookeeper_client = KazooClient(hosts=self.zookeeper_hosts, read_only=True, timeout=self.timeout) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) try: groups = map(str,zookeeper_client.get_children(self.zookeeper_url + 'consumers')) except NoNodeError as e: print "Error, invalid zookeeper url." zookeeper_client.stop() sys.exit(2) else: for group in groups: if 'offsets' not in zookeeper_client.get_children(self.zookeeper_url + 'consumers/%s' % group):continue topic_path = 'consumers/%s/offsets' % (group) topics = map(str,zookeeper_client.get_children(self.zookeeper_url + topic_path)) if len(topics) == 0: continue for topic in topics: # print topic # print self.white_topic_group.keys() if topic not in self.white_topic_group.keys(): continue # elif group not in self.white_topic_group[topic].replace(' ','').split(','): # continue partition_path = 'consumers/%s/offsets/%s' % (group,topic) partitions = map(int,zookeeper_client.get_children(self.zookeeper_url + partition_path)) for partition in partitions: base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topic, partition) owner_path, offset_path = base_path % 'owners', base_path % 'offsets' offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0] try: owner = zookeeper_client.get(self.zookeeper_url + owner_path)[0] except NoNodeError as e: owner = 'null' #消费进度放在字典metric中 metric = {'datetime':time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'topic':topic, 'group':group, 'partition':int(partition), 'logsize':None, 'offset':int(offset), 'lag':None, 'owner':owner} self.result.append(metric) finally: zookeeper_client.stop() #获取每个分片的logsize try: client = SimpleClient(self.kafka_hosts) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: for kafka_topic in kafka_topics: self.kafka_logsize[kafka_topic] = {} partitions = client.topic_partitions[kafka_topic] offset_requests = [OffsetRequestPayload(kafka_topic, p, -1, 1) for p in partitions.keys()] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: self.kafka_logsize[kafka_topic][r.partition] = r.offsets[0] #logsize减去current offset等于lag f1 = open(self.log_file,'w') # f2 = open(self.log_day_file,'a') # print self.result for metric in self.result: logsize = self.kafka_logsize[metric['topic']][metric['partition']] metric['logsize'] = int(logsize) metric['lag'] = int(logsize) - int(metric['offset']) f1.write(json.dumps(metric,sort_keys=True) + '\n') f1.flush() # f2.write(json.dumps(metric,sort_keys=True) + '\n') # f2.flush() # finally: f1.close() client.close()
def getLogsize(self, topic, partitions): """ 指定topic与partition列表, 返回logsize数据 """ tp = self.client.send_offset_request( [OffsetRequestPayload(topic, p, -1, 1) for p in partitions]) return {p.partition: p.offsets[0] for p in tp}
def monitor(self): try: kafka_client = KafkaClient(KAFKA_HOSTS, timeout=self.timeout) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) try: zookeeper_client = KazooClient(hosts=ZOO_HOSTS, read_only=True, timeout=self.timeout) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) for group in CONSUMER_GROUPS: for topic in TOPIC_LIST: try: partition_path = 'consumers/%s/offsets/%s' % (group, topic) partitions = map( int, zookeeper_client.get_children(self.zookeeper_url + partition_path)) for partition in partitions: offset_path = 'consumers/%s/offsets/%s/%s' % ( group, topic, partition) offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0] if offset is None: continue obj = { 'timestamp': self.timestamp, 'group': group, 'topic': topic, 'partition': int(partition), 'metric': 'consumerlag:%s' % group, 'tags': 'topic=%s,partition=%s' % (topic, partition), 'offset': int(offset) } self.result.append(obj) except NoNodeError as e: print "Error, fail to get offset for group[%s], topic[%s]" % ( group, topic) continue zookeeper_client.stop() for kafka_topic in TOPIC_LIST: self.kafka_logsize[kafka_topic] = {} try: partitions = kafka_client.topic_partitions[kafka_topic] logsize_requests = [ OffsetRequestPayload(kafka_topic, p, -1, 1) for p in partitions.keys() ] logsize_responses = kafka_client.send_offset_request( logsize_requests) for r in logsize_responses: self.kafka_logsize[kafka_topic][r.partition] = r.offsets[0] except Exception as e: print "error to get logsize for topic: %s" % kafka_topic kafka_client.close() payload = [] for obj in self.result: try: logsize = self.kafka_logsize[obj['topic']][obj['partition']] lag = int(logsize) - int(obj['offset']) item = {} item['endpoint'] = ENDPOINT item['metric'] = obj['metric'] item['tags'] = obj['tags'] item['timestamp'] = obj['timestamp'] item['step'] = STEP item['value'] = lag item['counterType'] = 'GAUGE' payload.append(item) except Exception as e: print "error to compute (%s/%s/%s) lag-value" % ( obj['group'], obj['topic'], obj['partition']) # 1. Print print "log-lag details:" print payload # 2. report to falcon-agent if len(payload) > 0: requests.post(FALCON_AGENT_URL, data=json.dumps(payload), timeout=10)
def seek(self, offset, whence=None, partition=None): """ Alter the current offset in the consumer, similar to fseek Arguments: offset: how much to modify the offset whence: where to modify it from, default is None * None is an absolute offset * 0 is relative to the earliest available offset (head) * 1 is relative to the current offset * 2 is relative to the latest known offset (tail) partition: modify which partition, default is None. If partition is None, would modify all partitions. """ if whence is None: # set an absolute offset if partition is None: for tmp_partition in self.offsets: self.offsets[tmp_partition] = offset else: self.offsets[partition] = offset elif whence == 1: # relative to current position if partition is None: for tmp_partition, _offset in self.offsets.items(): self.offsets[tmp_partition] = _offset + offset else: self.offsets[partition] += offset elif whence in (0, 2): # relative to beginning or end reqs = [] deltas = {} if partition is None: # divide the request offset by number of partitions, # distribute the remained evenly (delta, rem) = divmod(offset, len(self.offsets)) for tmp_partition, r in izip_longest(self.offsets.keys(), repeat(1, rem), fillvalue=0): deltas[tmp_partition] = delta + r for tmp_partition in self.offsets.keys(): if whence == 0: reqs.append( OffsetRequestPayload(self.topic, tmp_partition, -2, 1)) elif whence == 2: reqs.append( OffsetRequestPayload(self.topic, tmp_partition, -1, 1)) else: pass else: deltas[partition] = offset if whence == 0: reqs.append( OffsetRequestPayload(self.topic, partition, -2, 1)) elif whence == 2: reqs.append( OffsetRequestPayload(self.topic, partition, -1, 1)) else: pass resps = self.client.send_offset_request(reqs) for resp in resps: self.offsets[resp.partition] = \ resp.offsets[0] + deltas[resp.partition] else: raise ValueError('Unexpected value for `whence`, %d' % whence) # Reset queue and fetch offsets since they are invalid self.fetch_offsets = self.offsets.copy() self.count_since_commit += 1 if self.auto_commit: self.commit() self.queue = queue.Queue()
DBNAME='kafka_monitor' host='' port=8086 ## kafka cluster name, measurement in influxdb kafka_name="" zk_path='/consumers/' consumer_group='' kafka_brokers=sys.argv[2] zk_clusters=sys.argv[4] topic=sys.argv[6] client = SimpleClient(kafka_brokers) partitions = client.topic_partitions[topic] offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()] offsets_responses = client.send_offset_request(offset_requests) zk = KazooClient(hosts=zk_clusters,read_only=True) zk.start() zk_path=zk_path+consumer_group if zk.exists(zk_path): data, stat = zk.get(zk_path+"/offsets/"+topic+"/1") sum_lag=0 sum_offset=0 for r in offsets_responses: consumer_offset, stat = zk.get(zk_path+"/offsets/"+topic+"/"+str(r.partition)) producer_offset=r.offsets[0] lag_partition=producer_offset - int(consumer_offset)
import kafka from kafka import SimpleClient from kafka.protocol.offset import offsetRequest, OffsetResetStrategy from kafka.common import OffsetRequestPayload topic = sys.argv[1] variables.setVariables() topic_prefix = 'my_topic_' table = [] sum_total = [] broker = "%s:%s" % (str( os.environ['KAFKA_BROKER_ADDR']), str(os.environ['KAFKA_BROKER_PORT'])) consumer = kafka.KafkaConsumer(group_id='count_check', bootstrap_Servers=[broker]) client = SimpleClient(broker) for tpc in table: partitions = client.topic_partitions[tpc] offset_requests = [ OffsetRequestPayload(tpc, p, -1, 1) for p in partitions.keys() ] offset_responses = client.send_offset_request(offset_requests) my_list = [] for r in offset_responses: my_list.append(r.offsets(0)) sum_total.append(sum(my_list)) my_list = [] print("%s, %s" % (topic, sum_total))