class DFProducer: def __init__(self, bootstrap_servers): self.kafka_client = SimpleClient(bootstrap_servers) self.producer = KafkaProducer( bootstrap_servers=bootstrap_servers, api_version=(0, 10), retries=3, max_block_ms=60 * 1000, value_serializer=lambda m: pickle.dumps(m)) def produce(self, reader): data_df = reader.read_data() if data_df.empty: log.info('本次数据采集的数量为0...') return key = reader.get_table_profile().get_key() bkey = bytes(key, encoding="utf8") tpc = reader.get_table_profile().get_pub_topic() tp_part = len(self.kafka_client.get_partition_ids_for_topic(tpc)) for idx, row in data_df.iterrows(): part = idx % tp_part self.producer.send(tpc, value=row, key=bkey, partition=part).add_errback( self.on_send_error, key, tpc) self.producer.flush() log.info('成功发送数据表key【{}】的数据【{}】条到kafka...'.format( key, data_df.shape[0])) def on_send_error(self, ex, key, topic): logging.error('数据表【{}】的数据发送kafka【{}】失败!!!'.format(key, topic), ex)
def kafka_get_topics_offsets(host, topic, port=9092): """Return available partitions and their offsets for the given topic. Args: host (str): Kafka host. topic (str): Kafka topic. port (int): Kafka port. Returns: [(int, int, int)]: [(partition, start_offset, end_offset)]. """ brokers = ['{}:{}'.format(host, port)] client = SimpleClient(brokers) offsets = [] partitions = client.get_partition_ids_for_topic(topic) offsets_responses_end = client.send_offset_request([ OffsetRequestPayload(topic, partition, -1, 1) for partition in partitions ]) offsets_responses_start = client.send_offset_request([ OffsetRequestPayload(topic, partition, -2, 1) for partition in partitions ]) for start_offset, end_offset in zip(offsets_responses_start, offsets_responses_end): offsets.append((start_offset.partition, start_offset.offsets[0], end_offset.offsets[0])) return offsets
def _get_partition_ids(self, topic, bootstrap_server): """ Get the number of partitions for a specific topic. :param topic: (string) topic name :param bootstrap_servers: (string) single bootstrap server 'host:port' :return: (int) no. of partitions """ client = SimpleClient(bootstrap_server) topic_partition_ids = client.get_partition_ids_for_topic(topic.encode('utf-8')) return len(topic_partition_ids)
class KafkaWriter: def __init__(self, servers, topic): self._servers = servers self._topic = topic self._client = None self._partitions_count = 0 def open(self): self._boot_topic() self._producer = Producer({'bootstrap.servers': self._servers}) def write(self, event): self._producer.poll(0) # Asynchronously produce a message, the delivery report callback will # will be triggered (from poll or flush), when the message has # been successfully delivered or failed permanently. self._producer.produce(self._topic, event.to_bytes(), partition=self.partition_for_key( event.get_thread_id()), callback=KafkaWriter.delivery_report) def close(self): self._producer.flush() self._client.close() def partition_for_key(self, thread_id): return int(hashlib.sha512(thread_id).hexdigest(), 16) % self._partitions_count def _boot_topic(self): self._client = KafkaClient(self._servers) if not self._client.has_metadata_for_topic(self._topic): raise IOError('Kafka topic was not found.') self._partitions_count = len( self._client.get_partition_ids_for_topic(self._topic)) if self._partitions_count == 0: raise IOError('Kafka topic does not have any partition.') @staticmethod def delivery_report(err, msg): if err is not None: logging.error('Event delivery failed: {}'.format(err)) elif logging.getLogger().getEffectiveLevel() == logging.DEBUG: logging.debug('Event delivered to {} [{}]'.format( msg.topic(), msg.partition()))
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) timeout = time.time() + 30 while time.time() < timeout: try: self.client.load_metadata_for_topics(self.topic, ignore_leadernotavailable=False) if self.client.has_metadata_for_topic(topic): break except (LeaderNotAvailableError, InvalidTopicError): time.sleep(1) else: raise KafkaTimeoutError('Timeout loading topic metadata!') # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors # TODO: It might be a good idea to move this to self.client.ensure_topic_exists for partition in self.client.get_partition_ids_for_topic(self.topic): while True: try: req = OffsetRequestPayload(self.topic, partition, -1, 100) self.client.send_offset_request([req]) break except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, FailedPayloadsError) as e: if time.time() > timeout: raise KafkaTimeoutError('Timeout loading topic metadata!') time.sleep(.1) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request([OffsetRequestPayload(topic, partition, -1, 1)]) except Exception: # XXX: We've seen some UnknownErrors here and can't debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0] def msgs(self, iterable): return [self.msg(x) for x in iterable] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) timeout = time.time() + 30 while time.time() < timeout: try: self.client.load_metadata_for_topics( self.topic, ignore_leadernotavailable=False) if self.client.has_metadata_for_topic(topic): break except (LeaderNotAvailableError, InvalidTopicError): time.sleep(1) else: raise KafkaTimeoutError('Timeout loading topic metadata!') # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors # TODO: It might be a good idea to move this to self.client.ensure_topic_exists for partition in self.client.get_partition_ids_for_topic(self.topic): while True: try: req = OffsetRequestPayload(self.topic, partition, -1, 100) self.client.send_offset_request([req]) break except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, FailedPayloadsError) as e: if time.time() > timeout: raise KafkaTimeoutError( 'Timeout loading topic metadata!') time.sleep(.1) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request( [OffsetRequestPayload(topic, partition, -1, 1)]) except Exception: # XXX: We've seen some UnknownErrors here and can't debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0] def msgs(self, iterable): return [self.msg(x) for x in iterable] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
def spoorer(self): try: kafka_client = SimpleClient(self.kafka_hosts, timeout=self.timeout) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: kafka_topics = kafka_client.topics finally: kafka_client.close() try: zookeeper_client = KazooClient(hosts=self.zookeeper_hosts, read_only=True, timeout=self.timeout) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) try: groups = map( str, zookeeper_client.get_children(self.zookeeper_url + 'consumers')) except NoNodeError as e: print "Error, invalid zookeeper url." zookeeper_client.stop() sys.exit(2) else: for group in groups: if 'offsets' not in zookeeper_client.get_children( self.zookeeper_url + 'consumers/%s' % group): continue topic_path = 'consumers/%s/offsets' % (group) topics = map( str, zookeeper_client.get_children(self.zookeeper_url + topic_path)) if len(topics) == 0: continue for topic in topics: if topic not in self.white_topic_group.keys(): continue elif group not in self.white_topic_group[topic].replace( ' ', '').split(','): continue partition_path = 'consumers/%s/offsets/%s' % (group, topic) partitions = map( int, zookeeper_client.get_children(self.zookeeper_url + partition_path)) for partition in partitions: base_path = 'consumers/%s/%s/%s/%s' % ( group, '%s', topic, partition) owner_path, offset_path = base_path % 'owners', base_path % 'offsets' offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0] try: owner = zookeeper_client.get(self.zookeeper_url + owner_path)[0] except NoNodeError as e: owner = 'null' metric = { 'datetime': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'topic': topic, 'group': group, 'partition': int(partition), 'logsize': None, 'offset': int(offset), 'lag': None, 'owner': owner } self.result.append(metric) finally: zookeeper_client.stop() try: kafka_consumer = KafkaConsumer(bootstrap_servers=self.kafka_hosts) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: for kafka_topic in kafka_topics: self.kafka_logsize[kafka_topic] = {} partitions = kafka_client.get_partition_ids_for_topic( kafka_topic) for partition in partitions: offset = kafka_consumer.get_partition_offsets( kafka_topic, partition, -1, 1)[0] self.kafka_logsize[kafka_topic][partition] = offset f1 = open(self.log_file, 'w') f2 = open(self.log_day_file, 'a') for metric in self.result: logsize = self.kafka_logsize[metric['topic']][ metric['partition']] metric['logsize'] = int(logsize) metric['lag'] = int(logsize) - int(metric['offset']) f1.write(json.dumps(metric, sort_keys=True) + '\n') f1.flush() f2.write(json.dumps(metric, sort_keys=True) + '\n') f2.flush() finally: kafka_consumer.close() return ''