for i in range(3): msg = "msg %d" % i print(msg) producer.send('test', msg) producer.close() # 生产数据 from pykafka import KafkaClient host = 'IP:9092, IP:9092, IP:9092' client = KafkaClient(hosts=host) # 生产者 topicdocu = client.topics['my-topic'] producer = topicdocu.get_producer() for i in range(100): print(i) producer.produce('test message ' + str(i**2)) producer.stop() # 读取本地所有topic信息 from pykafka import KafkaClient client = KafkaClient(hosts="127.0.0.1:9092") for topic in client.topics: print(topic) # 查看brokers信息 from pykafka import KafkaClient client = KafkaClient(host="127.0.0.1:9092") print(client.brokers) for n in client.brokers: host = client.brokers[n].host
message_value = event.encode('utf-8') if raw else \ json.dumps(event, sort_keys=True).encode('utf-8') # When a event is passed to "produce" it will end up in a local # buffer (controlled by librdkafka) first, and then later on it # will be delivered to Kafka. # It might happen that the buffer is full, so the following logic # is needed to implement a simple retry logic before giving up. event_enqueued = False enqueue_retries = 0 while (not event_enqueued and enqueue_retries < KAFKA_CONFLUENT_RETRY_BUFFER_FULL): try: # Produce the message. enqueue_retries += 1 kafka_producer.produce(message_topic, message_value, message_key) event_enqueued = True except BufferError as e: if enqueue_retries < KAFKA_CONFLUENT_RETRY_BUFFER_FULL: logging.warning( 'Local produce queue full, waiting for ' 'events delivered.') kafka_producer.poll(0.5) else: logging.error("Failed to enqueue an event to the " "local kafka producer queue after %d " "retries.".format(enqueue_retries)) raise e # If not async, flush the Kafka produce buffer now and block # until we are done.
def run(argv): old_client = False if len(argv) > 4: test_duration = argv[1] msg_batch = argv[2] msg_requested_rate = argv[3] topic_name = argv[4] acks = argv[5] linger_ms = argv[6] msg_batch = int(msg_batch) msg_requested_rate = float(msg_requested_rate) test_duration = float(test_duration) topic_name = str(topic_name) acks = int(acks) linger_ms = int(linger_ms) # Initialize Kafka PUB Server l.info("Starting Kafka Publisher (producer)") # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka min_message_size = len(str(0) + ' msg' + str(0)) max_message_size = len(str(msg_requested_rate) + ' msg' + str(msg_requested_rate)) average_message_size = (min_message_size + max_message_size) / 2 batch_estimated_size = (average_message_size) * msg_batch l.info("Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]" % (average_message_size, batch_estimated_size)) if old_client: producer = KafkaProducer(bootstrap_servers=['localhost:9092'], batch_size=batch_estimated_size, linger_ms=linger_ms, acks=acks) else: client = KafkaClient(hosts='localhost:9092') topic = client.topics[topic_name] producer = topic.get_producer(min_queued_messages=batch_estimated_size, linger_ms=linger_ms, required_acks=acks) # Initialize simple Rep server, this is used to listen # for the signal to start sending data pub_rep_port = os.environ.get('PORT0') l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port) run_data = {'start': False, 'stats': {'rate': 0, 'msg_cnt': 0}, 'test_status': 'stopped'} pub_metrics = {'test_duration': test_duration, 'msg_batch': msg_batch, 'msg_requested_rate': msg_requested_rate} hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics) hd.run() while True: # Wait for 'signal' to start sending messages to Kafka Broker if not run_data['start']: l.debug("KAFKA PUB WAITING FOR SIGNAL...") time.sleep(1) continue l.info('PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]' 'and requested msg rate [%f]' % (hd.test_duration, hd.msg_batch, hd.msg_requested_rate)) cnt = 0 msg_cnt = 0 start_time = time.time() # Start Publishing Messages to Broker while True: # Build 'message' messagedata = "msg%d" % msg_cnt message = "%d %s" % (msg_cnt, messagedata) try: # Publish message to the Kafka Cluster # topic: specifies the 'topic' where the message will be published if old_client: producer.send(topic=topic_name, value=message) else: producer.produce(message) except KafkaTimeoutError as e: l.error("Unable to publish message to the Kafka Cluster. ERROR: %s" % e.message) # Insert a 'delay' if tx rate between batches outperforms the expected # (minimum) rate to achieve requested tx rate cnt += 1 msg_cnt += 1 if cnt >= hd.msg_batch: # Compute the delay duration = time.time() - start_time expected_time = msg_cnt / hd.msg_requested_rate delay = 0.0 if expected_time > duration: delay = expected_time - duration if delay > 1: delay = 1 time.sleep(delay) cnt = 0 elapsed_time = time.time() - start_time if elapsed_time >= hd.test_duration: break # Update 'stats' to 'hd' (HDaemon) run_data['stats']['time:end'] = json.dumps(time.time()) run_data['stats']['rate'] = msg_cnt / elapsed_time run_data['stats']['msg_cnt'] = msg_cnt process = psutil.Process() run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters()) run_data['stats']['cpu:end'] = json.dumps(process.cpu_times()) run_data['stats']['mem:end'] = json.dumps(process.memory_info()) run_data['test_status'] = 'stopping' # Go back to waiting for the next test run_data['start'] = False continue producer.close() l.info("PUB Server stopping after sending %d messages elapsed time %f and message rate %f" % (msg_cnt, elapsed_time, run_data['stats']['rate']))
if not raw and key: try: message_key = key.format(**event) # If we failed getting key, log and skip the event. except KeyError as e: logging.error( 'Could not get message key from event. KeyError: %s. ' 'Skipping event.' % e ) continue message_value = event.encode('utf-8') if raw else \ json.dumps(event, sort_keys=True).encode('utf-8') # Produce the message. kafka_producer.produce(message_topic, message_value, message_key) # If not async, the flush Kafka produce buffer now and block # until we are done. if not async: kafka_producer.flush() @writes('mysql', 'sqlite') def sql_writer( uri, replace=False, statsd_host='', batch_size=3000, batch_time=300 ):
from kafka import KafkaProducer import random etl_producer = KafkaProducer() def create_features(data): for rec in data: feat_y = rec['field_y'] + random.randint(0, 3) feat_z = random.choice(['D', 'E', 'F']) etl_producer.produce('features', [{ 'id': rec['id'], 'feat_y': feat_y, 'feat_z': feat_z }]) if __name__ == '__main__': data = [{ 'id': random.randint(500, 600), 'field_x': random.choice(['A', 'B', 'C']), 'field_y': random.choice([1, 2, 3]) } for _ in range(10)] etl_producer.produce('raw_fields', data) create_features(data)
def run(argv): old_client = False if len(argv) > 4: test_duration = argv[1] msg_batch = argv[2] msg_requested_rate = argv[3] topic_name = argv[4] acks = argv[5] linger_ms = argv[6] msg_batch = int(msg_batch) msg_requested_rate = float(msg_requested_rate) test_duration = float(test_duration) topic_name = str(topic_name) acks = int(acks) linger_ms = int(linger_ms) # Initialize Kafka PUB Server l.info("Starting Kafka Publisher (producer)") # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka min_message_size = len(str(0) + ' msg' + str(0)) max_message_size = len( str(msg_requested_rate) + ' msg' + str(msg_requested_rate)) average_message_size = (min_message_size + max_message_size) / 2 batch_estimated_size = (average_message_size) * msg_batch l.info( "Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]" % (average_message_size, batch_estimated_size)) if old_client: producer = KafkaProducer(bootstrap_servers=['localhost:9092'], batch_size=batch_estimated_size, linger_ms=linger_ms, acks=acks) else: client = KafkaClient(hosts='localhost:9092') topic = client.topics[topic_name] producer = topic.get_producer(min_queued_messages=batch_estimated_size, linger_ms=linger_ms, required_acks=acks) # Initialize simple Rep server, this is used to listen # for the signal to start sending data pub_rep_port = os.environ.get('PORT0') l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port) run_data = { 'start': False, 'stats': { 'rate': 0, 'msg_cnt': 0 }, 'test_status': 'stopped' } pub_metrics = { 'test_duration': test_duration, 'msg_batch': msg_batch, 'msg_requested_rate': msg_requested_rate } hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics) hd.run() while True: # Wait for 'signal' to start sending messages to Kafka Broker if not run_data['start']: l.debug("KAFKA PUB WAITING FOR SIGNAL...") time.sleep(1) continue l.info( 'PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]' 'and requested msg rate [%f]' % (hd.test_duration, hd.msg_batch, hd.msg_requested_rate)) cnt = 0 msg_cnt = 0 start_time = time.time() # Start Publishing Messages to Broker while True: # Build 'message' messagedata = "msg%d" % msg_cnt message = "%d %s" % (msg_cnt, messagedata) try: # Publish message to the Kafka Cluster # topic: specifies the 'topic' where the message will be published if old_client: producer.send(topic=topic_name, value=message) else: producer.produce(message) except KafkaTimeoutError as e: l.error( "Unable to publish message to the Kafka Cluster. ERROR: %s" % e.message) # Insert a 'delay' if tx rate between batches outperforms the expected # (minimum) rate to achieve requested tx rate cnt += 1 msg_cnt += 1 if cnt >= hd.msg_batch: # Compute the delay duration = time.time() - start_time expected_time = msg_cnt / hd.msg_requested_rate delay = 0.0 if expected_time > duration: delay = expected_time - duration if delay > 1: delay = 1 time.sleep(delay) cnt = 0 elapsed_time = time.time() - start_time if elapsed_time >= hd.test_duration: break # Update 'stats' to 'hd' (HDaemon) run_data['stats']['time:end'] = json.dumps(time.time()) run_data['stats']['rate'] = msg_cnt / elapsed_time run_data['stats']['msg_cnt'] = msg_cnt process = psutil.Process() run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters()) run_data['stats']['cpu:end'] = json.dumps(process.cpu_times()) run_data['stats']['mem:end'] = json.dumps(process.memory_info()) run_data['test_status'] = 'stopping' # Go back to waiting for the next test run_data['start'] = False continue producer.close() l.info( "PUB Server stopping after sending %d messages elapsed time %f and message rate %f" % (msg_cnt, elapsed_time, run_data['stats']['rate']))