def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True #print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) if tempfile.tell() > 120000000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit() # inform zookeeper of position in the kafka queue
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) tempfile.write("\n") if tempfile.tell() > 12000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit( ) # inform zookeeper of position in the kafka queue
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_stockTwits_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path,"w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 300s") # sleep 5mins continue for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 1000 messages at a time, non blocking if not messages: break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(callback_url, consumer_group, topic): consumer = None try: consumer = SimpleConsumer(self.kafka, consumer_group, topic, auto_commit=False) messages_read = 0 # we can't read messages infinitely here as we have # a lot of topics/subscribers (much more than threadpool size) while messages_read < self.max_read_messages_per_cycle: # get one message and monitor the time start = monitoring.start_time_measure() message = consumer.get_message(block=False) ms_elapsed = monitoring.stop_time_measure(start) self.metrics['kafka_read'].add({'topic': topic}, ms_elapsed) # if we don't have messages for this topic/subscriber - quit and give chance to others if message is None: logging.info( 'No messages for topic: %s and callback: %s, quiting the thread', topic, callback_url) break try: event = json.loads( message.message.value.decode('utf-8')) response_status = self.forward_event( callback_url, event, topic) # if status is success - mark message as consumed by this subscriber if 200 <= response_status < 300: consumer.commit() else: logging.info( 'Received error response fro consumer: %s', response_status) except: logging.error( "Exception while sending event to consumer") logging.error(traceback.format_exc()) finally: messages_read += 1 return messages_read except UnknownTopicOrPartitionError: logging.error('Adding %s to skip list', topic) except: logging.exception('failed to create kafka client') finally: if consumer is not None: consumer.stop()
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Friendsquare topic then save it to a temporary file: temp_dir, then transfer the file to hdfs. Create a kafka receiver to grap messages ''' kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) # Create a temp file to store messages self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic) # Create a hdfs directory to store output files os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir) while self.count < self.max_count: # Get 1000 messages each time messages = kafka_receiver.get_messages(count=1000, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # Set each file size at 20 M if temp_file.tell() > 20000000: temp_file.close() # Put the file to hdfs hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Kafka then save it to a temporary first, then transfer the file to hdfs. ''' # Create a kafka receiver to grap messages kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) self.timestamp = self.getTimestamp() # Create a temp file to store messages self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') while self.count < self.max_count: # Get 100 messages each time messages = kafka_receiver.get_messages(count=100, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # For structured streaming, files need to be small at this point, set the size at 2 M if temp_file.tell() > 2000000: temp_file.close() # Copy the file to hdfs output_dir = "%s/%s" % (self.hdfs_dir, topic) os.system("hdfs dfs -mkdir %s" % output_dir) hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.timestamp = self.getTimestamp() self.temp_file_path = "%s/%s_%s.txt" % ( temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consume_save(group,topic): tmp_save=open(tmp_file_path,"w") kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") # print message.message.value+"\n" kafka_consumer.commit() # inform zookeeper of position in the kafka queu print ".... ... .. .." print "Message from topic \"%s\" consumed \n" % topic
def consume_save(group,topic): i=0 tmp_save=open(tmp_file_path,"w") while True: kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) # if not messages: # print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") print message.message.value+"\n" # file size > 20MB if tmp_save.tell() > 20000000: push_to_hdfs(tmp_file_path) kafka_consumer.commit() # inform zookeeper of position in the kafka queu
def test_simple_consumer_commit_does_not_raise(self): client = MagicMock() client.get_partition_ids_for_topic.return_value = [0, 1] def mock_offset_fetch_request(group, payloads, **kwargs): return [ OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0) for p in payloads ] client.send_offset_fetch_request.side_effect = mock_offset_fetch_request def mock_offset_commit_request(group, payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_commit_request.side_effect = mock_offset_commit_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # Mock internal commit check consumer.count_since_commit = 10 # This should not raise an exception self.assertFalse(consumer.commit(partitions=[0, 1]))
class KafkaSpout(Spout): def initialize(self, stormconf, context): # self.words = itertools.cycle(['dog', 'cat', # 'zebra', 'elephant']) self.kafka = KafkaClient("cloud.soumet.com:9092") self.consumer = SimpleConsumer(self.kafka, "storm", "realtime", max_buffer_size=1310720000) def next_tuple(self): for message in self.consumer.get_messages(count=500, block=False):#, timeout=1): #transaction_data = TransactionFull() #transaction_data.ParseFromString(base64.b64decode(message.message.value)) #self.emit([transaction_data]) self.emit([message.message.value]) self.consumer.commit()
def consume_topic(callback_url, consumer_group, topic): consumer = None try: consumer = SimpleConsumer(self.kafka, consumer_group, topic, auto_commit=False) messages_read = 0 # we can't read messages infinitely here as we have # a lot of topics/subscribers (much more than threadpool size) while messages_read < self.max_read_messages_per_cycle: # get one message and monitor the time start = monitoring.start_time_measure() message = consumer.get_message(block=False) ms_elapsed = monitoring.stop_time_measure(start) self.metrics['kafka_read'].add({'topic': topic}, ms_elapsed) # if we don't have messages for this topic/subscriber - quit and give chance to others if message is None: logging.info('No messages for topic: %s and callback: %s, quiting the thread', topic, callback_url) break try: event = json.loads(message.message.value.decode('utf-8')) response_status = self.forward_event(callback_url, event, topic) # if status is success - mark message as consumed by this subscriber if 200 <= response_status < 300: consumer.commit() else: logging.info('Received error response fro consumer: %s', response_status) except: logging.error("Exception while sending event to consumer") logging.error(traceback.format_exc()) finally: messages_read += 1 return messages_read except UnknownTopicOrPartitionError: logging.error('Adding %s to skip list', topic) except: logging.exception('failed to create kafka client') finally: if consumer is not None: consumer.stop()
class QueueKafka(QueueBase.QueueBase): @QueueBase.catch def __init__(self, name, host='web14', port=51092, **kwargs): QueueBase.QueueBase.__init__(self, name, host, port) self.__queue = [] self.__kafka = KafkaClient('%s:%d' % (host, port)) self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False)) self.__producer.client.ensure_topic_exists(self.name) self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1) def __del__(self): if self.__kafka: [self.put(x.message.value) for x in self.__queue] self.__kafka.close() @QueueBase.catch def put(self, value, *args, **kwargs): if isinstance(value, dict) or isinstance(value, list): self.__producer.send_messages(self.name, json.dumps(value)) else: self.__producer.send_messages(self.name, value.encode('utf-8') if isinstance(value, unicode) else value) @QueueBase.catch def get(self, *args, **kwargs): if not self.__queue: self.__consumer._fetch() kq = self.__consumer.queue while not kq.empty(): partition, result = kq.get_nowait() self.__queue.append(result) self.__consumer.offsets[partition] += 1 self.__consumer.count_since_commit += 1 self.__consumer.queue = Queue() self.__consumer.commit() return self.__queue.pop().message.value if self.__queue else None @QueueBase.catch def size(self, *args, **kwargs): count = 0 for k, v in self.__consumer.offsets.items(): reqs = [common.OffsetRequest(self.name, k, -1, 1)] (resp, ) = self.__consumer.client.send_offset_request(reqs) count += (resp.offsets[0] - v) return count + len(self.__queue)
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def test_simple_consumer_commit_does_not_raise(self): client = MagicMock() client.get_partition_ids_for_topic.return_value = [0, 1] def mock_offset_fetch_request(group, payloads, **kwargs): return [OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0) for p in payloads] client.send_offset_fetch_request.side_effect = mock_offset_fetch_request def mock_offset_commit_request(group, payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_commit_request.side_effect = mock_offset_commit_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # Mock internal commit check consumer.count_since_commit = 10 # This should not raise an exception self.assertFalse(consumer.commit(partitions=[0, 1]))
def _run(self): pcount = 0 while True: try: self._logger.info("New KafkaClient %d" % self._partition) kafka = KafkaClient(self._brokers, str(os.getpid())) try: consumer = SimpleConsumer(kafka, self._group, self._topic, buffer_size=4096 * 4, max_buffer_size=4096 * 32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise gevent.GreenletExit self._logger.info("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0, 2) try: mi = consumer.get_message(timeout=0.1) except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) # start reading from last previously processed message consumer.seek(0, 1) if mi != None: count = 0 self._logger.info("Catching Up %d" % self._partition) loff = mi.offset coff = 0 while True: try: mm = consumer.get_message(timeout=None) count += 1 if not self.msg_handler(mm): self._logger.info("%d could not process %s" % (self._partition, str(mm))) raise gevent.GreenletExit consumer.commit() coff = mm.offset self._logger.info("Syncing offset %d" % coff) if coff == loff: break except Exception as ex: self._logger.info("Sync Error %s" % str(ex)) break if coff != loff: self._logger.info("Sync Failed for %d count %d" % (self._partition, count)) continue else: self._logger.info("Sync Completed for %d count %d" % (self._partition, count)) if self._limit: raise gevent.GreenletExit while True: try: mm = consumer.get_message(timeout=None) if mm is None: continue consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError: gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.info("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) gevent.sleep(1) self._logger.info("Stopping %d pcount %d" % (self._partition, pcount)) return self._partoffset, self._partdb
from __future__ import absolute_import, print_function #, unicode_literals import itertools from streamparse.spout import Spout import base64 import sys from kafka import KafkaClient, SimpleProducer, SimpleConsumer #from kafka.client import KafkaClient #from kafka.consumer import SimpleConsumer kafka = KafkaClient("cloud.soumet.com:9092") kafka_consumer = SimpleConsumer( kafka, "storm", "realtime", max_buffer_size=1310720000) #, max_buffer_size=1310720000) for message in kafka_consumer.get_messages( count=5000, block=False): #, block=True, timeout=4): print(message.message.value) kafka_consumer.commit()
def _run(self): pcount = 0 while True: try: self._logger.error("New KafkaClient %d" % self._partition) self._kfk = KafkaClient(self._brokers, str(os.getpid())) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size=4096 * 4, max_buffer_size=4096 * 32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise RuntimeError(messag) self._logger.error("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0, 2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(0, 1) else: consumer.seek(0, 0) if self._limit: raise gevent.GreenletExit while True: try: self.resource_check() mlist = consumer.get_messages(10, timeout=0.2) for mm in mlist: if mm is None: continue self._logger.debug("%d Reading offset %d" % \ (self._partition, mm.offset)) consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() gevent.sleep(2) partdb = {} for coll in self._uvedb.keys(): partdb[coll] = {} for gen in self._uvedb[coll].keys(): partdb[coll][gen] = {} for tab in self._uvedb[coll][gen].keys(): for rkey in self._uvedb[coll][gen][tab].keys(): uk = tab + ":" + rkey partdb[coll][gen][uk] = \ set(self._uvedb[coll][gen][tab][rkey].keys()) self._logger.error("Stopping %d pcount %d" % (self._partition, pcount)) self.stop_partition() return self._partoffset, partdb
ph.start() workers[int(mm.key)] = ph elif mm.value == "stop": #import pdb; pdb.set_trace() if workers.has_key(int(mm.key)): ph = workers[int(mm.key)] gevent.kill(ph) res, db = ph.get() print "Returned " + str(res) print "State :" for k, v in db.iteritems(): print "%s -> %s" % (k, str(v)) del workers[int(mm.key)] else: end_ready = True cons.commit() gevent.sleep(2) break except TypeError: gevent.sleep(0.1) except common.FailedPayloadsError as ex: print "Payload Error: " + str(ex.args) gevent.sleep(0.1) lw = [] for key, value in workers.iteritems(): gevent.kill(value) lw.append(value) gevent.joinall(lw) print "Ending Consumers"
def _run(self): pcount = 0 pause = False while True: try: if pause: gevent.sleep(2) pause = False self._logger.error("New KafkaClient %s" % self._topic) self._kfk = KafkaClient(self._brokers , "kc-" + self._topic) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("Error: %s trace %s" % \ (messag, traceback.format_exc())) raise RuntimeError(messag) self._logger.error("Starting %s" % self._topic) # Find the offset of the last message that has been queued consumer.seek(-1,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %s is %s" % \ (self._topic,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(-1,1) else: consumer.seek(0,0) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10,timeout=0.5) if not self.msg_handler(mlist): raise gevent.GreenletExit consumer.commit() pcount += len(mlist) except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() pause = True self._logger.error("Stopping %s pcount %d" % (self._topic, pcount)) partdb = self.stop_partition() return self._partoffset, partdb
ph.start() workers[int(mm.key)] = ph elif mm.value == "stop": #import pdb; pdb.set_trace() if workers.has_key(int(mm.key)): ph = workers[int(mm.key)] gevent.kill(ph) res,db = ph.get() print "Returned " + str(res) print "State :" for k,v in db.iteritems(): print "%s -> %s" % (k,str(v)) del workers[int(mm.key)] else: end_ready = True cons.commit() gevent.sleep(2) break except TypeError: gevent.sleep(0.1) except common.FailedPayloadsError as ex: print "Payload Error: " + str(ex.args) gevent.sleep(0.1) lw=[] for key, value in workers.iteritems(): gevent.kill(value) lw.append(value) gevent.joinall(lw) print "Ending Consumers"
from __future__ import absolute_import, print_function#, unicode_literals import itertools from streamparse.spout import Spout import base64 import sys from kafka import KafkaClient, SimpleProducer, SimpleConsumer #from kafka.client import KafkaClient #from kafka.consumer import SimpleConsumer kafka = KafkaClient("cloud.soumet.com:9092") kafka_consumer = SimpleConsumer(kafka, "storm", "realtime", max_buffer_size=1310720000)#, max_buffer_size=1310720000) for message in kafka_consumer.get_messages(count=5000, block=False):#, block=True, timeout=4): print(message.message.value) kafka_consumer.commit()
class KafkaSimpleConsumer(object): """ Base class for consuming from kafka. Implement the logic to connect to kafka and consume messages. KafkaSimpleConsumer is a wrapper around kafka-python SimpleConsumer. KafkaSimpleConsumer relies on it in order to consume messages from kafka. KafkaSimpleConsumer does not catch exceptions raised by kafka-python. An instance of this class can be used as iterator to consume messages from kafka. .. warning:: This class is considered deprecated in favor of K:py:class:`yelp_kafka.consumer_group.KafkaConsumerGroup`. :param topic: topic to consume from. :type topic: string. :param config: consumer configuration. :type config: dict. :param partitions: topic partitions to consumer from. :type partitions: list. """ def __init__(self, topic, config, partitions=None): self.log = logging.getLogger(self.__class__.__name__) if not isinstance(topic, six.string_types): raise TypeError("Topic must be a string") self.topic = kafka_bytestring(topic) if partitions and not isinstance(partitions, list): raise TypeError("Partitions must be a list") self.partitions = partitions self.kafka_consumer = None self.config = config def connect(self): """ Connect to kafka and create a consumer. It uses config parameters to create a kafka-python KafkaClient and SimpleConsumer. """ # Instantiate a kafka client connected to kafka. self.client = KafkaClient(self.config.broker_list, client_id=self.config.client_id) # Create a kafka SimpleConsumer. self.kafka_consumer = SimpleConsumer( client=self.client, topic=self.topic, partitions=self.partitions, **self.config.get_simple_consumer_args()) self.log.debug( "Connected to kafka. Topic %s, partitions %s, %s", self.topic, self.partitions, ','.join([ '{0} {1}'.format(k, v) for k, v in six.iteritems( self.config.get_simple_consumer_args()) ])) self.kafka_consumer.provide_partition_info() def __iter__(self): for partition, kafka_message in self.kafka_consumer: yield Message( partition=partition, offset=kafka_message[0], key=kafka_message[1].key, value=kafka_message[1].value, ) def __enter__(self): self.connect() def __exit__(self, type, value, tb): self.close() def close(self): """Disconnect from kafka. If auto_commit is enabled commit offsets before disconnecting. """ if self.kafka_consumer.auto_commit is True: try: self.commit() except: self.log.exception("Commit error. " "Offsets may not have been committed") # Close all the connections to kafka brokers. KafkaClient open # connections to all the partition leaders. self.client.close() def get_message(self, block=True, timeout=0.1): """Get message from kafka. It supports the same arguments of get_message in kafka-python SimpleConsumer. :param block: If True, the API will block till at least a message is fetched. :type block: boolean :param timeout: If block is True, the function will block for the specified time (in seconds). If None, it will block forever. :returns: a Kafka message :rtype: Message namedtuple, which consists of: partition number, offset, key, and message value """ fetched_message = self.kafka_consumer.get_message(block, timeout) if fetched_message is None: # get message timed out returns None return None else: partition, kafka_message = fetched_message return Message( partition=partition, offset=kafka_message[0], key=kafka_message[1].key, value=kafka_message[1].value, ) def commit(self, partitions=None): """Commit offset for this consumer group :param partitions: list of partitions to commit, default commits to all partitions. :return: True on success, False on failure. """ if partitions: return self.kafka_consumer.commit(partitions) else: return self.kafka_consumer.commit() def commit_message(self, message): """Commit the message offset for this consumer group. This function does not take care of the consumer offset tracking. It should only be used if auto_commit is disabled and the commit function never called. .. note:: all the messages received before message itself will be committed as consequence. :param message: message to commit. :type message: Message namedtuple, which consists of: partition number, offset, key, and message value :return: True on success, False on failure. """ reqs = [ OffsetCommitRequest( self.topic, message.partition, message.offset, None, ) ] try: if self.config.offset_storage in [None, 'zookeeper', 'dual']: self.client.send_offset_commit_request(self.config.group_id, reqs) if self.config.offset_storage in ['kafka', 'dual']: self.client.send_offset_commit_request_kafka( self.config.group_id, reqs) except KafkaError as e: self.log.error("%s saving offsets: %s", e.__class__.__name__, e) return False else: return True
def _run(self): pcount = 0 pause = False while True: try: if pause: gevent.sleep(2) pause = False self._logger.error("New KafkaClient %s" % self._topic) self._kfk = KafkaClient(self._brokers , "kc-" + self._topic) self._failed = False try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("Error: %s trace %s" % \ (messag, traceback.format_exc())) self._failed = True raise RuntimeError(messag) self._logger.error("Starting %s" % self._topic) # Find the offset of the last message that has been queued consumer.seek(-1,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %s is %s" % \ (self._topic,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(-1,1) else: consumer.seek(0,0) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10,timeout=0.5) if not self.msg_handler(mlist): raise gevent.GreenletExit consumer.commit() pcount += len(mlist) except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() self._failed = True pause = True self._logger.error("Stopping %s pcount %d" % (self._topic, pcount)) partdb = self.stop_partition() return self._partoffset, partdb
def _run(self): pcount = 0 while True: try: self._logger.info("New KafkaClient %d" % self._partition) kafka = KafkaClient(self._brokers ,str(os.getpid())) try: consumer = SimpleConsumer(kafka, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise RuntimeError(messag) self._logger.info("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) self.start_partition() # start reading from last previously processed message consumer.seek(0,1) if self._limit: raise gevent.GreenletExit while True: try: mm = consumer.get_message(timeout=None) if mm is None: continue self._logger.debug("%d Reading offset %d" % (self._partition, mm.offset)) consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError: gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.info("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() gevent.sleep(2) self._logger.info("Stopping %d pcount %d" % (self._partition, pcount)) return self._partoffset, self._partdb
def _run(self): pcount = 0 while True: try: self._logger.error("New KafkaClient %d" % self._partition) self._kfk = KafkaClient(self._brokers ,str(os.getpid())) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise RuntimeError(messag) self._logger.error("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(0,1) else: consumer.seek(0,0) if self._limit: raise gevent.GreenletExit while True: try: self.resource_check() mlist = consumer.get_messages(10,timeout=0.2) for mm in mlist: if mm is None: continue self._logger.debug("%d Reading offset %d" % \ (self._partition, mm.offset)) consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() gevent.sleep(2) partdb = {} for coll in self._uvedb.keys(): partdb[coll] = {} for gen in self._uvedb[coll].keys(): partdb[coll][gen] = {} for tab in self._uvedb[coll][gen].keys(): for rkey in self._uvedb[coll][gen][tab].keys(): uk = tab + ":" + rkey partdb[coll][gen][uk] = \ set(self._uvedb[coll][gen][tab][rkey].keys()) self._logger.error("Stopping %d pcount %d" % (self._partition, pcount)) self.stop_partition() return self._partoffset, partdb
# stdlib from collections import defaultdict # 3p from kafka import SimpleClient, SimpleConsumer kafka_conn = SimpleClient("192.168.208.2:9092") consumer = SimpleConsumer(kafka_conn, "sample_check", "test-topic", auto_commit=True) for message in consumer.get_messages(count=10): print message.offset consumer.commit()
class AbstractPersister(threading.Thread): def __init__(self, kafka_conf, influxdb_conf): super(AbstractPersister, self).__init__() kafka = KafkaClient(kafka_conf.uri) self._consumer = SimpleConsumer( kafka, kafka_conf.group_id, kafka_conf.topic, # Set to true even though we actually do # the commits manually. Needed to # initialize # offsets correctly. auto_commit=True, # Make these values None so that the # manual commit will do the actual # commit. # Needed so that offsets are initialized # correctly. If not done, then restarts # will reread messages from beginning of # the queue. auto_commit_every_n=None, auto_commit_every_t=None, iter_timeout=1) self._influxdb_client = InfluxDBClient(influxdb_conf.ip_address, influxdb_conf.port, influxdb_conf.user, influxdb_conf.password, influxdb_conf.database_name) self._max_wait_time_secs = kafka_conf.max_wait_time_seconds self._batch_size = kafka_conf.batch_size self._kafka_topic = kafka_conf.topic self._json_body = [] self._last_flush = datetime.now() @abc.abstractmethod def process_message(self, message): pass def _flush(self): if self._json_body: self._influxdb_client.write_points(self._json_body) self._consumer.commit() LOG.info("processed {} messages from topic '{}'".format( len(self._json_body), self._kafka_topic)) self._json_body = [] self._last_flush = datetime.now() def run(self): try: while True: delta_time = datetime.now() - self._last_flush if delta_time.seconds > self._max_wait_time_secs: self._flush() for message in self._consumer: try: self._json_body.append(self.process_message(message)) except Exception: LOG.exception('Error processing message. Message is ' 'being dropped. {}'.format(message)) if len(self._json_body) >= self._batch_size: self._flush() except: LOG.exception( 'Persister encountered fatal exception processing messages. ' 'Shutting down all threads and exiting') os._exit(1)
class kafka: def __init__(self, host, port, table_name, **args): """ :param host :param port :param table_name :return: kafka """ self.queue = [] self.queue_name = table_name.replace(":", "_") # self.kafka = KafkaClient('%s:%d' % (host, port)) self.kafka = KafkaClient(hosts=host, client_id=self.queue_name) self.producer = SimpleProducer( self.kafka, async=args['async'] if args.has_key('async') else False) self.producer.client.ensure_topic_exists(self.queue_name) self.consumer = SimpleConsumer(self.kafka, self.queue_name + "_consumer", self.queue_name, auto_commit_every_n=1, max_buffer_size=None) print 'success init kafka connection' def __del__(self): if self.kafka: [self.save(x.message.value) for x in self.queue] self.kafka.close() def save(self, data, **args): try: if isinstance(data, dict) or isinstance(data, list): self.producer.send_messages(self.queue_name, json.dumps(data)) elif isinstance(data, unicode): self.producer.send_messages(self.queue_name, data.encode('utf-8')) else: self.producer.send_messages(self.queue_name, data) except Exception as e: print e time.sleep(60) def get(self, **args): # self.consumer.seek(369600, 0) if not self.queue: try: self.consumer._fetch() except Exception as e: print e kq = self.consumer.queue while not kq.empty(): partition, result = kq.get_nowait() self.queue.append(result) self.consumer.offsets[partition] += 1 self.consumer.count_since_commit += 1 self.consumer.queue = Queue() self.consumer.commit() if self.queue: return self.queue.pop().message.value else: return None def size(self, **args): count = 0 for k, v in self.consumer.offsets.items(): reqs = [common.OffsetRequest(self.queue_name, k, -1, 1)] (resp, ) = self.consumer.client.send_offset_request(reqs) count += (resp.offsets[0] - v) return count + len(self.queue) # 切换队列 def select_queue(self, name): self.queue_name = name.replace(":", "_") self.consumer = SimpleConsumer(self.kafka, self.queue_name + "_consumer", self.queue_name, max_buffer_size=None)
class AbstractPersister(threading.Thread): def __init__(self, kafka_conf, influxdb_conf): super(AbstractPersister, self).__init__() kafka = KafkaClient(kafka_conf.uri) self._consumer = SimpleConsumer(kafka, kafka_conf.group_id, kafka_conf.topic, # Set to true even though we actually do # the commits manually. Needed to # initialize # offsets correctly. auto_commit=True, # Make these values None so that the # manual commit will do the actual # commit. # Needed so that offsets are initialized # correctly. If not done, then restarts # will reread messages from beginning of # the queue. auto_commit_every_n=None, auto_commit_every_t=None, iter_timeout=1) self._influxdb_client = InfluxDBClient(influxdb_conf.ip_address, influxdb_conf.port, influxdb_conf.user, influxdb_conf.password, influxdb_conf.database_name) self._max_wait_time_secs = kafka_conf.max_wait_time_seconds self._batch_size = kafka_conf.batch_size self._kafka_topic = kafka_conf.topic self._json_body = [] self._last_flush = datetime.now() @abc.abstractmethod def process_message(self, message): pass def _flush(self): if self._json_body: self._influxdb_client.write_points(self._json_body) self._consumer.commit() LOG.info("processed {} messages from topic '{}'".format( len(self._json_body), self._kafka_topic)) self._json_body = [] self._last_flush = datetime.now() def run(self): try: while True: delta_time = datetime.now() - self._last_flush if delta_time.seconds > self._max_wait_time_secs: self._flush() for message in self._consumer: try: self._json_body.append(self.process_message(message)) except Exception: LOG.exception('Error processing message. Message is ' 'being dropped. {}'.format(message)) if len(self._json_body) >= self._batch_size: self._flush() except: LOG.exception( 'Persister encountered fatal exception processing messages. ' 'Shutting down all threads and exiting') os._exit(1)