def assert_message_count(self, topic, check_count, timeout=10, partitions=None, at_least=False): hosts = ','.join( ['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) client = KafkaClient(hosts) consumer = SimpleConsumer(client, None, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) started_at = time.time() pending = consumer.pending(partitions) # Keep checking if it isn't immediately correct, subject to timeout while pending < check_count and (time.time() - started_at < timeout): pending = consumer.pending(partitions) time.sleep(0.5) consumer.stop() client.close() if pending < check_count: self.fail('Too few pending messages: found %d, expected %d' % (pending, check_count)) elif pending > check_count and not at_least: self.fail('Too many pending messages: found %d, expected %d' % (pending, check_count)) return True
def test_simple_consumer_commit_does_not_raise(self): client = MagicMock() client.get_partition_ids_for_topic.return_value = [0, 1] def mock_offset_fetch_request(group, payloads, **kwargs): return [ OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0) for p in payloads ] client.send_offset_fetch_request.side_effect = mock_offset_fetch_request def mock_offset_commit_request(group, payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_commit_request.side_effect = mock_offset_commit_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # Mock internal commit check consumer.count_since_commit = 10 # This should not raise an exception self.assertFalse(consumer.commit(partitions=[0, 1]))
def __init__(self, name, host='web14', port=51092, **kwargs): QueueBase.QueueBase.__init__(self, name, host, port) self.__queue = [] self.__kafka = KafkaClient('%s:%d' % (host, port)) self.__producer = SimpleProducer(self.__kafka, async=kwargs.get('async', False)) self.__producer.client.ensure_topic_exists(self.name) self.__consumer = SimpleConsumer(self.__kafka, self.name + '_consumer', self.name, auto_commit_every_n=1)
def assert_message_count(self, topic, check_count, timeout=10, partitions=None): hosts = ','.join( ['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) client = KafkaClient(hosts) group = random_string(10) consumer = SimpleConsumer(client, group, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) started_at = time.time() pending = consumer.pending(partitions) # Keep checking if it isn't immediately correct, subject to timeout while pending != check_count and (time.time() - started_at < timeout): pending = consumer.pending(partitions) consumer.stop() client.close() self.assertEqual(pending, check_count)
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 1000 messages at a time, non blocking if not messages: break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def test_switch_leader_simple_consumer(self): producer = Producer(self.client, async=False) consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10) self._send_random_messages(producer, self.topic, 0, 2) consumer.get_messages() self._kill_leader(self.topic, 0) consumer.get_messages()
def createConsumer(self): self.consumer = SimpleConsumer(self.client, topic=self.config["topic"], group=self.config["consumerGroup"], auto_commit= True, max_buffer_size=3000000, iter_timeout=5)
def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load( "localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient( self.redis_monitor.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.outbound_firehose")
def assert_message_count(self, topic, check_count, timeout=10, partitions=None, at_least=False): hosts = ','.join(['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) client = SimpleClient(hosts, timeout=2) consumer = SimpleConsumer(client, None, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) started_at = time.time() pending = -1 while pending < check_count and (time.time() - started_at < timeout): try: pending = consumer.pending(partitions) except FailedPayloadsError: pass time.sleep(0.5) consumer.stop() client.close() if pending < check_count: self.fail('Too few pending messages: found %d, expected %d' % (pending, check_count)) elif pending > check_count and not at_least: self.fail('Too many pending messages: found %d, expected %d' % (pending, check_count)) return True
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) tempfile.write("\n") if tempfile.tell() > 12000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit( ) # inform zookeeper of position in the kafka queue
def setup_kafka(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name self.topic ='general-starturls' _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092') _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0)) _group = self.settings.get("GROUP","scrapy-crawler") _conn = KafkaClient(_server) self.topic1 = self.settings.get('TOPIC', 'frontier-todo') mongo_server = self.settings.get("MONGODB_SERVER", 'localhost') mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT') self.mng_client = MongoClient(mongo_server, mongo_port) self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) self.producer = KafkaProducer(bootstrap_servers=[_server]) self.MONGODB_DB = self.settings.get("MONGODB_DB") self.MONGODB_COLLECTION = "shop" self.SPIDER_NAME = self.settings.get("SPIDER_NAME") self.JOB_NAME = self.settings.get("JOB_NAME") self.LOCALE = self.settings.get("LOCALE",'us') self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr") self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7) self.JOB_INPUT_COLLECTION = self.settings.get("JOB_INPUT_COLLECTION", "job_input3") self.ITEM_INPUT_COLLECTION = self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4') self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0
def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) # move cursor to end of kafka topic self.consumer.seek(0, 2)
def __init__(self, topic, kafka_broker, consumer_group): self.kafka = KafkaClient(kafka_broker) self.consumer = SimpleConsumer(self.kafka, consumer_group, topic, fetch_size_bytes=self.__max_buffer_size, buffer_size=self.__max_buffer_size, max_buffer_size=self.__max_buffer_size)
def connection_time(): cnt = 0 kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "test", "twitter") start_time = time.time() for msg in consumer: cnt += 1 if cnt > 0: return time.time() - start_time
def consume_topic(callback_url, consumer_group, topic): consumer = None try: consumer = SimpleConsumer(self.kafka, consumer_group, topic, auto_commit=False) messages_read = 0 # we can't read messages infinitely here as we have # a lot of topics/subscribers (much more than threadpool size) while messages_read < self.max_read_messages_per_cycle: # get one message and monitor the time start = monitoring.start_time_measure() message = consumer.get_message(block=False) ms_elapsed = monitoring.stop_time_measure(start) self.metrics['kafka_read'].add({'topic': topic}, ms_elapsed) # if we don't have messages for this topic/subscriber - quit and give chance to others if message is None: logging.info( 'No messages for topic: %s and callback: %s, quiting the thread', topic, callback_url) break try: event = json.loads( message.message.value.decode('utf-8')) response_status = self.forward_event( callback_url, event, topic) # if status is success - mark message as consumed by this subscriber if 200 <= response_status < 300: consumer.commit() else: logging.info( 'Received error response fro consumer: %s', response_status) except: logging.error( "Exception while sending event to consumer") logging.error(traceback.format_exc()) finally: messages_read += 1 return messages_read except UnknownTopicOrPartitionError: logging.error('Adding %s to skip list', topic) except: logging.exception('failed to create kafka client') finally: if consumer is not None: consumer.stop()
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Friendsquare topic then save it to a temporary file: temp_dir, then transfer the file to hdfs. Create a kafka receiver to grap messages ''' kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) # Create a temp file to store messages self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic) # Create a hdfs directory to store output files os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir) while self.count < self.max_count: # Get 1000 messages each time messages = kafka_receiver.get_messages(count=1000, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # Set each file size at 20 M if temp_file.tell() > 20000000: temp_file.close() # Put the file to hdfs hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Kafka then save it to a temporary first, then transfer the file to hdfs. ''' # Create a kafka receiver to grap messages kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) self.timestamp = self.getTimestamp() # Create a temp file to store messages self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') while self.count < self.max_count: # Get 100 messages each time messages = kafka_receiver.get_messages(count=100, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # For structured streaming, files need to be small at this point, set the size at 2 M if temp_file.tell() > 2000000: temp_file.close() # Copy the file to hdfs output_dir = "%s/%s" % (self.hdfs_dir, topic) os.system("hdfs dfs -mkdir %s" % output_dir) hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.timestamp = self.getTimestamp() self.temp_file_path = "%s/%s_%s.txt" % ( temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consumer(seconds): time.sleep(5) cnt = 0 kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "test", "twitter") start_time = time.time() for msg in consumer: if time.time() >= start_time + seconds: return cnt cnt += 1 return cnt
def get_max_offsets(consumer_group, topic_name): print("conmeiiiiiiiii") kfk_ins = kafka_ins() kafka = KafkaClient(kfk_ins.BROKERS_LIST, kfk_ins.CLIENT_ID, kfk_ins.TIME_OUT) topic_offset = {} consumer = SimpleConsumer(kafka, consumer_group, topic_name, api_version='0.8.2') topic_offset[topic_name] = consumer.offsets return topic_offset
def test_simple_consumer_failed_payloads(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) def failed_payloads(payload): return FailedPayloadsError(payload) client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads) # This should not raise an exception consumer.get_messages(5)
def __init__(self, kafka_addr, topic, vin, web_url): threading.Thread.__init__(self) self.kafka = KafkaClient(kafka_addr) #kafka_addr self.cons = SimpleConsumer(self.kafka, None, topic) self.cons.seek(0,2) self.vin = vin self.web_url = web_url self.flag = True self.count = 0 self.sleep_count = 0 self.headers = {'Content-Type' : 'application/json'}
def run(self): consumer = SimpleConsumer(self.kafka, self.group_name, self.topic_name,iter_timeout=self.timeout, buffer_size=4096*8,max_buffer_size=None) print "setting max_buffer_size=None" try: for message in consumer: parsed_msg = json.loads(message.message.value) self.handle_msg(parsed_msg) # print parsed_msg self.logger.info('OK%s'%str(consumer.offsets)) except Exception as e: self.logger.warning( "Exception %s offset, %s" % (self.group_name, consumer.offsets)) self.logger.warning( str(e) ) raise
def kafka_stream(): topic = request.args.get('topic') print topic kafka = KafkaClient("localhost:9092") consumer = SimpleConsumer(kafka, "python", topic) topic = None def gen(): for message in consumer: yield 'data: %s\n\n' % str(message.message.value) print "DEBUG: Kafka Stream Connected" return Response(gen(), mimetype="text/event-stream")
def test_simple_consumer_reset_partition_offset(self): client = MagicMock() def mock_offset_request(payloads, **kwargs): raise FailedPayloadsError(payloads[0]) client.send_offset_request.side_effect = mock_offset_request consumer = SimpleConsumer(client, group='foobar', topic='topic', partitions=[0, 1], auto_commit=False) # This should not raise an exception self.assertEqual(consumer.reset_partition_offset(0), None)
def _connect_consumer(self): if self._cons is None: try: self._cons = SimpleConsumer(self._conn, self._group, self._topic, partitions=self._partition_ids, buffer_size=1048576, max_buffer_size=10485760) except BrokerResponseError: self._cons = None logger.warning("Could not connect consumer to Kafka server") return False return True
def __init__(self, settings, no_batches, no_scoring, no_incoming): self._kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY) self._in_consumer = SimpleConsumer(self._kafka, settings.get('FRONTIER_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760) if not no_scoring: self._scoring_consumer = SimpleConsumer( self._kafka, settings.get('FRONTIER_GROUP'), settings.get('SCORING_TOPIC'), buffer_size=262144, max_buffer_size=1048576) self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP')) self._manager = FrontierManager.from_settings(settings) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('OUTGOING_TOPIC') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring, settings.get('NEW_BATCH_DELAY', 60.0), no_incoming) self.job_id = 0 self.stats = {}
def kafka_consumer(kafka_hosts, schema_host, schema_port, topic, consumer_group="python"): """ 消费kafka对应topic的记录, 非实时消费 :param kafka_hosts: :param schema_host: :param schema_port: :param topic: :param consumer_group: :return: """ # 获取topic最新schema topic_schema, topic_schema_id, schema_version = get_latest_schema_info( schema_host, schema_port, topic) # 消费kafka记录 client = KafkaClient(hosts=kafka_hosts) simple_consumer = SimpleConsumer(client, consumer_group, topic, auto_offset_reset="smallest") collect_logs = [] # 存放消息记录的partition,offset,value msg_exist = True while msg_exist: msg = simple_consumer.get_message(get_partition_info=True) # print "kafka log:", msg # 判断此次获取的记录是否为None,为None则停止消费 if msg is None: msg_exist = False else: msg_partition = msg[0] msg_offset = msg[1].offset msg_value = msg[1].message.value # 对单条记录解码 bytes_msg = io.BytesIO(msg_value[5:]) decode_msg = avro.io.BinaryDecoder(bytes_msg) recode_msg = avro.io.DatumReader( avro.schema.parse(topic_schema)).read(decode_msg) # 收集该log的partition,offset,value信息 msg_collect = [msg_partition, msg_offset, recode_msg] collect_logs.append(msg_collect) collect_logs.sort(key=lambda x: x[0]) # 按partition id排序 print "+++++++Topic: %s+++++++" % topic for index, log in enumerate(collect_logs): print index, log print "Successfully received." return collect_logs
def validate_samza_job(): """ Validates that negate-number negated all messages, and sent the output to samza-test-topic-output. """ logger.info('Running validate_samza_job') kafka = _get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC) messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60) message_count = len(messages) assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count) for message in map(lambda m: m.message.value, messages): assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message) kafka.close()