def run(self): client = KafkaClient("10.206.216.13:19092,10.206.212.14:19092,10.206.209.25:19092") consumer = SimpleConsumer(client, "test-group", "jiketest",auto_commit=False,partitions=self.part) consumer.seek(0,0) while True: message = consumer.get_message(True,60) self.__offset = message.offset print message.message.value
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__( self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']") self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() else: self.logger.info('Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value(lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info('Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn('Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info('Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
class KafkaSpiderMixin(object): """ Mixin class to implement reading urls from a kafka queue. :type kafka_topic: str """ kafka_topic = None def process_kafka_message(self, message): """" Tell this spider how to extract urls from a kafka message :param message: A Kafka message object :type message: kafka.common.OffsetAndMessage :rtype: str or None """ if not message: return None return message.message.value def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) def next_request(self): """ Returns a request to be scheduled. :rtype: str or None """ message = self.consumer.get_message(True) url = self.process_kafka_message(message) if not url: return None return self.make_requests_from_url(url) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request()
class ZKConsumer(object): zk_timeout = 30 jitter_seconds = 30 broker_prefix = '/brokers/ids' def __init__(self, zk_hosts, group, topic, nodes, zk_handler=None, logger=None, identifier=None, **consumer_kwargs): """Creates a Consumer that tracks state in ZooKeeper, rebalancing partition ownership as registered consumers change. NOTE: this class is intended for version 0.8.1 of Kafka, where offsets are managed by Kafka but there is no rebalancing in the protocol. """ if logger is None: logger = logging.getLogger('kafka.consumer.ZKConsumer') self.logger = logger self.identifier = identifier if KafkaClient is None: raise RuntimeError( "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']" ) self.zk_handler = zk_handler self.zk_hosts = zk_hosts self.broker_hosts = [] self.group = group self.topic = topic self.zk = None self.nodes = nodes self.client = None self.consumer = None self.consumer_kwargs = consumer_kwargs # This will kick off a cascading sequence to initialize ourselves: # 1. Connect to ZK and pull list of Kafka brokers # 2. Register ourselves as a consumer in ZK # 3. Rebalance partitions across all connected consumers self.init_zk() def zk_session_watch(self, state): self.logger.debug('ZK transitioned to: %s', state) if state == KazooState.SUSPENDED: if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None # Lost connection to ZK; we can't call any methods that would # try to contact it (i.e., we can't do self.zkp.finish() ) self.zkp = None elif state == KazooState.CONNECTED: self.logger.info('Restarting ZK partitioner') self.zk.handler.spawn(self.init_zkp) def _zkp_wait(self): handler = self.zk.handler while 1: if self.zkp.failed: self.logger.warning("Lost or unable to acquire partition") self.stop() elif self.zkp.release: self.zkp.release_set() elif self.zkp.acquired: def group_change_proxy(event): self.logger.warn('Connected consumers changed') if self.zkp is None: self.logger.info('Restarting ZK partitioner') handler.spawn(self.init_zkp) elif self.zkp is not None and self.zkp.failed: self.logger.warning( "Lost or unable to acquire partition") self.stop() else: self.logger.info( 'Scheduling ZK partitioner set release') rel_greenlet = handler.spawn(self.zkp.release_set) self.logger.info('Scheduling group re-join') rel_greenlet.link_value( lambda greenlet: self.zkp.join_group) if not self.nodes: self.logger.info( 'Partitioner aquired; setting child watch') result = self.zk.get_children_async(self.zkp._group_path) result.rawlink(group_change_proxy) # Break out of while loop to begin consuming events break elif self.zkp.allocating: self.zkp.wait_for_acquire() def init_zkp(self): if not hasattr(self, 'zkp') or self.zkp is None: if self.nodes: self.zkp = StaticZKPartitioner( self.zk, self.group, self.topic, self.nodes, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) else: self.zkp = ZKPartitioner( self.zk, self.group, self.topic, time_boundary=self.jitter_seconds, partitions_changed_cb=self.init_consumer, logger=self.logger, identifier=self.identifier) self._zkp_wait() def init_zk(self): # TODO: switch to async # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler) self.zk.start() self.zk.add_listener(self.zk_session_watch) @self.zk.ChildrenWatch(self.broker_prefix) def broker_change_proxy(broker_ids): self.onBrokerChange(broker_ids) self.init_zkp() def onBrokerChange(self, broker_ids): self.broker_hosts = [] for b_id in broker_ids: b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id])) b_data = json.loads(b_json) self.broker_hosts.append('{}:{}'.format(b_data['host'], b_data['port'])) my_partitions = [] if self.consumer is not None: self.logger.warn('Brokers changed, stopping Kafka consumer.') my_partitions = self.consumer.offsets.keys() self.consumer.stop() self.consumer = None if self.client is not None: self.logger.warn('Brokers changed, stopping Kafka client.') self.client.close() self.client = None if my_partitions: msg = 'Brokers changed, queuing restart of Kafka client / consumer.' self.logger.warn(msg) self.zk.handler.spawn(self.init_consumer, my_partitions) def init_consumer(self, my_partitions): if self.consumer is None: self.logger.warn('Starting Kafka client') self.client = KafkaClient(self.broker_hosts, client_id=self.zkp._identifier) else: if self.consumer is None or \ sorted(my_partitions) != sorted(self.consumer.offsets.keys()): self.logger.warn( 'Partitions changed, restarting Kafka consumer.') self.consumer.stop() else: self.logger.info( 'Partitions unchanged, not restarting Kafka consumer.') return self.consumer = SimpleConsumer(self.client, self.group, self.topic, partitions=my_partitions, **self.consumer_kwargs) self.consumer.provide_partition_info() self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets) def stop(self): if self.consumer is not None: self.logger.info('Stopping Kafka consumer') self.consumer.stop() self.consumer = None if self.client is not None: self.logger.info('Stopping Kafka client') self.client.close() self.client = None if self.zk is not None: self.logger.info('Stopping ZooKeeper client') if self.zkp is not None and not self.zkp.failed: self.zkp.finish() self.zk.stop() self.zkp = None self.zk = None def commit(self, partitions=None): """ Commit offsets for this consumer partitions: list of partitions to commit, default is to commit all of them """ if self.consumer is None: return self.logger.debug('Begin committing offsets for partitions: %s', partitions if partitions else 'All') self.consumer.commit(partitions) self.logger.debug('End committing offsets for partitions: %s', partitions if partitions else 'All') def pending(self, partitions=None): """ Gets the pending message count partitions: list of partitions to check for, default is to check all """ return self.consumer.pending(partitions) def provide_partition_info(self): """ Indicates that partition info must be returned by the consumer """ self.consumer.provide_partition_info() def seek(self, offset, whence): """ Alter the current offset in the consumer, similar to fseek offset: how much to modify the offset whence: where to modify it from 0 is relative to the earliest available offset (head) 1 is relative to the current offset 2 is relative to the latest known offset (tail) """ self.consumer.seek(offset, whence) def get_messages(self, count=1, block=True, timeout=0.1): """ Fetch the specified number of messages count: Indicates the maximum number of messages to be fetched block: If True, the API will block till some messages are fetched. timeout: If block is True, the function will block for the specified time (in seconds) until count messages is fetched. If None, it will block forever. """ if self.consumer is None: return [] else: try: messages = self.consumer.get_messages(count, block, timeout) if not messages and self.zkp.failed: raise FailedPayloadsError return messages except FailedPayloadsError as err: msg = 'Failed to retrieve payload, restarting consumer' self.logger.exception(msg) raise err def get_message(self, block=True, timeout=0.1, get_partition_info=None): return self.consumer.get_message(block, timeout, get_partition_info) def _get_message(self, block=True, timeout=0.1, get_partition_info=None, update_offset=True): return self.consumer._get_message(block, timeout, get_partition_info, update_offset) def __iter__(self): for msg in self.consumer: yield msg
print '功能: 移动consumer的消费指针至指定时间处' print 'Kafka Server为: xxxxxxx:9092 等同15/25:9092' print 'Usage: .py [topic] [group] [date]' else: topic = sys.argv[1] group = sys.argv[2] date = sys.argv[3] server = 'xxxxxxx:9092' print '将%s的%s的使用者%s的时间轴调整至%s...' % (server, topic, group, date) client = KafkaClient(server) consumer = SimpleConsumer(client, group, topic) step = 10000 consumer.seek(step, 0) cnt = 0 while step > 1: cnt = cnt + 1 message = consumer.get_message() msg = json.loads(message.message.value) if msg.has_key('up_time'): if cnt % 2 == 0: print 'Processed %s to date %s' % (cnt, msg['up_time']) if msg['up_time'] > date: step = int(step * 2 / 3) consumer.seek(-step, 1) elif msg['up_time'] == date: break else: consumer.seek(step, 1) else: break
class zk_client: def __init__(self, topics, zk_hosts='127.0.0.1:2181', consumer_group=kafka_consts.CONSUMER_GROUP): self.zk_hosts = zk_hosts self.kafka_client = None self.consumer_group = consumer_group self.lock = Lock() self.zk_st_watcher = zk_states_watcher() self.consumer_id = uuid1().hex self.consumer_ids = [self.consumer_id] self.consumer_id_path = '{}/{}/{}'.format(kafka_consts.CONSUMER_PATH, self.consumer_group, 'ids') try: self.zoo_cl = KazooClient(self.zk_hosts) self.zoo_cl.add_listener(self.zk_st_watcher) self.broker_details = {} self.zoo_cl.start() sleep(1) self._init(topics) except Exception as e: logging.exception(e) def register(self): ret = False while not ret: ret = self.create_ephemeralpath(self.consumer_id_path + '/' + self.consumer_id) if not ret: sleep(1) def _init(self, topics): ret = False while not ret: ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group) if not ret: sleep(1) ret = False while not ret: ret = self.create_newpath(kafka_consts.CONSUMER_PATH + '/' + self.consumer_group + '/ids') if not ret: sleep(1) self.register() self.get_consumer_list() self.populate_broker_info() temptopics = [x.strip() for x in topics] self.topics = [] for t in temptopics: if t != '' and t not in self.topics: self.topics.append(t) if not self.topics: raise ValueError('no topics passed') ret = False broker_ports = [] with self.lock: for brid in self.broker_details: broker_port = self.broker_details[brid] broker_ports.append('{}:{}'.format(broker_port['host'],broker_port['port'])) self.kafka_client = nsclient(broker_ports) self.topic_part_ids = {} for topic in topics: pids = self.kafka_client.get_partition_ids_for_topic(topic) self.topic_part_ids[topic] = pids self.consumed = {} self.rebalance_consumers() try: topic_partitions = {t : None for t in self.topics} self.kconsumer = SimpleConsumer(self.kafka_client, self.consumer_group, None, topic_partitions=self.consumed.copy()) except Exception as e: logging.exception(e) sys.exit(1) def get_message(self): try: return self.kconsumer.get_message(timeout=1, get_partition_info=True) except Exception as e: logging.exception(e) return None @synchronized def populate_broker_info(self): brokers = self.get_brokerids() self.broker_details.clear() for brid in brokers: try: brdetails = self.get_data(kafka_consts.BROKER_ID_PATH + '/' + brid) if brdetails is None: continue brjson = json.loads(brdetails[0]) self.broker_details[brid] = brjson except Exception as e: logging.exception(e) def create_newpath(self, path): ''' Create the znode path if it is not existing already ''' try: if not self.zoo_cl.exists(path): self.zoo_cl.ensure_path(path) except Exception as e: logging.exception(e) return False return True def get_children(self, parentpath): try: children = self.zoo_cl.get_children(parentpath, watch=self) return children except Exception as e: logging.error(e) return None def get_brokerids(self): return self.get_children(kafka_consts.BROKER_ID_PATH) @synchronized def get_consumer_list(self): while True: print self.consumer_id_path cids = self.get_children(self.consumer_id_path) if cids is None: sleep(1) continue self.consumer_ids = cids break self.consumer_ids.sort() def get_data(self, path): try: return self.zoo_cl.get(path) except Exception as e: logging.exception(e) return None def create_ephemeralpath(self, path): ''' Create the znode ephemeral path if it is not existing ''' try: if self.zoo_cl.exists(path): return True self.zoo_cl.create(path, ephemeral=True) except Exception as e: logging.exception(e) return False return True @synchronized def rebalance_consumers(self): self.topic_part_ids self.consumer_ids num_consumer = len(self.consumer_ids) cinsumerpos = self.consumer_ids.index(self.consumer_id) consumed_parts = {} for topic in self.topic_part_ids: partitions = filter(lambda x : x % num_consumer == 0, self.topic_part_ids[topic]) consumed_parts[topic] = partitions self.consumed = consumed_parts @synchronized def print_brokers(self): print(self.broker_details) def __call__(self, event): if event.path == kafka_consts.BROKER_ID_PATH: self.populate_broker_info() elif event.path == self.consumer_id_path: self.rebalance_consumers() self.get_consumer_list()
from kafka.client import KafkaClient from kafka.consumer import SimpleConsumer from kafka.producer import SimpleProducer, KeyedProducer import logging logging.basicConfig( format='%(asctime)s.%(msecs)s:%(name)s:%(thread)d:%(levelname)s:%(process)d:%(message)s', level=logging.DEBUG ) kafka = KafkaClient("localhost:9092") kafka.send_offset_fetch_request('test-group') # To consume messages consumer = SimpleConsumer(kafka, "test-group", "test-topic", auto_commit_every_n=1, iter_timeout=10) # while True: # for message in consumer: # print(message) # consumer.commit() print consumer.get_message() # consumer.commit(partitions=[0]) kafka.close()
def main(): """kafkadump: Kafka topic dump utility for debugging. Usage: kafkadump list --host=<host> kafkadump dump <topic> --host=<host> [--consumer=<consumer>] Examples: List all the topics on your local Kafka instance: python kafkadump.py list --host=<kafkahost>:9092 Dump the contents of a single topic starting from offset 0: python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092 Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka. It will end by printing the total records serviced and the raw output of the most recent record. Options: -h --host <host> Kafka host name where Kafka cluster will be resolved -c --consumer <consumer> Consumer group ID to use for reading messages """ args = docopt(main.__doc__) host = args["--host"] logging.basicConfig() print "=> Connecting to {0}...".format(host) kafka = KafkaClient(host) print "=> Connected." if args["list"]: for topic in kafka.topic_partitions.keys(): print topic return 0 elif args["dump"]: topic = args["<topic>"] consumer_id = args["--consumer"] or "default" consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024*100, # 100kb fetch_size_bytes=1024*100, # 100kb max_buffer_size=None # eliminate big message errors ) consumer.seek(0, 0) num_records = 0 total_bytes = 0 item = None while True: try: message = consumer.get_message() if message is None: time.sleep(1) continue val = message.message.value item = json.loads(val) body_bytes = len(item) print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except: traceback.print_exc() break total_mbs = float(total_bytes) / (1024*1024) print if item is not None: print json.dumps(item, indent=4) if num_records == 0: num_records = 1 print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg" kafka.close() return 0
class ListeningKafkaSpider(Spider): """ Spider that reads urls from a kafka topic when idle. This spider will exit only if stopped, otherwise it keeps listening to messages on the given topic Specify the topic to listen to by setting the spider's `kafka_topic`. Messages are assumed to be URLS, one by message. To do custom processing of kafka messages, override the spider's `process_kafka_message` method """ """ Mixin class to implement reading urls from a kafka queue. :type kafka_topic: str """ kafka_topic = None def process_kafka_message(self, message): """" Tell this spider how to extract urls from a kafka message :param message: A Kafka message object :type message: kafka.common.OffsetAndMessage :rtype: str or None """ if not message: return None return message.message.value def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic) def next_request(self): """ Returns a request to be scheduled. :rtype: str or None """ message = self.consumer.get_message(True) url = self.process_kafka_message(message) if not url: return None return self.make_requests_from_url(url) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request() @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(ListeningKafkaSpider, cls).from_crawler(crawler, *args, **kwargs) if not hasattr(spider, 'topic') or not spider.topic: spider.topic = '%s-starturls' % spider.name hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = crawler.settings.get( 'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue spider.consumer = SimpleConsumer(_kafka, consumer_group, spider.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic) return spider