def worker(): global consumers consumer = Consumer({'bootstrap.servers': bootstrap_servers, 'group.id': consumer_group, 'client.id': client_id, 'default.topic.config': {'auto.offset.reset': 'earliest'}, 'enable.auto.offset.store': False, 'session.timeout.ms': session_timeout_ms}) consumers.append(consumer) consumer.subscribe([topic]) while True: msg = consumer.poll(0) thread_name = threading.current_thread().name if msg == None or not msg: continue if not msg.error(): msg_timestamp = datetime.fromtimestamp(msg.timestamp()[1] / 1000.0) keep_alive_counter = 0 now = datetime.now() # loop/sleep to delay the message while now < msg_timestamp + delay_timedelta: keep_alive_counter = keep_alive_counter + 1 msg_timestamp_with_delta = msg_timestamp + delay_timedelta diff1 = msg_timestamp_with_delta - now diff_seconds = diff1.total_seconds() if keep_alive_counter <= 1: logging.info("[%s] %s | received message on partition=%d, delaying for %fs" % ( thread_name, now.isoformat(), msg.partition(), diff_seconds)) # sleep for {min_sleep_seconds}s...{kafka_keep_alive_seconds}s sleep_seconds = min(kafka_keep_alive_seconds, max(min_sleep_seconds, diff_seconds)) # use as 'keep alive' feedback for low (no) traffic periods... to avoid connections getting dropped by brokers - resulting in a group rebalance logging.debug( "[%s] %s | kafka keep alive commit partition=%d" % (thread_name, now.isoformat(), msg.partition())) consumer.commit( offsets=[TopicPartition(topic=msg.topic(), partition=msg.partition(), offset=OFFSET_STORED)]) # go to sleep logging.debug("[%s] %s | going to sleep for %fs / lag: %fs" % ( thread_name, now.isoformat(), sleep_seconds, diff_seconds)) sleep(sleep_seconds) now = datetime.now() process(thread_name, msg) consumer.store_offsets(msg) elif msg.error().code() == KafkaError._PARTITION_EOF: continue else: logging.error("kafka consumer error: %s" % msg.error())
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer already closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer already closed' == str(ex.value)
def test_store_offsets(): """ Basic store_offsets() tests """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) try: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) except KafkaException as e: assert e.args[0].code() == KafkaError._UNKNOWN_PARTITION c.unsubscribe() c.close()
def test_calling_store_offsets_after_close_throws_erro(): """ calling store_offset after close should throw RuntimeError """ c = Consumer({ 'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100 }) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) assert 'Consumer closed' == str(ex.value)
def test_calling_store_offsets_after_close_throws_erro(): """ calling store_offset after close should throw RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.offsets_for_times([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value)
def run(self): ac = ApiClient() def fail_fast(err, partitions): if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) #print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ 'group.id': self.consumer_group, 'on_commit': fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker 'enable.auto.commit': True, 'enable.auto.offset.store': False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) 'max.poll.interval.ms': 60000, 'default.topic.config': { 'auto.offset.reset': 'latest', }, }) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet") print("... nothing new from kafka, try again (interval: {}". format(self.poll_interval)) continue print("... got {} kafka messages".format(len(batch))) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode('utf-8') # HACK: work around a bug where container entities got published to # release_v03 topic if self.elasticsearch_document_name == "release": entity_dict = json.loads(json_str) if entity_dict.get( 'name') and not entity_dict.get('title'): continue entity = entity_from_json(json_str, self.entity_type, api_client=ac) # TODO: handle deletions from index bulk_actions.append( json.dumps({ "index": { "_id": entity.ident, }, })) bulk_actions.append(json.dumps(self.transform_func(entity))) print("Upserting, eg, {} (of {} releases in elasticsearch)".format( entity.ident, len(batch))) elasticsearch_endpoint = "{}/{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index, self.elasticsearch_document_name) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n") resp.raise_for_status() if resp.json()['errors']: desc = "Elasticsearch errors from post to {}:".format( elasticsearch_endpoint) print(desc) print(resp.content) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)
class VerifiableConsumer(VerifiableClient): """ confluent-kafka-python backed VerifiableConsumer class for use with Kafka's kafkatests client tests. """ def __init__(self, conf): """ conf is a config dict passed to confluent_kafka.Consumer() """ super(VerifiableConsumer, self).__init__(conf) self.conf["on_commit"] = self.on_commit self.consumer = Consumer(**conf) self.consumed_msgs = 0 self.consumed_msgs_last_reported = 0 self.consumed_msgs_at_last_commit = 0 self.use_auto_commit = False self.use_async_commit = False self.max_msgs = -1 self.assignment = [] self.assignment_dict = dict() def find_assignment(self, topic, partition): """Find and return existing assignment based on topic and partition, or None on miss.""" skey = "%s %d" % (topic, partition) return self.assignment_dict.get(skey) def send_records_consumed(self, immediate=False): """Send records_consumed, every 100 messages, on timeout, or if immediate is set.""" if self.consumed_msgs <= self.consumed_msgs_last_reported + ( 0 if immediate else 100): return if len(self.assignment) == 0: return d = { "name": "records_consumed", "count": self.consumed_msgs - self.consumed_msgs_last_reported, "partitions": [], } for a in self.assignment: if a.min_offset == -1: # Skip partitions that havent had any messages since last time. # This is to circumvent some minOffset checks in kafkatest. continue d["partitions"].append(a.to_dict()) a.min_offset = -1 self.send(d) self.consumed_msgs_last_reported = self.consumed_msgs def send_assignment(self, evtype, partitions): """ Send assignment update, evtype is either 'assigned' or 'revoked' """ d = { "name": "partitions_" + evtype, "partitions": [{ "topic": x.topic, "partition": x.partition } for x in partitions], } self.send(d) def on_assign(self, consumer, partitions): """ Rebalance on_assign callback """ old_assignment = self.assignment self.assignment = [ AssignedPartition(p.topic, p.partition) for p in partitions ] # Move over our last seen offsets so that we can report a proper # minOffset even after a rebalance loop. for a in old_assignment: b = self.find_assignment(a.topic, a.partition) b.min_offset = a.min_offset self.assignment_dict = {a.skey: a for a in self.assignment} self.send_assignment("assigned", partitions) def on_revoke(self, consumer, partitions): """ Rebalance on_revoke callback """ # Send final consumed records prior to rebalancing to make sure # latest consumed is in par with what is going to be committed. self.send_records_consumed(immediate=True) self.do_commit(immediate=True, asynchronous=False) self.assignment = list() self.assignment_dict = dict() self.send_assignment("revoked", partitions) def on_commit(self, err, partitions): """ Offsets Committed callback """ if err is not None and err.code() == KafkaError._NO_OFFSET: self.dbg("on_commit(): no offsets to commit") return # Report consumed messages to make sure consumed position >= committed position self.send_records_consumed(immediate=True) d = {"name": "offsets_committed", "offsets": []} if err is not None: d["success"] = False d["error"] = str(err) else: d["success"] = True d["error"] = "" for p in partitions: pd = { "topic": p.topic, "partition": p.partition, "offset": p.offset } if p.error is not None: pd["error"] = str(p.error) d["offsets"].append(pd) if len(self.assignment) == 0: self.dbg( "Not sending offsets_committed: No current assignment: would be: %s" % d) return self.send(d) def do_commit(self, immediate=False, asynchronous=None): """Commit every 1000 messages or whenever there is a consume timeout or immediate.""" if (self.use_auto_commit or self.consumed_msgs_at_last_commit + (0 if immediate else 1000) > self.consumed_msgs): return # Make sure we report consumption before commit, # otherwise tests may fail because of commit > consumed if self.consumed_msgs_at_last_commit < self.consumed_msgs: self.send_records_consumed(immediate=True) if asynchronous is None: async_mode = self.use_async_commit else: async_mode = asynchronous self.dbg("Committing %d messages (Async=%s)" % (self.consumed_msgs - self.consumed_msgs_at_last_commit, async_mode)) retries = 3 while True: try: self.dbg("Commit") offsets = self.consumer.commit(asynchronous=async_mode) self.dbg("Commit done: offsets %s" % offsets) if not async_mode: self.on_commit(None, offsets) break except KafkaException as e: if e.args[0].code() == KafkaError._NO_OFFSET: self.dbg("No offsets to commit") break elif e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ): self.dbg("Commit failed: %s (%d retries)" % (str(e), retries)) if retries <= 0: raise retries -= 1 time.sleep(1) continue else: raise self.consumed_msgs_at_last_commit = self.consumed_msgs def msg_consume(self, msg): """ Handle consumed message (or error event) """ if msg.error(): self.err("Consume failed: %s" % msg.error(), term=False) return if self.verbose: self.send({ "name": "record_data", "topic": msg.topic(), "partition": msg.partition(), "key": msg.key(), "value": msg.value(), "offset": msg.offset(), }) if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs: return # ignore extra messages # Find assignment. a = self.find_assignment(msg.topic(), msg.partition()) if a is None: self.err( "Received message on unassigned partition %s [%d] @ %d" % (msg.topic(), msg.partition(), msg.offset()), term=True, ) a.consumed_msgs += 1 if a.min_offset == -1: a.min_offset = msg.offset() if a.max_offset < msg.offset(): a.max_offset = msg.offset() self.consumed_msgs += 1 self.consumer.store_offsets(message=msg) self.send_records_consumed(immediate=False) self.do_commit(immediate=False)
def run(self): def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) def on_commit(err, partitions): if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors print(p) if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ 'group.id': self.consumer_group, 'on_commit': fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker 'enable.auto.commit': True, 'enable.auto.offset.store': False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) 'max.poll.interval.ms': 180000, 'default.topic.config': { 'auto.offset.reset': 'latest', }, }) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() producer_conf.update({ 'delivery.report.only.error': True, 'default.topic.config': { 'request.required.acks': -1, # all brokers must confirm }, }) producer = Producer(producer_conf) consumer.subscribe([self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) print("Kafka consuming {}".format(self.consume_topic)) while True: msg = consumer.poll(self.poll_interval) if not msg: print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval)) continue if msg.error(): raise KafkaException(msg.error()) cle = json.loads(msg.value().decode('utf-8')) #print(cle) print("processing changelog index {}".format(cle['index'])) release_ids = [] new_release_ids = [] file_ids = [] container_ids = [] work_ids = [] release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: release_ids.append(re['ident']) # filter to direct release edits which are not updates if not re.get('prev_revision') and not re.get('redirect_ident'): new_release_ids.append(re['ident']) file_edits = cle['editgroup']['edits']['files'] for e in file_edits: file_ids.append(e['ident']) container_edits = cle['editgroup']['edits']['containers'] for e in container_edits: container_ids.append(e['ident']) work_edits = cle['editgroup']['edits']['works'] for e in work_edits: work_ids.append(e['ident']) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes # TODO: fetch old revision as well, and only update # releases for which list changed release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization(file_entity) producer.produce( self.file_topic, json.dumps(file_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization(container) producer.produce( self.container_topic, json.dumps(container_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) for ident in set(release_ids): release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( self.release_topic, json.dumps(release_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) # filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') if ir and not release.files and self.want_live_ingest(release, ir): producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode('utf-8'), #key=None, on_delivery=fail_fast, ) producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg)
class KafkaConsumer(BaseKafkaConsumer): def __init__(self, config): self._config = config["consumer"] self.assign_offset_end = self._config.get("assign_offset_end", False) conf = self._config["conf"] conf.setdefault("group.id", str(uuid.uuid1())) self.autocommit_enabled = conf.get("enable.auto.commit", True) internal_log_path = self._config.get("internal_log_path") conf["error_cb"] = self._error_callback if internal_log_path: debug_logger = logging.getLogger("debug_consumer") timestamp = time.strftime("_%d%m%Y_") debug_logger.addHandler( logging.FileHandler("{}/kafka_consumer_debug{}{}.log".format( internal_log_path, timestamp, os.getpid()))) conf["logger"] = debug_logger self._consumer = Consumer(**conf) @staticmethod def on_assign_offset_end(consumer, partitions): for p in partitions: p.offset = OFFSET_END KafkaConsumer.on_assign_log(consumer, partitions) consumer.assign(partitions) @staticmethod def on_coop_assign_offset_end(consumer, partitions): for p in partitions: p.offset = OFFSET_END KafkaConsumer.on_assign_log(consumer, partitions) consumer.incremental_assign(partitions) @staticmethod def on_assign_log(consumer, partitions): log_level = "WARNING" for p in partitions: if p.error: log_level = "ERROR" params = { "partitions": str(list([str(partition) for partition in partitions or []])), log_const.KEY_NAME: log_const.KAFKA_ON_ASSIGN_VALUE, "log_level": log_level } log("KafkaConsumer.subscribe<on_assign>: assign %(partitions)s %(log_level)s", params=params, level=log_level) def subscribe(self, topics=None): topics = topics or list(self._config["topics"].values()) self._consumer.subscribe( topics, on_assign=self.get_on_assign_callback() if self.assign_offset_end else KafkaConsumer.on_assign_log) def get_on_assign_callback(self): if "cooperative" in self._config["conf"].get( "partition.assignment.strategy", ""): callback = KafkaConsumer.on_coop_assign_offset_end else: callback = KafkaConsumer.on_assign_offset_end return callback def unsubscribe(self): self._consumer.unsubscribe() def poll(self): msg = self._consumer.poll(self._config["poll_timeout"]) if msg is not None: return self._process_message(msg) def consume(self, num_messages: int = 1): messages = self._consumer.consume(num_messages=num_messages, timeout=self._config["poll_timeout"]) for msg in messages: yield self._process_message(msg) def commit_offset(self, msg): if msg is not None: if self.autocommit_enabled: self._consumer.store_offsets(msg) else: self._consumer.commit(msg, **{"async": True}) def get_msg_create_time(self, mq_message): timestamp_type, timestamp = mq_message.timestamp() return timestamp if timestamp_type is not TIMESTAMP_NOT_AVAILABLE else None def _error_callback(self, err): params = { "error": str(err), log_const.KEY_NAME: log_const.EXCEPTION_VALUE } log("KafkaConsumer: Error: %(error)s", params=params, level="WARNING") monitoring.got_counter("kafka_consumer_exception") # noinspection PyMethodMayBeStatic def _process_message(self, msg: KafkaMessage): err = msg.error() if err: if err.code() == KafkaError._PARTITION_EOF: return None else: monitoring.got_counter("kafka_consumer_exception") params = { "code": err.code(), "pid": os.getpid(), "topic": msg.topic(), "partition": msg.partition(), "offset": msg.offset(), log_const.KEY_NAME: log_const.EXCEPTION_VALUE } log( "KafkaConsumer Error %(code)s at pid %(pid)s: topic=%(topic)s partition=[%(partition)s] " "reached end at offset %(offset)s\n", params=params, level="WARNING") raise KafkaException(err) if msg.value(): if msg.headers() is None: msg.set_headers([]) return msg def close(self): self._consumer.close() log(f"consumer to topics {self._config['topics']} closed.")
def run(self) -> None: ac = ApiClient() api = public_api(self.api_host) # only used by container indexing query_stats code path es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend) def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) # print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print( "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr, ) consumer_conf = self.kafka_config.copy() consumer_conf.update( { "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 60000, "default.topic.config": { "auto.offset.reset": "latest", }, } ) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet", file=sys.stderr) print( "... nothing new from kafka, try again (interval: {}".format( self.poll_interval ), file=sys.stderr, ) continue print("... got {} kafka messages".format(len(batch)), file=sys.stderr) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode("utf-8") entity = entity_from_json(json_str, self.entity_type, api_client=ac) assert isinstance(entity, self.entity_type) if self.entity_type == ChangelogEntry: key = entity.index # might need to fetch from API if not ( entity.editgroup # pylint: disable=no-member # (TODO) and entity.editgroup.editor # pylint: disable=no-member # (TODO) ): entity = api.get_changelog_entry(entity.index) else: key = entity.ident # pylint: disable=no-member # (TODO) if self.entity_type != ChangelogEntry and entity.state == "wip": print( f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr, ) continue if self.entity_type == ContainerEntity and self.query_stats: stats = query_es_container_stats( entity.ident, es_client=es_client, es_index=self.elasticsearch_release_index, merge_shadows=True, ) doc_dict = container_to_elasticsearch(entity, stats=stats) else: doc_dict = self.transform_func(entity) # TODO: handle deletions from index bulk_actions.append( json.dumps( { "index": { "_id": key, }, } ) ) bulk_actions.append(json.dumps(doc_dict)) # if only WIP entities, then skip if not bulk_actions: for msg in batch: consumer.store_offsets(message=msg) continue print( "Upserting, eg, {} (of {} {} in elasticsearch)".format( key, len(batch), self.entity_type.__name__ ), file=sys.stderr, ) elasticsearch_endpoint = "{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index ) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n", ) resp.raise_for_status() if resp.json()["errors"]: desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) print(desc, file=sys.stderr) print(resp.content, file=sys.stderr) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)
class VerifiableConsumer(VerifiableClient): """ confluent-kafka-python backed VerifiableConsumer class for use with Kafka's kafkatests client tests. """ def __init__(self, conf): """ conf is a config dict passed to confluent_kafka.Consumer() """ super(VerifiableConsumer, self).__init__(conf) self.conf['on_commit'] = self.on_commit self.consumer = Consumer(**conf) self.consumed_msgs = 0 self.consumed_msgs_last_reported = 0 self.consumed_msgs_at_last_commit = 0 self.use_auto_commit = False self.use_async_commit = False self.max_msgs = -1 self.assignment = [] self.assignment_dict = dict() def find_assignment(self, topic, partition): """ Find and return existing assignment based on topic and partition, or None on miss. """ skey = '%s %d' % (topic, partition) return self.assignment_dict.get(skey) def send_records_consumed(self, immediate=False): """ Send records_consumed, every 100 messages, on timeout, or if immediate is set. """ if self.consumed_msgs <= self.consumed_msgs_last_reported + (0 if immediate else 100): return if len(self.assignment) == 0: return d = {'name': 'records_consumed', 'count': self.consumed_msgs - self.consumed_msgs_last_reported, 'partitions': []} for a in self.assignment: if a.min_offset == -1: # Skip partitions that havent had any messages since last time. # This is to circumvent some minOffset checks in kafkatest. continue d['partitions'].append(a.to_dict()) a.min_offset = -1 self.send(d) self.consumed_msgs_last_reported = self.consumed_msgs def send_assignment(self, evtype, partitions): """ Send assignment update, evtype is either 'assigned' or 'revoked' """ d = {'name': 'partitions_' + evtype, 'partitions': [{'topic': x.topic, 'partition': x.partition} for x in partitions]} self.send(d) def on_assign(self, consumer, partitions): """ Rebalance on_assign callback """ old_assignment = self.assignment self.assignment = [AssignedPartition(p.topic, p.partition) for p in partitions] # Move over our last seen offsets so that we can report a proper # minOffset even after a rebalance loop. for a in old_assignment: b = self.find_assignment(a.topic, a.partition) b.min_offset = a.min_offset self.assignment_dict = {a.skey: a for a in self.assignment} self.send_assignment('assigned', partitions) def on_revoke(self, consumer, partitions): """ Rebalance on_revoke callback """ # Send final consumed records prior to rebalancing to make sure # latest consumed is in par with what is going to be committed. self.send_records_consumed(immediate=True) self.do_commit(immediate=True, asynchronous=False) self.assignment = list() self.assignment_dict = dict() self.send_assignment('revoked', partitions) def on_commit(self, err, partitions): """ Offsets Committed callback """ if err is not None and err.code() == KafkaError._NO_OFFSET: self.dbg('on_commit(): no offsets to commit') return # Report consumed messages to make sure consumed position >= committed position self.send_records_consumed(immediate=True) d = {'name': 'offsets_committed', 'offsets': []} if err is not None: d['success'] = False d['error'] = str(err) else: d['success'] = True d['error'] = '' for p in partitions: pd = {'topic': p.topic, 'partition': p.partition, 'offset': p.offset} if p.error is not None: pd['error'] = str(p.error) d['offsets'].append(pd) if len(self.assignment) == 0: self.dbg('Not sending offsets_committed: No current assignment: would be: %s' % d) return self.send(d) def do_commit(self, immediate=False, asynchronous=None): """ Commit every 1000 messages or whenever there is a consume timeout or immediate. """ if (self.use_auto_commit or self.consumed_msgs_at_last_commit + (0 if immediate else 1000) > self.consumed_msgs): return # Make sure we report consumption before commit, # otherwise tests may fail because of commit > consumed if self.consumed_msgs_at_last_commit < self.consumed_msgs: self.send_records_consumed(immediate=True) if asynchronous is None: async_mode = self.use_async_commit else: async_mode = asynchronous self.dbg('Committing %d messages (Async=%s)' % (self.consumed_msgs - self.consumed_msgs_at_last_commit, async_mode)) retries = 3 while True: try: self.dbg('Commit') offsets = self.consumer.commit(asynchronous=async_mode) self.dbg('Commit done: offsets %s' % offsets) if not async_mode: self.on_commit(None, offsets) break except KafkaException as e: if e.args[0].code() == KafkaError._NO_OFFSET: self.dbg('No offsets to commit') break elif e.args[0].code() in (KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD): self.dbg('Commit failed: %s (%d retries)' % (str(e), retries)) if retries <= 0: raise retries -= 1 time.sleep(1) continue else: raise self.consumed_msgs_at_last_commit = self.consumed_msgs def msg_consume(self, msg): """ Handle consumed message (or error event) """ if msg.error(): self.err('Consume failed: %s' % msg.error(), term=False) return if False: self.dbg('Read msg from %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset())) if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs: return # ignore extra messages # Find assignment. a = self.find_assignment(msg.topic(), msg.partition()) if a is None: self.err('Received message on unassigned partition %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset()), term=True) a.consumed_msgs += 1 if a.min_offset == -1: a.min_offset = msg.offset() if a.max_offset < msg.offset(): a.max_offset = msg.offset() self.consumed_msgs += 1 self.consumer.store_offsets(message=msg) self.send_records_consumed(immediate=False) self.do_commit(immediate=False)
def run(self) -> None: def fail_fast(err: Any, _msg: Any) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) def on_commit(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors print(p) if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 180000, "default.topic.config": { "auto.offset.reset": "latest", }, }) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() producer_conf.update({ "delivery.report.only.error": True, "default.topic.config": { "request.required.acks": -1, # all brokers must confirm }, }) producer = Producer(producer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) print("Kafka consuming {}".format(self.consume_topic)) while True: msg = consumer.poll(self.poll_interval) if not msg: print("nothing new from kafka (poll_interval: {} sec)".format( self.poll_interval)) continue if msg.error(): raise KafkaException(msg.error()) cle = json.loads(msg.value().decode("utf-8")) # print(cle) print("processing changelog index {}".format(cle["index"])) release_ids = [] new_release_ids = [] file_ids = [] fileset_ids = [] webcapture_ids = [] container_ids = [] work_ids = [] release_edits = cle["editgroup"]["edits"]["releases"] for re in release_edits: release_ids.append(re["ident"]) # filter to direct release edits which are not updates if not re.get("prev_revision") and not re.get( "redirect_ident"): new_release_ids.append(re["ident"]) file_edits = cle["editgroup"]["edits"]["files"] for e in file_edits: file_ids.append(e["ident"]) fileset_edits = cle["editgroup"]["edits"]["filesets"] for e in fileset_edits: fileset_ids.append(e["ident"]) webcapture_edits = cle["editgroup"]["edits"]["webcaptures"] for e in webcapture_edits: webcapture_ids.append(e["ident"]) container_edits = cle["editgroup"]["edits"]["containers"] for e in container_edits: container_ids.append(e["ident"]) work_edits = cle["editgroup"]["edits"]["works"] for e in work_edits: work_ids.append(e["ident"]) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes # TODO: also fetch old version of file and update any *removed* # release idents (and same for filesets, webcapture updates) release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization( file_entity) producer.produce( self.file_topic, json.dumps(file_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # TODO: topic for fileset updates for ident in set(fileset_ids): fileset_entity = self.api.get_fileset(ident, expand=None) # update release when a fileset changes release_ids.extend(fileset_entity.release_ids or []) # TODO: topic for webcapture updates for ident in set(webcapture_ids): webcapture_entity = self.api.get_webcapture(ident, expand=None) # update release when a webcapture changes release_ids.extend(webcapture_entity.release_ids or []) for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization( container) producer.produce( self.container_topic, json.dumps(container_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) for ident in set(release_ids): release = self.api.get_release( ident, expand="files,filesets,webcaptures,container,creators") if release.work_id: work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization( release) producer.produce( self.release_topic, json.dumps(release_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # for ingest requests, filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request( release, ingest_request_source="fatcat-changelog") if ir and not release.files and self.want_live_ingest( release, ir): producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode("utf-8"), # key=None, on_delivery=fail_fast, ) # send work updates (just ident and changelog metadata) to scholar for re-indexing for ident in set(work_ids): assert ident key = f"work_{ident}" work_ident_dict = dict( key=key, type="fatcat_work", work_ident=ident, updated=cle["timestamp"], fatcat_changelog_index=cle["index"], ) producer.produce( self.work_ident_topic, json.dumps(work_ident_dict).encode("utf-8"), key=key.encode("utf-8"), on_delivery=fail_fast, ) producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg)