示例#1
0
def worker():
    global consumers

    consumer = Consumer({'bootstrap.servers': bootstrap_servers, 'group.id': consumer_group, 'client.id': client_id,
                         'default.topic.config': {'auto.offset.reset': 'earliest'}, 'enable.auto.offset.store': False,
                         'session.timeout.ms': session_timeout_ms})
    consumers.append(consumer)

    consumer.subscribe([topic])

    while True:
        msg = consumer.poll(0)

        thread_name = threading.current_thread().name

        if msg == None or not msg:
            continue

        if not msg.error():
            msg_timestamp = datetime.fromtimestamp(msg.timestamp()[1] / 1000.0)

            keep_alive_counter = 0
            now = datetime.now()
            # loop/sleep to delay the message
            while now < msg_timestamp + delay_timedelta:
                keep_alive_counter = keep_alive_counter + 1

                msg_timestamp_with_delta = msg_timestamp + delay_timedelta
                diff1 = msg_timestamp_with_delta - now
                diff_seconds = diff1.total_seconds()

                if keep_alive_counter <= 1:
                    logging.info("[%s] %s | received message on partition=%d, delaying for %fs" % (
                    thread_name, now.isoformat(), msg.partition(), diff_seconds))

                # sleep for {min_sleep_seconds}s...{kafka_keep_alive_seconds}s
                sleep_seconds = min(kafka_keep_alive_seconds, max(min_sleep_seconds, diff_seconds))

                # use as 'keep alive' feedback for low (no) traffic periods... to avoid connections getting dropped by brokers - resulting in a group rebalance
                logging.debug(
                    "[%s] %s | kafka keep alive commit partition=%d" % (thread_name, now.isoformat(), msg.partition()))
                consumer.commit(
                    offsets=[TopicPartition(topic=msg.topic(), partition=msg.partition(), offset=OFFSET_STORED)])

                # go to sleep
                logging.debug("[%s] %s | going to sleep for %fs / lag: %fs" % (
                    thread_name, now.isoformat(), sleep_seconds, diff_seconds))
                sleep(sleep_seconds)
                now = datetime.now()

            process(thread_name, msg)
            consumer.store_offsets(msg)

        elif msg.error().code() == KafkaError._PARTITION_EOF:
            continue

        else:
            logging.error("kafka consumer error: %s" % msg.error())
示例#2
0
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
        assert 'Consumer already closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
        assert 'Consumer already closed' == str(ex.value)
示例#3
0
def test_store_offsets():
    """ Basic store_offsets() tests """

    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])

    try:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._UNKNOWN_PARTITION

    c.unsubscribe()
    c.close()
def test_calling_store_offsets_after_close_throws_erro():
    """ calling store_offset after close should throw RuntimeError """

    c = Consumer({
        'group.id': 'test',
        'enable.auto.commit': True,
        'enable.auto.offset.store': False,
        'socket.timeout.ms': 50,
        'session.timeout.ms': 100
    })

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    assert 'Consumer closed' == str(ex.value)
def test_store_offsets():
    """ Basic store_offsets() tests """

    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])

    try:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._UNKNOWN_PARTITION

    c.unsubscribe()
    c.close()
def test_calling_store_offsets_after_close_throws_erro():
    """ calling store_offset after close should throw RuntimeError """

    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.offsets_for_times([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)
示例#7
0
    def run(self):
        ac = ApiClient()

        def fail_fast(err, partitions):
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(err)
            #print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer, partitions):
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            'group.id': self.consumer_group,
            'on_commit': fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            'enable.auto.commit': True,
            'enable.auto.offset.store': False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            'max.poll.interval.ms': 60000,
            'default.topic.config': {
                'auto.offset.reset': 'latest',
            },
        })
        consumer = Consumer(consumer_conf)
        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )

        while True:
            batch = consumer.consume(num_messages=self.batch_size,
                                     timeout=self.poll_interval)
            if not batch:
                if not consumer.assignment():
                    print("... no Kafka consumer partitions assigned yet")
                print("... nothing new from kafka, try again (interval: {}".
                      format(self.poll_interval))
                continue
            print("... got {} kafka messages".format(len(batch)))
            # first check errors on entire batch...
            for msg in batch:
                if msg.error():
                    raise KafkaException(msg.error())
            # ... then process
            bulk_actions = []
            for msg in batch:
                json_str = msg.value().decode('utf-8')
                # HACK: work around a bug where container entities got published to
                # release_v03 topic
                if self.elasticsearch_document_name == "release":
                    entity_dict = json.loads(json_str)
                    if entity_dict.get(
                            'name') and not entity_dict.get('title'):
                        continue
                entity = entity_from_json(json_str,
                                          self.entity_type,
                                          api_client=ac)
                # TODO: handle deletions from index
                bulk_actions.append(
                    json.dumps({
                        "index": {
                            "_id": entity.ident,
                        },
                    }))
                bulk_actions.append(json.dumps(self.transform_func(entity)))
            print("Upserting, eg, {} (of {} releases in elasticsearch)".format(
                entity.ident, len(batch)))
            elasticsearch_endpoint = "{}/{}/{}/_bulk".format(
                self.elasticsearch_backend, self.elasticsearch_index,
                self.elasticsearch_document_name)
            resp = requests.post(
                elasticsearch_endpoint,
                headers={"Content-Type": "application/x-ndjson"},
                data="\n".join(bulk_actions) + "\n")
            resp.raise_for_status()
            if resp.json()['errors']:
                desc = "Elasticsearch errors from post to {}:".format(
                    elasticsearch_endpoint)
                print(desc)
                print(resp.content)
                raise Exception(desc)
            for msg in batch:
                # offsets are *committed* (to brokers) automatically, but need
                # to be marked as processed here
                consumer.store_offsets(message=msg)
class VerifiableConsumer(VerifiableClient):
    """
    confluent-kafka-python backed VerifiableConsumer class for use with
    Kafka's kafkatests client tests.
    """
    def __init__(self, conf):
        """
        conf is a config dict passed to confluent_kafka.Consumer()
        """
        super(VerifiableConsumer, self).__init__(conf)
        self.conf["on_commit"] = self.on_commit
        self.consumer = Consumer(**conf)
        self.consumed_msgs = 0
        self.consumed_msgs_last_reported = 0
        self.consumed_msgs_at_last_commit = 0
        self.use_auto_commit = False
        self.use_async_commit = False
        self.max_msgs = -1
        self.assignment = []
        self.assignment_dict = dict()

    def find_assignment(self, topic, partition):
        """Find and return existing assignment based on topic and partition,
        or None on miss."""
        skey = "%s %d" % (topic, partition)
        return self.assignment_dict.get(skey)

    def send_records_consumed(self, immediate=False):
        """Send records_consumed, every 100 messages, on timeout,
        or if immediate is set."""
        if self.consumed_msgs <= self.consumed_msgs_last_reported + (
                0 if immediate else 100):
            return

        if len(self.assignment) == 0:
            return

        d = {
            "name": "records_consumed",
            "count": self.consumed_msgs - self.consumed_msgs_last_reported,
            "partitions": [],
        }

        for a in self.assignment:
            if a.min_offset == -1:
                # Skip partitions that havent had any messages since last time.
                # This is to circumvent some minOffset checks in kafkatest.
                continue
            d["partitions"].append(a.to_dict())
            a.min_offset = -1

        self.send(d)
        self.consumed_msgs_last_reported = self.consumed_msgs

    def send_assignment(self, evtype, partitions):
        """ Send assignment update, evtype is either 'assigned' or 'revoked' """
        d = {
            "name":
            "partitions_" + evtype,
            "partitions": [{
                "topic": x.topic,
                "partition": x.partition
            } for x in partitions],
        }
        self.send(d)

    def on_assign(self, consumer, partitions):
        """ Rebalance on_assign callback """
        old_assignment = self.assignment
        self.assignment = [
            AssignedPartition(p.topic, p.partition) for p in partitions
        ]
        # Move over our last seen offsets so that we can report a proper
        # minOffset even after a rebalance loop.
        for a in old_assignment:
            b = self.find_assignment(a.topic, a.partition)
            b.min_offset = a.min_offset

        self.assignment_dict = {a.skey: a for a in self.assignment}
        self.send_assignment("assigned", partitions)

    def on_revoke(self, consumer, partitions):
        """ Rebalance on_revoke callback """
        # Send final consumed records prior to rebalancing to make sure
        # latest consumed is in par with what is going to be committed.
        self.send_records_consumed(immediate=True)
        self.do_commit(immediate=True, asynchronous=False)
        self.assignment = list()
        self.assignment_dict = dict()
        self.send_assignment("revoked", partitions)

    def on_commit(self, err, partitions):
        """ Offsets Committed callback """
        if err is not None and err.code() == KafkaError._NO_OFFSET:
            self.dbg("on_commit(): no offsets to commit")
            return

        # Report consumed messages to make sure consumed position >= committed position
        self.send_records_consumed(immediate=True)

        d = {"name": "offsets_committed", "offsets": []}

        if err is not None:
            d["success"] = False
            d["error"] = str(err)
        else:
            d["success"] = True
            d["error"] = ""

        for p in partitions:
            pd = {
                "topic": p.topic,
                "partition": p.partition,
                "offset": p.offset
            }
            if p.error is not None:
                pd["error"] = str(p.error)
            d["offsets"].append(pd)

        if len(self.assignment) == 0:
            self.dbg(
                "Not sending offsets_committed: No current assignment: would be: %s"
                % d)
            return

        self.send(d)

    def do_commit(self, immediate=False, asynchronous=None):
        """Commit every 1000 messages or whenever there is a consume timeout
        or immediate."""
        if (self.use_auto_commit or self.consumed_msgs_at_last_commit +
            (0 if immediate else 1000) > self.consumed_msgs):
            return

        # Make sure we report consumption before commit,
        # otherwise tests may fail because of commit > consumed
        if self.consumed_msgs_at_last_commit < self.consumed_msgs:
            self.send_records_consumed(immediate=True)

        if asynchronous is None:
            async_mode = self.use_async_commit
        else:
            async_mode = asynchronous

        self.dbg("Committing %d messages (Async=%s)" %
                 (self.consumed_msgs - self.consumed_msgs_at_last_commit,
                  async_mode))

        retries = 3
        while True:
            try:
                self.dbg("Commit")
                offsets = self.consumer.commit(asynchronous=async_mode)
                self.dbg("Commit done: offsets %s" % offsets)

                if not async_mode:
                    self.on_commit(None, offsets)

                break

            except KafkaException as e:
                if e.args[0].code() == KafkaError._NO_OFFSET:
                    self.dbg("No offsets to commit")
                    break
                elif e.args[0].code() in (
                        KafkaError.REQUEST_TIMED_OUT,
                        KafkaError.NOT_COORDINATOR,
                        KafkaError._WAIT_COORD,
                ):
                    self.dbg("Commit failed: %s (%d retries)" %
                             (str(e), retries))
                    if retries <= 0:
                        raise
                    retries -= 1
                    time.sleep(1)
                    continue
                else:
                    raise

        self.consumed_msgs_at_last_commit = self.consumed_msgs

    def msg_consume(self, msg):
        """ Handle consumed message (or error event) """
        if msg.error():
            self.err("Consume failed: %s" % msg.error(), term=False)
            return

        if self.verbose:
            self.send({
                "name": "record_data",
                "topic": msg.topic(),
                "partition": msg.partition(),
                "key": msg.key(),
                "value": msg.value(),
                "offset": msg.offset(),
            })

        if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs:
            return  # ignore extra messages

        # Find assignment.
        a = self.find_assignment(msg.topic(), msg.partition())
        if a is None:
            self.err(
                "Received message on unassigned partition %s [%d] @ %d" %
                (msg.topic(), msg.partition(), msg.offset()),
                term=True,
            )

        a.consumed_msgs += 1
        if a.min_offset == -1:
            a.min_offset = msg.offset()
        if a.max_offset < msg.offset():
            a.max_offset = msg.offset()

        self.consumed_msgs += 1

        self.consumer.store_offsets(message=msg)
        self.send_records_consumed(immediate=False)
        self.do_commit(immediate=False)
示例#9
0
    def run(self):

        def fail_fast(err, msg):
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        def on_commit(err, partitions):
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                print(p)
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer, partitions):
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            'group.id': self.consumer_group,
            'on_commit': fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            'enable.auto.commit': True,
            'enable.auto.offset.store': False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            'max.poll.interval.ms': 180000,
            'default.topic.config': {
                'auto.offset.reset': 'latest',
            },
        })
        consumer = Consumer(consumer_conf)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            'delivery.report.only.error': True,
            'default.topic.config': {
                'request.required.acks': -1, # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        consumer.subscribe([self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )
        print("Kafka consuming {}".format(self.consume_topic))

        while True:
            msg = consumer.poll(self.poll_interval)
            if not msg:
                print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval))
                continue
            if msg.error():
                raise KafkaException(msg.error())

            cle = json.loads(msg.value().decode('utf-8'))
            #print(cle)
            print("processing changelog index {}".format(cle['index']))
            release_ids = []
            new_release_ids = []
            file_ids = []
            container_ids = []
            work_ids = []
            release_edits = cle['editgroup']['edits']['releases']
            for re in release_edits:
                release_ids.append(re['ident'])
                # filter to direct release edits which are not updates
                if not re.get('prev_revision') and not re.get('redirect_ident'):
                    new_release_ids.append(re['ident'])
            file_edits = cle['editgroup']['edits']['files']
            for e in file_edits:
                file_ids.append(e['ident'])
            container_edits = cle['editgroup']['edits']['containers']
            for e in container_edits:
                container_ids.append(e['ident'])
            work_edits = cle['editgroup']['edits']['works']
            for e in work_edits:
                work_ids.append(e['ident'])

            # TODO: do these fetches in parallel using a thread pool?
            for ident in set(file_ids):
                file_entity = self.api.get_file(ident, expand=None)
                # update release when a file changes
                # TODO: fetch old revision as well, and only update
                # releases for which list changed
                release_ids.extend(file_entity.release_ids or [])
                file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
                producer.produce(
                    self.file_topic,
                    json.dumps(file_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
            for ident in set(container_ids):
                container = self.api.get_container(ident)
                container_dict = self.api.api_client.sanitize_for_serialization(container)
                producer.produce(
                    self.container_topic,
                    json.dumps(container_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
            for ident in set(release_ids):
                release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                work_ids.append(release.work_id)
                release_dict = self.api.api_client.sanitize_for_serialization(release)
                producer.produce(
                    self.release_topic,
                    json.dumps(release_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
                # filter to "new" active releases with no matched files
                if release.ident in new_release_ids:
                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
                    if ir and not release.files and self.want_live_ingest(release, ir):
                        producer.produce(
                            self.ingest_file_request_topic,
                            json.dumps(ir).encode('utf-8'),
                            #key=None,
                            on_delivery=fail_fast,
                        )
            producer.flush()
            # TODO: publish updated 'work' entities to a topic
            consumer.store_offsets(message=msg)
class KafkaConsumer(BaseKafkaConsumer):
    def __init__(self, config):
        self._config = config["consumer"]
        self.assign_offset_end = self._config.get("assign_offset_end", False)
        conf = self._config["conf"]
        conf.setdefault("group.id", str(uuid.uuid1()))
        self.autocommit_enabled = conf.get("enable.auto.commit", True)
        internal_log_path = self._config.get("internal_log_path")
        conf["error_cb"] = self._error_callback
        if internal_log_path:
            debug_logger = logging.getLogger("debug_consumer")
            timestamp = time.strftime("_%d%m%Y_")
            debug_logger.addHandler(
                logging.FileHandler("{}/kafka_consumer_debug{}{}.log".format(
                    internal_log_path, timestamp, os.getpid())))
            conf["logger"] = debug_logger
        self._consumer = Consumer(**conf)

    @staticmethod
    def on_assign_offset_end(consumer, partitions):
        for p in partitions:
            p.offset = OFFSET_END
        KafkaConsumer.on_assign_log(consumer, partitions)
        consumer.assign(partitions)

    @staticmethod
    def on_coop_assign_offset_end(consumer, partitions):
        for p in partitions:
            p.offset = OFFSET_END
        KafkaConsumer.on_assign_log(consumer, partitions)
        consumer.incremental_assign(partitions)

    @staticmethod
    def on_assign_log(consumer, partitions):
        log_level = "WARNING"
        for p in partitions:
            if p.error:
                log_level = "ERROR"
        params = {
            "partitions":
            str(list([str(partition) for partition in partitions or []])),
            log_const.KEY_NAME:
            log_const.KAFKA_ON_ASSIGN_VALUE,
            "log_level":
            log_level
        }
        log("KafkaConsumer.subscribe<on_assign>: assign %(partitions)s %(log_level)s",
            params=params,
            level=log_level)

    def subscribe(self, topics=None):
        topics = topics or list(self._config["topics"].values())

        self._consumer.subscribe(
            topics,
            on_assign=self.get_on_assign_callback()
            if self.assign_offset_end else KafkaConsumer.on_assign_log)

    def get_on_assign_callback(self):
        if "cooperative" in self._config["conf"].get(
                "partition.assignment.strategy", ""):
            callback = KafkaConsumer.on_coop_assign_offset_end
        else:
            callback = KafkaConsumer.on_assign_offset_end
        return callback

    def unsubscribe(self):
        self._consumer.unsubscribe()

    def poll(self):
        msg = self._consumer.poll(self._config["poll_timeout"])
        if msg is not None:
            return self._process_message(msg)

    def consume(self, num_messages: int = 1):
        messages = self._consumer.consume(num_messages=num_messages,
                                          timeout=self._config["poll_timeout"])
        for msg in messages:
            yield self._process_message(msg)

    def commit_offset(self, msg):
        if msg is not None:
            if self.autocommit_enabled:
                self._consumer.store_offsets(msg)
            else:
                self._consumer.commit(msg, **{"async": True})

    def get_msg_create_time(self, mq_message):
        timestamp_type, timestamp = mq_message.timestamp()
        return timestamp if timestamp_type is not TIMESTAMP_NOT_AVAILABLE else None

    def _error_callback(self, err):
        params = {
            "error": str(err),
            log_const.KEY_NAME: log_const.EXCEPTION_VALUE
        }
        log("KafkaConsumer: Error: %(error)s", params=params, level="WARNING")
        monitoring.got_counter("kafka_consumer_exception")

    # noinspection PyMethodMayBeStatic
    def _process_message(self, msg: KafkaMessage):
        err = msg.error()
        if err:
            if err.code() == KafkaError._PARTITION_EOF:
                return None
            else:
                monitoring.got_counter("kafka_consumer_exception")
                params = {
                    "code": err.code(),
                    "pid": os.getpid(),
                    "topic": msg.topic(),
                    "partition": msg.partition(),
                    "offset": msg.offset(),
                    log_const.KEY_NAME: log_const.EXCEPTION_VALUE
                }
                log(
                    "KafkaConsumer Error %(code)s at pid %(pid)s: topic=%(topic)s partition=[%(partition)s] "
                    "reached end at offset %(offset)s\n",
                    params=params,
                    level="WARNING")
                raise KafkaException(err)

        if msg.value():
            if msg.headers() is None:
                msg.set_headers([])
            return msg

    def close(self):
        self._consumer.close()
        log(f"consumer to topics {self._config['topics']} closed.")
示例#11
0
    def run(self) -> None:
        ac = ApiClient()
        api = public_api(self.api_host)

        # only used by container indexing query_stats code path
        es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend)

        def fail_fast(err: Any, partitions: List[Any]) -> None:
            if err is not None:
                print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
                print("Bailing out...", file=sys.stderr)
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr)
                    print("Bailing out...", file=sys.stderr)
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            # print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print(
                "Kafka partitions rebalanced: {} / {}".format(consumer, partitions),
                file=sys.stderr,
            )

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update(
            {
                "group.id": self.consumer_group,
                "on_commit": fail_fast,
                # messages don't have offset marked as stored until pushed to
                # elastic, but we do auto-commit stored offsets to broker
                "enable.auto.commit": True,
                "enable.auto.offset.store": False,
                # user code timeout; if no poll after this long, assume user code
                # hung and rebalance (default: 5min)
                "max.poll.interval.ms": 60000,
                "default.topic.config": {
                    "auto.offset.reset": "latest",
                },
            }
        )
        consumer = Consumer(consumer_conf)
        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )

        while True:
            batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval)
            if not batch:
                if not consumer.assignment():
                    print("... no Kafka consumer partitions assigned yet", file=sys.stderr)
                print(
                    "... nothing new from kafka, try again (interval: {}".format(
                        self.poll_interval
                    ),
                    file=sys.stderr,
                )
                continue
            print("... got {} kafka messages".format(len(batch)), file=sys.stderr)
            # first check errors on entire batch...
            for msg in batch:
                if msg.error():
                    raise KafkaException(msg.error())
            # ... then process
            bulk_actions = []
            for msg in batch:
                json_str = msg.value().decode("utf-8")
                entity = entity_from_json(json_str, self.entity_type, api_client=ac)
                assert isinstance(entity, self.entity_type)
                if self.entity_type == ChangelogEntry:
                    key = entity.index
                    # might need to fetch from API
                    if not (
                        entity.editgroup  # pylint: disable=no-member # (TODO)
                        and entity.editgroup.editor  # pylint: disable=no-member # (TODO)
                    ):
                        entity = api.get_changelog_entry(entity.index)
                else:
                    key = entity.ident  # pylint: disable=no-member # (TODO)

                if self.entity_type != ChangelogEntry and entity.state == "wip":
                    print(
                        f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}",
                        file=sys.stderr,
                    )
                    continue

                if self.entity_type == ContainerEntity and self.query_stats:
                    stats = query_es_container_stats(
                        entity.ident,
                        es_client=es_client,
                        es_index=self.elasticsearch_release_index,
                        merge_shadows=True,
                    )
                    doc_dict = container_to_elasticsearch(entity, stats=stats)
                else:
                    doc_dict = self.transform_func(entity)

                # TODO: handle deletions from index
                bulk_actions.append(
                    json.dumps(
                        {
                            "index": {
                                "_id": key,
                            },
                        }
                    )
                )
                bulk_actions.append(json.dumps(doc_dict))

            # if only WIP entities, then skip
            if not bulk_actions:
                for msg in batch:
                    consumer.store_offsets(message=msg)
                continue

            print(
                "Upserting, eg, {} (of {} {} in elasticsearch)".format(
                    key, len(batch), self.entity_type.__name__
                ),
                file=sys.stderr,
            )
            elasticsearch_endpoint = "{}/{}/_bulk".format(
                self.elasticsearch_backend, self.elasticsearch_index
            )
            resp = requests.post(
                elasticsearch_endpoint,
                headers={"Content-Type": "application/x-ndjson"},
                data="\n".join(bulk_actions) + "\n",
            )
            resp.raise_for_status()
            if resp.json()["errors"]:
                desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
                print(desc, file=sys.stderr)
                print(resp.content, file=sys.stderr)
                raise Exception(desc)
            for msg in batch:
                # offsets are *committed* (to brokers) automatically, but need
                # to be marked as processed here
                consumer.store_offsets(message=msg)
class VerifiableConsumer(VerifiableClient):
    """
    confluent-kafka-python backed VerifiableConsumer class for use with
    Kafka's kafkatests client tests.
    """
    def __init__(self, conf):
        """
        conf is a config dict passed to confluent_kafka.Consumer()
        """
        super(VerifiableConsumer, self).__init__(conf)
        self.conf['on_commit'] = self.on_commit
        self.consumer = Consumer(**conf)
        self.consumed_msgs = 0
        self.consumed_msgs_last_reported = 0
        self.consumed_msgs_at_last_commit = 0
        self.use_auto_commit = False
        self.use_async_commit = False
        self.max_msgs = -1
        self.assignment = []
        self.assignment_dict = dict()

    def find_assignment(self, topic, partition):
        """ Find and return existing assignment based on topic and partition,
        or None on miss. """
        skey = '%s %d' % (topic, partition)
        return self.assignment_dict.get(skey)

    def send_records_consumed(self, immediate=False):
        """ Send records_consumed, every 100 messages, on timeout,
            or if immediate is set. """
        if self.consumed_msgs <= self.consumed_msgs_last_reported + (0 if immediate else 100):
            return

        if len(self.assignment) == 0:
            return

        d = {'name': 'records_consumed',
             'count': self.consumed_msgs - self.consumed_msgs_last_reported,
             'partitions': []}

        for a in self.assignment:
            if a.min_offset == -1:
                # Skip partitions that havent had any messages since last time.
                # This is to circumvent some minOffset checks in kafkatest.
                continue
            d['partitions'].append(a.to_dict())
            a.min_offset = -1

        self.send(d)
        self.consumed_msgs_last_reported = self.consumed_msgs

    def send_assignment(self, evtype, partitions):
        """ Send assignment update, evtype is either 'assigned' or 'revoked' """
        d = {'name': 'partitions_' + evtype,
             'partitions': [{'topic': x.topic, 'partition': x.partition} for x in partitions]}
        self.send(d)

    def on_assign(self, consumer, partitions):
        """ Rebalance on_assign callback """
        old_assignment = self.assignment
        self.assignment = [AssignedPartition(p.topic, p.partition) for p in partitions]
        # Move over our last seen offsets so that we can report a proper
        # minOffset even after a rebalance loop.
        for a in old_assignment:
            b = self.find_assignment(a.topic, a.partition)
            b.min_offset = a.min_offset

        self.assignment_dict = {a.skey: a for a in self.assignment}
        self.send_assignment('assigned', partitions)

    def on_revoke(self, consumer, partitions):
        """ Rebalance on_revoke callback """
        # Send final consumed records prior to rebalancing to make sure
        # latest consumed is in par with what is going to be committed.
        self.send_records_consumed(immediate=True)
        self.do_commit(immediate=True, asynchronous=False)
        self.assignment = list()
        self.assignment_dict = dict()
        self.send_assignment('revoked', partitions)

    def on_commit(self, err, partitions):
        """ Offsets Committed callback """
        if err is not None and err.code() == KafkaError._NO_OFFSET:
            self.dbg('on_commit(): no offsets to commit')
            return

        # Report consumed messages to make sure consumed position >= committed position
        self.send_records_consumed(immediate=True)

        d = {'name': 'offsets_committed',
             'offsets': []}

        if err is not None:
            d['success'] = False
            d['error'] = str(err)
        else:
            d['success'] = True
            d['error'] = ''

        for p in partitions:
            pd = {'topic': p.topic, 'partition': p.partition, 'offset': p.offset}
            if p.error is not None:
                pd['error'] = str(p.error)
            d['offsets'].append(pd)

        if len(self.assignment) == 0:
            self.dbg('Not sending offsets_committed: No current assignment: would be: %s' % d)
            return

        self.send(d)

    def do_commit(self, immediate=False, asynchronous=None):
        """ Commit every 1000 messages or whenever there is a consume timeout
            or immediate. """
        if (self.use_auto_commit
                or self.consumed_msgs_at_last_commit + (0 if immediate else 1000) >
                self.consumed_msgs):
            return

        # Make sure we report consumption before commit,
        # otherwise tests may fail because of commit > consumed
        if self.consumed_msgs_at_last_commit < self.consumed_msgs:
            self.send_records_consumed(immediate=True)

        if asynchronous is None:
            async_mode = self.use_async_commit
        else:
            async_mode = asynchronous

        self.dbg('Committing %d messages (Async=%s)' %
                 (self.consumed_msgs - self.consumed_msgs_at_last_commit,
                  async_mode))

        retries = 3
        while True:
            try:
                self.dbg('Commit')
                offsets = self.consumer.commit(asynchronous=async_mode)
                self.dbg('Commit done: offsets %s' % offsets)

                if not async_mode:
                    self.on_commit(None, offsets)

                break

            except KafkaException as e:
                if e.args[0].code() == KafkaError._NO_OFFSET:
                    self.dbg('No offsets to commit')
                    break
                elif e.args[0].code() in (KafkaError.REQUEST_TIMED_OUT,
                                          KafkaError.NOT_COORDINATOR_FOR_GROUP,
                                          KafkaError._WAIT_COORD):
                    self.dbg('Commit failed: %s (%d retries)' % (str(e), retries))
                    if retries <= 0:
                        raise
                    retries -= 1
                    time.sleep(1)
                    continue
                else:
                    raise

        self.consumed_msgs_at_last_commit = self.consumed_msgs

    def msg_consume(self, msg):
        """ Handle consumed message (or error event) """
        if msg.error():
            self.err('Consume failed: %s' % msg.error(), term=False)
            return

        if False:
            self.dbg('Read msg from %s [%d] @ %d' %
                     (msg.topic(), msg.partition(), msg.offset()))

        if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs:
            return  # ignore extra messages

        # Find assignment.
        a = self.find_assignment(msg.topic(), msg.partition())
        if a is None:
            self.err('Received message on unassigned partition %s [%d] @ %d' %
                     (msg.topic(), msg.partition(), msg.offset()), term=True)

        a.consumed_msgs += 1
        if a.min_offset == -1:
            a.min_offset = msg.offset()
        if a.max_offset < msg.offset():
            a.max_offset = msg.offset()

        self.consumed_msgs += 1

        self.consumer.store_offsets(message=msg)
        self.send_records_consumed(immediate=False)
        self.do_commit(immediate=False)
示例#13
0
    def run(self) -> None:
        def fail_fast(err: Any, _msg: Any) -> None:
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        def on_commit(err: Any, partitions: List[Any]) -> None:
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                print(p)
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            "group.id": self.consumer_group,
            "on_commit": fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            "enable.auto.commit": True,
            "enable.auto.offset.store": False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            "max.poll.interval.ms": 180000,
            "default.topic.config": {
                "auto.offset.reset": "latest",
            },
        })
        consumer = Consumer(consumer_conf)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            "delivery.report.only.error": True,
            "default.topic.config": {
                "request.required.acks": -1,  # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )
        print("Kafka consuming {}".format(self.consume_topic))

        while True:
            msg = consumer.poll(self.poll_interval)
            if not msg:
                print("nothing new from kafka (poll_interval: {} sec)".format(
                    self.poll_interval))
                continue
            if msg.error():
                raise KafkaException(msg.error())

            cle = json.loads(msg.value().decode("utf-8"))
            # print(cle)
            print("processing changelog index {}".format(cle["index"]))
            release_ids = []
            new_release_ids = []
            file_ids = []
            fileset_ids = []
            webcapture_ids = []
            container_ids = []
            work_ids = []
            release_edits = cle["editgroup"]["edits"]["releases"]
            for re in release_edits:
                release_ids.append(re["ident"])
                # filter to direct release edits which are not updates
                if not re.get("prev_revision") and not re.get(
                        "redirect_ident"):
                    new_release_ids.append(re["ident"])
            file_edits = cle["editgroup"]["edits"]["files"]
            for e in file_edits:
                file_ids.append(e["ident"])
            fileset_edits = cle["editgroup"]["edits"]["filesets"]
            for e in fileset_edits:
                fileset_ids.append(e["ident"])
            webcapture_edits = cle["editgroup"]["edits"]["webcaptures"]
            for e in webcapture_edits:
                webcapture_ids.append(e["ident"])
            container_edits = cle["editgroup"]["edits"]["containers"]
            for e in container_edits:
                container_ids.append(e["ident"])
            work_edits = cle["editgroup"]["edits"]["works"]
            for e in work_edits:
                work_ids.append(e["ident"])

            # TODO: do these fetches in parallel using a thread pool?
            for ident in set(file_ids):
                file_entity = self.api.get_file(ident, expand=None)
                # update release when a file changes
                # TODO: also fetch old version of file and update any *removed*
                # release idents (and same for filesets, webcapture updates)
                release_ids.extend(file_entity.release_ids or [])
                file_dict = self.api.api_client.sanitize_for_serialization(
                    file_entity)
                producer.produce(
                    self.file_topic,
                    json.dumps(file_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            # TODO: topic for fileset updates
            for ident in set(fileset_ids):
                fileset_entity = self.api.get_fileset(ident, expand=None)
                # update release when a fileset changes
                release_ids.extend(fileset_entity.release_ids or [])

            # TODO: topic for webcapture updates
            for ident in set(webcapture_ids):
                webcapture_entity = self.api.get_webcapture(ident, expand=None)
                # update release when a webcapture changes
                release_ids.extend(webcapture_entity.release_ids or [])

            for ident in set(container_ids):
                container = self.api.get_container(ident)
                container_dict = self.api.api_client.sanitize_for_serialization(
                    container)
                producer.produce(
                    self.container_topic,
                    json.dumps(container_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            for ident in set(release_ids):
                release = self.api.get_release(
                    ident,
                    expand="files,filesets,webcaptures,container,creators")
                if release.work_id:
                    work_ids.append(release.work_id)
                release_dict = self.api.api_client.sanitize_for_serialization(
                    release)
                producer.produce(
                    self.release_topic,
                    json.dumps(release_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )
                # for ingest requests, filter to "new" active releases with no matched files
                if release.ident in new_release_ids:
                    ir = release_ingest_request(
                        release, ingest_request_source="fatcat-changelog")
                    if ir and not release.files and self.want_live_ingest(
                            release, ir):
                        producer.produce(
                            self.ingest_file_request_topic,
                            json.dumps(ir).encode("utf-8"),
                            # key=None,
                            on_delivery=fail_fast,
                        )

            # send work updates (just ident and changelog metadata) to scholar for re-indexing
            for ident in set(work_ids):
                assert ident
                key = f"work_{ident}"
                work_ident_dict = dict(
                    key=key,
                    type="fatcat_work",
                    work_ident=ident,
                    updated=cle["timestamp"],
                    fatcat_changelog_index=cle["index"],
                )
                producer.produce(
                    self.work_ident_topic,
                    json.dumps(work_ident_dict).encode("utf-8"),
                    key=key.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            producer.flush()
            # TODO: publish updated 'work' entities to a topic
            consumer.store_offsets(message=msg)