Exemplo n.º 1
0
def analytics_internet3_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet3_logs_%s' %dt,
                         'default.topic.config': {'auto.offset.reset': 'latest', 'auto.commit.enable': 'true'}})
    consumer.subscribe(['haproxy_logs'])
    try:
        while True:
            msg = consumer.poll()
            if not msg.error():
                Msg = msg.value().decode('utf-8').strip()
                try:
                    tm = time.strftime('%Y%m%d%H%M', time.localtime())
                    if Msg:
                        Msg = Msg.split()
                        if len(Msg) >= 17:
                            internet_access_minute = 'internet_access_minute_%s' % tm
                            RC.incr(internet_access_minute)
                            RC.expire(internet_access_minute,3600)
                except Exception as e:
                    logging.error(e)
                    continue
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                logging.error(msg.error())
                continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert 'Consumer closed' == str(ex.value)
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb (err, partitions):
        pass

    kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100',
                   'session.timeout.ms': 1000, # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke (consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3)))
    kc.assign(partitions)

    kc.unassign()

    kc.commit(async=True)

    try:
        kc.commit(async=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == -1001]) == len(partitions)

    try:
        offsets = kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT


    kc.close()
Exemplo n.º 4
0
def consume():

    c = Consumer({'bootstrap.servers': KAFKA_SERVER, 'group.id': 'mygroup',
              'default.topic.config': {'auto.offset.reset': 'smallest'}})
    c.subscribe([KAFKA_TOPIC])
    while True:
        msg = c.poll()
        if not msg.error():
            print('Received message: %s' % msg.value().decode('utf-8'))
    c.close()
def subscribe():
	c = Consumer({'bootstrap.servers': '0', 'group.id': 'test-consumer-group', 'default.topic.config': {'auto.offset.reset': 'smallest'}})
	c.subscribe(['neuronraindata'])
	while True:
	    msg = c.poll()
	    if not msg.error() and msg.value():
	        print('Received message: ' , msg.value().encode("utf-8"))
	    else:
	        print(msg.error())
	c.close()
Exemplo n.º 6
0
class KafkaWorkflowResultsReceiver(object):
    _requires = ['confluent-kafka']

    def __init__(self, message_converter=ProtobufWorkflowResultsConverter, current_app=None):
        import walkoff.server.workflowresults  # Need this import

        self.thread_exit = False

        kafka_config = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_CONFIG
        self.receiver = Consumer(kafka_config)
        self.topic = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_TOPIC
        self.message_converter = message_converter
        self.workflows_executed = 0

        if current_app is None:
            self.current_app = Flask(__name__)
            self.current_app.config.from_object(walkoff.config.Config)
            self.current_app.running_context = context.Context(init_all=False)
        else:
            self.current_app = current_app

    def receive_results(self):
        """Constantly receives data from the Kafka Consumer and handles it accordingly"""
        logger.info('Starting Kafka workflow results receiver')
        self.receiver.subscribe(['{}.*'.format(self.topic)])
        while not self.thread_exit:
            raw_message = self.receiver.poll(1.0)
            if raw_message is None:
                gevent.sleep(0.1)
                continue
            if raw_message.error():
                if raw_message.error().code() == KafkaError._PARTITION_EOF:
                    gevent.sleep(0.1)
                    continue
                else:
                    logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error()))
                    gevent.sleep(0.1)
                    continue
            with self.current_app.app_context():
                self._send_callback(raw_message.value())
        self.receiver.close()
        return

    def _send_callback(self, message_bytes):
        event, sender, data = self.message_converter.to_event_callback(message_bytes)

        if sender is not None and event is not None:
            with self.current_app.app_context():
                event.send(sender, data=data)
            if event in [WalkoffEvent.WorkflowShutdown, WalkoffEvent.WorkflowAborted]:
                self._increment_execution_count()

    def _increment_execution_count(self):
        self.workflows_executed += 1
def test_offsets_for_times():
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})
    # Query broker for timestamps for partition
    try:
        test_topic_partition = TopicPartition("test", 0, 100)
        c.offsets_for_times([test_topic_partition], timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))
    c.close()
Exemplo n.º 8
0
def analytics_intranet_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Intranet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}})
    consumer.subscribe(['haproxy2_logs'])
    try:
        while True:
            msg = consumer.poll()
            if not msg.error():
                Msg = msg.value().decode('utf-8').strip()
                try:
                    tt = time.strftime('%Y%m%d', time.localtime())
                    th = time.strftime('%Y%m%d%H', time.localtime())
                    tm = time.strftime('%Y%m%d%H%M', time.localtime())
                    H_key = 'haproxy2_topic_%s' % tt
                    top2_url_hour = 'top2_url_hour_%s' % th
                    top2_url_minute = 'top2_url_minute_%s' % tm
                    if len(Msg.split()) >= 17:
                        val = Msg.split('{')
                        if len(val) >= 2:
                            Topic = val[1].split('}')[0]
                            Rtime = val[0].split()[8]
                            Rtime = int(Rtime.split('/')[4])
                            if ':' in Topic:
                                Topic = str(Topic.split(':')[0])
                            if '|' in Topic:
                                Topic = str(Topic.split('|')[0])
                            if '.baihe.com' in Topic:
                                Key = 'haproxy2_logs_%s_%s' % (tt, Topic)
                                Rt_Key = 'Rtime2_%s_%s' % (tt, Topic)
                                # 接口
                                PATH = str(Msg.split()[17]).split('?')[0]
                                URL = 'http://%s%s' % (Topic,PATH)
                                RC.zincrby(top2_url_hour, URL, 1)
                                RC.zincrby(top2_url_minute, URL, 1)
                                for KEY in (H_key, Key, Rt_Key,top2_url_hour,top2_url_minute):
                                    RC.expire(KEY,3600)
                                RC.sadd(H_key, Topic)
                                RC.incr(Key)
                                if Rtime:
                                    RC.lpush(Rt_Key, Rtime)
                except Exception as e:
                    logging.error(e)
                    continue
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                logging.error(msg.error())
                continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()
Exemplo n.º 9
0
class KafkaWorkflowCommunicationReceiver(object):
    """Receives communication via Kafka and sends it to the executing workflow"""
    _requires = ['confluent-kafka']

    def __init__(self, message_converter=ProtobufWorkflowCommunicationConverter):
        self._ready = False

        kafka_config = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_CONFIG
        self.receiver = Consumer(kafka_config)
        self.topic = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_TOPIC
        self.message_converter = message_converter
        self.exit = False

        if self.check_status():
            self._ready = True

    def shutdown(self):
        self.exit = True
        self.receiver.close()

    def receive_communications(self):
        """Constantly receives data from the Kafka and handles it accordingly"""
        logger.info('Starting workflow communication receiver')
        while not self.exit:
            raw_message = self.receiver.poll(1.0)
            if raw_message is None:
                continue
            if raw_message.error():
                if raw_message.error().code() == KafkaError._PARTITION_EOF:
                    continue
                else:
                    logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error()))
                    continue

            message = self.message_converter.to_received_message(raw_message.value())
            if message is not None:
                yield message
            else:
                break

        raise StopIteration

    def is_ready(self):
        return self._ready

    def check_status(self):
        if self.receiver is not None:
            return True
        return False
Exemplo n.º 10
0
def analytics_internet_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}})
    consumer.subscribe(['haproxy_logs'])
    try:
        while True:
            msg = consumer.poll()
            if not msg.error():
                Msg = msg.value().decode('utf-8').strip()
                try:
                    tt = time.strftime('%Y%m%d', time.localtime())
                    th = time.strftime('%Y%m%d%H', time.localtime())
                    pv_key = 'baihe_pv_%s' % tt
                    if Msg:
                        Msg = Msg.split()
                        RC.incr(pv_key)
                        if len(Msg) >= 17:
                            Topic = str(Msg[14]).split('|')[0].replace('{', '').strip()
                            IP = str(Msg[5])
                            H_key = 'haproxy_topic_%s' % tt
                            top_ip = 'top_ip_%s' % tt
                            top_ip_hour = 'top_ip_%s' % th
                            top_url_hour = 'top_url_%s' % th
                            PATH = str(Msg[16]).split('?')[0]
                            URL = 'http://%s%s' % (Topic,PATH)
                            Ha_Key = 'haproxy_logs_%s_%s' % (tt, Topic)
                            top_ip_domain = 'top_%s_domain_%s' % (IP, tt)
                            top_ip_domain_hour = 'top_%s_domain_%s' % (IP, th)
                            for KEY in (H_key, pv_key, top_ip, top_url_hour, top_ip_hour,Ha_Key, top_ip_domain, top_ip_domain_hour):
                                RC.expire(KEY,3600)
                            RC.sadd(H_key, Topic)
                            RC.incr(Ha_Key)
                            # ip
                            RC.zincrby(top_ip, IP, 1)
                            RC.zincrby(top_ip_hour, IP, 1)
                            # IP_接口
                            RC.zincrby(top_ip_domain, URL, 1)
                            RC.zincrby(top_ip_domain_hour, URL, 1)
                            # 接口
                            RC.zincrby(top_url_hour, URL, 1)
                except:
                    continue
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                logging.error(msg.error())
                continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()
def test_multiple_close_throw_exception():
    """ Calling Consumer.close() multiple times should throw Runtime Exception
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])

    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.close()
    assert 'Consumer already closed' == str(ex.value)
Exemplo n.º 12
0
def WAF_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Waf_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}})
    consumer.subscribe(['haproxy_logs'])
    try:
        while True:
            msg = consumer.poll()
            if not msg.error():
                Msg = msg.value().decode('utf-8').strip()
                try:
                    tm = time.strftime('%Y%m%d%H%M',time.localtime())
                    if Msg:
                        Msg = Msg.split()
                        if len(Msg) >= 17:
                            url_code = Msg[9]
                            Topic =str(Msg[14]).split('|')[0].replace('{','').strip()
                            IP = str(Msg[5])
                            if url_code in ('200', '206', '301', '302', '304', '404'):
                                top_ip_minute = 'top_ip_%s' % tm
                                top_url_minute = 'top_url_%s' % tm
                                PATH = str(Msg[16]).split('?')[0]
                                URL = 'http://%s%s' % (Topic,PATH)
                                top_ip_domain_minute = 'top_%s_domain_%s' % (IP, tm)
                                top_url_ip_minute = 'top_%s_ip_%s' % (URL, tm)
                                # ip
                                RC.zincrby(top_ip_minute, IP, 1)
                                RC.expire(top_ip_minute, 300)
                                # IP_接口
                                RC.zincrby(top_ip_domain_minute, URL, 1)
                                RC.expire(top_ip_domain_minute, 300)
                                # 接口
                                RC.zincrby(top_url_minute, URL, 1)
                                RC.expire(top_url_minute, 300)
                                # 接口_ip
                                RC.zincrby(top_url_ip_minute, IP, 1)
                                RC.expire(top_url_ip_minute, 300)
                except Exception as e:
                    logging.error(e)
                    continue
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                logging.error(msg.error())
                continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()
def test_store_offsets():
    """ Basic store_offsets() tests """

    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])

    try:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._UNKNOWN_PARTITION

    c.unsubscribe()
    c.close()
def test_calling_store_offsets_after_close_throws_erro():
    """ calling store_offset after close should throw RuntimeError """

    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.store_offsets(offsets=[TopicPartition("test", 0, 42)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.offsets_for_times([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)
Exemplo n.º 15
0
async def consume_events(topic, group, brokers, callback, schema=None,registry=None,delay=0.01,**kwargs):
    """
    Connect to the Kafka endpoint and start consuming
    messages from the given `topic`.
    The given callback is applied on each
    message.
    """    
    global consumer
    if topic in consumers:
        raise RuntimeError("A consumer already exists for topic: %s" % topic)

    if (not registry_serializer or not registry_client) and registry:
        r_client,serializer = create_registry_client(registry)


    consumer = Consumer({'bootstrap.servers': brokers, 'group.id': group,
              'default.topic.config': {'auto.offset.reset': 'largest'}})
    consumer.subscribe([topic])
    consumers[topic] = consumer


    try:
        while True:
            message = consumer.poll(1)
            if message:
               if not message.error():
                   if registry:
                       message = serializer.decode_message(message.value())
                   else:
                       message = message.value()

                   await callback(message)
                   consumer.commit()
            else:
                   await asyncio.sleep(delay)
    except KafkaException as ex:
        pass
    else:
        consumer.close()
    finally:
        consumers.pop(topic, None)
def test_on_commit():
    """ Verify that on_commit is only called once per commit() (issue #71) """

    class CommitState(object):
        def __init__(self, topic, partition):
            self.topic = topic
            self.partition = partition
            self.once = True

    def commit_cb(cs, err, ps):
        print('on_commit: err %s, partitions %s' % (err, ps))
        assert cs.once is True
        assert err == KafkaError._NO_OFFSET
        assert len(ps) == 1
        p = ps[0]
        assert p.topic == cs.topic
        assert p.partition == cs.partition
        cs.once = False

    cs = CommitState('test', 2)

    c = Consumer({'group.id': 'x',
                  'enable.auto.commit': False, 'socket.timeout.ms': 50,
                  'session.timeout.ms': 100,
                  'on_commit': lambda err, ps: commit_cb(cs, err, ps)})

    c.assign([TopicPartition(cs.topic, cs.partition)])

    for i in range(1, 3):
        c.poll(0.1)

        if cs.once:
            # Try commit once
            try:
                c.commit(asynchronous=False)
            except KafkaException as e:
                print('commit failed with %s (expected)' % e)
                assert e.args[0].code() == KafkaError._NO_OFFSET

    c.close()
Exemplo n.º 17
0
def consumer(args, poll_timeout=3.0):
    """ Consumes packets from a Kafka topic. """

    # setup the signal handler
    signal.signal(signal.SIGINT, signal_handler)

    # where to start consuming messages from
    kafka_offset_options = {
        "begin": seek_to_begin,
        "end": seek_to_end,
        "stored": seek_to_stored
    }
    on_assign_cb = kafka_offset_options[args.kafka_offset]

    # connect to kafka
    logging.debug("Connecting to Kafka; %s", args.kafka_configs)
    kafka_consumer = Consumer(args.kafka_configs)
    kafka_consumer.subscribe([args.kafka_topic], on_assign=on_assign_cb)

    # if 'pretty-print' not set, write libpcap global header
    if args.pretty_print == 0:
        sys.stdout.write(global_header(args))
        sys.stdout.flush()

    try:
        pkts_in = 0
        while not finished.is_set() and (args.max_packets <= 0 or pkts_in < args.max_packets):

            # consume a message from kafka
            msg = kafka_consumer.poll(timeout=poll_timeout)
            if msg is None:
                # no message received
                continue;

            elif msg.error():

                if msg.error().code() == KafkaError._PARTITION_EOF:
                    if args.pretty_print > 0:
                        print "Reached end of topar: topic=%s, partition=%d, offset=%s" % (
                            msg.topic(), msg.partition(), msg.offset())
                else:
                    raise KafkaException(msg.error())

            else:
                pkts_in += 1
                logging.debug("Packet received: pkts_in=%d", pkts_in)

                if args.pretty_print == 0:

                    # write the packet header and packet
                    sys.stdout.write(packet_header(msg))
                    sys.stdout.write(msg.value())
                    sys.stdout.flush()

                elif pkts_in % args.pretty_print == 0:

                    # pretty print
                    print 'Packet[%s]: date=%s topic=%s partition=%s offset=%s len=%s' % (
                        pkts_in, to_date(unpack_ts(msg.key())), args.kafka_topic,
                        msg.partition(), msg.offset(), len(msg.value()))

    finally:
        sys.stdout.close()
        kafka_consumer.close()
class KafkaConsumer:
    """Defines the base kafka consumer class"""

    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        #
        #
        # TODO: Configure the broker properties below. Make sure to reference the project README
        # and use the Host URL for Kafka and Schema Registry!
        #
        #
        self.broker_properties = {
            "group.id": "0",
            "bootstrap.servers": "PLAINTEXT://localhost:9092",
            "auto.offset.reset": "earliest"
        }

        # TODO: Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties["schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)
            pass


        #
        #
        # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about
        # how the `on_assign` callback should be invoked.
        #
        #
        self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # TODO: If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        logger.info("on_assign completed")
        for partition in partitions:
            if self.offset_earliest is True:
                partition.offset = confluent_kafka.OFFSET_BEGINNING
#             consumer.seek(partition)

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        #
        #
        # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions.
        # Additionally, make sure you return 1 when a message is processed, and 0 when no message
        # is retrieved.
        #
        #
        message = self.consumer.poll(1.0)
        if message is 0:
            print("no message received")
            return 0
        elif message.error() is not None:
            print(f"message error {message.error()}")
            return 0
        else:
            self.message_handler(message)
            return 1


    def close(self):
        """Cleans up any open kafka consumers"""
        logger.debug("closing consumer")
        self.consumer.close()
Exemplo n.º 19
0
    'default.topic.config': {
        'auto.offset.reset': 'earliest'
    }
})

topics = ['orders']

orderConsumer.subscribe(topics)

try:
    while True:
        msg = orderConsumer.poll(timeout=1.0)
        if msg is None:
            continue
        if msg.error():
            print('No Messages')
            continue
        else:
            # Proper message
            sys.stderr.write(
                '%% %s [%d] at offset %d with key %s:\n' %
                (msg.topic(), msg.partition(), msg.offset(), str(msg.key())))
            print(msg.value())

except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')

finally:
    # Close down consumer to commit final offsets.
    orderConsumer.close()
Exemplo n.º 20
0
class KafkaConnector(object):
    """Simple wrapper class to configure a simple kafka consumer
    and producer pair, so that they can be used to perform simple
    filter() and map() operations over the received tweets"""

    def __init__(
        self,
        group_id=None,
        consumer_topic='consumer_limbo',
        producer_topic='consumer_limbo',
        logging_topic='minteressa_stats',
        bootstrap_servers='kafka:9092'
    ):

        self.group_id = group_id
        self.bootstrap_servers = bootstrap_servers
        self.consumer_topic = consumer_topic
        self.producer_topic = producer_topic
        self.logging_topic = logging_topic

        self.consumer = None
        self.producer = None

    def listen(self):
        while True:
            msg = self.consumer.poll()
            if msg is None:
                continue
            if msg.error():
                # Error or event
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition event
                    sys.stderr.write(
                        '%% %s [%d] reached end at offset %d\n' % (
                            msg.topic(),
                            msg.partition(),
                            msg.offset()
                        )
                    )
                elif msg.error():
                    # Error
                    raise KafkaException(msg.error())
            else:
                # Proper message
                sys.stdout.write(
                    '%s [partition-%d] at offset %d with key %s:\n' %
                    (
                        msg.topic(),
                        msg.partition(),
                        msg.offset(),
                        str(msg.key())
                    )
                )
                yield msg

    def connect(self):
        self.consumer = Consumer({
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.group_id,
            'default.topic.config': {
                'auto.offset.reset': 'smallest'
            }
        })
        print("subscribing to %s" % self.consumer_topic)
        self.consumer.subscribe([
            self.consumer_topic
        ])
        print("Subscribed to topic %s " % self.consumer_topic)

        self.producer = Producer({
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.group_id
        })

    def send(self, message, producer_topic=None):
        producer_topic = producer_topic \
            if producer_topic is not None \
            else self.producer_topic

        self.producer.produce(
            producer_topic,
            message
        )
        # self.producer.flush()


    def log(self, message, logging_topic=None):
        logging_topic = logging_topic \
            if logging_topic is not None \
            else self.logging_topic

        self.producer.produce(logging_topic, message)
        self.producer.flush()

    def close(self):
        self.consumer.close()
        self.producer.close()
Exemplo n.º 21
0
from confluent_kafka import Consumer, KafkaError

app_settings = {
    "bootstrap.servers": "TODO",
    "group.id": "TODO",
    "topic": "TODO",
}

c = Consumer({
    "bootstrap.servers": app_settings["bootstrap.servers"],
    "group.id": app_settings["group.id"],
    "auto.offset.reset":
    "latest",  # smallest, earliest, beginning, largest, latest, end, error
})

c.subscribe([app_settings["topic"]])

while True:
    msg = c.poll(0.1)

    if msg is None:
        print("No Data")
        continue
    if msg.error():
        print(f"Consumer error: {msg.error()}")
        continue

    print(f"Received message: {msg.value().decode('utf-8')}")

c.close()
Exemplo n.º 22
0
                        raise KafkaException(record.error())
                else:
                    recrods_pulled = True
                    # ** 在這裡進行商業邏輯與訊息處理 **
                    # 取出相關的metadata
                    topic = record.topic()
                    partition = record.partition()
                    offset = record.offset()
                    timestamp = record.timestamp()
                    # 取出msgKey與msgValue
                    msgKey = try_decode_utf8(record.key())
                    msgValue = try_decode_utf8(record.value())

                    # 秀出metadata與msgKey & msgValue訊息
                    print('%s-%d-%d : (%s , %s)' %
                          (topic, partition, offset, msgKey, msgValue))

            # 同步地執行commit (Sync commit)
            if (recrods_pulled):
                offsets = consumer.commit(asynchronous=False)
                print_sync_commit_result(offsets)

    except KeyboardInterrupt as e:
        sys.stderr.write('Aborted by user\n')
    except Exception as e:
        sys.stderr.write(str(e))

    finally:
        # 步驟6.關掉Consumer實例的連線
        consumer.close()
Exemplo n.º 23
0
class TransformerConsumer:
    def __init__(
        self,
        broker=None,
        topics=None,
        group_id=None,
        offset_start=-1,
        process_event=None,
        manage_error=None,
    ):
        """
        Instantiate the class and create the consumer object
        :param broker: host[:port]’ string (or list of ‘host[:port]’ strings) that
        the consumer should contact to bootstrap initial cluster metadata
        :param topics: string or list of strings corresponding to the topics to listen
        :param group_id: string
        :param offset_start: integer
        :param process_event: function taking as an argument a deserialized message
            to process the event
        :param manage_error: function taking as an argument adeserialized message
            to manage any error
        """
        self.broker = broker
        self.topics = topics
        self.group_id = group_id
        self.partition = 0  # One partition for now
        self.offset_start = offset_start
        self.process_event = process_event
        self.manage_error = manage_error

        # Create consumer
        self.consumer = Consumer(self._generate_config())

        if isinstance(self.topics, str):
            self.topics = [self.topics]

    def _generate_config(self):
        """
        Generate configuration dictionary for consumer
        :return:
        """
        config = {
            "bootstrap.servers": self.broker,
            "group.id": self.group_id,
            "session.timeout.ms": 6000,
            "auto.offset.reset": "earliest",
        }
        return config

    def consume_event(self):
        """
        Consume event in an infinite loop
        :return:
        """
        while True:
            # Deserialize Event
            msg = self.consumer.poll(timeout=1.0)

            # Process Event or Raise Error
            if msg is None:
                continue
            if msg.error():
                self.manage_error(msg)
            else:
                # Proper message
                self.process_event(msg)

    def run_consumer(self):
        """
        Create consumer, assign topics, consume and process events
        :return:
        """
        logger.info(self.topics)
        self.consumer.subscribe(self.topics)
        try:
            self.consume_event()
        except (KafkaException, KafkaError):
            raise
        finally:
            # Close down consumer to commit final offsets.
            self.consumer.close()
Exemplo n.º 24
0
class BrokerConsumer(object):
    def __init__(self, *args, **kwargs):
        # Consumer configuration
        # See
        # https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
        config = {
            'bootstrap.servers': os.environ['CLOUDKARAFKA_BROKERS'],
            'group.id': "%s-consumer" % os.environ['CLOUDKARAFKA_USERNAME'],
            'session.timeout.ms': 6000,
            'default.topic.config': {
                'auto.offset.reset': 'smallest'
            },
            'security.protocol': 'SASL_SSL',
            'sasl.mechanisms': 'SCRAM-SHA-256',
            'sasl.username': os.environ['CLOUDKARAFKA_USERNAME'],
            'sasl.password': os.environ['CLOUDKARAFKA_PASSWORD']
        }

        self.topic_prefix = os.environ['CLOUDKARAFKA_TOPIC_PREFIX']
        self.consumer = Consumer(**config)
        self.handlers = {}
        self.logger = get_logger()
        self.loop = asyncio.get_event_loop()
        return super().__init__(*args, **kwargs)

    def _add_handler(self, topic, handler):
        if self.handlers.get(topic) is None:
            self.handlers[topic] = []
        self.handlers[topic].append(handler)

    def handle(self, topic):
        def decorator(f):
            self._add_handler(topic, f)
            return f

        return decorator

    def _run_handlers(self, msg):
        try:
            handlers = self.handlers[msg.topic()]
            for handler in handlers:
                handler(msg)
            self.consumer.commit()
        except Exception as e:
            self.logger.critical(str(e), exc_info=1)
            # self.consumer.close()
            sys.exit("Exited due to exception")

    def _signal_term_handler(self, signal, frame):
        info("closing consumer")
        # self.consumer.close()
        # sys.exit(0)

    def start(self):
        topics = [*self.handlers.keys()]
        self.consumer.subscribe(topics=topics)
        info("starting consumer...registered signterm")

        signal.signal(signal.SIGTERM, self._signal_term_handler)
        signal.signal(signal.SIGINT, self._signal_term_handler)
        signal.signal(signal.SIGQUIT, self._signal_term_handler)
        signal.signal(signal.SIGHUP, self._signal_term_handler)

        self.loop.run_until_complete(self._consume())
        # from python 3.7+ will be to wonder use
        # asyncio.run(self._consume())

    async def _consume(self):
        try:
            while True:
                msg = self.consumer.poll(timeout=1.0)

                if msg is None:
                    continue
                if msg.error():
                    # Error or event
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        # End of partition event
                        info("Reached end at offset {0}\n".format(
                            msg.offset()))
                    elif msg.error():
                        # Error
                        raise KafkaException(msg.error())
                else:
                    self._run_handlers(msg)
        finally:
            # Close down consumer to commit final offsets.
            self.consumer.close()
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest
        self.subscribed = False
        #
        #
        # Configure the broker properties below. Make sure to reference the project README
        # and use the Host URL for Kafka and Schema Registry!
        #
        #
        self.broker_properties = {
            'bootstrap.servers':
            "PLAINTEXT://kafka0:19092,PLAINTEXT://kafka1:19092,PLAINTEXT://kafka2:19092",
            "group.id": "cta-1",
            "session.timeout.ms": 20000,
            "heartbeat.interval.ms": 1500,
            "max.poll.interval.ms": 60000,
            # 'debug': 'broker'
        }
        if self.offset_earliest:
            self.broker_properties['auto.offset.reset'] = "earliest"

        # Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://schema-registry:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        #
        #
        # Configure the AvroConsumer and subscribe to the topics. Make sure to think about
        # how the `on_assign` callback should be invoked.
        #
        #
        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # TODO: If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        self.subscribed = True
        logger.info("in on_assing, offset is " + str(self.offset_earliest))
        if self.offset_earliest:
            logger.info("set offset to beginning...")
            for partition in partitions:
                partition.offset = OFFSET_BEGINNING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        # while not self.subscribed:
        #     gen.sleep(self.sleep_secs)
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        logger.debug("consuming message")
        message = self.consumer.poll(timeout=self.consume_timeout)
        if message is None:
            logger.debug("no message received!")
            return 0
            self.close()
        elif message.error() is not None:
            raise Exception("error during consumer poll" +
                            message.error().str())
            return 0
            self.close()
        else:
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        logger.info("closing consumer...")
        self.consumer.close()
Exemplo n.º 26
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        # init broker properties
        self.broker_properties = {
            "bootstrap.servers": f"PLAINTEXT://{BOOTSTRAP_SERVERS_URL}",
            "group.id": "cta.consumer",
        }

        # in case we're starting from earliest
        if self.offset_earliest == True:
            self.broker_properties["auto.offset.reset"] = "earliest"

        # create consumer, avro and regular
        if is_avro is True:
            self.consumer = AvroConsumer(
                self.broker_properties,
                schema_registry=CachedSchemaRegistryClient(
                    SCHEMA_REGISTRY_URL))
        else:
            self.consumer = Consumer(self.broker_properties)

        # subscribe to topic (s pattern)
        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        for partition in partitions:
            consumer.seek(partition)
        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""

        message = self.consumer.poll(self.consume_timeout)

        # no message received
        if message is None:
            print("no message received by consumer")
            return 0
        # recieved message with error
        elif message.error() is not None:
            print(f"error from consumer {message.error()}")
            return 1
        # invoke message handler
        else:
            print(
                f"received message, calling handler: {self.message_handler.__name__}"
            )
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 27
0
def main():
    signal.signal(signal.SIGINT, stop_running)

    app.conf.task_routes = {'tick_worker.process_ticks': {'queue': 'tick_processors'}}
    task = app.signature('tick_worker.process_ticks')

    broker = os.environ.get('KAFKA_BROKER', 'kafka:9092')
    topic = os.environ.get('KAFKA_TOPIC', 'ticks')

    consumer = Consumer({'bootstrap.servers': broker,
                         'group.id': 'tick_consumer',
                         'enable.auto.commit': False,
                         'default.topic.config': {
                             'auto.offset.reset': 'smallest'}
                         })

    consumer.subscribe([topic])

    global keep_running
    while keep_running:
        print "Waiting..."
        sys.stdout.flush()
        msg = consumer.poll(5)
        print "Checking the hook..."
        sys.stdout.flush()

        if msg is None:
            print "We got nuttin"
            sys.stdout.flush()
            continue

        if msg.error():
            print "Oh, we got an error"
            if msg.error().code() == KafkaError._PARTITION_EOF:
                print "Found partition EOF"
                print msg.value()
                sys.stdout.flush()
                continue
            else:
                print "And it's a bad error"
                print msg.error()
                sys.stdout.flush()
                break

        print "We got a message!"
        sys.stdout.flush()
        message = json.loads(msg.value())
        message_id = message['message_id']

        #Simulate an exception that kills the consumer
        # if random.random() < 0.1:
        #     print "Oops!  Something threw an execption!  (it was us)"
        #     print message_id
        #     print
        #     sys.stdout.flush()
        #     raise Exception('Generic general exception of doom')
        # elif random.random() < 0.1:
        #     print "For no good reason, we're going to re-process this message later"
        #     print message_id
        #     print
        #     sys.stdout.flush()
        #     continue

        task.delay(message)
        #process_ticks.delay(message)
        print 'Successfully handled message {}'.format(message_id)
        consumer.commit(msg)
        print
        sys.stdout.flush()
        sleep(1)

    consumer.close()
    print "Exiting cleanly"
    exit(0)
Exemplo n.º 28
0
class QuerySubscriptionConsumer(object):
    """
    A Kafka consumer that processes query subscription update messages. Each message has
    a related subscription id and the latest values related to the subscribed query.
    These values are passed along to a callback associated with the subscription.
    """
    def __init__(self,
                 group_id,
                 topic=None,
                 commit_batch_size=100,
                 initial_offset_reset="earliest"):
        self.group_id = group_id
        if not topic:
            topic = settings.KAFKA_SNUBA_QUERY_SUBSCRIPTIONS
        self.topic = topic
        cluster_name = settings.KAFKA_TOPICS[topic]["cluster"]
        self.bootstrap_servers = settings.KAFKA_CLUSTERS[cluster_name][
            "bootstrap.servers"]
        self.commit_batch_size = commit_batch_size
        self.initial_offset_reset = initial_offset_reset
        self.offsets = {}
        self.consumer = None

    def run(self):
        logger.debug("Starting snuba query subscriber")
        self.offsets.clear()

        conf = {
            "bootstrap.servers": self.bootstrap_servers,
            "group.id": self.group_id,
            "session.timeout.ms": 6000,
            "auto.offset.reset": self.initial_offset_reset,
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "false",
            "enable.partition.eof": "false",
            "default.topic.config": {
                "auto.offset.reset": self.initial_offset_reset
            },
        }

        def on_revoke(consumer, partitions):
            self.commit_offsets()

        self.consumer = Consumer(conf)
        self.consumer.subscribe([self.topic], on_revoke=on_revoke)

        try:
            i = 0
            while True:
                message = self.consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise KafkaException(error)

                i = i + 1

                self.handle_message(message)

                # Track latest completed message here, for use in `shutdown` handler.
                self.offsets[message.partition()] = message.offset() + 1

                if i % self.commit_batch_size == 0:
                    logger.debug("Committing offsets")
                    self.commit_offsets()
        except KeyboardInterrupt:
            pass

        self.shutdown()

    def commit_offsets(self):
        if self.offsets and self.consumer:
            to_commit = [
                TopicPartition(self.topic, partition, offset)
                for partition, offset in self.offsets.items()
            ]
            self.consumer.commit(offsets=to_commit)
            self.offsets.clear()

    def shutdown(self):
        logger.debug("Committing offsets and closing consumer")
        self.commit_offsets()
        self.consumer.close()

    def handle_message(self, message):
        """
        Parses the value from Kafka, and if valid passes the payload to the callback defined by the
        subscription. If the subscription has been removed, or no longer has a valid callback then
        just log metrics/errors and continue.
        :param message:
        :return:
        """
        with sentry_sdk.push_scope() as scope:
            try:
                contents = self.parse_message_value(message.value())
            except InvalidMessageError:
                # If the message is in an invalid format, just log the error
                # and continue
                logger.exception(
                    "Subscription update could not be parsed",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return
            scope.set_tag("query_subscription_id", contents["subscription_id"])

            try:
                subscription = QuerySubscription.objects.get_from_cache(
                    subscription_id=contents["subscription_id"])
            except QuerySubscription.DoesNotExist:
                metrics.incr(
                    "snuba_query_subscriber.subscription_doesnt_exist")
                logger.error(
                    "Received subscription update, but subscription does not exist",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return

            if subscription.type not in subscriber_registry:
                metrics.incr(
                    "snuba_query_subscriber.subscription_type_not_registered")
                logger.error(
                    "Received subscription update, but no subscription handler registered",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return

            logger.info(
                "query-subscription-consumer.handle_message",
                extra={
                    "timestamp": contents["timestamp"],
                    "query_subscription_id": contents["subscription_id"],
                    "contents": contents,
                    "offset": message.offset(),
                    "partition": message.partition(),
                    "value": message.value(),
                },
            )

            callback = subscriber_registry[subscription.type]
            with sentry_sdk.start_span(
                    op="process_message",
                    transaction="query_subscription_consumer_process_message"
            ) as span, metrics.timer(
                    "snuba_query_subscriber.callback.duration",
                    instance=subscription.type):
                span.set_data("payload", contents)
                callback(contents, subscription)

    def parse_message_value(self, value):
        """
        Parses the value received via the Kafka consumer and verifies that it
        matches the expected schema.
        :param value: A json formatted string
        :return: A dict with the parsed message
        """
        wrapper = loads(value)
        try:
            jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA)
        except jsonschema.ValidationError:
            metrics.incr("snuba_query_subscriber.message_wrapper_invalid")
            raise InvalidSchemaError("Message wrapper does not match schema")

        schema_version = wrapper["version"]
        if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS:
            metrics.incr(
                "snuba_query_subscriber.message_wrapper_invalid_version")
            raise InvalidMessageError(
                "Version specified in wrapper has no schema")

        payload = wrapper["payload"]
        try:
            jsonschema.validate(payload,
                                SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version])
        except jsonschema.ValidationError:
            metrics.incr("snuba_query_subscriber.message_payload_invalid")
            raise InvalidSchemaError("Message payload does not match schema")

        payload["timestamp"] = parse_date(
            payload["timestamp"]).replace(tzinfo=pytz.utc)
        return payload
Exemplo n.º 29
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(
        map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0],
                                          timeout=0.5,
                                          cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(async=True)

    try:
        kc.commit(async=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions
                if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    kc.close()
Exemplo n.º 30
0
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({
        'group.id': 'test',
        'enable.auto.commit': True,
        'enable.auto.offset.store': False,
        'socket.timeout.ms': 50,
        'session.timeout.ms': 100
    })

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert 'Consumer closed' == str(ex.value)
Exemplo n.º 31
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        BROKER_URL = "PLAINTEXT://localhost:9092"
        SCHEMA_REGISTRY = "http://localhost:8081"

        self.broker_properties = {
            "bootstrap.servers": "PLAINTEXT://localhost:9092",
            'group.id': topic_name_pattern,
            #"auto.offset.reset": "earliest"
            "default.topic.config": {
                "auto.offset.reset": "earliest"
            }
        }

        # TODO: Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        for partition in partitions:
            consumer.seek(partition)
            #if self.offset_earliest == True:
            #    partition.offset = OFFSET_BEGINNING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        try:
            message = self.consumer.poll(1.0)
            self.message_handler(message)
            return 1
        except SerializerError as err:
            logger.error(f"Error while consuming data: {err.message}")
            return 0

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 32
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""

    BROKER_URL = 'PLAINTEXT://localhost:9092'
    SCHEMA_REGISTRY_URL = 'http://localhost:8081'

    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        offset_reset = "earliest" if offset_earliest else "latest"

        self.broker_properties = {
            "bootstrap.servers": self.BROKER_URL,
            "group.id": "org.cta.consumer",
            "auto.offset.reset": offset_reset
        }

        if is_avro is True:
            self.broker_properties["schema.registry.url"] = (
                self.SCHEMA_REGISTRY_URL)
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        if self.offset_earliest:
            for partition in partitions:
                partition.offset = OFFSET_BEGINNING

            logger.info("partitions assigned for %s", self.topic_name_pattern)
            consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message.

        Returns 1 if a message was received, 0 otherwise
        """
        while True:
            try:
                msg = self.consumer.poll(1.0)
            except SerializerError as e:
                logger.error(
                    "Message deserialization failed for {}: {}".format(msg, e))
                return 0

            if msg is None:
                return 0

            if msg.error():
                logger.error("Consumer error: {}".format(msg.error()))
                return 0

            self.message_handler(msg)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 33
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        print("Init for topic", topic_name_pattern)
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        #
        #
        # TODO: Configure the broker properties below. Make sure to reference the project README
        # and use the Host URL for Kafka and Schema Registry!
        #
        #
        self.broker_properties = {
            "bootstrap.servers": "localhost:9092",
            "group.id": self.topic_name_pattern,
            'auto.offset.reset': 'earliest'
        }

        # TODO: Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        #
        #
        # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about
        # how the `on_assign` callback should be invoked.
        #
        #
        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)
        print(f"Init complete for:{self.topic_name_pattern}")

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # TODO: If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        print("on_assign is Running")
        for partition in partitions:
            # TODO
            if self.offset_earliest:
                partition.offset = 0
            print("partition", partition)

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        #
        #
        # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions.
        # Additionally, make sure you return 1 when a message is processed, and 0 when no message
        # is retrieved.
        #
        #

        try:
            message = self.consumer.poll(1.0)

        except SerializerError as e:
            print("Message deserialization failed for {}: {}".format(msg, e))
            return 0
        print(f"message in _consume (): {message}")
        if message is None:
            return 0
        elif message.error() is not None:
            logger.error("Error caused due to :", message.error())
            return 0
        else:
            print("message_handler called()")
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        # TODO: Cleanup the kafka consumer
        #
        self.consumer.commit()
        self.consumer.unassign()
        self.consumer.unsubscribe()
        self.consumer.close()
Exemplo n.º 34
0
            # commit the transaction every TRANSACTION_TIME
            cur_time = time.time()
            if cur_time >= last_transaction_time + TRANSACTION_TIME:
                last_transaction_time = cur_time
                commit_transaction(verbose=VERBOSE,
                                   commit_time=last_transaction_time)

    except KeyboardInterrupt:
        print("Gracefully stopping")
    finally:
        stop_time = time.time()

        # commit processed message offsets to the transaction
        kafka_producer.send_offsets_to_transaction(
            kafka_consumer.position(kafka_consumer.assignment()),
            kafka_consumer.consumer_group_metadata())
        # commit transaction
        kafka_producer.commit_transaction()
        # Leave group and commit offsets
        kafka_consumer.close()

        print(
            f"\nRecords in |{TARGET_SYSTEM}| = {stream_buffer.get_join_counter()}, "
            f"|left buffer| = {stream_buffer.get_left_counter()}, "
            f"|right buffer| = {stream_buffer.get_right_counter()}.")
    if start_time != stop_time:
        print(
            f"Joined time-series {stop_time - start_time:.6f} s long, "
            f"that are {stream_buffer.get_join_counter() / (stop_time - start_time):.2f} joins per second."
        )
Exemplo n.º 35
0
class Manager:
    def _kafkaAssign(self):
        consumer_assigned = False

        def flush(consumer, partition):
            nonlocal consumer_assigned
            for p in partition:
                p.offset = OFFSET_END
            consumer.assign(partition)
            consumer_assigned = True

        self.kafka_consumer.subscribe([self.manager_topic], on_assign=flush)
        while not consumer_assigned:
            self.kafka_consumer.poll(1)

    def _configuire_topic(self, configs):
        admin_client = AdminClient(
            {'bootstrap.servers': configs['kafka_host']})
        sensor_topic_partitions = NewPartitions(
            configs['sensor_manager_topic'],
            int(configs['sensor_manager_topic_partitions']))
        admin_client.create_partitions([sensor_topic_partitions])

    def _get_data(self):
        message = self.kafka_consumer.poll(self._response_timeout)
        if message != None and not message.error():
            return message.value().decode()
        return None

    def _manager_consumer_thread(self):
        while self._status:
            message = self._get_data()
            #print(message)
            if message != None:
                #pass
                self.message_executor.exec(message)

    def __init__(self, config="Configs/platform_configs.json"):
        self.message_executor = MessageExec(config)
        with open(config, 'r') as fp:
            configs = json.load(fp)
            self.manager_topic = configs["sensor_manager_topic"]
            self._configuire_topic(configs)
            self.kafka_consumer = Consumer({
                "bootstrap.servers":
                configs['kafka_host'],
                "group.id":
                "sensor_manager",
                "auto.offset.reset":
                'latest'
            })
            self._kafkaAssign()
            self._response_timeout = configs["sensor_manager_response_timeout"]
            print("Subscribed to sensor_manager")

    def start(self):
        self._status = True
        self.manager_thread = threading.Thread(
            target=self._manager_consumer_thread)
        self.manager_thread.start()
        print("Manager polling thread started")

    def stop(self):
        self._status = False
        self.manager_thread.join()
        self.kafka_consumer.close()
class KafkaConsumer:
    """Defines the base kafka consumer class"""

    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        #
        #
        # TODO: Configure the broker properties below. Make sure to reference the project README
        # and use the Host URL for Kafka and Schema Registry!
        #
        #
        self.group_id = "".join(re.findall("[a-zA-Z]+", topic_name_pattern))

        print(self.group_id)
        self.broker_properties = {
            "group.id": self.group_id,
            "bootstrap.servers": "PLAINTEXT://localhost:9092",
            "auto.offset.reset": "earliest",
        }

        logger.info('%s', json.dumps(self.broker_properties))

        # TODO: Create the Consumer, using the appropriate type.
        if is_avro is True:
            logger.info("Creating AvroConsumer with group.id %s", self.broker_properties['group.id'])
            self.broker_properties["schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
            logger.info("Successfully AvroConsumer with group.id %s", self.broker_properties['group.id'])
        else:
            logger.info("Creating Consumer with group.id %s", self.broker_properties['group.id'])
            self.consumer = Consumer(self.broker_properties)
            logger.info("Created Consumer with group.id %s", self.broker_properties['group.id'])

        #
        #
        # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about
        # how the `on_assign` callback should be invoked.
        #
        #
        self.consumer.subscribe(
            [self.topic_name_pattern],
            on_assign = self.on_assign
        )

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # TODO: If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        logger.debug("on_assign called")

        if self.offset_earliest:
            for partition in partitions:
                partition.offset = OFFSET_BEGINNING

            logger.info("partitions assigned for %s", self.topic_name_pattern)
            consumer.assign(partitions)
        else:
            logger.debug("No partitions are assigned for consumer %s", consumer)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        #
        #
        # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions.
        # Additionally, make sure you return 1 when a message is processed, and 0 when no message
        # is retrieved.
        #
        #
        logger.debug("consuming from %s", self.topic_pattern)

        message = self.consumer.poll(timeout=self.consume_timeout)

        if message is None:
            logger.debug("Message was None in topic %s", self.topic_pattern)
            return 0
        elif message.error() is not None:
            logger.exception("Message could not be consumed: %s", message.error())
        else:
            logger.debug("Message processed, key: %s , val: %s", message.key(), message.value())
            self.message_handler(message)
            return 1


    def close(self):
        """Cleans up any open kafka consumers"""
        #
        #
        # TODO: Cleanup the kafka consumer
        #
        #
        try:
            self.consumer.close()
        except Exception as e:
            logger.exception("Cannot close consumer: ")

        logger.debug('Consumer object closed successfully')
class ConfluentKafkaReader(object):
    def __init__(self, host, port, group, topic, buffer_size, reconnect_wait_time=2):
        """
        Initialize Kafka reader
        """
	logging.info("Initializing Confluent Kafka Consumer")
        self.host = host
        self.port = str(port)
        self.group = group
        self.topic = [topic]
	self.buffer_size = buffer_size
        self.reconnect_wait_time = reconnect_wait_time
        self.reconnect_retries = 0
        self.max_reconnect_retries = 10 # TODO: implement config parameter
	self.buffer = []

        # Initialized on read
        self.consumer = None

    def on_assign(self, consumer, partitions):
#	for p in partitions:
#            p.offset=-2
#        consumer.assign(partitions)
        logging.debug('on_assignment callback...')
        logging.info('Assignment:', partitions)

    def _connect(self):
        connection = {'bootstrap.servers': self.host+":"+self.port, 'group.id': self.group, 'session.timeout.ms': 6000,
			'default.topic.config': {'auto.offset.reset': 'largest'}}
        logging.info("Connecting to Kafka at %s...", connection)
        self.consumer = Consumer(**connection)
        self.consumer.subscribe(self.topic, on_assign=self.on_assign)

    def read(self):
        """
        Read from Kafka. Reconnect on error.
        """
	try:
           self._connect()
	   msgcn = 0
	   while True:
	      msg = self.consumer.poll(timeout=1.0)
	      if msg is None:
 		continue
	      if msg.error():
                # Error or event
		if msg.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition event
                   logging.debug('Catching KafkaError._PARTITION_EOF')
		   logging.error('%s [%d] reached end at offset %d\n', msg.topic(), msg.partition(), msg.offset())
                   logging.error('%s [%d] at offset %d with key %s:\n',
                                 msg.topic(), msg.partition(), msg.offset(),
                                 str(msg.key()))
                   break
		elif msg.error():
                    # Error
                    # TODO : extend exception handling scope as we will end here
                    # for a lot of reasons !
                   logging.debug('Catching other errors...')
		   logging.error("Kafka error: %s.", msg.error())
		   logging.error("Trying to reconnect to %s:%s", self.host, self.port)
                   self.reconnect_retries += 1
		   time.sleep(self.reconnect_wait_time)
                   if self.reconnect_retries >= self.max_reconnect_retries:
                       logging.error("Max reconnection attempt limit reached (%d). Aborting",
                                     self.max_reconnect_retries)
                       break
                   else:
                       self.consumer.close()
                       self._connect()
		       pass
                    #raise KafkaException(msg.error())
	      else:
            # Proper message
		logging.error('%s [%d] at offset %d with key %s:\n', msg.topic(), msg.partition(), msg.offset(), str(msg.key()))
		(self.buffer).append(msg.value().rstrip('\n')) # otherwise the
                #writter will add extra \n 
                msgcn += 1
                #self.consumer.commit(async=False)
		if msgcn >= self.buffer_size: 
                    logging.debug("Read buffer [%d] reached.",self.buffer_size)
                    break
	except KeyboardInterrupt:
          logging.info('Aborted by user\n')
    # Close down consumer to commit final offsets.
	self.consumer.close()
	return(self.buffer)
Exemplo n.º 38
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        self.broker_properties = {
            "bootstrap.server": BROKER_URL,
            "group.id": self.topic_name_pattern,
            "default.topic.config": {
                "auto.offset.reset": "earliest"
            }
        }

        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        for partition in partitions:
            if partition.offset_earliest:
                partition.offset = confluent_kafka.OFFSET_BEGINING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        try:
            message = consumer.poll()
        except Exception as e:
            logger.error(e)
            return 0

        if message is None:
            logger.info("The message was empty.")
            return 0
        elif message.error():
            logger.error()
            return 0
        else:
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 39
0
class KRPCClient:
    def __init__(self, *addresses, topic_name: Union[str, list],
                 max_polling_timeout: float = 0.001, **kwargs):
        """
        Init Kafka RPCClient.

        Not like the most of the RPC protocols,
        Only one KRPCClient can run on a single Kafka topic.

        If you insist using multiple KRPCClient instances,
        redis must be used, pass argument use_redis=True.

        Args:
            addresses: kafka broker host, port, for examples: '192.168.1.117:9092'
            topic_name: kafka topic_name(s), if topic exists,
                        the existing topic will be used,
                        create a new topic otherwise.
            max_polling_timeout: maximum time(seconds) to block waiting for message, event or callback.
            encrypt: default None, if not None, will encrypt the message with the given password. It will slow down performance.
            verify: default False, if True, will verify the message with the given sha3 checksum from the headers.
            use_redis: default False, if True, use redis as cache, built-in QueueDict otherwise.
            ack: default False, if True, server will confirm the message status. Disable ack will double the speed, but not exactly safe.
            use_gevent: default True, if True, use gevent instead of asyncio. If gevent version is lower than 1.5, krpc will not run on windows.
            compression: default 'none', check https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md compression.codec. 'zstd' is bugged. Check https://github.com/confluentinc/confluent-kafka-python/issues/589
            use_compression: default False, custom compression using zstd.
            max_queue_len: int, default 1024, if use_redis is False, a QueueDict will cache results with the length of max_queue_len. This should be as low as it can be, otherwise OOM.
        """

        bootstrap_servers = ','.join(addresses)

        assert isinstance(topic_name, str) or isinstance(topic_name, list)
        self.topic_names = [topic_name] if isinstance(topic_name, str) else topic_name
        # self.server_topics = ['krpc_{}_server'.format(topic_name) for topic_name in self.topic_names]
        self.client_topics = ['krpc_{}_client'.format(topic_name) for topic_name in self.topic_names]

        # set max_polling_timeout
        assert max_polling_timeout > 0, 'max_polling_timeout must be greater than 0'
        self.max_polling_timeout = max_polling_timeout

        self.consumer = Consumer({
            'bootstrap.servers': bootstrap_servers,
            'group.id': 'krpc',
            'auto.offset.reset': 'earliest',
            'auto.commit.interval.ms': 1000,
            'compression.codec': kwargs.get('compression_codec', 'none')
        })

        # message_max_bytes = kwargs.get('message_max_bytes', 1048576),
        # queue_buffering_max_kbytes = kwargs.get('queue_buffering_max_kbytes', 1048576),
        # queue_buffering_max_messages = kwargs.get('queue_buffering_max_messages', 100000),
        try:
            message_max_bytes = kwargs['message_max_bytes']
        except KeyError:
            message_max_bytes = 1048576
        try:
            queue_buffering_max_kbytes = kwargs['queue_buffering_max_kbytes']
        except KeyError:
            queue_buffering_max_kbytes = 1048576
        try:
            queue_buffering_max_messages = kwargs['queue_buffering_max_messages']
        except KeyError:
            queue_buffering_max_messages = 100000

        if message_max_bytes > 1048576:
            logger.warning('message_max_bytes is greater than 1048576, '
                           'message.max.bytes and replica.fetch.max.bytes of '
                           'brokers\' config should be greater than this')

        self.producer = Producer({
            'bootstrap.servers': bootstrap_servers,
            'on_delivery': self.delivery_report,

            # custom parameters
            'message.max.bytes': message_max_bytes,
            'queue.buffering.max.kbytes': queue_buffering_max_kbytes,
            'queue.buffering.max.messages': queue_buffering_max_messages,
            'compression.codec': kwargs.get('compression_codec', 'none')
        })

        # add redis cache, for temporarily storage of returned data
        self.use_redis = kwargs.get('use_redis', False)
        self.expire_time = kwargs.get('expire_time', 600)
        if self.use_redis:
            import redis
            redis_host = kwargs.get('redis_host', 'localhost')
            redis_port = kwargs.get('redis_port', 6379)
            redis_db = kwargs.get('redis_db', 0)
            redis_password = kwargs.get('redis_password', None)
            self.cache = redis.Redis(redis_host, redis_port, redis_db, redis_password)
            self.cache_channel = self.cache.pubsub()
        else:
            self.cache = QueueDict(maxlen=kwargs.get('max_queue_len', 1024), expire=self.expire_time)

        # set msgpack packer & unpacker, stop using a global packer or unpacker, to ensure thread safety.
        # self.packer = msgpack.Packer(use_bin_type=True)
        # self.unpacker = msgpack.Unpacker(use_list=False, raw=False)

        self.verify = kwargs.get('verify', False)
        self.verification_method = kwargs.get('verification', 'crc32')
        if self.verification_method == 'crc32':
            self.verification_method = lambda x: hex(zlib.crc32(x)).encode()
        elif isinstance(self.verification_method, Callable):
            self.verification_method = self.verification_method
        else:
            raise AssertionError('not supported verification function.')

        self.encrypt = kwargs.get('encrypt', None)
        if self.encrypt is not None:
            self.encrypt = AESEncryption(self.encrypt, encrypt_length=16)

        self.use_compression = kwargs.get('use_compression', False)

        self.is_closed = False
        # coroutine pool
        use_gevent = kwargs.get('use_gevent', True)
        if use_gevent:
            from gevent.threadpool import ThreadPoolExecutor as gThreadPoolExecutor
            self.pool = gThreadPoolExecutor(1)
        else:
            from aplex import ThreadAsyncPoolExecutor
            self.pool = ThreadAsyncPoolExecutor(pool_size=1)
        self.pool.submit(self.wait_forever)

        # handshake, if's ok not to handshake, but the first rpc would be slow.
        if kwargs.get('handshake', True):
            self.handshaked = {}
        self.subscribe(*self.topic_names)

        # acknowledge, disable ack will double the speed, but not exactly safe.
        self.ack = kwargs.get('ack', False)

    def subscribe(self, *topic_names):
        if not topic_names:
            return

        for topic_name in topic_names:
            client_topic = 'krpc_{}_client'.format(topic_name)

            self.topic_names.append(topic_name)
            self.client_topics.append(client_topic)

        self.consumer.subscribe(self.client_topics)
        logger.info('adding consumer subscription of: {}'.format(topic_names))

        if hasattr(self, 'handshaked'):
            for topic_name in topic_names:
                self.handshaked[topic_name] = False
                server_topic = 'krpc_{}_server'.format(topic_name)
                self.producer.produce(server_topic, b'handshake', b'handshake',
                                      headers={
                                          'checksum': None
                                      })
                self.producer.poll(0.0)
                logger.info('sending handshake to {}'.format(server_topic))
                for i in range(15):
                    if self.handshaked[topic_name]:
                        logger.info('handshake of {} succeeded.'.format(topic_name))
                        break
                    time.sleep(2)
                else:
                    logger.error('failed to handshake with {}'.format(server_topic))

    @staticmethod
    def delivery_report(err, msg):
        if err is not None:
            logger.error('request failed: {}'.format(err))
        else:
            logger.info('request sent to {} [{}]'.format(msg.topic(), msg.partition()))

    @staticmethod
    def parse_response(msg_value):
        try:
            res = msgpack.unpackb(msg_value, use_list=False, raw=False)
        except Exception as e:
            logger.exception(e)

            res = None
        return res

    def call(self, method_name, *args, **kwargs):
        # rpc call timeout
        # WARNING: if the rpc method has an argument named timeout, it will be not be passed.
        timeout = kwargs.pop('timeout', 10)

        # get topic_name
        topic_name = kwargs.pop('topic_name', self.topic_names[0])
        server_topic = 'krpc_{}_server'.format(topic_name)

        start_time = time.time()

        # send request back to server
        req = {
            'method_name': method_name,
            'args': args,
            'kwargs': kwargs
        }

        req = msgpack.packb(req, use_bin_type=True)

        if self.use_compression:
            req = zstd.compress(req)

        if self.encrypt:
            req = self.encrypt.encrypt(req)

        if self.verify:
            checksum = self.verification_method(req)
        else:
            checksum = None

        task_id = uuid.uuid4().hex

        self.producer.produce(server_topic, req, task_id,
                              headers={
                                  'checksum': checksum
                              })

        # waiting for response from server sync/async
        res, flight_time_response = self.poll_result_from_cache(task_id, timeout)

        if self.ack:
            self.producer.poll(0.0)

        # do something to the response
        ret = res['ret']
        tact_time_server = res['tact_time']
        flight_time_request = res['flight_time_request']
        server_id = res['server_id']
        exception = res['exception']
        tb = res['traceback']

        if exception is not None:
            exception = pickle.loads(exception)
            logger.exception(exception)
            if tb is not None:
                logger.error(tb)

        end_time = time.time()

        return {
            'ret': ret,
            'tact_time': end_time - start_time,
            'tact_time_server': tact_time_server,
            'server_id': server_id,
            'flight_time_request': flight_time_request,
            'flight_time_response': flight_time_response
        }

    def wait_forever(self):
        while True:
            if self.is_closed:
                logger.info('user exit')
                break

            try:
                msg = self.consumer.poll(self.max_polling_timeout)

                if msg is None:
                    continue
                if msg.error():
                    logger.error("consumer error: {}".format(msg.error()))
                    continue

                task_id = msg.key()  # an uuid, the only id that pairs the request and the response
                topic_name = msg.topic()

                if task_id == b'handshake':
                    try:
                        real_topic_name = '_'.join(topic_name.split('_')[1:-1])
                    except:
                        logger.error('invalid topic name {}'.format(topic_name))
                        continue
                    self.handshaked[real_topic_name] = True
                    continue

                res = msg.value()
                headers = msg.headers()
                timestamp = msg.timestamp()
                checksum = headers[0][1]

                if self.verify:
                    signature = self.verification_method(res)
                    if checksum != signature:
                        logger.error('checksum mismatch of task {}'.format(task_id))
                        continue

                if self.use_redis:
                    self.cache.hset(task_id, b'result', res)
                    self.cache.hset(task_id, b'flight_time_response', time.time() - timestamp[1] / 1000)
                    self.cache.expire(task_id, self.expire_time)
                else:
                    self.cache[task_id] = res, time.time() - timestamp[1] / 1000

                # send signal for polling to search for result
                ...

            except Exception as e:
                logger.exception(e)

    def poll_result_from_cache(self, task_id, timeout=10):
        """
        poll_result_from_cache after receiving a signal from waiting
        Args:
            task_id:
            timeout:

        Returns:

        """
        loop_times = int(timeout / self.max_polling_timeout)
        task_id = task_id.encode()
        if self.use_redis:
            for _ in range(loop_times):
                res_exists = self.cache.hexists(task_id, 'result')

                # if still no response yet, continue polling
                if not res_exists:
                    continue

                res = self.cache.hget(task_id, b'result')
                flight_time_response = self.cache.hget(task_id, b'flight_time_response')

                break
            else:
                raise TimeoutError
        else:
            for _ in range(loop_times):
                try:
                    res = self.cache[task_id]
                    flight_time_response = res[1]
                    res = res[0]
                    break
                except:
                    time.sleep(self.max_polling_timeout)
            else:
                raise TimeoutError

        if self.encrypt:
            res = self.encrypt.decrypt(res)

        if self.use_compression:
            res = zstd.decompress(res)

        res = self.parse_response(res)

        return res, flight_time_response

    def __getattr__(self, method_name):
        return lambda *args, **kwargs: self.call(method_name, *args, **kwargs)

    def close(self):
        self.is_closed = True
        if self.use_redis:
            self.cache_channel.close()
            self.cache.close()
        self.consumer.close()
        self.producer.flush()
        self.pool.shutdown()
Exemplo n.º 40
0
class SynchronizedConsumer(KafkaConsumerFacade):
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)

    This class implements the KafkaConsumerFacade so that it can be used with
    BatchingKafkaConsumer.
    """

    initial_offset_reset_strategies = {
        "earliest": get_earliest_offset,
        "latest": get_latest_offset
    }

    def __init__(
        self,
        cluster_name,
        consumer_group,
        commit_log_topic,
        synchronize_commit_group,
        initial_offset_reset="latest",
        on_commit=None,
    ):
        self.cluster_name = cluster_name
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[
            initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        (
            self.__commit_log_consumer,
            self.__commit_log_consumer_stop_request,
        ) = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = kafka_config.get_kafka_consumer_cluster_options(
            cluster_name,
            override_params={
                "group.id": self.consumer_group,
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "true",
                "enable.partition.eof": "false",
                "default.topic.config": {
                    "auto.offset.reset": "error"
                },
                "on_commit": commit_callback,
            },
        )

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                cluster_name=self.cluster_name,
                consumer_group=f"{self.consumer_group}:sync:{uuid.uuid1().hex}",
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ))
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception("Commit log consumer unexpectedly exit!")

    def __on_partition_state_change(self, topic, partition,
                                    previous_state_and_offsets,
                                    current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug(
            "State change for %r: %r to %r",
            (topic, partition),
            previous_state_and_offsets,
            current_state_and_offsets,
        )

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (
                SynchronizedPartitionState.UNKNOWN,
                SynchronizedPartitionState.SYNCHRONIZED,
                SynchronizedPartitionState.REMOTE_BEHIND,
        ):
            self.__consumer.pause(
                [TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume(
                [TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError(
                f"Unexpected partition state: {current_state}")

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {(i.topic, i.partition): self.__positions.get(
                (i.topic, i.partition))
                          for i in assignment}

            for i in self.__consumer.committed([
                    TopicPartition(topic, partition)
                    for (topic, partition), offset in assignment.items()
                    if offset is None
            ]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(
                        consumer, i.topic, i.partition)

            self.__consumer.assign([
                TopicPartition(topic, partition, offset)
                for (topic, partition), offset in assignment.items()
            ])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(
                    topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(
                    self,
                    [
                        TopicPartition(topic, partition)
                        for topic, partition in assignment.keys()
                    ],
                )

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(),
            message.offset() + 1)
        self.__positions[(message.topic(),
                          message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        self.broker_properties = {
            'bootstrap.servers': config['broker']['bootstrap.servers'],
            "group.id": f"{config['broker']['group.id']}"
        }

        if is_avro is True:
            self.broker_properties["schema.registry.url"] = config['broker'][
                'schema.registry.url']
            self.consumer = AvroConsumer(config=self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        if self.offset_earliest:
            for partition in partitions:
                partition.offset = OFFSET_BEGINNING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:

            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        message = self.consumer.poll(1.0)
        if message is None:
            logger.info("no message received by consumer")
        elif message.error() is not None:
            logger.error(f"error from consumer: {message.error()}")
        else:
            logger.info("message received by consumer")
            self.message_handler(message)
            return 1

        return 0

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
        logger.info("Shutting down consumer.")
class KafkaConsumer(BaseKafkaConsumer):
    def __init__(self, config):
        self._config = config["consumer"]
        self.assign_offset_end = self._config.get("assign_offset_end", False)
        conf = self._config["conf"]
        conf.setdefault("group.id", str(uuid.uuid1()))
        self.autocommit_enabled = conf.get("enable.auto.commit", True)
        internal_log_path = self._config.get("internal_log_path")
        conf["error_cb"] = self._error_callback
        if internal_log_path:
            debug_logger = logging.getLogger("debug_consumer")
            timestamp = time.strftime("_%d%m%Y_")
            debug_logger.addHandler(
                logging.FileHandler("{}/kafka_consumer_debug{}{}.log".format(
                    internal_log_path, timestamp, os.getpid())))
            conf["logger"] = debug_logger
        self._consumer = Consumer(**conf)

    @staticmethod
    def on_assign_offset_end(consumer, partitions):
        for p in partitions:
            p.offset = OFFSET_END
        KafkaConsumer.on_assign_log(consumer, partitions)
        consumer.assign(partitions)

    @staticmethod
    def on_assign_log(consumer, partitions):
        log_level = "WARNING"
        for p in partitions:
            if p.error:
                log_level = "ERROR"
        params = {
            "partitions": partitions,
            log_const.KEY_NAME: log_const.KAFKA_ON_ASSIGN_VALUE,
            "log_level": log_level
        }
        log("KafkaConsumer.subscribe<on_assign>: assign %(partitions)s %(log_level)s",
            params=params,
            level=log_level)

    def subscribe(self, topics=None):
        topics = topics or list(self._config["topics"].values())
        self._consumer.subscribe(
            topics,
            on_assign=KafkaConsumer.on_assign_offset_end
            if self.assign_offset_end else KafkaConsumer.on_assign_log)

    def unsubscribe(self):
        self._consumer.unsubscribe()

    def poll(self):
        msg = self._consumer.poll(self._config["poll_timeout"])
        if msg is not None:
            return self._process_message(msg)

    def consume(self, num_messages: int = 1):
        messages = self._consumer.consume(num_messages=num_messages,
                                          timeout=self._config["poll_timeout"])
        for msg in messages:
            yield self._process_message(msg)

    def commit_offset(self, msg):
        if msg is not None:
            if self.autocommit_enabled:
                self._consumer.store_offsets(msg)
            else:
                self._consumer.commit(msg, **{"async": False})

    def get_msg_create_time(self, mq_message):
        timestamp_type, timestamp = mq_message.timestamp()
        return timestamp if timestamp_type is not TIMESTAMP_NOT_AVAILABLE else None

    def _error_callback(self, err):
        params = {
            "error": str(err),
            log_const.KEY_NAME: log_const.EXCEPTION_VALUE
        }
        log("KafkaConsumer: Error: %(error)s", params=params, level="WARNING")
        monitoring.got_counter("kafka_consumer_exception")

    # noinspection PyMethodMayBeStatic
    def _process_message(self, msg: KafkaMessage):
        err = msg.error()
        if err:
            if err.code() == KafkaError._PARTITION_EOF:
                return None
            else:
                monitoring.got_counter("kafka_consumer_exception")
                params = {
                    "code": err.code(),
                    "pid": os.getpid(),
                    "topic": msg.topic(),
                    "partition": msg.partition(),
                    "offset": msg.offset(),
                    log_const.KEY_NAME: log_const.EXCEPTION_VALUE
                }
                log(
                    "KafkaConsumer Error %(code)s at pid %(pid)s: topic=%(topic)s partition=[%(partition)s] "
                    "reached end at offset %(offset)s\n",
                    params=params,
                    level="WARNING")
                raise KafkaException(err)

        if msg.value():
            if msg.headers() is None:
                msg.set_headers([])
            return msg

    def close(self):
        self._consumer.close()
Exemplo n.º 43
0
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        #
        #
        # Configure the broker properties below. Make sure to reference the project README
        # and use the Host URL for Kafka and Schema Registry!
        #
        #
        self.broker_properties = {
            "bootstrap.servers": BROKER_URL,
            "group.id": topic_name_pattern
        }

        # Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties["schema.registry.url"] = SCHEMA_REGISTRY_URL
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        #
        #
        # Configure the AvroConsumer and subscribe to the topics. Make sure to think about
        # how the `on_assign` callback should be invoked.
        #
        #
        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        if self.offset_earliest:
            for partition in partitions:
                partition.offset = OFFSET_BEGINNING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        #
        #
        # Poll Kafka for messages. Make sure to handle any errors or exceptions.
        # Additionally, make sure you return 1 when a message is processed, and 0 when no message
        # is retrieved.
        #
        #
        message = self.consumer.poll(self.consume_timeout)
        if message is None:
            return 0
        elif message.error() is not None:
            return 0
        else:
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 44
0
class KafkaConsumerWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = True
    async_commit = True
    poll_timeout = 0.01
    sleep_time = 0.05
    timestamp_fields = ['timestamp']
    decimal_fields = []
    boolean_fields = []

    def setup(self):
        self.consumer = Consumer(**self.get_consumer_settings())
        self.serializer = self.get_message_serializer()
        self.set_topic()

    def teardown(self):
        self.consumer.close()

    def get_topic_name(self):
        if self.topic_name is None:
            raise NotImplementedError
        return self.topic_name

    def get_consumer_name(self):
        if self.consumer_name is None:
            raise NotImplementedError
        return self.consumer_name

    def get_broker_url(self):
        broker_url = settings.BROKER_URL
        if broker_url is None:
            raise NotImplementedError
        return broker_url

    def get_zookeeper_url(self):
        zookeeper_url = settings.ZOOKEEPER_URL
        if zookeeper_url is None:
            raise NotImplementedError
        return zookeeper_url

    def get_consumer_settings(self):
        broker_url = self.get_broker_url()
        logger.debug('connecting to kafka: ' + broker_url)

        consumer_name = self.get_consumer_name()
        logger.debug('using group id: ' + consumer_name)

        initial_settings = {
            'api.version.request': True,
            'broker.version.fallback': '0.9.0',
            'client.id': 'JanglConsumer',
            'bootstrap.servers': broker_url,
            'group.id': consumer_name,
            'default.topic.config': {'auto.offset.reset': 'earliest'},
            'enable.auto.commit': False,
            'on_commit': self.on_commit,
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
        }
        return generate_client_settings(initial_settings, self.consumer_settings)

    def get_message_serializer(self):
        schema_registry_url = self.get_schema_registry_url()
        logger.debug('loading schema registry: ' + schema_registry_url)
        schema_client = CachedSchemaRegistryClient(url=schema_registry_url)
        return MessageSerializer(schema_client)

    def get_schema_registry_url(self):
        schema_microservice = settings.SCHEMA_MICROSERVICE
        if schema_microservice:
            schema_registry_url = get_service_url(schema_microservice)
        else:
            schema_registry_url = settings.SCHEMA_REGISTRY_URL
        if schema_registry_url is None:
            raise NotImplementedError
        return schema_registry_url

    def set_topic(self):
        topic_name = self.get_topic_name()
        logger.debug('set kafka topic: ' + topic_name)
        self.consumer.subscribe([topic_name], on_assign=self.on_assign, on_revoke=self.on_revoke)

    def on_assign(self, consumer, partitions):
        logger.debug('partitions assigned: {}'.format(partitions))
        consumer.assign(partitions)

    def on_revoke(self, consumer, partitions):
        logger.debug('partitions revoked: {}'.format(partitions))
        try:
            consumer.commit(async=False)
        except KafkaException:
            pass
        consumer.unassign()

    def on_commit(self, err, partitions):
        if err is None:
            logger.debug('commit done: {}'.format(partitions))
        else:
            logger.error('commit error: {} - {}'.format(err, partitions))

    def handle(self):
        message = self.consumer.poll(timeout=self.poll_timeout)

        if message is not None:
            if message.error():
                if message.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition event
                    logger.info('%% %s [%d] reached end at offset %d\n' %
                                (message.topic(), message.partition(), message.offset()))
                elif message.error():
                    raise KafkaException(message.error())
            else:
                message = DecodedMessage(self.serializer, message)
                message = self.parse_message(message)

                self.consume_message(message)

                if self.commit_on_complete:
                    self.commit()
            self.done()
        else:
            self.wait()

    def parse_message(self, message):
        for field in self.timestamp_fields:
            if field in message:
                try:
                    message[field] = datetime.fromtimestamp(message[field], utc)
                except ValueError:
                    try:
                        message[field] = datetime.fromtimestamp(message[field]/1000, utc)
                    except TypeError:
                        pass
                except TypeError:
                    pass
        for field in self.decimal_fields:
            if field in message:
                try:
                    message[field] = decimal.Decimal(message[field])
                except (TypeError, decimal.InvalidOperation):
                    pass
        for field in self.boolean_fields:
            if field in message:
                try:
                    message[field] = bool(message[field])
                except TypeError:
                    pass
        return message

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(async=self.async_commit)

    def consume_message(self, message):
        pass
class KafkaConsumer:
    """Defines the base kafka consumer class"""

    BROKER_URL = "PLAINTEXT://localhost:9092"
    NAMESPACE = "cta.trains.monitor"

    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest
        self.broker_properties = {
            "bootstrap.servers": KafkaConsumer.BROKER_URL,
            "group.id": f"{self.topic_name_pattern}"
        }
        self.broker_properties = {
            "bootstrap.servers": KafkaConsumer.BROKER_URL,
            "group.id": f"{self.topic_name_pattern}"
        }

        # Instantiate AvroConsumer/Consumer
        # Use AvroConsumer for Avro formatted stream
        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(config=self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        # Topic subscription with conditional earliest offset
        if self.offset_earliest:
            self.consumer.subscribe(topics=[self.topic_name_pattern],
                                    on_assign=self.on_assign)
        else:
            self.consumer.subscribe(topics=[self.topic_name_pattern])

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        for partition in partitions:
            try:
                partition.offset = 0  #0-earliest, 1-current, 2-latest
                logger.info(f"Parition offset set")
            except Exception as e:
                logger.error(f"{e}")

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        logger.info("Processing poll")

        message = self.consumer.poll(timeout=1.0)
        if message is None:
            return 0
        elif message.error() is not None:
            logger.debug(f"message error: {message.error()}")
            return 0
        else:
            self.message_handler(message)
            return 1

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
class StreamAbsGen(object):
	def __init__(self,data_storage,data_source):
		#For Apache Cassandra, HBase and Hive, code from HivePythonClient.py for HiveServer2,
		#HBasePythonClient.py and CassandraPythonClient.py has been #replicated in __iter__(). 

		#Possible storages:
		#self.data_storage="file"
		#self.data_storage="hive"
		#self.data_storage="hbase"
		#self.data_storage="cassandra"
		#self.data_storage="USBWWAN_stream"
		#self.data_storage="KingCobra"
		#self.data_storage="Spark_Parquet"
		#self.data_storage="AsFer_Encoded_Strings"
		self.data_storage=data_storage

		#Possible datasources:
		#self.data_source="RZF"
		#self.data_source="movielens"
		#self.data_source="USBWWAN"
		#self.data_source="file"
		#self.data_source="KingCobra"
		#self.data_source="Spark_Streaming"
		#self.data_source="NeuronRain"
		self.data_source=data_source

		if self.data_storage=="KingCobra":
			self.inputfile=open("/var/log/kingcobra/REQUEST_REPLY.queue")

		if self.data_storage=="AsFer_Encoded_Strings":
			self.inputfile=open("../cpp-src/asfer.enterprise.encstr")

		if self.data_storage=="file":
			self.inputfile=open(data_source,"r")

		if self.data_storage=="USBWWAN_stream":
			self.inputfile=open("../../usb-md-github-code/usb_wwan_modified/testlogs/kern.log.print_buffer_byte")

		if self.data_storage=="hbase":
			self.hbase_connection = happybase.Connection(host='localhost',port=9090,transport='buffered')
			self.hbase_table = self.hbase_connection.table('stream_data')
			print "StreamAbsGen:__init__():connected to HBase table"
	
		if self.data_storage=="hive":	
			#pyhs2 client - requires SASL
			self.hive_conn=pyhs2.connect(host='localhost',
       	        		    port=10000,
			            authMechanism="PLAIN",
       		                     user='******',
       		                     password='******',
       		                     database='default')
			self.hive_cur=self.hive_conn.cursor()
			#Show databases
			print self.hive_cur.getDatabases()

			#Execute query
			self.hive_cur.execute("CREATE TABLE stream_data (alphanum STRING)")
			self.hive_cur.execute("select * from stream_data")

			#Return column info from query
			print self.hive_cur.getSchema()
			print "StreamAbsGen:__init__():connected to Hive table"

		if self.data_storage=="cassandra":
			self.cl=Cluster()
			self.session = self.cl.connect('cassandrakeyspace')
			inputf=open('movielens_stream2.data')
			for line in inputf:
		       		linetoks=line.split(' ')
		       		query='INSERT INTO stream_data(row_id,alphanum) VALUES (\''+linetoks[0]+'\',\''+linetoks[1]+'\');'
		       		print query
		       		session.execute(query)
			self.query='SELECT * FROM stream_data'
			self.resultrows=self.session.execute(self.query)
			print "StreamAbsGen:__init__(): connected to Cassandra"

		if self.data_storage=="Kafka":
		        self.c = Consumer({'bootstrap.servers': '0', 'group.id': 'test-consumer-group', 'default.topic.config': {'auto.offset.reset': 'smallest'}})
		        self.c.subscribe(['neuronraindata'])
		if self.data_storage=="Socket_Streaming":
			self.streaming_host=self.data_source
			self.streaming_port=64001
		if self.data_storage=="OperatingSystem":
			self.streaming_host="localhost"
		if self.data_storage=="TextHistogramPartition":
			self.partition_stream=[]
			for ds in data_source:
				self.partition_stream.append(open(ds,"r"))
		if self.data_storage=="DictionaryHistogramPartition":
			self.partition_stream=open(data_source,"r")
			
				
	def __iter__(self):
		if self.data_storage=="Spark_Parquet":
			self.spark=SparkSession.builder.getOrCreate()
			spark_stream_parquet=self.spark.read.parquet("../java-src/bigdata_analytics/spark_streaming/word.parquet")
			#spark_stream_parquet_DS=spark_stream_parquet.rdd.map(lambda row: (row.word))
			spark_stream_parquet_DS=spark_stream_parquet.rdd.filter(lambda row: row.word not in [' ','or','and','who','he','she','whom','well','is','was','were','are','there','where','when','may', 'The', 'the', 'In','in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',' ','.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s','1','2','3','4','5','6','7','8','9','0'])
			for r in spark_stream_parquet_DS.collect():
				print "StreamiAbsGen(Spark Parquet): iterator yielding %s" % r.word.encode("UTF-8")
				yield r.word.encode("UTF-8")
		if self.data_storage=="KingCobra":
			for i in self.inputfile:
				print "StreamAbsGen(file storage): iterator yielding %s" % i
				yield i
		if self.data_storage=="hbase":
			for key,value in self.hbase_table.scan():
				print "StreamAbsGen(HBase storage): iterator yielding %s" % i
   				yield value['cf:alphanum'] 
		if self.data_storage=="AsFer_Encoded_Strings":
			for i in self.inputfile:
				print "StreamAbsGen(file storage): iterator yielding %s" % i
				yield i
		if self.data_storage=="file":
			for i in self.inputfile:
				words=i.split()
				for word in words:
					print "StreamAbsGen(file storage): iterator yielding %s" % word.strip() 
					yield word.strip() 
		if self.data_storage=="hive":
		        #Fetch table results
		        for i in self.hive_cur.fetch():
				print "StreamAbsGen(Hive storage): iterator yielding %s" % i[0]
		                yield i[0]
		if self.data_storage=="cassandra":
			for row in self.resultrows:
			        #print row.row_id,' ',row.alphanum
				print "StreamAbsGen(Cassandra storage): iterator yielding %s" % row.alphanum 
				yield row.alphanum
		if self.data_storage=="USBWWAN_stream":
			for i in self.inputfile:
				#print "StreamAbsGen(USBWWAN byte stream data): iterator yielding %s" % i
				yield i
		if self.data_storage=="Kafka":
			        while True:
				    print "Polling Kafka topic to receive message ..."
			            msg = self.c.poll()
			            if not msg.error() and msg.value():
			                print('Received message: ' , msg.value().encode("utf-8"))
					yield msg
			            else:
			                print(msg.error())
			        self.c.close()
		if self.data_storage=="Socket_Streaming":
			s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
			s.connect((self.streaming_host,self.streaming_port))
			print "socket_streaming_client(): host = ",self.streaming_host,"; post=",self.streaming_port
			data=""
			while data != None:
				data=s.recv(100)
				yield data
		if self.data_storage=="OperatingSystem" and self.data_source=="SchedulerRunQueue":
			from DeepLearning_SchedulerAnalytics import sched_debug_runqueue
			while True:
				schedrunqueue=sched_debug_runqueue()
				#df=DataFrame(data=schedrunqueue)
				#yield df
				yield schedrunqueue
		if self.data_storage=="TextHistogramPartition":
			self.sc = SparkContext()
			for ps in self.partition_stream:
				partition_stream_DS=self.sc.parallelize(ps.readlines()).flatMap(lambda line: line.split(" ")).map(lambda word: (word,[1])).reduceByKey(lambda v1,v2: v1+v2).groupByKey().mapValues(list)
				partition=partition_stream_DS.collect()
				print "partition:",partition
				if partition[0] is not '':
					print "StreamAbsGen(Spark Parquet): iterator yielding labelled partition: %s" % partition
					yield partition	
		if self.data_storage=="DictionaryHistogramPartition":
			dict_stream=ast.literal_eval(self.partition_stream.read())
			for d in dict_stream:
				yield d
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=False,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        self.broker_properties = {
            "bootstrap.servers": BROKER_URL,
            "group.id": "0-PublicTransport",
            "auto.offset.reset": "earliest" if offset_earliest else "latest"
        }

        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # logger.info("on_assign is incomplete - skipping")

        # Use offset_earliest to determine the right offset for the consumer
        for partition in partitions:
            if self.offset_earliest:
                partition.offset = "earliest"

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        message = self.consumer.poll(self.consume_timeout)
        if message is None:
            print("No message recieved by consumer")
        elif message.error() is not None:
            print(f"error from consumer {message.error()}")
        else:
            try:
                print(f"message recieved: {message.value()}")
                self.message_handler(message)
                return 1
            except KeyError as e:
                print(f"Failed to unpack message {e}")

        # logger.info("_consume is incomplete - skipping")
        return 0

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100',
                   'session.timeout.ms': 1000,  # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    kc.close()
Exemplo n.º 49
0
    def run(self):

        conf = {
            'bootstrap.servers': self.kafka_broker,
            'group.id': self.group,
            'session.timeout.ms': 6000,
            'auto.offset.reset': 'earliest'
        }

        logger = logging.getLogger('consumer_' + str(self.pid))
        logger.setLevel(logging.DEBUG)
        handler = logging.StreamHandler()
        handler.setFormatter(
            logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s'))
        logger.addHandler(handler)
        logger.info("Config: " + str(conf))
        logger.info("Creating Consumer")
        c = Consumer(conf, logger=logger)

        def print_assignment(consumer, partitions):
            print('Assignment:', partitions)

        # Subscribe to topics
        c.subscribe(self.topic, on_assign=print_assignment)

        l_val = []
        fcount = 0
        msg_count = 0
        pq_id = 0
        pq_limit = 100000
        # Read messages from Kafka
        while True:
            msg = c.poll(timeout=1.0)

            if msg is None:
                continue

            if msg.error():
                raise KafkaException(msg.error())
            else:
                # Proper message
                msg_dict = json.loads(msg)
                timestamp = msg_dict["@timestamp"]
                dt_ts = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
                if dt_ts >= self.dt_end:
                    break
                elif dt_ts < self.dt_start:
                    continue

                host = msg_dict["host"]["name"]
                kmsg = msg_dict["message"]
                l_val.append((host, dt_ts, kmsg))
                fcount += 1
                if pq_limit == fcount:
                    pq_id += 1
                    pq_fn = "./" + self.topic + "_" + str(
                        self.pid) + "_" + str(pq_id) + ".parquet"
                    df = pd.DataFrame(l_val,
                                      columns=["host", "timestamp", "message"])
                    logger.info("Writing Parq file: " + pq_fn)
                    df.to_parquet(pq_fn, compression='snappy')
                    logger.info("Written count: " + str(fcount))
                    msg_count += fcount
                    logger.info("Processed count: " + str(msg_count))
                    fcount = 0
                    l_val = []

        if fcount > 0:
            pq_id += 1
            pq_fn = "./" + self.topic + "_" + str(self.pid) + "_" + str(pq_id)
            df = pd.DataFrame(l_val, columns=["host", "timestamp", "message"])
            logger.info("Writing Parq file: " + pq_fn)
            df.to_parquet(pq_fn, compression='snappy')
            logger.info("Written count: " + str(fcount))
            msg_count += fcount
            logger.info("Total Processed count: " + str(msg_count))

        c.close()
Exemplo n.º 50
0
class QuerySubscriptionConsumer(object):
    """
    A Kafka consumer that processes query subscription update messages. Each message has
    a related subscription id and the latest values related to the subscribed query.
    These values are passed along to a callback associated with the subscription.
    """

    topic_to_dataset = {
        settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS: QueryDatasets.EVENTS,
        settings.KAFKA_TRANSACTIONS_SUBSCRIPTIONS_RESULTS: QueryDatasets.TRANSACTIONS,
    }

    def __init__(
        self, group_id, topic=None, commit_batch_size=100, initial_offset_reset="earliest"
    ):
        self.group_id = group_id
        if not topic:
            topic = settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS
        self.topic = topic
        cluster_name = settings.KAFKA_TOPICS[topic]["cluster"]
        self.bootstrap_servers = settings.KAFKA_CLUSTERS[cluster_name]["bootstrap.servers"]
        self.commit_batch_size = commit_batch_size
        self.initial_offset_reset = initial_offset_reset
        self.offsets = {}
        self.consumer = None

    def run(self):
        logger.debug("Starting snuba query subscriber")
        self.offsets.clear()

        conf = {
            "bootstrap.servers": self.bootstrap_servers,
            "group.id": self.group_id,
            "session.timeout.ms": 6000,
            "auto.offset.reset": self.initial_offset_reset,
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "false",
            "enable.partition.eof": "false",
            "default.topic.config": {"auto.offset.reset": self.initial_offset_reset},
        }

        def on_assign(consumer, partitions):
            for partition in partitions:
                if partition.offset == OFFSET_INVALID:
                    updated_offset = None
                else:
                    updated_offset = partition.offset
                self.offsets[partition.partition] = updated_offset
            logger.info("query-subscription-consumer.on_assign", extra={"offsets": self.offsets})

        def on_revoke(consumer, partitions):
            partition_numbers = [partition.partition for partition in partitions]
            self.commit_offsets(partition_numbers)
            for partition_number in partition_numbers:
                self.offsets.pop(partition_number, None)
            logger.info("query-subscription-consumer.on_revoke", extra={"offsets": self.offsets})

        self.consumer = Consumer(conf)
        self.consumer.subscribe([self.topic], on_assign=on_assign, on_revoke=on_revoke)

        try:
            i = 0
            while True:
                message = self.consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise KafkaException(error)

                i = i + 1

                with sentry_sdk.start_span(
                    Span(
                        op="handle_message",
                        transaction="query_subscription_consumer_process_message",
                        sampled=True,
                    )
                ), metrics.timer("snuba_query_subscriber.handle_message"):
                    self.handle_message(message)

                # Track latest completed message here, for use in `shutdown` handler.
                self.offsets[message.partition()] = message.offset() + 1

                if i % self.commit_batch_size == 0:
                    logger.debug("Committing offsets")
                    self.commit_offsets()
        except KeyboardInterrupt:
            pass

        self.shutdown()

    def commit_offsets(self, partitions=None):
        logger.info(
            "query-subscription-consumer.commit_offsets",
            extra={"offsets": self.offsets, "partitions": partitions},
        )

        if self.offsets and self.consumer:
            if partitions is None:
                partitions = self.offsets.keys()
            to_commit = []
            for partition in partitions:
                offset = self.offsets.get(partition)
                if offset is None:
                    # Skip partitions that have no offset
                    continue
                to_commit.append(TopicPartition(self.topic, partition, offset))

            self.consumer.commit(offsets=to_commit)

    def shutdown(self):
        logger.debug("Committing offsets and closing consumer")
        self.commit_offsets()
        self.consumer.close()

    def handle_message(self, message):
        """
        Parses the value from Kafka, and if valid passes the payload to the callback defined by the
        subscription. If the subscription has been removed, or no longer has a valid callback then
        just log metrics/errors and continue.
        :param message:
        :return:
        """
        with sentry_sdk.push_scope() as scope:
            try:
                with metrics.timer("snuba_query_subscriber.parse_message_value"):
                    contents = self.parse_message_value(message.value())
            except InvalidMessageError:
                # If the message is in an invalid format, just log the error
                # and continue
                logger.exception(
                    "Subscription update could not be parsed",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return
            scope.set_tag("query_subscription_id", contents["subscription_id"])

            try:
                with metrics.timer("snuba_query_subscriber.fetch_subscription"):
                    subscription = QuerySubscription.objects.get_from_cache(
                        subscription_id=contents["subscription_id"]
                    )
                    if subscription.status != QuerySubscription.Status.ACTIVE.value:
                        metrics.incr("snuba_query_subscriber.subscription_inactive")
                        return
            except QuerySubscription.DoesNotExist:
                metrics.incr("snuba_query_subscriber.subscription_doesnt_exist")
                logger.error(
                    "Received subscription update, but subscription does not exist",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                try:
                    _delete_from_snuba(
                        self.topic_to_dataset[message.topic()], contents["subscription_id"]
                    )
                except Exception:
                    logger.exception("Failed to delete unused subscription from snuba.")
                return

            if subscription.type not in subscriber_registry:
                metrics.incr("snuba_query_subscriber.subscription_type_not_registered")
                logger.error(
                    "Received subscription update, but no subscription handler registered",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return

            logger.info(
                "query-subscription-consumer.handle_message",
                extra={
                    "timestamp": contents["timestamp"],
                    "query_subscription_id": contents["subscription_id"],
                    "project_id": subscription.project_id,
                    "subscription_dataset": subscription.snuba_query.dataset,
                    "subscription_query": subscription.snuba_query.query,
                    "subscription_aggregation": subscription.snuba_query.aggregate,
                    "subscription_time_window": subscription.snuba_query.time_window,
                    "subscription_resolution": subscription.snuba_query.resolution,
                    "offset": message.offset(),
                    "partition": message.partition(),
                    "value": message.value(),
                },
            )

            callback = subscriber_registry[subscription.type]
            with sentry_sdk.start_span(op="process_message") as span, metrics.timer(
                "snuba_query_subscriber.callback.duration", instance=subscription.type
            ):
                span.set_data("payload", contents)
                callback(contents, subscription)

    def parse_message_value(self, value):
        """
        Parses the value received via the Kafka consumer and verifies that it
        matches the expected schema.
        :param value: A json formatted string
        :return: A dict with the parsed message
        """
        with metrics.timer("snuba_query_subscriber.parse_message_value.json_parse"):
            wrapper = loads(value)

        with metrics.timer("snuba_query_subscriber.parse_message_value.json_validate_wrapper"):
            try:
                jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA)
            except jsonschema.ValidationError:
                metrics.incr("snuba_query_subscriber.message_wrapper_invalid")
                raise InvalidSchemaError("Message wrapper does not match schema")

        schema_version = wrapper["version"]
        if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS:
            metrics.incr("snuba_query_subscriber.message_wrapper_invalid_version")
            raise InvalidMessageError("Version specified in wrapper has no schema")

        payload = wrapper["payload"]
        with metrics.timer("snuba_query_subscriber.parse_message_value.json_validate_payload"):
            try:
                jsonschema.validate(payload, SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version])
            except jsonschema.ValidationError:
                metrics.incr("snuba_query_subscriber.message_payload_invalid")
                raise InvalidSchemaError("Message payload does not match schema")
        # XXX: Since we just return the raw dict here, when the payload changes it'll
        # break things. This should convert the payload into a class rather than passing
        # the dict around, but until we get time to refactor we can keep things working
        # here.
        payload.setdefault("values", payload.get("result"))

        payload["timestamp"] = parse_date(payload["timestamp"]).replace(tzinfo=pytz.utc)
        return payload
Exemplo n.º 51
0
class QuerySubscriptionConsumer:
    """
    A Kafka consumer that processes query subscription update messages. Each message has
    a related subscription id and the latest values related to the subscribed query.
    These values are passed along to a callback associated with the subscription.
    """

    topic_to_dataset: Dict[str, QueryDatasets] = {
        settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS: QueryDatasets.EVENTS,
        settings.KAFKA_TRANSACTIONS_SUBSCRIPTIONS_RESULTS: QueryDatasets.TRANSACTIONS,
    }

    def __init__(
        self,
        group_id: str,
        topic: Optional[str] = None,
        commit_batch_size: int = 100,
        initial_offset_reset: str = "earliest",
        force_offset_reset: Optional[str] = None,
    ):
        self.group_id = group_id
        if not topic:
            # TODO(typing): Need a way to get the actual value of settings to avoid this
            topic = cast(str, settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS)

        self.topic = topic
        cluster_name: str = settings.KAFKA_TOPICS[topic]["cluster"]
        self.commit_batch_size = commit_batch_size
        self.initial_offset_reset = initial_offset_reset
        self.offsets: Dict[int, Optional[int]] = {}
        self.consumer: Consumer = None
        self.cluster_options = kafka_config.get_kafka_consumer_cluster_options(
            cluster_name,
            {
                "group.id": self.group_id,
                "session.timeout.ms": 6000,
                "auto.offset.reset": self.initial_offset_reset,
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "false",
                "enable.partition.eof": "false",
                "default.topic.config": {"auto.offset.reset": self.initial_offset_reset},
            },
        )
        self.admin_cluster_options = kafka_config.get_kafka_admin_cluster_options(
            cluster_name, {"allow.auto.create.topics": "true"}
        )
        self.resolve_partition_force_offset = self.offset_reset_name_to_func(force_offset_reset)
        self.__shutdown_requested = False

    def offset_reset_name_to_func(
        self, offset_reset: Optional[str]
    ) -> Optional[Callable[[TopicPartition], TopicPartition]]:
        if offset_reset in {"smallest", "earliest", "beginning"}:
            return self.resolve_partition_offset_earliest
        elif offset_reset in {"largest", "latest", "end"}:
            return self.resolve_partition_offset_latest
        return None

    def resolve_partition_offset_earliest(self, partition: TopicPartition) -> TopicPartition:
        low, high = self.consumer.get_watermark_offsets(partition)
        return TopicPartition(partition.topic, partition.partition, low)

    def resolve_partition_offset_latest(self, partition: TopicPartition) -> TopicPartition:
        low, high = self.consumer.get_watermark_offsets(partition)
        return TopicPartition(partition.topic, partition.partition, high)

    def run(self) -> None:
        logger.debug("Starting snuba query subscriber")
        self.offsets.clear()

        def on_assign(consumer: Consumer, partitions: List[TopicPartition]) -> None:
            updated_partitions: List[TopicPartition] = []
            for partition in partitions:
                if self.resolve_partition_force_offset:
                    partition = self.resolve_partition_force_offset(partition)
                    updated_partitions.append(partition)

                if partition.offset == OFFSET_INVALID:
                    updated_offset = None
                else:
                    updated_offset = partition.offset
                self.offsets[partition.partition] = updated_offset
            if updated_partitions:
                self.consumer.assign(updated_partitions)
            logger.info(
                "query-subscription-consumer.on_assign",
                extra={
                    "offsets": str(self.offsets),
                    "partitions": str(partitions),
                },
            )

        def on_revoke(consumer: Consumer, partitions: List[TopicPartition]) -> None:
            partition_numbers = [partition.partition for partition in partitions]
            self.commit_offsets(partition_numbers)
            for partition_number in partition_numbers:
                self.offsets.pop(partition_number, None)
            logger.info(
                "query-subscription-consumer.on_revoke",
                extra={
                    "offsets": str(self.offsets),
                    "partitions": str(partitions),
                },
            )

        self.consumer = Consumer(self.cluster_options)
        self.__shutdown_requested = False

        if settings.KAFKA_CONSUMER_AUTO_CREATE_TOPICS:
            # This is required for confluent-kafka>=1.5.0, otherwise the topics will
            # not be automatically created.
            admin_client = AdminClient(self.admin_cluster_options)
            wait_for_topics(admin_client, [self.topic])

        self.consumer.subscribe([self.topic], on_assign=on_assign, on_revoke=on_revoke)

        i = 0
        while not self.__shutdown_requested:
            message = self.consumer.poll(0.1)
            if message is None:
                continue

            error = message.error()
            if error is not None:
                raise KafkaException(error)

            i = i + 1

            with sentry_sdk.start_transaction(
                op="handle_message",
                name="query_subscription_consumer_process_message",
                sampled=True,
            ), metrics.timer("snuba_query_subscriber.handle_message"):
                self.handle_message(message)

            # Track latest completed message here, for use in `shutdown` handler.
            self.offsets[message.partition()] = message.offset() + 1

            if i % self.commit_batch_size == 0:
                logger.debug("Committing offsets")
                self.commit_offsets()

        logger.debug("Committing offsets and closing consumer")
        self.commit_offsets()
        self.consumer.close()

    def commit_offsets(self, partitions: Optional[Iterable[int]] = None) -> None:
        logger.info(
            "query-subscription-consumer.commit_offsets",
            extra={"offsets": str(self.offsets), "partitions": str(partitions)},
        )

        if self.offsets and self.consumer:
            if partitions is None:
                partitions = self.offsets.keys()
            to_commit = []
            for partition in partitions:
                offset = self.offsets.get(partition)
                if offset is None:
                    # Skip partitions that have no offset
                    continue
                to_commit.append(TopicPartition(self.topic, partition, offset))

            self.consumer.commit(offsets=to_commit)

    def shutdown(self) -> None:
        self.__shutdown_requested = True

    def handle_message(self, message: Message) -> None:
        """
        Parses the value from Kafka, and if valid passes the payload to the callback defined by the
        subscription. If the subscription has been removed, or no longer has a valid callback then
        just log metrics/errors and continue.
        :param message:
        :return:
        """
        with sentry_sdk.push_scope() as scope:
            try:
                with metrics.timer("snuba_query_subscriber.parse_message_value"):
                    contents = self.parse_message_value(message.value())
            except InvalidMessageError:
                # If the message is in an invalid format, just log the error
                # and continue
                logger.exception(
                    "Subscription update could not be parsed",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return
            scope.set_tag("query_subscription_id", contents["subscription_id"])

            try:
                with metrics.timer("snuba_query_subscriber.fetch_subscription"):
                    subscription: QuerySubscription = QuerySubscription.objects.get_from_cache(
                        subscription_id=contents["subscription_id"]
                    )
                    if subscription.status != QuerySubscription.Status.ACTIVE.value:
                        metrics.incr("snuba_query_subscriber.subscription_inactive")
                        return
            except QuerySubscription.DoesNotExist:
                metrics.incr("snuba_query_subscriber.subscription_doesnt_exist")
                logger.error(
                    "Received subscription update, but subscription does not exist",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                try:
                    _delete_from_snuba(
                        self.topic_to_dataset[message.topic()], contents["subscription_id"]
                    )
                except Exception:
                    logger.exception("Failed to delete unused subscription from snuba.")
                return

            if subscription.type not in subscriber_registry:
                metrics.incr("snuba_query_subscriber.subscription_type_not_registered")
                logger.error(
                    "Received subscription update, but no subscription handler registered",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return

            sentry_sdk.set_tag("project_id", subscription.project_id)
            sentry_sdk.set_tag("query_subscription_id", contents["subscription_id"])

            callback = subscriber_registry[subscription.type]
            with sentry_sdk.start_span(op="process_message") as span, metrics.timer(
                "snuba_query_subscriber.callback.duration", instance=subscription.type
            ):
                span.set_data("payload", contents)
                span.set_data("subscription_dataset", subscription.snuba_query.dataset)
                span.set_data("subscription_query", subscription.snuba_query.query)
                span.set_data("subscription_aggregation", subscription.snuba_query.aggregate)
                span.set_data("subscription_time_window", subscription.snuba_query.time_window)
                span.set_data("subscription_resolution", subscription.snuba_query.resolution)
                span.set_data("message_offset", message.offset())
                span.set_data("message_partition", message.partition())
                span.set_data("message_value", message.value())

                callback(contents, subscription)

    def parse_message_value(self, value: str) -> Dict[str, Any]:
        """
        Parses the value received via the Kafka consumer and verifies that it
        matches the expected schema.
        :param value: A json formatted string
        :return: A dict with the parsed message
        """
        with metrics.timer("snuba_query_subscriber.parse_message_value.json_parse"):
            wrapper: Dict[str, Any] = json.loads(value)

        with metrics.timer("snuba_query_subscriber.parse_message_value.json_validate_wrapper"):
            try:
                jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA)
            except jsonschema.ValidationError:
                metrics.incr("snuba_query_subscriber.message_wrapper_invalid")
                raise InvalidSchemaError("Message wrapper does not match schema")

        schema_version: int = wrapper["version"]
        if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS:
            metrics.incr("snuba_query_subscriber.message_wrapper_invalid_version")
            raise InvalidMessageError("Version specified in wrapper has no schema")

        payload: Dict[str, Any] = wrapper["payload"]
        with metrics.timer("snuba_query_subscriber.parse_message_value.json_validate_payload"):
            try:
                jsonschema.validate(payload, SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version])
            except jsonschema.ValidationError:
                metrics.incr("snuba_query_subscriber.message_payload_invalid")
                raise InvalidSchemaError("Message payload does not match schema")
        # XXX: Since we just return the raw dict here, when the payload changes it'll
        # break things. This should convert the payload into a class rather than passing
        # the dict around, but until we get time to refactor we can keep things working
        # here.
        payload.setdefault("values", payload.get("result"))

        payload["timestamp"] = parse_date(payload["timestamp"]).replace(tzinfo=pytz.utc)
        return payload
Exemplo n.º 52
0
    # Hint: try debug='fetch' to generate some log messages
    c = Consumer(conf, logger=logger)

    def print_assignment(consumer, partitions):
        print('Assignment:', partitions)

    # Subscribe to topics
    c.subscribe(topics, on_assign=print_assignment)

    # Read messages from Kafka, print to stdout
    try:
        while True:
            msg = c.poll(timeout=1.0)
            if msg is None:
                continue
            if msg.error():
                raise KafkaException(msg.error())
            else:
                # Proper message
                sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' %
                                 (msg.topic(), msg.partition(), msg.offset(),
                                  str(msg.key())))
                print(msg.value())

    except KeyboardInterrupt:
        sys.stderr.write('%% Aborted by user\n')

    finally:
        # Close down consumer to commit final offsets.
        c.close()
class KafkaConsumer:
    """Defines the base kafka consumer class"""
    def __init__(
        self,
        topic_name_pattern,
        message_handler,
        is_avro=True,
        offset_earliest=True,
        sleep_secs=1.0,
        consume_timeout=0.1,
    ):
        logger.info(f"intialising kafka consumet topic name handler : %s",
                    topic_name_pattern)
        """Creates a consumer object for asynchronous use"""
        self.topic_name_pattern = topic_name_pattern
        self.message_handler = message_handler
        self.sleep_secs = sleep_secs
        self.consume_timeout = consume_timeout
        self.offset_earliest = offset_earliest

        self.broker_properties = {
            "bootstrap.servers":
            "PLAINTEXT://localhost:9092,PLAINTEXT://localhost:9093,PLAINTEXT://localhost:9094",
            "group.id": f"{self.topic_name_pattern}",
            "auto.offset.reset": "earliest"
        }

        # Create the Consumer, using the appropriate type.
        if is_avro is True:
            self.broker_properties[
                "schema.registry.url"] = "http://localhost:8081"
            self.consumer = AvroConsumer(self.broker_properties)
        else:
            self.consumer = Consumer(self.broker_properties)

        self.consumer.subscribe([self.topic_name_pattern],
                                on_assign=self.on_assign)

    def on_assign(self, consumer, partitions):
        """Callback for when topic assignment takes place"""
        # If the topic is configured to use `offset_earliest` set the partition offset to
        # the beginning or earliest
        for partition in partitions:
            if self.offset_earliest:
                partition.offset = confluent_kafka.OFFSET_BEGINNING

        logger.info("partitions assigned for %s", self.topic_name_pattern)
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consumes data from kafka topic"""
        while True:
            num_results = 1
            while num_results > 0:
                num_results = self._consume()
            await gen.sleep(self.sleep_secs)

    def _consume(self):
        """Polls for a message. Returns 1 if a message was received, 0 otherwise"""
        try:
            while True:
                message = self.consumer.poll(15.0)
                if message is None:
                    logger.info("no message received by consumer")
                    return 0
                elif message.error() is not None:
                    logger.info(f"error from consumer {message.error()}")
                    return 0
                else:
                    logger.info(f"consumed message")
                    logger.info("key: %s", message.key())
                    logger.info("value: %s", message.value())
                    self.message_handler(message)
                    return 1
        except Exception as e:
            logger.info(f"an excpetion occured : %s ", e)

    def close(self):
        """Cleans up any open kafka consumers"""
        self.consumer.close()
Exemplo n.º 54
0
class SynchronizedConsumer(object):
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)
    """
    initial_offset_reset_strategies = {
        'earliest': get_earliest_offset,
        'latest': get_latest_offset,
    }

    def __init__(self, bootstrap_servers, consumer_group, commit_log_topic,
                 synchronize_commit_group, initial_offset_reset='latest', on_commit=None):
        self.bootstrap_servers = bootstrap_servers
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = {
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.consumer_group,
            'enable.auto.commit': 'false',
            'enable.auto.offset.store': 'true',
            'enable.partition.eof': 'false',
            'default.topic.config': {
                'auto.offset.reset': 'error',
            },
            'on_commit': commit_callback,
        }

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                bootstrap_servers=self.bootstrap_servers,
                consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex),
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ),
        )
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception('Commit log consumer unexpectedly exit!')

    def __on_partition_state_change(
            self, topic, partition, previous_state_and_offsets, current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug('State change for %r: %r to %r', (topic, partition),
                     previous_state_and_offsets, current_state_and_offsets)

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED,
                             SynchronizedPartitionState.REMOTE_BEHIND):
            self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError('Unexpected partition state: %s' % (current_state,))

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {
                (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment
            }

            for i in self.__consumer.committed([TopicPartition(topic, partition) for (
                    topic, partition), offset in assignment.items() if offset is None]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition)

            self.__consumer.assign([TopicPartition(topic, partition, offset)
                                    for (topic, partition), offset in assignment.items()])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(self, [TopicPartition(topic, partition)
                                 for topic, partition in assignment.keys()])

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(
            topics,
            on_assign=assignment_callback,
            on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(), message.offset() + 1)
        self.__positions[(message.topic(), message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
Exemplo n.º 55
-1
def httpry_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Httpry_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}})
    consumer.subscribe(['httpry_logs'])
    try:
        while True:
            msg = consumer.poll()
            if msg:
                if not msg.error():
                    Msg = msg.value().decode('utf-8').strip()
                    try:
                        tm = time.strftime('%Y%m%d%H%M', time.localtime())
                        httpry_Key = 'httpry_domain.%s' % tm
                        if Msg:
                            msg = Msg.split()
                            if len(msg) == 11:
                                if msg[6] != '-':
                                    RC.zincrby(httpry_Key,msg[6], 1)
                                    RC.expire(httpry_Key,600)
                    except Exception as e:
                        logging.error(e)
                        continue
                elif msg.error().code() != KafkaError._PARTITION_EOF:
                    logging.error(msg.error())
                    continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()
Exemplo n.º 56
-1
def analytics_internet2_logs():
    consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet2_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}})
    consumer.subscribe(['haproxy_logs'])
    try:
        while True:
            msg = consumer.poll()
            if not msg.error():
                Msg = msg.value().decode('utf-8').strip()
                try:
                    tt = time.strftime('%Y%m%d', time.localtime())
                    tm = time.strftime('%Y%m%d%H%M', time.localtime())
                    Tm = time.strftime('%H:%M', time.localtime())
                    Tra_ser_minute_Key = 'traffic.ser.%s' % tm
                    Tra_cli_minute_Key = 'traffic.cli.%s' % tm
                    if Msg:
                        Msg = Msg.split()
                        if len(Msg) >= 17:
                            traffic_cli = Msg[10]
                            traffic_ser = Msg[11]
                            Topic = str(Msg[14]).split('|')[0].replace('{', '').strip()
                            IP = str(Msg[5])
                            Rtime = Msg[8].split('/')[-1]
                            if Rtime.isdigit():
                                Rtime = int(Rtime)
                            else:
                                Rtime = 0
                            uv_key = 'baihe_uv_%s' % tt
                            Rt_Key = 'Rtime_%s_%s' % (tt, Topic)
                            PATH = str(Msg[16]).split('?')[0]
                            URL = 'http://%s%s' % (Topic,PATH)
                            Tra_ser_url_minute_Key = 'traffic.ser.url_%s' % Tm
                            Tra_cli_url_minute_Key = 'traffic.cli.url_%s' % Tm
                            for KEY in (uv_key,Rt_Key,Tra_ser_url_minute_Key,Tra_cli_url_minute_Key):
                                RC.expire(KEY,3600)
                            # 流量
                            if traffic_ser.isdigit() and traffic_cli.isdigit():
                                RC.zincrby(Tra_cli_url_minute_Key, URL, int(traffic_cli))
                                RC.zincrby(Tra_ser_url_minute_Key,URL, int(traffic_ser))
                                # 实时流量
                                RC.zincrby(Tra_cli_minute_Key, Topic, int(traffic_cli))
                                RC.expire(Tra_cli_minute_Key, 300)
                                RC.zincrby(Tra_ser_minute_Key, Topic, int(traffic_ser))
                                RC.expire(Tra_ser_minute_Key, 300)
                            #
                            if Rtime:
                                RC.lpush(Rt_Key, Rtime)
                                RC.sadd(uv_key, IP)
                except Exception as e:
                    logging.error(e)
                    continue
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                logging.error(msg.error())
                continue
    except Exception as e:
        logging.error(e)
    finally:
        consumer.close()