def __seek_from_to_offsets(self, partition, start_offset, end_offset, fft):
        self.log.info(
            f'Start : __seek_from_to_offsets({partition}, {start_offset}, {end_offset})'
        )

        consumer = AvroConsumer({
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.group_id,
            'schema.registry.url': self.schema_registry_url
        })

        topic_partition = TopicPartition(self.topic, partition)
        topic_partition.offset = start_offset
        consumer.assign([topic_partition])

        messages = []

        while True:
            message = consumer.poll(10)
            if fft:
                dasfft = DasFft()
                message.value()['fft'] = dasfft.amplitudes_fft(
                    message.value()['amplitudes'])

            messages.append(message)
            if (message.offset() >= end_offset):
                self.log.info(
                    f'End : __seek_from_to_offsets({partition}, {start_offset}, {end_offset})'
                )
                return messages
    def __get_message(self, partition, offset, fft):
        self.log.info(f'Start : __get_message({partition},{offset})')

        consumer = AvroConsumer({
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.group_id,
            'schema.registry.url': self.schema_registry_url
        })

        topic_partition = TopicPartition(self.topic, partition)
        topic_partition.offset = offset
        consumer.assign([topic_partition])

        message = consumer.poll(10)

        consumer.close()

        if fft:
            dasfft = DasFft()
            message.value()['fft'] = dasfft.amplitudes_fft(
                message.value()['amplitudes'])

        self.log.info(f'End : __get_message({partition},{offset})')

        return message
示例#3
0
class AvroConsumerFacade:
    def __init__(self, name, emit_datum, broker, schema_registry_url, topic):
        self.name = name
        self.emit_datum = emit_datum
        self.consumer = AvroConsumer({
            'bootstrap.servers': broker,
            'group.id': name,
            'schema.registry.url': schema_registry_url,
            **get_sr_config_from_environment(),
            **get_kafka_config_from_environment(),
        })

        # Subscribe to topics/partitions, and seek to end. Following that we need
        # to poll until the topics have actually been assigned.
        def on_assign(consumer, partitions):
            for p in partitions:
                p.offset = OFFSET_END
            self.consumer.assign(partitions)

        self.consumer.subscribe([topic], on_assign=on_assign)
        self.consumer.poll(10)

    def consume_one(self, poll_wait=0):
        consumed_message = self.consumer.poll(poll_wait)
        if consumed_message is not None:
            self.emit_datum(Datum(good_count=1))
        else:
            self.emit_datum(Datum(bad_count=1))

    def close(self):
        self.consumer.commit()
        self.consumer.close()
示例#4
0
def target_topic_avro_consumer(unittest_config: Config, target_topic: Tuple[str, int]) -> AvroConsumer:
    consumer = AvroConsumer(
        {
            "group.id": "asdf",
            "enable.auto.commit": False,
            "enable.partition.eof": False,
            **unittest_config.create_confluent_config(include_schema_registry=True),
        }
    )
    consumer.assign([TopicPartition(topic=target_topic[0], partition=i, offset=0) for i in range(target_topic[1])])
    yield consumer
    consumer.close()
示例#5
0
    def read_from_offset(self, offset=1000):
        c = AvroConsumer(
            dict(
                self.base_config, **{
                    'group.id': 'groupid-1',
                    'default.topic.config': {
                        'auto.offset.reset': 'beginning',
                        'auto.commit.enable': 'false'
                    }
                }))

        c.assign([TopicPartition(self.topic, partition=0, offset=offset)])
        return self.run_loop(c, return_message=True, file_object=False)
示例#6
0
    def read_from_start(self, persist=False, path='/'):
        c = AvroConsumer(
            dict(
                self.base_config, **{
                    'group.id': 'groupid',
                    'default.topic.config': {
                        'auto.offset.reset': 'beginning',
                        'auto.commit.enable': 'false'
                    }
                }))

        c.assign([
            TopicPartition(self.topic,
                           partition=0,
                           offset=confluent_kafka.OFFSET_BEGINNING)
        ])

        if persist:
            with open(os.path.join(path, self.topic + '.txt'), 'w') as out:
                self.run_loop(c, file_object=out)
        else:
            self.run_loop(c)
示例#7
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(asynchronous=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
示例#8
0
from confluent_kafka import KafkaError
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
from confluent_kafka import TopicPartition


c = AvroConsumer({
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'messages-average',
    'schema.registry.url': 'http://0.0.0.0:8081',
    })

    
Partition = TopicPartition('ten-messages-average4', 0)
c.assign([Partition])
# c.seek(Partition)
# print(dir(c))
# # msg = c.poll(10)
# # print(msg.value(), msg.key(), msg.offset())


while True:
    try:
        msg = c.poll(10)

    except SerializerError as e:
        print("Message deserialization failed for {}: {}".format(msg, e))
        break

    if msg is None:
        continue
from confluent_kafka import KafkaError
from confluent_kafka import TopicPartition
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError

tp = TopicPartition('pure_project_xml', 0, 0)
c = AvroConsumer({
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'pure_project_output_generator',
    'schema.registry.url': 'http://localhost:8081',
})
c.assign([tp])
assignment = c.assignment()

# Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196
(first_offset, next_offset_to_create) = c.get_watermark_offsets(tp,
                                                                timeout=1,
                                                                cached=False)
last_offset = next_offset_to_create - 1

f = open('pure_project.xml', 'w')
f.write(
    '<?xml version="1.0"?>' + "\n" +
    '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">'
    + "\n")

# range values explained: We read the topic backwards, starting with the
# last offset. We use `first_offset - 1` because Python's range will stop
# before it reaches that value. So the last offset used will actually be
# the first offset. The last argument is the step, for which we pass -1,
# because we're reading backwards.
示例#10
0
    }

# no local avro schema setup when we use repository
consumer = AvroConsumer(config)

if (env == "local"):
    topicPartitionData = [
        TopicPartition(apiDataTopic, p) for p in range(0, mytopicpartitions)
    ]
    topicPartitionException = [
        TopicPartition(apiExceptionTopic, p)
        for p in range(0, mytopicpartitions)
    ]
    subscribed_topics.extend(topicPartitionException)
    subscribed_topics.extend(topicPartitionData)
    consumer.assign(subscribed_topics)
else:
    subscribed_topics.append(apiDataTopic)
    subscribed_topics.append(apiExceptionTopic)
    consumer.subscribe(subscribed_topics)

log.info("Consumer is listening for messages coming to topics: " +
         str(subscribed_topics))
badRecords = list()
# Read all messages from 2 topics
while True:
    try:
        msg = consumer.poll(2)
        badRecords.clear()
        if msg is None:
            continue
示例#11
0
class AvroAsync(object):
    def __init__(self, topic=None, ip='localhost'):
        self.topic = topic
        self.ip = ip  # os.environ['KAFKA_SERVER_IP']
        self.base_config = {
            'bootstrap.servers': self.ip + ':9092',
            'schema.registry.url': 'http://' + self.ip + ':8081'
        }

        self.avro_consumer = AvroConsumer(
            dict(self.base_config, **{'group.id': 'groupid'}))

        self.avro_consumer.assign([TopicPartition(self.topic, 0)])
        self.key_schema = avro.load(os.path.join(SCHEMAS, 'keyschema.avsc'))
        self.value_schema = avro.load(
            os.path.join(SCHEMAS, self.topic + '.avsc'))

    def producer(self):
        return AvroProducer(
            {
                'bootstrap.servers': self.ip + ':9092',
                'schema.registry.url': 'http://' + self.ip + ':8081'
            },
            default_key_schema=self.key_schema,
            default_value_schema=self.value_schema)

    def read_new(self, accumulate=False, n_messages=8, unique=True):

        self.avro_consumer.subscribe([self.topic])
        running = True

        cache = []
        while running:
            msg = self.avro_consumer.poll()
            if not msg.error():
                print(msg.value())
                if accumulate:
                    if len(cache) >= n_messages:
                        self.avro_consumer.close()
                        return cache
                    if unique:
                        if msg not in cache:
                            cache.append(msg.value())
                    else:
                        cache.append(msg.value())

            elif msg.error().code() != KafkaError._PARTITION_EOF:
                print(msg.error())
                running = False
        self.avro_consumer.close()

    def read_from_start(self, persist=False, return_msgs=True, path='/'):
        _logger.debug('Reading data from Kafka from start...')
        c = AvroConsumer(
            dict(
                self.base_config, **{
                    'group.id': 'groupid',
                    'default.topic.config': {
                        'auto.offset.reset': 'beginning',
                        'auto.commit.enable': 'false'
                    }
                }))

        c.assign([
            TopicPartition(self.topic,
                           partition=0,
                           offset=confluent_kafka.OFFSET_BEGINNING)
        ])

        if persist:
            with open(os.path.join(path, self.topic + '.txt'), 'w') as out:
                self.run_loop(c, file_object=out)
        else:
            return self.run_loop(c, return_message=return_msgs)

    def read_from_offset(self, offset=1000):
        c = AvroConsumer(
            dict(
                self.base_config, **{
                    'group.id': 'groupid-1',
                    'default.topic.config': {
                        'auto.offset.reset': 'beginning',
                        'auto.commit.enable': 'false'
                    }
                }))

        c.assign([TopicPartition(self.topic, partition=0, offset=offset)])
        return self.run_loop(c, return_message=True, file_object=False)

    @staticmethod
    def run_loop(consumer, file_object=None, return_message=False):
        _logger.debug('Kakfa consumer initialized, looping through data...')
        counter = 0
        msg_stack = []
        last_import = time.time() - 60
        while True:
            if counter % 10000:
                _logger.debug('Read {} messages from Kafka'.format(counter))
            counter += 1
            msg = consumer.poll(timeout=3)
            if file_object or return_message:
                try:
                    msg_stack.append(msg.value())
                except TypeError:
                    print(msg.value())

                if msg.timestamp()[1] / 1000 > last_import:
                    break
            else:
                print(msg)

        if file_object:
            for item in msg_stack:
                file_object.write(json.dumps(item) + '\n')

        if return_message:
            return msg_stack

        print(counter)
示例#12
0
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'):
    if 'username' in request.cookies:
        username = request.cookies['username']
        print(f"Ok, {username}, let's fetch the latest tweets!")
        c = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': username,
            'schema.registry.url': SCHEMA_REGISTRY_URL,
            #'isolation.level': 'read_committed'
        })
        c.assign([TopicPartition(TOPIC, 0, 0)])
        low_offset, high_offset = c.get_watermark_offsets(
            TopicPartition(TOPIC, 0))
        #print(f"the latest offset is {high_offset}, the low is {low_offset}")

        # move consumer to offset=high_offset-WINDOW_LEN (only if > 0)
        if high_offset - WINDOW_LEN > 0:
            new_offset = high_offset - WINDOW_LEN
        else:
            new_offset = low_offset
        c.seek(TopicPartition(TOPIC, 0, new_offset))

        msgs = []  # to store the messages to be returned
        pos = c.position([TopicPartition(TOPIC, 0, new_offset)])
        while pos[0].offset < high_offset:
            try:
                msg = c.poll(0)

            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(
                    msg, e))
                break

            if msg is None:
                continue

            if msg.error():
                print("AvroConsumer error: {}".format(msg.error()))
                continue

            author = msg.value()['author']
            content = msg.value()['content']
            #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y')
            timestamp = datetime.datetime.fromtimestamp(
                float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y')
            message_ts = float(msg.value()['timestamp'])
            location = msg.value()['location']
            tags = [h[1:] for h in content.split() if h.startswith('#')]
            mentions = [h[1:] for h in content.split() if h.startswith('@')]
            display_message = f"[{author}] {content} ({location} - {timestamp})"
            print(f"[{author}] {content} ({location} - {timestamp})")
            #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}")
            pos = c.position([TopicPartition(TOPIC, 0, new_offset)])

            if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (
                        mentionfilter.lower()
                        in mentions) and (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (mentionfilter.lower() in mentions) and (tagfilter.lower()
                                                            in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (tagfilter.lower()
                                                         in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter) and (mentionfilter.lower()
                                                         in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (mentionfilter.lower() in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            else:
                msgs.append((display_message, message_ts))
        c.close()
        # finally return dictonary of messages
        msgs = list(
            set(msgs)
        )  # this is done to ensure that no duplicates of a message are shown in timeline
        msgs = sorted(msgs, key=lambda x: x[1])
        msgs = [m[0] for m in msgs]
        print(msgs)
        return {"results": msgs}
    else:
        return {"results": ['Oooops, your are not logged in...']}
示例#13
0
def streaming_filtering():
    cityfilter = request.form['cityfilter']
    mentionfilter = request.form['mentionfilter']
    tagfilter = request.form['tagfilter']
    print(f'cityfilter: {cityfilter}')
    print(f'mentionfilter: {mentionfilter}')
    print(f'tagfilter: {tagfilter}')

    if 'username' in request.cookies:
        username = request.cookies['username']
        print(f"Ok, {username}, let's stream the latest tweets!")
        c = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': username,
            'schema.registry.url': SCHEMA_REGISTRY_URL
        })
        c.assign([TopicPartition(TOPIC, 0, 0)])
        low_offset, high_offset = c.get_watermark_offsets(
            TopicPartition(TOPIC, 0))
        print(f"the latest offset is {high_offset}, the low is {low_offset}")
        print(f"consumer position: {c.position([TopicPartition(TOPIC, 0)])}")

        # move consumer to top
        c.seek(TopicPartition(TOPIC, 0, high_offset))

        msgs = []
        pos = c.position([TopicPartition(TOPIC, 0, high_offset)])

        def gen(msgs):  # generator funciton for streaming
            print('ciao')
            while True:
                try:
                    msg = c.poll(1)

                except SerializerError as e:
                    print("Message deserialization failed for {}: {}".format(
                        msg, e))
                    break

                if msg is None:
                    current_ts = time.time()
                    msgs = [
                        m for m in msgs
                        if (float(current_ts) -
                            float(m[1])) < STREAMING_WINDOW_SECONDS
                    ]
                    ret_msgs = [m[0] for m in msgs]
                    yield f' `{json.dumps(ret_msgs)}` '
                    continue

                if msg.error():
                    current_ts = time.time()
                    msgs = [
                        m for m in msgs
                        if (float(current_ts) -
                            float(m[1])) < STREAMING_WINDOW_SECONDS
                    ]
                    ret_msgs = [m[0] for m in msgs]
                    yield f' `{json.dumps(ret_msgs)}` '
                    print("AvroConsumer error: {}".format(msg.error()))
                    continue

                # get message fields
                author = msg.value()['author']
                content = msg.value()['content']
                #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y')
                timestamp = datetime.datetime.fromtimestamp(
                    float(msg.value()['timestamp'])).strftime(
                        '%H:%M:%S, %d-%m-%Y')
                location = msg.value()['location']
                tags = [h[1:] for h in content.split() if h.startswith('#')]
                mentions = [
                    h[1:] for h in content.split() if h.startswith('@')
                ]
                # create display_message
                display_message = f"[{author}] {content} ({location} - {timestamp})"
                display_message = display_message.replace(
                    "`", "'")  # serve per leggere lo streaming
                message_ts = float(msg.value()['timestamp'])
                print(f"{display_message}")
                print(
                    f"consumer position: {c.position([TopicPartition(TOPIC, 0, high_offset)])}"
                )
                pos = c.position([TopicPartition(TOPIC, 0, high_offset)])
                print('prima')
                print(f'cityfilter: {cityfilter}')
                print(f'mentionfilter: {mentionfilter}')
                print(f'tagfilter: {tagfilter}')

                if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                    if (location.lower() == cityfilter) and (
                            mentionfilter.lower()
                            in mentions) and (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                    if (mentionfilter.lower()
                            in mentions) and (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                    if (location.lower() == cityfilter) and (tagfilter.lower()
                                                             in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                    if (location.lower()
                            == cityfilter) and (mentionfilter.lower()
                                                in mentions):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL':
                    if (location.lower() == cityfilter):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                    if (mentionfilter.lower() in mentions):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                    if (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                else:
                    msgs.append((display_message, message_ts))

                # remove old messages
                current_ts = time.time()
                msgs = [
                    m for m in msgs if (float(current_ts) -
                                        float(m[1])) < STREAMING_WINDOW_SECONDS
                ]
                #msgs = list(set(msgs))
                msgs = sorted(msgs, key=lambda x: x[1])
                ret_msgs = [m[0] for m in msgs]
                yield f' `{json.dumps(ret_msgs)}` '

        return Response(stream_with_context(gen(msgs)))
    else:
        return {"results": ['Oooops, your are not logged in...']}
示例#14
0
students_average =AvroConsumer({
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'students_average',
    'schema.registry.url': 'http://0.0.0.0:8081',
    'auto.offset.reset': 'earliest'
})

p0 = TopicPartition('students_result_source', 0)
p1 = TopicPartition('students_result_source', 1)
p2 = TopicPartition('students_result_source', 2)

# c.assign([Partition])

c.subscribe(['students_result_source'])
students_average.subscribe(['STUDENTS_AVERAGE'])
c_partition0.assign([p0])
c_partition1.assign([p1])
c_partition2.assign([p2])

searcher=[c_partition0, c_partition1, c_partition2]

key_schema_str = """
{
    "name": "average_key",
    "type": "int"
}
"""

value_schema_str = """
{
    "name": "average_value",
示例#15
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(async=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
示例#16
0
class Consumer:
    def __init__(self,
                 broker,
                 schema_registry,
                 topic=None,
                 logging_enabled=False,
                 group_id=None,
                 auto_commit=True):
        """
        Initialiser for Confluent Consumer using AvroConsumer. 
        Each consumer can only be subscribed to one topic 
        Parameters
        ----------
        broker: str
            The URL of the broker (example: 'localhost:9092')
        schema_registry: str
            The URL of the confluent Schema Registry endpoint (example: 'http://localhost:8081')
        topic: str
            The topic to subscribe too
        logger: Logger object, Optional
            The logger object which will be used to log messages if provided
        groupId: str, Optional
            An optional groupId which can be used to loadbalance consumers default is "asgard"
        """
        if group_id is None:
            new_hash = hashlib.sha1()
            new_hash.update(str(time.time()).encode("utf-8"))
            group_id = new_hash.hexdigest()

        self.__consumer = AvroConsumer({
            "bootstrap.servers": broker,
            "group.id": group_id,
            "schema.registry.url": schema_registry,
            "enable.auto.commit": auto_commit
        })
        self.__consumer_non_avro = KafkaConsumer({
            "bootstrap.servers":
            broker,
            "group.id":
            group_id + "0",
            "enable.auto.commit":
            auto_commit
        })
        self.auto_commit = auto_commit
        if not auto_commit:
            self.consumed_messages = PriorityQueue()
        if not topic is None:
            self.subscribe_to_topic(topic)
        else:
            self.topic = None
        if logging_enabled:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = None

    def consume(self, timeout=1):
        """
        Method to consume and return message if exists and can be deserialized
        Returns
        -------
        str
            The recieved message payload as a string
        None
            No message has been recieved or an error has occured
        """
        if not self.topic is None:
            msg = None
            non_avro = False
            try:
                msg = self.__consumer.poll(timeout)
            except SerializerError as e:
                try:
                    msg = self.__consumer_non_avro.poll(timeout)
                    non_avro = True
                except Exception as e:
                    self.__log_msg(
                        "Message deserialization has failed {}: {}".format(
                            msg, e),
                        "See the following stack trace",
                        f"{traceback.format_exc()}",
                        delimeter="\n",
                        level="ERROR")
            except RuntimeError as e:
                self.__log_msg(
                    "The consumer has been closed and cannot recieve messages",
                    level="ERROR")
            except Exception as e:
                self.__log_msg("An unkown error has occured {}".format(e),
                               "See the following stack trace",
                               f"{traceback.format_exc()}",
                               delimeter="\n",
                               level="ERROR")

            if not msg is None:
                if msg.error():
                    self.__log_msg("AvroConsumer error: {}".format(
                        msg.error()),
                                   level="ERROR")
                else:
                    if not self.auto_commit:
                        self.consumed_messages.put_nowait(msg)
                    if non_avro:
                        data_to_be_returned = json.loads(msg.value().decode())
                    else:
                        data_to_be_returned = msg.value()
                    return data_to_be_returned
        else:
            raise ValueError("Consumer is currently not subscribed to a topic")

    def __enter__(self):
        return self.__consumer

    def __exit__(self, *args):
        self.close()

    def __log_msg(
        self,
        *messages,
        level="NOTSET",
        delimeter=" ",
    ):
        levels = {
            "CRITICAL": logging.CRITICAL,
            "ERROR": logging.ERROR,
            "WARNING": logging.WARNING,
            "INFO": logging.INFO,
            "DEBUG": logging.DEBUG,
            "NOTSET": logging.NOTSET
        }
        msg = delimeter.join(messages)
        if self.logger is not None:
            if level not in levels:
                raise ValueError(
                    f"level {level} is not valid must be one of {list(levels.keys())}"
                )
            self.logger.log(levels[level], msg)
        else:
            if level is not None:
                print(f"LOGGED MESSAGE: {msg}")
            else:
                print(f"{level}: {msg}")

    def commit(self, asynchronous=True):
        if not self.auto_commit and not self.consumed_messages.empty():
            msg = self.consumed_messages.get_nowait()
            self.__consumer.commit(msg, asynchronous=asynchronous)

    def list_topics(self, topic=None, timeout=1):
        try:
            metadata = self.__consumer.list_topics(topic, timeout)
            topics = metadata.topics
            return list(topics.keys())
        except Exception as e:
            self.__log_msg(
                f"An unknown error has occured when trying to list topics {e}",
                "ERROR")
            self.logger.debug(e)

    def check_if_topic_exists(self, topic, timeout=1):
        topic_list = self.list_topics(timeout=timeout)
        if topic_list is not None:
            return topic in topic_list

    def subscribe_to_topic(self, topic):
        try:
            self.__consumer_non_avro.subscribe([topic],
                                               on_assign=self.__assign)
            self.__consumer.subscribe([topic], on_assign=self.__assign)
            self.topic = topic
            return True
        except Exception as e:
            self.__log_msg(
                "An unknown error {}".format(e),
                "occured while trying to subscribe to topic {}".format(topic),
                delimeter=" ",
                level="ERROR")
            return False

    def __assign(self, consumer, partitions):
        for p in partitions:
            p.offset = consumer.get_watermark_offsets(p)[1] - 1
        self.__consumer.assign(partitions)
        self.__consumer_non_avro.assign(partitions)

    def close(self):
        """
        Close the consumer, Once called this object cannot be reused
        """
        self.__consumer.close()
示例#17
0
def main():

    # create Avro Producer for Kafka
    # Note that only schema registry has to be given here and deserialization of avro is handled automatically
    c = AvroConsumer({
        'bootstrap.servers': 'broker:29092',
        'group.id': 'anomalie_training',
        'schema.registry.url': 'http://schema-registry:8081'
    })

    # subscribe to topic
    c.subscribe(['anomalie_tutorial'])

    # We need to change Kafka offset in order to consume from beginning.
    # There is no straightforward way to archive this, so first message had to
    # be polled in order that assignment can be obtained. Afterwards assignment
    # (in form of topic partition) is changed by setting the offset to the beginning.
    msg = c.poll(10)
    topic_partition = c.assignment()

    for partition in topic_partition:
        partition.offset = OFFSET_BEGINNING

    c.assign(topic_partition)

    # Consume messages frop topic
    messages = []
    while True:
        msg = c.poll(1)

        if msg is None:
            break

        messages.append(msg.value())

    c.close()

    # transform messages to Pandas DataFrame and feature engineering
    df = pd.DataFrame(messages)
    df.timestamp = pd.to_datetime(df.timestamp * 1000000)

    df['hour'] = df.timestamp.dt.hour
    df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int")
    df.drop(["hour"], axis=1, inplace=True)

    # train test split
    # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option!
    train_length = int(len(df) * 0.6)
    x_train = df.drop("timestamp", axis=1).iloc[:train_length, :]
    x_test = df.drop("timestamp", axis=1).iloc[train_length:, :]

    # Train Machine Learning Model, here Isolation Forests
    # contamination is import parameter. Determines how many datapoints will be classified as anomalous.
    iso_forest = IsolationForest(n_estimators=100, contamination=float(.02))
    iso_forest.fit(x_train)
    dump(iso_forest, '/data/iso_forest.joblib')

    # make predictions on test set
    predictions = iso_forest.predict(x_test)

    # make plot for evaluation and save figure
    evaluate_anomalies(predictions, df, train_length)