예제 #1
0
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'):
    if 'username' in request.cookies:
        username = request.cookies['username']
        print(f"Ok, {username}, let's fetch the latest tweets!")
        c = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': username,
            'schema.registry.url': SCHEMA_REGISTRY_URL,
            #'isolation.level': 'read_committed'
        })
        c.assign([TopicPartition(TOPIC, 0, 0)])
        low_offset, high_offset = c.get_watermark_offsets(
            TopicPartition(TOPIC, 0))
        #print(f"the latest offset is {high_offset}, the low is {low_offset}")

        # move consumer to offset=high_offset-WINDOW_LEN (only if > 0)
        if high_offset - WINDOW_LEN > 0:
            new_offset = high_offset - WINDOW_LEN
        else:
            new_offset = low_offset
        c.seek(TopicPartition(TOPIC, 0, new_offset))

        msgs = []  # to store the messages to be returned
        pos = c.position([TopicPartition(TOPIC, 0, new_offset)])
        while pos[0].offset < high_offset:
            try:
                msg = c.poll(0)

            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(
                    msg, e))
                break

            if msg is None:
                continue

            if msg.error():
                print("AvroConsumer error: {}".format(msg.error()))
                continue

            author = msg.value()['author']
            content = msg.value()['content']
            #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y')
            timestamp = datetime.datetime.fromtimestamp(
                float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y')
            message_ts = float(msg.value()['timestamp'])
            location = msg.value()['location']
            tags = [h[1:] for h in content.split() if h.startswith('#')]
            mentions = [h[1:] for h in content.split() if h.startswith('@')]
            display_message = f"[{author}] {content} ({location} - {timestamp})"
            print(f"[{author}] {content} ({location} - {timestamp})")
            #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}")
            pos = c.position([TopicPartition(TOPIC, 0, new_offset)])

            if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (
                        mentionfilter.lower()
                        in mentions) and (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (mentionfilter.lower() in mentions) and (tagfilter.lower()
                                                            in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (tagfilter.lower()
                                                         in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter) and (mentionfilter.lower()
                                                         in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (mentionfilter.lower() in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            else:
                msgs.append((display_message, message_ts))
        c.close()
        # finally return dictonary of messages
        msgs = list(
            set(msgs)
        )  # this is done to ensure that no duplicates of a message are shown in timeline
        msgs = sorted(msgs, key=lambda x: x[1])
        msgs = [m[0] for m in msgs]
        print(msgs)
        return {"results": msgs}
    else:
        return {"results": ['Oooops, your are not logged in...']}
예제 #2
0
from confluent_kafka import TopicPartition
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError

tp = TopicPartition('pure_project_xml', 0, 0)
c = AvroConsumer({
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'pure_project_output_generator',
    'schema.registry.url': 'http://localhost:8081',
})
c.assign([tp])
assignment = c.assignment()

# Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196
(first_offset, next_offset_to_create) = c.get_watermark_offsets(tp,
                                                                timeout=1,
                                                                cached=False)
last_offset = next_offset_to_create - 1

f = open('pure_project.xml', 'w')
f.write(
    '<?xml version="1.0"?>' + "\n" +
    '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">'
    + "\n")

# range values explained: We read the topic backwards, starting with the
# last offset. We use `first_offset - 1` because Python's range will stop
# before it reaches that value. So the last offset used will actually be
# the first offset. The last argument is the step, for which we pass -1,
# because we're reading backwards.
for offset in range(last_offset, first_offset - 1, -1):
예제 #3
0
def streaming_filtering():
    cityfilter = request.form['cityfilter']
    mentionfilter = request.form['mentionfilter']
    tagfilter = request.form['tagfilter']
    print(f'cityfilter: {cityfilter}')
    print(f'mentionfilter: {mentionfilter}')
    print(f'tagfilter: {tagfilter}')

    if 'username' in request.cookies:
        username = request.cookies['username']
        print(f"Ok, {username}, let's stream the latest tweets!")
        c = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': username,
            'schema.registry.url': SCHEMA_REGISTRY_URL
        })
        c.assign([TopicPartition(TOPIC, 0, 0)])
        low_offset, high_offset = c.get_watermark_offsets(
            TopicPartition(TOPIC, 0))
        print(f"the latest offset is {high_offset}, the low is {low_offset}")
        print(f"consumer position: {c.position([TopicPartition(TOPIC, 0)])}")

        # move consumer to top
        c.seek(TopicPartition(TOPIC, 0, high_offset))

        msgs = []
        pos = c.position([TopicPartition(TOPIC, 0, high_offset)])

        def gen(msgs):  # generator funciton for streaming
            print('ciao')
            while True:
                try:
                    msg = c.poll(1)

                except SerializerError as e:
                    print("Message deserialization failed for {}: {}".format(
                        msg, e))
                    break

                if msg is None:
                    current_ts = time.time()
                    msgs = [
                        m for m in msgs
                        if (float(current_ts) -
                            float(m[1])) < STREAMING_WINDOW_SECONDS
                    ]
                    ret_msgs = [m[0] for m in msgs]
                    yield f' `{json.dumps(ret_msgs)}` '
                    continue

                if msg.error():
                    current_ts = time.time()
                    msgs = [
                        m for m in msgs
                        if (float(current_ts) -
                            float(m[1])) < STREAMING_WINDOW_SECONDS
                    ]
                    ret_msgs = [m[0] for m in msgs]
                    yield f' `{json.dumps(ret_msgs)}` '
                    print("AvroConsumer error: {}".format(msg.error()))
                    continue

                # get message fields
                author = msg.value()['author']
                content = msg.value()['content']
                #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y')
                timestamp = datetime.datetime.fromtimestamp(
                    float(msg.value()['timestamp'])).strftime(
                        '%H:%M:%S, %d-%m-%Y')
                location = msg.value()['location']
                tags = [h[1:] for h in content.split() if h.startswith('#')]
                mentions = [
                    h[1:] for h in content.split() if h.startswith('@')
                ]
                # create display_message
                display_message = f"[{author}] {content} ({location} - {timestamp})"
                display_message = display_message.replace(
                    "`", "'")  # serve per leggere lo streaming
                message_ts = float(msg.value()['timestamp'])
                print(f"{display_message}")
                print(
                    f"consumer position: {c.position([TopicPartition(TOPIC, 0, high_offset)])}"
                )
                pos = c.position([TopicPartition(TOPIC, 0, high_offset)])
                print('prima')
                print(f'cityfilter: {cityfilter}')
                print(f'mentionfilter: {mentionfilter}')
                print(f'tagfilter: {tagfilter}')

                if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                    if (location.lower() == cityfilter) and (
                            mentionfilter.lower()
                            in mentions) and (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                    if (mentionfilter.lower()
                            in mentions) and (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                    if (location.lower() == cityfilter) and (tagfilter.lower()
                                                             in tags):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                    if (location.lower()
                            == cityfilter) and (mentionfilter.lower()
                                                in mentions):
                        msgs.append((display_message, message_ts))
                elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL':
                    if (location.lower() == cityfilter):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                    if (mentionfilter.lower() in mentions):
                        msgs.append((display_message, message_ts))
                elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                    if (tagfilter.lower() in tags):
                        msgs.append((display_message, message_ts))
                else:
                    msgs.append((display_message, message_ts))

                # remove old messages
                current_ts = time.time()
                msgs = [
                    m for m in msgs if (float(current_ts) -
                                        float(m[1])) < STREAMING_WINDOW_SECONDS
                ]
                #msgs = list(set(msgs))
                msgs = sorted(msgs, key=lambda x: x[1])
                ret_msgs = [m[0] for m in msgs]
                yield f' `{json.dumps(ret_msgs)}` '

        return Response(stream_with_context(gen(msgs)))
    else:
        return {"results": ['Oooops, your are not logged in...']}