class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(asynchronous=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'): if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's fetch the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL, #'isolation.level': 'read_committed' }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) #print(f"the latest offset is {high_offset}, the low is {low_offset}") # move consumer to offset=high_offset-WINDOW_LEN (only if > 0) if high_offset - WINDOW_LEN > 0: new_offset = high_offset - WINDOW_LEN else: new_offset = low_offset c.seek(TopicPartition(TOPIC, 0, new_offset)) msgs = [] # to store the messages to be returned pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) while pos[0].offset < high_offset: try: msg = c.poll(0) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: continue if msg.error(): print("AvroConsumer error: {}".format(msg.error())) continue author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y') message_ts = float(msg.value()['timestamp']) location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [h[1:] for h in content.split() if h.startswith('@')] display_message = f"[{author}] {content} ({location} - {timestamp})" print(f"[{author}] {content} ({location} - {timestamp})") #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}") pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) c.close() # finally return dictonary of messages msgs = list( set(msgs) ) # this is done to ensure that no duplicates of a message are shown in timeline msgs = sorted(msgs, key=lambda x: x[1]) msgs = [m[0] for m in msgs] print(msgs) return {"results": msgs} else: return {"results": ['Oooops, your are not logged in...']}
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
def streaming_filtering(): cityfilter = request.form['cityfilter'] mentionfilter = request.form['mentionfilter'] tagfilter = request.form['tagfilter'] print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's stream the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) print(f"the latest offset is {high_offset}, the low is {low_offset}") print(f"consumer position: {c.position([TopicPartition(TOPIC, 0)])}") # move consumer to top c.seek(TopicPartition(TOPIC, 0, high_offset)) msgs = [] pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) def gen(msgs): # generator funciton for streaming print('ciao') while True: try: msg = c.poll(1) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' continue if msg.error(): current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' print("AvroConsumer error: {}".format(msg.error())) continue # get message fields author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime( '%H:%M:%S, %d-%m-%Y') location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [ h[1:] for h in content.split() if h.startswith('@') ] # create display_message display_message = f"[{author}] {content} ({location} - {timestamp})" display_message = display_message.replace( "`", "'") # serve per leggere lo streaming message_ts = float(msg.value()['timestamp']) print(f"{display_message}") print( f"consumer position: {c.position([TopicPartition(TOPIC, 0, high_offset)])}" ) pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) print('prima') print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) # remove old messages current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] #msgs = list(set(msgs)) msgs = sorted(msgs, key=lambda x: x[1]) ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' return Response(stream_with_context(gen(msgs))) else: return {"results": ['Oooops, your are not logged in...']}