def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextPreprocessor") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextPreprocessingConsumedMsgSchema() self.produced_msg_schema = TextEncodingProducedMsgSchema()
def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextSummarizer") self.logger.debug("Loading t5-large models...") self.summarizer = Summarizer(TOKENIZER_PATH, MODEL_PATH) self.logger.debug("Models loaded!") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextSummarizationConsumedMsgSchema() self.produced_msg_schema = TextPostprocessingProducedMsgSchema()
def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextEncoder") self.logger.debug("Loading t5-large tokenizer...") self.text_encoder = SplitterEncoder( TOKENIZER_PATH, debug=True) # TODO: change debug to False self.logger.debug("Tokenizer loaded!") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextEncodingsConsumedMsgSchema() self.produced_msg_schema = TextSumarizationProducedMsgSchema()
def __init__(self, log_level): self.app = Flask(__name__) self.api = Api(self.app) self.cors = CORS( self.app, resources={ # Origins examples: http://jizt.it, https://app.jizt.it, http://jizt.it/hi r"*": { "origins": r"https?://\w*\.?jizt\.it/?.*", "allow_headers": ['Content-Type'] } }) logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("Dispatcher") logging.getLogger("flask_cors").level = log_level # PostgreSQL connection data pg_username = None pg_password = None with open((Path(os.environ['PG_SECRET_PATH']) / Path(os.environ['PG_USERNAME_FILE'])), 'r') as username: pg_username = username.readline().rstrip('\n') with open((Path(os.environ['PG_SECRET_PATH']) / Path(os.environ['PG_PASSWORD_FILE'])), 'r') as password: pg_password = password.readline().rstrip('\n') self.db = SummaryDAOFactory(os.environ['PG_HOST'], os.environ['PG_DBNAME'], pg_username, pg_password, log_level) # Create Kafka Producer and ConsumerLoop self.kafka_producer = Producer() self.kafka_consumerloop = ConsumerLoop(self.db) # Endpoints self.api.add_resource(PlainTextSummary, "/v1/summaries/plain-text", "/v1/summaries/plain-text/<summary_id>", endpoint="plain-text-summarization", resource_class_kwargs={ 'dispatcher_service': self, 'kafka_producer': self.kafka_producer }) self.api.add_resource(Health, "/", "/healthz", endpoint="readiness-liveness-probe", resource_class_kwargs={ 'dispatcher_service': self, 'kafka_producer': self.kafka_producer })
def __init__(self, log_level): self.app = Flask(__name__) self.api = Api(self.app) logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=log_level, datefmt='%d/%m/%Y %I:%M:%S %p' ) self.logger = logging.getLogger("Dispatcher") # PostgreSQL connection data pg_username = None pg_password = None with open((Path(os.environ['PG_SECRET_PATH']) / Path(os.environ['PG_USERNAME_FILE'])), 'r') as username: pg_username = username.readline().rstrip('\n') with open((Path(os.environ['PG_SECRET_PATH']) / Path(os.environ['PG_PASSWORD_FILE'])), 'r') as password: pg_password = password.readline().rstrip('\n') self.db = SummaryDAOFactory( os.environ['PG_HOST'], os.environ['PG_DBNAME'], pg_username, pg_password, log_level ) del pg_username del pg_password # Create Kafka Producer and ConsumerLoop self.kafka_producer = Producer() self.kafka_consumerloop = ConsumerLoop(self.db) # Endpoints self.api.add_resource( PlainTextSummary, "/v1/summaries/plain-text", "/v1/summaries/plain-text/<summary_id>", endpoint="plain-text-summarization", resource_class_kwargs={'dispatcher_service': self, 'kafka_producer': self.kafka_producer} ) self.api.add_resource( Health, "/", "/healthz", endpoint="readiness-liveness-probe", resource_class_kwargs={'dispatcher_service': self, 'kafka_producer': self.kafka_producer} )
class TextPostprocessorService: """Text post-processing service.""" def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextPostprocessor") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextPostprocessingConsumedMsgSchema() self.produced_msg_schema = ReadyProducedMsgSchema() self.text_postprocessor = TextPostprocessor() def run(self): try: topics_to_subscribe = [KafkaTopic.TEXT_POSTPROCESSING.value] self.consumer.subscribe(topics_to_subscribe) self.logger.debug(f'Consumer subscribed to topic(s): ' f'{topics_to_subscribe}') while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.error( f'{msg.topic()} in partition {msg.partition} ' f'{msg.partition()} reached end at offset ' f'{msg.offset()}') elif msg.error(): self.logger.error( f"Error in consumer loop: {msg.error()}") raise KafkaException(msg.error()) else: self.logger.debug(f'Message consumed: [key]: {msg.key()}, ' f'[value]: "{msg.value()[:500]} [...]"') topic = KafkaTopic.READY.value message_key = msg.key() data = self.consumed_msg_schema.loads(msg.value()) summary = data.pop('summary') postprocessed_text = self.text_postprocessor.postprocess( summary) data['text_postprocessed'] = postprocessed_text message_value = self.produced_msg_schema.dumps(data) self._produce_message(topic, message_key, message_value) self.logger.debug(f'Message produced: [topic]: "{topic}", ' f'[key]: {message_key}, [value]: ' f'"{message_value[:500]} [...]"') finally: self.logger.debug("Consumer loop stopped. Closing consumer...") self.consumer.close( ) # close down consumer to commit final offsets def _produce_message(self, topic: str, message_key: int, message_value: str): """Produce Kafka message. If the local producer queue is full, the request will be aborted. Args: topic (:obj:`str`): The topic to produce the message to. message_key (:obj:`str`); The Kafka message key. message_value (:obj:`str`); The Kafka message value. """ try: self.producer.produce(topic, key=message_key, value=message_value, on_delivery=self._kafka_delivery_callback) except BufferError as err: error_msg = (f"Local producer queue is full ({len(self.producer)} " f"messages awaiting delivery)") self.logger.error(error_msg) raise Exception(error_msg) from err # Wait up to 1 second for events. Callbacks will # be invoked during this method call. self.producer.poll(1) def _kafka_delivery_callback(self, err: KafkaError, msg: Message): """Kafka per-message delivery callback. When passed to :meth:`confluent_kafka.Producer.produce` through the :attr:`on_delivery` attribute, this method will be triggered by :meth:`confluent_kafka.Producer.poll` or :meth:`confluent_kafka.Producer.flush` when wither a message has been successfully delivered or the delivery failed (after specified retries). Args: err (:obj:`confluent_kafka.KafkaError`): The Kafka error. msg (:obj:`confluent_kafka.Message`): The produced message, or an event. """ if err: self.logger.debug(f'Message delivery failed: {err}') else: self.logger.debug( f'Message delivered sucessfully: [topic]: ' f'"{msg.topic()}", [partition]: "{msg.partition()}"' f', [offset]: {msg.offset()}')
class TextSummarizerService: """Text summarizer service.""" def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextSummarizer") self.logger.debug("Loading t5-large models...") self.summarizer = Summarizer(TOKENIZER_PATH, MODEL_PATH) self.logger.debug("Models loaded!") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextSummarizationConsumedMsgSchema() self.produced_msg_schema = TextPostprocessingProducedMsgSchema() def run(self): try: topics_to_subscribe = [KafkaTopic.TEXT_SUMMARIZATION.value] self.consumer.subscribe(topics_to_subscribe) self.logger.debug(f'Consumer subscribed to topic(s): ' f'{topics_to_subscribe}') while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.error( f'{msg.topic()} in partition {msg.partition} ' f'{msg.partition()} reached end at offset ' f'{msg.offset()}') elif msg.error(): self.logger.error( f"Error in consumer loop: {msg.error()}") raise KafkaException(msg.error()) else: self.logger.debug(f'Message consumed: [key]: {msg.key()}, ' f'[value]: "{msg.value()[:500]} [...]"') topic = KafkaTopic.TEXT_POSTPROCESSING.value message_key = msg.key() data = self.consumed_msg_schema.loads(msg.value()) serialized_encoded_text = data.pop('text_encodings') encoded_text = pickle.loads(serialized_encoded_text) params = self._clean_up_params_and_add_defaults( data['params']) data[ 'params'] = params # update params to keep only the valid ones summarized_text = self.summarizer.summarize( encoded_text, **params) data['summary'] = summarized_text message_value = self.produced_msg_schema.dumps(data) self._produce_message(topic, message_key, message_value) self.logger.debug(f'Message produced: [topic]: "{topic}", ' f'[key]: {message_key}, [value]: ' f'"{message_value[:500]} [...]"') finally: self.logger.debug("Consumer loop stopped. Closing consumer...") self.consumer.close( ) # close down consumer to commit final offsets def _clean_up_params_and_add_defaults(self, params: dict) -> dict: """Ignore invalid params and add default values. Until here, the paramters are not checked in any step. Therefore, the attribute ``params`` could contain invalid parameters. If that were the case, we only take the correct ones; the invalid ones are ignored. This method also adds default values to the parameters not present in :obj:`params`. Args: params ():obj:`dict`): The unchecked parameters. Returns: :obj:`dict`: The valid parameters, with defaults set if needed. """ supported_params = [param.name.lower() for param in DefaultParams] invalid_params = {} for key in params: if key not in supported_params: invalid_params[key] = params[key] [params.pop(invalid) for invalid in invalid_params] # remove invalid params for default_param in supported_params: if default_param not in params: # add not included param params[default_param] = \ DefaultParams[default_param.upper()].value self.logger.debug(f"Valid params: {params}") if invalid_params: self.logger.debug(f"Invalid params: {invalid_params}") return params def _produce_message(self, topic: str, message_key: int, message_value: str): """Produce Kafka message. If the local producer queue is full, the request will be aborted. Args: topic (:obj:`str`): The topic to produce the message to. message_key (:obj:`str`); The Kafka message key. message_value (:obj:`str`); The Kafka message value. """ try: self.producer.produce(topic, key=message_key, value=message_value, on_delivery=self._kafka_delivery_callback) except BufferError as err: error_msg = (f"Local producer queue is full ({len(self.producer)} " f"messages awaiting delivery)") self.logger.error(error_msg) raise Exception(error_msg) from err # Wait up to 1 second for events. Callbacks will # be invoked during this method call. self.producer.poll(1) def _kafka_delivery_callback(self, err: KafkaError, msg: Message): """Kafka per-message delivery callback. When passed to :meth:`confluent_kafka.Producer.produce` through the :attr:`on_delivery` attribute, this method will be triggered by :meth:`confluent_kafka.Producer.poll` or :meth:`confluent_kafka.Producer.flush` when wither a message has been successfully delivered or the delivery failed (after specified retries). Args: err (:obj:`confluent_kafka.KafkaError`): The Kafka error. msg (:obj:`confluent_kafka.Message`): The produced message, or an event. """ if err: self.logger.debug(f'Message delivery failed: {err}') else: self.logger.debug( f'Message delivered sucessfully: [topic]: ' f'"{msg.topic()}", [partition]: "{msg.partition()}"' f', [offset]: {msg.offset()}')
class TextEncoderService: """Text encoder service.""" def __init__(self, log_level): self.log_level = log_level logging.basicConfig( format='%(asctime)s %(name)s %(levelname)-8s %(message)s', level=self.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') self.logger = logging.getLogger("TextEncoder") self.logger.debug("Loading t5-large tokenizer...") self.text_encoder = SplitterEncoder( TOKENIZER_PATH, debug=True) # TODO: change debug to False self.logger.debug("Tokenizer loaded!") self.producer = Producer() self.consumer = Consumer() self.consumed_msg_schema = TextEncodingsConsumedMsgSchema() self.produced_msg_schema = TextSumarizationProducedMsgSchema() def run(self): try: topics_to_subscribe = [KafkaTopic.TEXT_ENCODING.value] self.consumer.subscribe(topics_to_subscribe) self.logger.debug(f'Consumer subscribed to topic(s): ' f'{topics_to_subscribe}') while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.error( f'{msg.topic()} in partition {msg.partition} ' f'{msg.partition()} reached end at offset ' f'{msg.offset()}') elif msg.error(): self.logger.error( f"Error in consumer loop: {msg.error()}") raise KafkaException(msg.error()) else: self.logger.debug(f'Message consumed: [key]: {msg.key()}, ' f'[value]: "{msg.value()[:500]} [...]"') data = self.consumed_msg_schema.loads(msg.value()) # In the future, when more models are supported, we have # to produce to the proper model Topic data.pop('model') # figure out what topic to produce to topic = KafkaTopic.TEXT_SUMMARIZATION.value message_key = msg.key() text_preprocessed = data.pop('text_preprocessed') encoded_text = self.text_encoder.encode(text_preprocessed) serialized_encoded_text = pickle.dumps( encoded_text) # bytes type data['text_encodings'] = serialized_encoded_text message_value = self.produced_msg_schema.dumps(data) self._produce_message(topic, message_key, message_value) self.logger.debug(f'Message produced: [topic]: "{topic}", ' f'[key]: {message_key}, [value]: ' f'"{message_value[:500]} [...]"') finally: self.logger.debug("Consumer loop stopped. Closing consumer...") self.consumer.close( ) # close down consumer to commit final offsets def _produce_message(self, topic: str, message_key: int, message_value: str): """Produce Kafka message. If the local producer queue is full, the request will be aborted. Args: topic (:obj:`str`): The topic to produce the message to. message_key (:obj:`str`); The Kafka message key. message_value (:obj:`str`); The Kafka message value. """ try: self.producer.produce(topic, key=message_key, value=message_value, on_delivery=self._kafka_delivery_callback) except BufferError as err: error_msg = (f"Local producer queue is full ({len(self.producer)} " f"messages awaiting delivery)") self.logger.error(error_msg) raise Exception(error_msg) from err # Wait up to 1 second for events. Callbacks will # be invoked during this method call. self.producer.poll(1) def _kafka_delivery_callback(self, err: KafkaError, msg: Message): """Kafka per-message delivery callback. When passed to :meth:`confluent_kafka.Producer.produce` through the :attr:`on_delivery` attribute, this method will be triggered by :meth:`confluent_kafka.Producer.poll` or :meth:`confluent_kafka.Producer.flush` when wither a message has been successfully delivered or the delivery failed (after specified retries). Args: err (:obj:`confluent_kafka.KafkaError`): The Kafka error. msg (:obj:`confluent_kafka.Message`): The produced message, or an event. """ if err: self.logger.debug(f'Message delivery failed: {err}') else: self.logger.debug( f'Message delivered sucessfully: [topic]: ' f'"{msg.topic()}", [partition]: "{msg.partition()}"' f', [offset]: {msg.offset()}')