def main(loop: asyncio.AbstractEventLoop) -> None: telethon, pytonisadb, rabbitmq, pytonisa_files = loop.run_until_complete( asyncio.gather( start_telethon(), start_pytonisadb(), start_rabbitmq(loop), start_pytonisa_file_storage(), )) queuehandlers.telegram = telethon queuehandlers.rabbitmq = rabbitmq queuehandlers.pytonisadb = pytonisadb queuehandlers.pytonisa_files = pytonisa_files messagehandlers.rabbitmq = rabbitmq messagehandlers.pytonisadb = pytonisadb messagehandlers.pytonisa_files = pytonisa_files log.info('Bot initiated') try: loop.run_forever() except KeyboardInterrupt: loop.run_until_complete( asyncio.gather( exit_telethon(telethon), exit_rabbitmq(rabbitmq), exit_pytonisadb(pytonisadb), exit_pytonisa_file_storage(pytonisa_files), ))
async def on_document_error(message: IncomingMessage): if pytonisadb is None: log.warn( 'on_document_error called before database is ready, sleeping 10 seconds') await asyncio.sleep(10) await message.nack() return ocr_request_id = message.body.decode() log.info('Sending error for document of id ' + ocr_request_id) document = pytonisadb.ocr_requests.get_item(ocr_request_id) queue_message = QueueMessage(**document) queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args) await telegram.send_message( entity=queue_message.chat_id, message='Infelizmente não foi possível reconhecer seu pdf. O(s) seguinte(s) erro(s) ocorreu(ram):', reply_to=queue_message.message_id, ) await telegram.send_message( entity=queue_message.chat_id, message='- ' + '\n- '.join(queue_message.errors) ) await message.ack()
async def pdf_to_ocr(event: events.newmessage.NewMessage.Event) -> None: """Handles messages for applying ocr to a pdf This function handles incoming new messages that respects the pattern '(^-)|(^$)' (messages that are empty or starts with -) and have an attached pdf file. Args: event (`events.newmessage.NewMessage.Event`): The new message event (from telethon) """ message_obj: custom.message.Message = event.message log.info('-' * 20 + 'pdf_to_ocr called' + '-' * 20) await message_obj.reply('Arquivo recebido!') default_args = OcrMyPdfArgs(arg_string=message_obj.message) log.info('Language set to: ' + ' '.join(default_args.language)) file_path: str = os.path.join(pytonisa_files.get_valid_path(), message_obj.file.name) file_path = await message_obj.download_media(file=file_path) pytonisa_files.upload_file(file_path) channel: Channel = rabbitmq['channel'] queue_message = QueueMessage(os.path.basename(file_path), message_obj.chat_id, message_obj.id, default_args) dicti: dict = queue_message.__dict__ dicti['ocr_args'] = dicti['ocr_args'].__dict__ result: dict = pytonisadb.ocr_requests.put_item(dicti) objectId: str = result['_id'] log.info(f'document of id {objectId} created') encoded_id = bytes(objectId, 'utf-8') await channel.default_exchange.publish(Message(encoded_id), routing_key=Queues.TO_PROCESS.value) log.info('Arquivo inserido na fila para processamento') await message_obj.respond('Arquivo inserido na fila para processamento') log.info('Finalizado')
async def on_document_processed(message: IncomingMessage): log.info('-'*20 + 'on_document_processed called' + '-'*20) if pytonisadb is None: log.warn( 'on_document_processed called before database is ready, sleeping 10 seconds') await asyncio.sleep(10) await message.nack() return ocr_request_id = message.body.decode() log.info('Sending processed document of id ' + ocr_request_id) document: dict = pytonisadb.ocr_requests.get_item(ocr_request_id) queue_message = QueueMessage(**document) queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args) output_file = pytonisa_files.download_file(queue_message.file_name) await telegram.send_message( entity=queue_message.chat_id, message='OCR feito! Estamos fazendo upload do seu arquivo', ) with open(output_file, 'rb') as file: await telegram.send_message( entity=queue_message.chat_id, message='Aqui está!', reply_to=queue_message.message_id, file=file, ) log.info('File sent') await message.ack()
def main() -> None: rabbitmq, pytonisadb, pytonisa_files = start_rabbitmq(), start_pytonisadb( ), start_pytonisa_file_storage() queuehandler.rabbitmq = rabbitmq queuehandler.pytonisadb = pytonisadb queuehandler.pytonisa_files = pytonisa_files log.info('ocrmypdf processor initiated') try: channel: BlockingChannel = rabbitmq['channel'] channel.start_consuming() except KeyboardInterrupt: log.info('ending ocrmypdf') exit_rabbitmq(rabbitmq), exit_pytonisadb( pytonisadb), exit_pytonisa_file_storage(pytonisa_files) for thread in threads: thread.join() log.info('ocrmypdf ended')
def on_document_to_process(channel: BlockingChannel, method: Basic.Deliver, properties: BasicProperties, body: Union[str, bytes]): connection: BlockingConnection = rabbitmq['connection'] ocr_request_id = body.decode() handle_error_partial: function = partial(handle_error, channel=channel, ocr_request_id=ocr_request_id, delivery_tag=method.delivery_tag) log.info('-' * 20 + ocr_request_id + '-' * 20) log.info('Processing document of id ' + ocr_request_id) document: dict = pytonisadb.ocr_requests.get_item(ocr_request_id) queue_message: QueueMessage = QueueMessage(**document) queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args) if queue_message.started_processing: handle_error_partial( message= 'Tentando processar um item repetido, provavelmente o servidor crashou no reconhecimento OCR anterior' ) return queue_message.started_processing = True pytonisadb.ocr_requests.update_item(ocr_request_id, {'started_processing': True}) input_file = pytonisa_files.download_file(queue_message.file_name) output_file = input_file log.info('Iniciando processamento OCR') try: ocr_args = queue_message.ocr_args.__dict__ ocrmypdf.ocr(input_file=input_file, output_file=output_file, **ocr_args) except ocrmypdf.PriorOcrFoundError: log.info('Arquivo já possui OCR') queue_message.ocr_args.set_force_ocr() pytonisadb.ocr_requests.update_item( ocr_request_id, {'ocr_args': queue_message.ocr_args.__dict__}) ocr_args = queue_message.ocr_args.__dict__ ocrmypdf.ocr(input_file=input_file, output_file=output_file, **ocr_args) except ocrmypdf.MissingDependencyError as mde: handle_error_partial( message='Não foi possível processar alguma das línguas solicitadas', e=mde, ) return except Exception as e: handle_error_partial( message='Ocorreu um erro desconhecido', e=e, ) return pytonisa_files.upload_file(output_file) queue_message.processed = True ocr_args = queue_message.ocr_args.__dict__ pytonisadb.ocr_requests.update_item(ocr_request_id, { 'processed': True, 'ocr_args': ocr_args }) log.info('Processamento OCR finalizado com sucesso!') cb = partial(ack_message, delivery_tag=method.delivery_tag, routing_key=Queues.PROCESSED.value, message=body) connection.add_callback_threadsafe(cb)