def run_document_conversion(data, _context=None): """Converts document passed in to PDF and triggers PDF extraction.""" data = get_pubsub_data(data) doc_id = data["doc_id"] slug = data["slug"] extension = data["extension"] logger.info("[DOCUMENT CONVERSION] doc_id %s extension %s", doc_id, extension) # Ensure whitelisted file extension if extension.lower().strip() not in SUPPORTED_DOCUMENT_EXTENSIONS: raise DocumentExtensionError() input_file = path.original_path(doc_id, slug, extension) # Ensure non-PDF document size is within the limit if storage.size(input_file) > DOCUMENT_SIZE_LIMIT: # If not, remove the PDF storage.delete(path.path(doc_id)) raise DocumentSizeError() # Run conversion convert(input_file, doc_id, slug) # Delete the original file storage.delete(input_file) # Trigger PDF processing (output file should be expected doc path) publisher.publish(PDF_PROCESS_TOPIC, data=encode_pubsub_data(data))
def trigger_processing(): """Triggers PDF processing via pubsub.""" publisher.publish( PDF_PROCESS_TOPIC, encode_pubsub_data({ "doc_id": ID, "slug": SLUG, "access": Access.private }), )
def process_doc(request, _context=None): """Central command to run processing on a doc""" data = get_http_data(request) doc_id = data["doc_id"] job_type = data["method"] extension = data.get("extension", "pdf").lower() # Initialize the processing environment utils.initialize(REDIS, doc_id) # Launch PDF processing via pubsub if job_type == "process_pdf": if extension == "pdf": publisher.publish(PDF_PROCESS_TOPIC, data=encode_pubsub_data(data)) else: # Non-PDF files require conversion first publisher.publish(DOCUMENT_CONVERT_TOPIC, data=encode_pubsub_data(data)) elif job_type == "redact_doc": publisher.publish(REDACT_TOPIC, data=encode_pubsub_data(data)) elif job_type == "modify_doc": publisher.publish(MODIFY_TOPIC, data=encode_pubsub_data(data)) elif job_type == "cancel_doc_processing": utils.clean_up(REDIS, doc_id) else: logger.error("Invalid doc processing type: %s", job_type, exc_info=sys.exc_info()) return "Error" return encode_response("Ok")
def trigger_redacting(page_numbers): """Triggers redaction processing via pubsub.""" publisher.publish( REDACT_TOPIC, encode_pubsub_data({ "doc_id": ID, "slug": SLUG, "access": Access.private, "redactions": [{ "page_number": page_number } for page_number in page_numbers], }), )
def flush(queue): if not queue: return # Trigger text position extraction pipeline publisher.publish( TEXT_POSITION_EXTRACT_TOPIC, encode_pubsub_data( { "paths_and_numbers": queue, "doc_id": doc_id, "slug": slug, "access": access, "ocr_code": ocr_code, "partial": partial, "in_memory": True, } ), ) queue.clear()
def run_tesseract(data, _context=None): """Runs OCR on the images passed in, storing the extracted text. """ # pylint: disable=too-many-locals, too-many-statements overall_start = time.time() data = get_pubsub_data(data) doc_id = data["doc_id"] slug = data["slug"] access = data.get("access", access_choices.PRIVATE) ocr_code = data.get("ocr_code", "eng") paths_and_numbers = data["paths_and_numbers"] partial = data["partial"] # Whether it is a partial update (e.g. redaction) or not force_ocr = data["force_ocr"] if force_ocr: ocr_version = f"{OCR_VERSION}_force" else: ocr_version = OCR_VERSION logger.info( "[RUN TESSERACT] doc_id %s ocr_code %s ocr_version %s page_numbers %s", doc_id, ocr_code, ocr_version, ",".join([str(number[0]) for number in paths_and_numbers]), ) result = {} if PROFILE_CPU: # Perform speed thresholding to prevent running OCR on a slow CPU speed = profile_cpu(CPU_DIFFICULTY) if speed > SPEED_THRESHOLD: # Resubmit to queue publisher.publish( OCR_TOPIC, data=encode_pubsub_data( { "paths_and_numbers": paths_and_numbers, "doc_id": doc_id, "slug": slug, "access": access, "ocr_code": ocr_code, "partial": partial, "force_ocr": force_ocr, } ), ) logging.warning("Too slow (speed: %f)", speed) return "Too slow, retrying" result["speed"] = speed # Keep track of how long OCR takes (useful for profiling) elapsed_times = [] if not paths_and_numbers: logging.warning("No paths/numbers") return "Ok" # Queue up text position extraction tasks queue = [] def flush(queue): if not queue: return # Trigger text position extraction pipeline publisher.publish( TEXT_POSITION_EXTRACT_TOPIC, encode_pubsub_data( { "paths_and_numbers": queue, "doc_id": doc_id, "slug": slug, "access": access, "ocr_code": ocr_code, "partial": partial, "in_memory": True, } ), ) queue.clear() def check_and_flush(queue): if len(queue) >= TEXT_POSITION_BATCH: flush(queue) # Loop through all paths and numbers for page_number, image_path in paths_and_numbers: ocrd = utils.page_ocrd(REDIS, doc_id, page_number) logger.info( "[RUN TESSERACT] doc_id %s page_number %s ocrd %s", doc_id, page_number, ocrd, ) text_path = path.page_text_path(doc_id, slug, page_number) # Benchmark OCR speed start_time = time.time() logger.info( "[RUN TESSERACT] doc_id %s page %s start_time %s", doc_id, page_number, start_time, ) text, pdf_contents = ocr_page(doc_id, image_path, text_path, access, ocr_code) elapsed_time = time.time() - start_time elapsed_times.append(elapsed_time) logger.info( "[RUN TESSERACT] doc_id %s page %s elapsed_time %s", doc_id, page_number, elapsed_time, ) # Write the output text and pdf to Redis utils.write_page_text(REDIS, doc_id, page_number, text, ocr_version, ocr_code) utils.write_page_text_pdf(REDIS, doc_id, page_number, pdf_contents) # Decrement the texts remaining utils.register_page_ocrd(REDIS, doc_id, page_number) # Queue text position extraction tasks queue.append(page_number) check_and_flush(queue) # Flush the remaining queue flush(queue) result["doc_id"] = doc_id result["elapsed"] = elapsed_times result["status"] = "Ok" result["overall_elapsed"] = time.time() - overall_start if PROFILE_CPU: result["speed_after"] = profile_cpu() return json.dumps(result)
def sidekick(request, _context=None): """Kick off sidekick processing lambda""" data = get_http_data(request) publisher.publish(SIDEKICK_PREPROCESS_TOPIC, data=encode_pubsub_data(data)) return encode_response("Ok")
def import_documents(request, _context=None): """Command to start the import process on an organization""" data = get_http_data(request) publisher.publish(START_IMPORT_TOPIC, data=encode_pubsub_data(data))