def __init__(self): settings = get_settings() super().__init__({ 'webdav_hostname': settings.webdav_url, 'webdav_login': settings.webdav_username, 'webdav_password': settings.webdav_password, 'disable_check': True })
def extract_page_images(pdf_fn: str, start_page: int = None, end_page: int = None, pdf_password: str = None, timeout_sec: int = 1800, dpi: int = 300) -> Generator[List[str], None, None]: java_modules_path = get_settings().java_modules_path temp_dir = mkdtemp(prefix='pdf_images_') basefn = os.path.splitext(os.path.basename(pdf_fn))[0] try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'org.apache.pdfbox.tools.PDFToImage', '-format', 'png', '-dpi', f'{dpi}', '-quality', '1', '-prefix', f'{temp_dir}/{basefn}__' ] if pdf_password: args += ['-password', pdf_password] if start_page is not None: args += ['-startPage', str(start_page)] if end_page is not None: args += ['-endPage', str(end_page)] args += [pdf_fn] completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract page images from {pdf_fn}') raise_from_pdfbox_error_messages(completed_process) # Output of PDFToImage is a set of files with the names generated as: # {prefix}+{page_num_1_based}.{ext} # We used "{temp_dir}/{basefn}__" as the prefix. # Now we need to get the page numbers from the filenames and return the list of file names # ordered by page number. page_by_num: Dict[int, str] = dict() for fn in os.listdir(temp_dir): page_num = PAGE_NUM_RE.search(os.path.splitext(fn)[0]).group(0) page_by_num[int(page_num)] = os.path.join(temp_dir, fn) yield [page_by_num[key] for key in sorted(page_by_num.keys())] finally: shutil.rmtree(temp_dir, ignore_errors=True)
def convert_to_pdf(src_fn: str, soffice_single_process_locking: bool = True, timeout_sec: int = 1800) -> Generator[str, None, None]: """ Converts the specified file to pdf using Libre Office CLI. Libre Office allows specifying the output directory and does not allow specifying the output file name. The output file name is generated by changing the extension to ".pdf". To avoid file name conflicts and additional operations the output file is written into a temporary directory and next yielded to the caller. After returning from the yield the output file and the output temp directory are removed. """ if not os.path.isfile(src_fn): raise InputFileDoesNotExist(src_fn) temp_dir = tempfile.mkdtemp() src_fn_base = os.path.basename(src_fn) src_fn_base, src_ext = os.path.splitext(src_fn_base) out_fn = os.path.join(temp_dir, src_fn_base + '.pdf') try: if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}: java_modules_path = get_settings().java_modules_path args = ['java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.MakePDFFromImages', out_fn, src_fn] completed_process: CompletedProcess = _run_process(args, timeout_sec) else: args = ['soffice', '--headless', '--invisible', '--nodefault', '--view', '--nolockcheck', '--nologo', '--norestore', '--nofirststartwizard', '--convert-to', 'pdf', src_fn, '--outdir', temp_dir] # We are using "soffice" (Libre Office) to "print" any document to pdf # and it seems not allowing running more than one copy of the process in some environments. # The following is a workaround mostly for in-container usage. # There is no guaranty that it will work on a dev machine when there is an "soffice" process # started by some other app/user. if soffice_single_process_locking: with get_lock('soffice_single_process', wait_required_listener= lambda: log.info('Waiting for another conversion task to finish first...')): completed_process: CompletedProcess = _run_process(args, timeout_sec) else: completed_process: CompletedProcess = _run_process(args, timeout_sec) raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.') if not os.path.isfile(out_fn): raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. ' f'Output file does not exist after conversion.\n' + render_process_msg(completed_process)) yield out_fn finally: if os.path.isfile(out_fn): os.remove(out_fn) if os.path.isdir(temp_dir): os.rmdir(temp_dir)
def __init__(self, pool, max_concurrency, min_concurrency=0, worker=None, keepalive=AUTOSCALE_KEEPALIVE, mutex=None): super().__init__(pool, max_concurrency, min_concurrency, worker, keepalive, mutex) from text_extraction_system.config import get_settings self.cool_down_period_sec = get_settings().celery_shutdown_when_no_tasks_longer_than_sec self.no_tasks_start: Optional[float] = None print( f'Configuring celery to shutdown when there were no tasks for more than f{self.cool_down_period_sec} seconds.')
def merge_pdf_pages(original_pdf_fn: str, page_pdf_dir: str = None, single_page_merge_num_file_rotate: Tuple[int, str, Optional[float]] = None, original_pdf_password: str = None, timeout_sec: int = 3000) \ -> Generator[str, None, None]: temp_dir = mkdtemp() try: dst_pdf_fn = os.path.join(temp_dir, os.path.basename(original_pdf_fn)) java_modules_path = get_settings().java_modules_path args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.mergepdf.MergeInPageLayers', '--original-pdf', original_pdf_fn, '--dst-pdf', dst_pdf_fn ] if page_pdf_dir: args += ['--page-dir', page_pdf_dir] if single_page_merge_num_file_rotate: merge_page_num, merge_page_fn, merge_page_rotate = single_page_merge_num_file_rotate args += [f'{merge_page_num}={merge_page_fn}'] if merge_page_rotate: args += [f'rotate_{merge_page_num}={merge_page_rotate}'] if original_pdf_password: args += ['--password', original_pdf_password] completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract page images for OCR needs ' f'(with text removed) from {original_pdf_fn}') raise_from_pdfbox_error_messages(completed_process) yield dst_pdf_fn finally: shutil.rmtree(temp_dir)
def re_schedule_unknown_pending_tasks(log: Logger, app) -> List[Tuple[str, str]]: conf = get_settings() webdav_client = get_webdav_client() broker_url = conf.celery_broker if not broker_url.startswith('redis:'): raise Exception('Only Redis broker supported for the task health tracking.') restarted_tasks: List[Tuple[str, str]] = list() failed_to_restart_tasks: List[Tuple[str, str]] = list() start_time = datetime.now() unknown_pending_tasks = get_unknown_pending_tasks(app) for task_id in unknown_pending_tasks: task_name: Optional[str] = 'unknown' try: task_info: Dict = webdav_client.unpickle(remote_path=f'{tasks_pending}/{task_id}') task_name = task_info['headers']['task'] or 'unknown' with Connection(broker_url) as conn: producer = conn.Producer(serializer='json') producer.publish(task_info['body'], routing_key=task_info['routing_key'], delivery_mode=2, serializer='pickle', headers=task_info['headers'], exchange=task_info['exchange'], retry=task_info['retry_policy'] is not None, retry_policy=task_info['retry_policy']) restarted_tasks.append((task_id, task_name)) except RemoteResourceNotFound: log.warning(f'Unable to restart lost pending task ' f'because it has been completed already: #{task_id} - {task_name}') except Exception as ex: failed_to_restart_tasks.append((task_id, task_name)) log.error(f'Unable to restart lost pending task: #{task_id} - {task_name}', exc_info=ex) if unknown_pending_tasks: time_spent = datetime.now() - start_time msg = f'Found {len(unknown_pending_tasks)} and restarted {len(restarted_tasks)} unknown/lost tasks ' \ f'registered at Webdav but not found in Redis queue.\n' \ f'Time spent: {time_spent}\n' if restarted_tasks: msg += f'Restarted tasks:\n' + '\n'.join([' - '.join(item) for item in restarted_tasks]) if failed_to_restart_tasks: msg += f'Failed to restart tasks:\n' + '\n'.join([' - '.join(item) for item in failed_to_restart_tasks]) log.info(msg) return restarted_tasks
def tika_extract_xhtml(src_fn: str) -> str: conf: Settings = get_settings() encoding_name = 'utf-8' os.environ['LEXNLP_TIKA_PARSER_MODE'] = 'pdf_only' # other possible values are 'coords_embedded' and '' os.environ['LEXNLP_TIKA_XML_DETAIL'] = 'coords_flat' cmd = [ 'java', '-cp', f'{conf.tika_jar_path}/*', '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider', 'org.apache.tika.cli.TikaCLI', f'--config={conf.tika_config}', '-x', f'-e{encoding_name}', src_fn ] def err(line): log.error(f'TIKA parsing {src_fn}:\n{line}') return read_output( cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=60 * 20) or ''
def setup_loggers(*args, **kwargs): conf = get_settings() logger = logging.getLogger() logger.handlers.clear() if conf.log_to_stdout: if conf.log_to_stdout_json: formatter = JSONFormatter() else: formatter = logging.Formatter( '%(levelname)s %(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') sh = logging.StreamHandler() sh.setFormatter(formatter) logger.addHandler(sh) if conf.log_to_file: formatter = JSONFormatter() from logging.handlers import RotatingFileHandler sh = RotatingFileHandler(filename=conf.log_to_file, encoding='utf-8', maxBytes=10 * 1024 * 1024, backupCount=5) sh.setFormatter(formatter) logger.addHandler(sh) # make pdfminer a bit more silent from pdfminer.pdfinterp import log as pdfinterp_log from pdfminer.pdfpage import log as pdfpage_log from pdfminer.pdfdocument import log as pdfdocument_log from pdfminer.converter import log as converter_log pdfinterp_log.setLevel(logging.WARNING) pdfpage_log.setLevel(logging.WARNING) pdfdocument_log.setLevel(logging.WARNING) converter_log.setLevel(logging.WARNING)
def extract_page_ocr_images( pdf_fn: str, start_page: int = None, end_page: int = None, pdf_password: str = None, timeout_sec: int = 1800, reset_page_rotation: bool = False, dpi: int = 300) -> Generator[Dict[int, str], None, None]: java_modules_path = get_settings().java_modules_path temp_dir_no_text = mkdtemp(prefix='pdf_images_') basefn = os.path.splitext(os.path.basename(pdf_fn))[0] try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.getocrimages.GetOCRImages', pdf_fn, '--format', 'png', '--dpi', f'{dpi}', '--output-prefix-no-text', f'{temp_dir_no_text}/{basefn}__' ] if pdf_password: args += ['--password', pdf_password] if start_page is not None: args += ['--start-page', str(start_page)] if end_page is not None: args += ['--end-page', str(end_page)] if reset_page_rotation: args += ['--reset-page-rotation'] completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract page images for OCR needs (with text removed) from {pdf_fn}' ) raise_from_pdfbox_error_messages(completed_process) # Output of GetOCRImages is a set of files with the names generated as: # {prefix}+{page_num_1_based}.{ext} # We used "{temp_dir}/{basefn}__" as the prefix. # Now we need to get the page numbers from the filenames and return the list of file names # ordered by page number. # For the "no-text" images: for the pages having no images which are not overlapped with # any text element it stores no page image. page_by_num_no_text: Dict[int, str] = dict() for fn in os.listdir(temp_dir_no_text): page_num = PAGE_NUM_RE.search(os.path.splitext(fn)[0]).group(0) page_no_text_fn = os.path.join(temp_dir_no_text, fn) page_by_num_no_text[int(page_num)] = page_no_text_fn yield page_by_num_no_text finally: shutil.rmtree(temp_dir_no_text, ignore_errors=True)
def extract_text_and_structure(pdf_fn: str, pdf_password: str = None, timeout_sec: int = 3600, language: str = "", correct_pdf: bool = False, render_coords_debug: bool = False) \ -> Tuple[ str, TextAndPDFCoordinates, str, Dict[int, float]]: # text, structure, corrected_pdf_fn, page_rotate_angles if render_coords_debug: correct_pdf = True java_modules_path = get_settings().java_modules_path # Convert language to language code lang_converter = LanguageConverter() language, locale_code = lang_converter.get_language_and_locale_code( language) temp_dir = mkdtemp(prefix='pdf_text_') out_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack') out_pdf_fn = pdf_fn try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn, '-f', 'pages_msgpack' ] if pdf_password: args.append('-p') args.append(pdf_password) if correct_pdf: out_pdf_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf') args.append('-corrected_output') args.append(out_pdf_fn) if render_coords_debug: args.append('-render_char_rects') completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract text and structure from {pdf_fn}') raise_from_pdfbox_error_messages(completed_process) with open(out_fn, 'rb') as pages_f: # see object structure in com.lexpredict.textextraction.dto.PDFPlainText pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False) # Remove Null characters because of incompatibility with PostgreSQL text = pdfbox_res['text'].replace("\x00", "") if len(text) == 0: pdf_coordinates = PDFCoordinates( char_bboxes=pdfbox_res['charBBoxes']) text_struct = PlainTextStructure( title='', language=language or 'en', # FastText returns English for empty strings pages=[], sentences=[], paragraphs=[], sections=[]) yield text, \ TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \ out_pdf_fn, \ None return page_rotate_angles: List[float] = [ pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages'] ] pages = [] num: int = 0 for p in pdfbox_res['pages']: p_res = PlainTextPage(number=num, start=p['location'][0], end=p['location'][1], bbox=p['bbox']) pages.append(p_res) num += 1 sentence_spans = get_sentence_span_list(text) lang = get_lang_detector() sentences = [ PlainTextSentence(start=start, end=end, language=language or lang.predict_lang(segment)) for start, end, segment in sentence_spans ] # There was a try-except in Contraxsuite catching some lexnlp exception. # Not putting it here because it should be solved on lexnlp side. paragraphs = [ PlainTextParagraph(start=start, end=end, language=language or lang.predict_lang(segment)) for segment, start, end in get_paragraphs(text, return_spans=True) ] sections = [ PlainTextSection(title=sect.title, start=sect.start, end=sect.end, title_start=sect.title_start, title_end=sect.title_end, level=sect.level, abs_level=sect.abs_level) for sect in get_document_sections_with_titles( text, sentence_list=sentence_spans) ] try: title = next(get_titles(text)) except StopIteration: title = None text_struct = PlainTextStructure(title=title, language=language or lang.predict_lang(text), pages=pages, sentences=sentences, paragraphs=paragraphs, sections=sections) char_bboxes = pdfbox_res['charBBoxes'] pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes) yield text, TextAndPDFCoordinates( text_structure=text_struct, pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles return finally: shutil.rmtree(temp_dir, ignore_errors=True)
from text_extraction_system.data_extract.tables import get_table_dtos_from_camelot_output from text_extraction_system.file_storage import get_webdav_client, WebDavClient from text_extraction_system.pdf.convert_to_pdf import convert_to_pdf from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks from text_extraction_system.request_metadata import RequestCallbackInfo, RequestMetadata, \ save_request_metadata, \ load_request_metadata from text_extraction_system.result_delivery.celery_client import send_task from text_extraction_system.task_health.task_health import store_pending_task_info_in_webdav, \ remove_pending_task_info_from_webdav, re_schedule_unknown_pending_tasks, init_task_tracking from text_extraction_system.utils import LanguageConverter from text_extraction_system_api.dto import OutputFormat from text_extraction_system_api.dto import RequestStatus, STATUS_FAILURE, STATUS_PENDING, STATUS_DONE log = logging.getLogger(__name__) settings = get_settings() class CeleryConfig: task_track_started = True task_serializer = 'pickle' result_serializer = 'pickle' accept_content = ['pickle', 'json'] task_acks_late = True task_reject_on_worker_lost = True worker_prefetch_multiplier = 1 @property def worker_autoscaler(self) -> Optional[str]: if settings.celery_shutdown_when_no_tasks_longer_than_sec: return 'text_extraction_system.celery_autoscaler:ShutdownWhenNoTasksAutoscaler'
def get_scheduled_tasks_from_redis() -> Set[str]: conf = get_settings() broker_url = conf.celery_broker r = Redis.from_url(broker_url) return {json.loads(item)['headers']['id'] for item in r.lrange('celery', 0, -1)}