示例#1
0
    def __init__(self):
        settings = get_settings()

        super().__init__({
            'webdav_hostname': settings.webdav_url,
            'webdav_login': settings.webdav_username,
            'webdav_password': settings.webdav_password,
            'disable_check': True
        })
示例#2
0
def extract_page_images(pdf_fn: str,
                        start_page: int = None,
                        end_page: int = None,
                        pdf_password: str = None,
                        timeout_sec: int = 1800,
                        dpi: int = 300) -> Generator[List[str], None, None]:
    java_modules_path = get_settings().java_modules_path

    temp_dir = mkdtemp(prefix='pdf_images_')
    basefn = os.path.splitext(os.path.basename(pdf_fn))[0]
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'org.apache.pdfbox.tools.PDFToImage', '-format', 'png', '-dpi',
            f'{dpi}', '-quality', '1', '-prefix', f'{temp_dir}/{basefn}__'
        ]
        if pdf_password:
            args += ['-password', pdf_password]

        if start_page is not None:
            args += ['-startPage', str(start_page)]

        if end_page is not None:
            args += ['-endPage', str(end_page)]

        args += [pdf_fn]

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract page images from {pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        # Output of PDFToImage is a set of files with the names generated as:
        # {prefix}+{page_num_1_based}.{ext}
        # We used "{temp_dir}/{basefn}__" as the prefix.
        # Now we need to get the page numbers from the filenames and return the list of file names
        # ordered by page number.
        page_by_num: Dict[int, str] = dict()
        for fn in os.listdir(temp_dir):
            page_num = PAGE_NUM_RE.search(os.path.splitext(fn)[0]).group(0)
            page_by_num[int(page_num)] = os.path.join(temp_dir, fn)

        yield [page_by_num[key] for key in sorted(page_by_num.keys())]

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
def convert_to_pdf(src_fn: str,
                   soffice_single_process_locking: bool = True,
                   timeout_sec: int = 1800) -> Generator[str, None, None]:
    """
    Converts the specified file to pdf using Libre Office CLI.
    Libre Office allows specifying the output directory and does not allow specifying the output file name.
    The output file name is generated by changing the extension to ".pdf".
    To avoid file name conflicts and additional operations the output file is written into
    a temporary directory and next yielded to the caller.
    After returning from the yield the output file and the output temp directory are removed.
    """
    if not os.path.isfile(src_fn):
        raise InputFileDoesNotExist(src_fn)
    temp_dir = tempfile.mkdtemp()
    src_fn_base = os.path.basename(src_fn)
    src_fn_base, src_ext = os.path.splitext(src_fn_base)
    out_fn = os.path.join(temp_dir, src_fn_base + '.pdf')
    try:
        if src_ext.lower() in {'.tiff', '.jpg', '.jpeg', '.png'}:
            java_modules_path = get_settings().java_modules_path
            args = ['java', '-cp', f'{java_modules_path}/*',
                    'com.lexpredict.textextraction.MakePDFFromImages',
                    out_fn, src_fn]
            completed_process: CompletedProcess = _run_process(args, timeout_sec)
        else:
            args = ['soffice', '--headless', '--invisible', '--nodefault', '--view', '--nolockcheck',
                    '--nologo', '--norestore', '--nofirststartwizard', '--convert-to', 'pdf', src_fn,
                    '--outdir', temp_dir]

            # We are using "soffice" (Libre Office) to "print" any document to pdf
            # and it seems not allowing running more than one copy of the process in some environments.
            # The following is a workaround mostly for in-container usage.
            # There is no guaranty that it will work on a dev machine when there is an "soffice" process
            # started by some other app/user.
            if soffice_single_process_locking:
                with get_lock('soffice_single_process',
                              wait_required_listener=
                              lambda: log.info('Waiting for another conversion task to finish first...')):
                    completed_process: CompletedProcess = _run_process(args, timeout_sec)
            else:
                completed_process: CompletedProcess = _run_process(args, timeout_sec)

        raise_from_process(log, completed_process, lambda: f'Converting {src_fn} to pdf.')

        if not os.path.isfile(out_fn):
            raise OutputPDFDoesNotExistAfterConversion(f'Unable to convert {src_fn} to pdf. '
                                                       f'Output file does not exist after conversion.\n'
                                                       + render_process_msg(completed_process))
        yield out_fn

    finally:
        if os.path.isfile(out_fn):
            os.remove(out_fn)
        if os.path.isdir(temp_dir):
            os.rmdir(temp_dir)
示例#4
0
 def __init__(self,
              pool,
              max_concurrency,
              min_concurrency=0,
              worker=None,
              keepalive=AUTOSCALE_KEEPALIVE,
              mutex=None):
     super().__init__(pool, max_concurrency, min_concurrency, worker, keepalive, mutex)
     from text_extraction_system.config import get_settings
     self.cool_down_period_sec = get_settings().celery_shutdown_when_no_tasks_longer_than_sec
     self.no_tasks_start: Optional[float] = None
     print(
         f'Configuring celery to shutdown when there were no tasks for more than f{self.cool_down_period_sec} seconds.')
示例#5
0
def merge_pdf_pages(original_pdf_fn: str,
                    page_pdf_dir: str = None,
                    single_page_merge_num_file_rotate: Tuple[int, str, Optional[float]] = None,
                    original_pdf_password: str = None,
                    timeout_sec: int = 3000) \
        -> Generator[str, None, None]:
    temp_dir = mkdtemp()
    try:
        dst_pdf_fn = os.path.join(temp_dir, os.path.basename(original_pdf_fn))

        java_modules_path = get_settings().java_modules_path
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.mergepdf.MergeInPageLayers',
            '--original-pdf', original_pdf_fn, '--dst-pdf', dst_pdf_fn
        ]
        if page_pdf_dir:
            args += ['--page-dir', page_pdf_dir]

        if single_page_merge_num_file_rotate:
            merge_page_num, merge_page_fn, merge_page_rotate = single_page_merge_num_file_rotate
            args += [f'{merge_page_num}={merge_page_fn}']
            if merge_page_rotate:
                args += [f'rotate_{merge_page_num}={merge_page_rotate}']

        if original_pdf_password:
            args += ['--password', original_pdf_password]

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract page images for OCR needs '
            f'(with text removed) from {original_pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        yield dst_pdf_fn
    finally:
        shutil.rmtree(temp_dir)
def re_schedule_unknown_pending_tasks(log: Logger, app) -> List[Tuple[str, str]]:
    conf = get_settings()
    webdav_client = get_webdav_client()
    broker_url = conf.celery_broker
    if not broker_url.startswith('redis:'):
        raise Exception('Only Redis broker supported for the task health tracking.')
    restarted_tasks: List[Tuple[str, str]] = list()
    failed_to_restart_tasks: List[Tuple[str, str]] = list()
    start_time = datetime.now()
    unknown_pending_tasks = get_unknown_pending_tasks(app)
    for task_id in unknown_pending_tasks:
        task_name: Optional[str] = 'unknown'
        try:
            task_info: Dict = webdav_client.unpickle(remote_path=f'{tasks_pending}/{task_id}')
            task_name = task_info['headers']['task'] or 'unknown'

            with Connection(broker_url) as conn:
                producer = conn.Producer(serializer='json')
                producer.publish(task_info['body'],
                                 routing_key=task_info['routing_key'],
                                 delivery_mode=2,
                                 serializer='pickle',
                                 headers=task_info['headers'],
                                 exchange=task_info['exchange'],
                                 retry=task_info['retry_policy'] is not None,
                                 retry_policy=task_info['retry_policy'])
                restarted_tasks.append((task_id, task_name))
        except RemoteResourceNotFound:
            log.warning(f'Unable to restart lost pending task '
                        f'because it has been completed already: #{task_id} - {task_name}')
        except Exception as ex:
            failed_to_restart_tasks.append((task_id, task_name))
            log.error(f'Unable to restart lost pending task: #{task_id} - {task_name}', exc_info=ex)
    if unknown_pending_tasks:
        time_spent = datetime.now() - start_time
        msg = f'Found {len(unknown_pending_tasks)} and restarted {len(restarted_tasks)} unknown/lost tasks ' \
              f'registered at Webdav but not found in Redis queue.\n' \
              f'Time spent: {time_spent}\n'
        if restarted_tasks:
            msg += f'Restarted tasks:\n' + '\n'.join([' - '.join(item) for item in restarted_tasks])
        if failed_to_restart_tasks:
            msg += f'Failed to restart tasks:\n' + '\n'.join([' - '.join(item) for item in failed_to_restart_tasks])
        log.info(msg)
    return restarted_tasks
示例#7
0
def tika_extract_xhtml(src_fn: str) -> str:
    conf: Settings = get_settings()

    encoding_name = 'utf-8'
    os.environ['LEXNLP_TIKA_PARSER_MODE'] = 'pdf_only'
    # other possible values are 'coords_embedded' and ''
    os.environ['LEXNLP_TIKA_XML_DETAIL'] = 'coords_flat'

    cmd = [
        'java', '-cp', f'{conf.tika_jar_path}/*',
        '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider',
        'org.apache.tika.cli.TikaCLI', f'--config={conf.tika_config}', '-x',
        f'-e{encoding_name}', src_fn
    ]

    def err(line):
        log.error(f'TIKA parsing {src_fn}:\n{line}')

    return read_output(
        cmd, stderr_callback=err, encoding=encoding_name,
        timeout_sec=60 * 20) or ''
def setup_loggers(*args, **kwargs):
    conf = get_settings()

    logger = logging.getLogger()
    logger.handlers.clear()

    if conf.log_to_stdout:
        if conf.log_to_stdout_json:
            formatter = JSONFormatter()
        else:
            formatter = logging.Formatter(
                '%(levelname)s %(asctime)s %(message)s',
                datefmt='%Y-%m-%d %H:%M:%S')
        sh = logging.StreamHandler()
        sh.setFormatter(formatter)
        logger.addHandler(sh)

    if conf.log_to_file:
        formatter = JSONFormatter()
        from logging.handlers import RotatingFileHandler
        sh = RotatingFileHandler(filename=conf.log_to_file,
                                 encoding='utf-8',
                                 maxBytes=10 * 1024 * 1024,
                                 backupCount=5)
        sh.setFormatter(formatter)
        logger.addHandler(sh)

    # make pdfminer a bit more silent
    from pdfminer.pdfinterp import log as pdfinterp_log
    from pdfminer.pdfpage import log as pdfpage_log
    from pdfminer.pdfdocument import log as pdfdocument_log
    from pdfminer.converter import log as converter_log
    pdfinterp_log.setLevel(logging.WARNING)
    pdfpage_log.setLevel(logging.WARNING)
    pdfdocument_log.setLevel(logging.WARNING)
    converter_log.setLevel(logging.WARNING)
示例#9
0
def extract_page_ocr_images(
        pdf_fn: str,
        start_page: int = None,
        end_page: int = None,
        pdf_password: str = None,
        timeout_sec: int = 1800,
        reset_page_rotation: bool = False,
        dpi: int = 300) -> Generator[Dict[int, str], None, None]:
    java_modules_path = get_settings().java_modules_path

    temp_dir_no_text = mkdtemp(prefix='pdf_images_')
    basefn = os.path.splitext(os.path.basename(pdf_fn))[0]
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.getocrimages.GetOCRImages', pdf_fn,
            '--format', 'png', '--dpi', f'{dpi}', '--output-prefix-no-text',
            f'{temp_dir_no_text}/{basefn}__'
        ]
        if pdf_password:
            args += ['--password', pdf_password]

        if start_page is not None:
            args += ['--start-page', str(start_page)]

        if end_page is not None:
            args += ['--end-page', str(end_page)]

        if reset_page_rotation:
            args += ['--reset-page-rotation']

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda:
            f'Extract page images for OCR needs (with text removed) from {pdf_fn}'
        )

        raise_from_pdfbox_error_messages(completed_process)

        # Output of GetOCRImages is a set of files with the names generated as:
        # {prefix}+{page_num_1_based}.{ext}
        # We used "{temp_dir}/{basefn}__" as the prefix.
        # Now we need to get the page numbers from the filenames and return the list of file names
        # ordered by page number.
        # For the "no-text" images: for the pages having no images which are not overlapped with
        # any text element it stores no page image.
        page_by_num_no_text: Dict[int, str] = dict()
        for fn in os.listdir(temp_dir_no_text):
            page_num = PAGE_NUM_RE.search(os.path.splitext(fn)[0]).group(0)
            page_no_text_fn = os.path.join(temp_dir_no_text, fn)
            page_by_num_no_text[int(page_num)] = page_no_text_fn

        yield page_by_num_no_text

    finally:
        shutil.rmtree(temp_dir_no_text, ignore_errors=True)
示例#10
0
def extract_text_and_structure(pdf_fn: str,
                               pdf_password: str = None,
                               timeout_sec: int = 3600,
                               language: str = "",
                               correct_pdf: bool = False,
                               render_coords_debug: bool = False) \
        -> Tuple[
            str, TextAndPDFCoordinates, str, Dict[int, float]]:  # text, structure, corrected_pdf_fn, page_rotate_angles

    if render_coords_debug:
        correct_pdf = True

    java_modules_path = get_settings().java_modules_path

    # Convert language to language code
    lang_converter = LanguageConverter()
    language, locale_code = lang_converter.get_language_and_locale_code(
        language)

    temp_dir = mkdtemp(prefix='pdf_text_')
    out_fn = os.path.join(
        temp_dir,
        os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack')
    out_pdf_fn = pdf_fn
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn,
            '-f', 'pages_msgpack'
        ]

        if pdf_password:
            args.append('-p')
            args.append(pdf_password)

        if correct_pdf:
            out_pdf_fn = os.path.join(
                temp_dir,
                os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf')
            args.append('-corrected_output')
            args.append(out_pdf_fn)

            if render_coords_debug:
                args.append('-render_char_rects')

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract text and structure from {pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        with open(out_fn, 'rb') as pages_f:
            # see object structure in com.lexpredict.textextraction.dto.PDFPlainText
            pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False)

        # Remove Null characters because of incompatibility with PostgreSQL
        text = pdfbox_res['text'].replace("\x00", "")
        if len(text) == 0:
            pdf_coordinates = PDFCoordinates(
                char_bboxes=pdfbox_res['charBBoxes'])
            text_struct = PlainTextStructure(
                title='',
                language=language
                or 'en',  # FastText returns English for empty strings
                pages=[],
                sentences=[],
                paragraphs=[],
                sections=[])
            yield text, \
                  TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \
                  out_pdf_fn, \
                  None

            return

        page_rotate_angles: List[float] = [
            pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages']
        ]

        pages = []
        num: int = 0
        for p in pdfbox_res['pages']:
            p_res = PlainTextPage(number=num,
                                  start=p['location'][0],
                                  end=p['location'][1],
                                  bbox=p['bbox'])
            pages.append(p_res)
            num += 1

        sentence_spans = get_sentence_span_list(text)

        lang = get_lang_detector()

        sentences = [
            PlainTextSentence(start=start,
                              end=end,
                              language=language or lang.predict_lang(segment))
            for start, end, segment in sentence_spans
        ]

        # There was a try-except in Contraxsuite catching some lexnlp exception.
        # Not putting it here because it should be solved on lexnlp side.
        paragraphs = [
            PlainTextParagraph(start=start,
                               end=end,
                               language=language or lang.predict_lang(segment))
            for segment, start, end in get_paragraphs(text, return_spans=True)
        ]

        sections = [
            PlainTextSection(title=sect.title,
                             start=sect.start,
                             end=sect.end,
                             title_start=sect.title_start,
                             title_end=sect.title_end,
                             level=sect.level,
                             abs_level=sect.abs_level)
            for sect in get_document_sections_with_titles(
                text, sentence_list=sentence_spans)
        ]

        try:
            title = next(get_titles(text))
        except StopIteration:
            title = None

        text_struct = PlainTextStructure(title=title,
                                         language=language
                                         or lang.predict_lang(text),
                                         pages=pages,
                                         sentences=sentences,
                                         paragraphs=paragraphs,
                                         sections=sections)

        char_bboxes = pdfbox_res['charBBoxes']
        pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes)
        yield text, TextAndPDFCoordinates(
            text_structure=text_struct,
            pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles
        return

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
示例#11
0
from text_extraction_system.data_extract.tables import get_table_dtos_from_camelot_output
from text_extraction_system.file_storage import get_webdav_client, WebDavClient
from text_extraction_system.pdf.convert_to_pdf import convert_to_pdf
from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks
from text_extraction_system.request_metadata import RequestCallbackInfo, RequestMetadata, \
    save_request_metadata, \
    load_request_metadata
from text_extraction_system.result_delivery.celery_client import send_task
from text_extraction_system.task_health.task_health import store_pending_task_info_in_webdav, \
    remove_pending_task_info_from_webdav, re_schedule_unknown_pending_tasks, init_task_tracking
from text_extraction_system.utils import LanguageConverter
from text_extraction_system_api.dto import OutputFormat
from text_extraction_system_api.dto import RequestStatus, STATUS_FAILURE, STATUS_PENDING, STATUS_DONE

log = logging.getLogger(__name__)
settings = get_settings()


class CeleryConfig:
    task_track_started = True
    task_serializer = 'pickle'
    result_serializer = 'pickle'
    accept_content = ['pickle', 'json']
    task_acks_late = True
    task_reject_on_worker_lost = True
    worker_prefetch_multiplier = 1

    @property
    def worker_autoscaler(self) -> Optional[str]:
        if settings.celery_shutdown_when_no_tasks_longer_than_sec:
            return 'text_extraction_system.celery_autoscaler:ShutdownWhenNoTasksAutoscaler'
def get_scheduled_tasks_from_redis() -> Set[str]:
    conf = get_settings()
    broker_url = conf.celery_broker
    r = Redis.from_url(broker_url)
    return {json.loads(item)['headers']['id'] for item in r.lrange('celery', 0, -1)}