def pdf2pdfa(task, input_file_path, output_file_path,
             language='eng', output_types=['pdf'], logger=None, timeout=600):
    """
    Convert scanned PDF into searchable PDF-A or optionally into hOCR or txt (see output_types)
    :param task: Task
    :param input_file_path:
    :param output_file_path:
    :param language:
    :param output_types: List['pdf', 'hocr', 'txt']
    :param logger: ProcessLogger
    :param timeout: sec
    :return:
    """
    with tempfile.TemporaryDirectory() as tmp_dir:

        pdf2image_kwargs = dict(fmt='jpg', jpegopt={'quality': 50})
        img_list_file_path = pdf2img(input_file_path, work_dir=tmp_dir, **pdf2image_kwargs)

        cmd_args = ['tesseract', '-l', str(language), img_list_file_path, output_file_path]
        for ot in output_types:
            cmd_args += ['-c', f'tessedit_create_{ot}=1']

        def err(line):
            logger.info(f'tesseract converting {img_list_file_path} '
                        f'images into {output_file_path}:\n{line}')

        read_output(cmd_args, stderr_callback=err, timeout_sec=timeout, task=task) or ''

        # rename file.pdf.alt.pdf into file.pdf.alt
        for ot in output_types:
            os.rename(output_file_path + f'.{ot}', output_file_path)
示例#2
0
    def parse_file_local(self,
                         local_path: str,
                         original_file_name: str,
                         timeout: int = 60,
                         encoding_name: str = 'utf-8',
                         logger: ProcessLogger = None,
                         enable_ocr: bool = True) -> Dict:
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        cmd = self.tika_start_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout) or ''

        try:
            return _parse((200, text))
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
示例#3
0
    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        for cmd_list in [
                self.tika_default_command_list,
                self.tika_lexnlp_default_command_list
        ]:
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]

            last_try = cmd == self.tika_lexnlp_default_command_list
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = len(output.text) if output and output.text else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex
    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT
        # don't use at all TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag
        os.environ[self.TIKA_PARSER_DETAIL] = ''

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
def pdf_has_images(file_path, task, logger=None, timeout=600):
    """
    Check whether PDF file has images
    :param file_path: str
    :param task: celery task
    :param logger: ProcessLogger
    :param timeout: timeout sec
    :return: bool
    """
    def err(line):
        logger.info(f'pdfimages parsing {file_path}:\n{line}')

    cmd = ['pdfimages', '-list', file_path]
    out = read_output(cmd, stderr_callback=err, timeout_sec=timeout,
                      task=task) or ''

    return process_image_list(out)
示例#6
0
    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               task: Any,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list

        parse_commands = [
            tika_default_command_list, self.tika_default_command_list
        ]
        from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT
        ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val

        for cmd_index in range(len(parse_commands)):
            cmd_list = parse_commands[cmd_index]
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]
            if ram_limit:
                java_index = cmd.index('java')
                cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m'
                                              ] + cmd[java_index + 1:]
            logger.info(f'Tika (XHTML) args: {", ".join(cmd)}')

            last_try = cmd_index == len(parse_commands) - 1
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout,
                               task=task) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = output.pure_text_length if output else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex