Python TikaXhtmlParser примеры, apps.task.utils.text_extraction.tika.tika_xhtml_parser.TikaXhtmlParser Python примеры использования

Пример #1

0

Показать файл

    def test_process_inner_tags(self):
        text = """
<p>The pen employed in finishing her story, and making it what you now see
it to be, has had no little difficulty to put it into a dress fit to be
seen, and to make it speak language fit to be read. When a woman
debauched from her youth, nay, even being the offspring of debauchery
and vice, comes to give an account of all her vicious practices, and
even to descend to the particular occasions and circumstances by which
she ran through in threescore years, an author must be hard put to it
wrap it up so clean as not to give room, especially for vicious
readers, to turn it to his disadvantage. <a href="#">This page{{##PGPG##}}
ends with semibold text</a>.
It is suggested there cannot be the same life, the same brightness and</p>
<p>
beauty, in relating the penitent part as is in the criminal part. If
there is any truth in that suggestion, I must be allowed to say ’tis
because there is not the same taste and relish in the reading, and
indeed it is too true that the difference lies not in the real worth of
the subject so much as in the gust and palate of the reader.</p>
        """
        parser = TikaXhtmlParser()
        markup = parser.parse_text(text)
        markup.convert_markers_to_labels()
        proc_text = markup.text
        self.assertEqual(-1, proc_text.find('##'))

        pages = markup.labels['pages']
        self.assertEqual(1, len(pages))
        last_page_text = proc_text[pages[0][0] - 30:pages[0][0]].strip()
        self.assertTrue(last_page_text.endswith('This page'))

        paragraphs = markup.labels['paragraphs']
        p_text = proc_text[paragraphs[0][0]:paragraphs[0][1]].strip()
        self.assertTrue(p_text.endswith('the same brightness and'))

Пример #2

0

Показать файл

    def test_complex_mixed_pdf(self):
        sets = XhtmlParsingSettings()
        sets.ocr_sets = OcrTextStoreSettings.STORE_ALWAYS
        full_text = load_resource_document('parsing/parsed_mixed_pdf.xhtml',
                                           encoding='utf-8')
        parser = TikaXhtmlParser(sets)
        markup = parser.parse_text(full_text)
        markup.convert_markers_to_labels()

        proc_text = markup.text
        self.assertEqual(-1, proc_text.find('##'))
        pages = markup.labels['pages']
        self.assertGreater(len(pages), 100)

        pages_texts = []
        for _start, end in pages:
            in_end = min(end, len(markup.text))
            in_start = max(in_end - 50, 0)
            ending = markup.text[in_start:in_end]
            pages_texts.append(ending)

        self.assertTrue('See “RATINGS” herein.' in pages_texts[0])
        self.assertTrue(
            'optional redemption date of November 15, 2027.' in pages_texts[1])
        self.assertTrue('by the IRS.' in pages_texts[54])

Пример #3

0

Показать файл

    def test_parse_recursive_tables(self):
        raw = """
        <table>
           <tr><td><p>Cell 1.1</p></td><td><p>Cell 1.2</p></td></tr>
           <tr>
              <td><p>Cell 2.1</p></td>
              <td>
               <p>
                  <table>
                     <tr><th><p>InCell 1.1</p></th><th><p>InCell 1.2</p></th></tr>
                     <tr><td><p>InCell 2.1</p></td><td><p></p></td></tr>
                  </table>
                </p>     
              </td>
           </tr>
           <tr><td><p>Cell 3.1</p></td><td><p>Cell 3.2</p></td></tr>
        </table>
        """
        parser = TikaXhtmlParser()
        rst = parser.parse_text(raw)
        rst.convert_markers_to_labels()
        self.assertEqual(2, len(rst.tables))

        rst.tables.sort(key=lambda t: t.start)
        table_df = rst.tables[1].serialize_in_dataframe(rst.text)
        self.assertEqual('InCell 1.1', table_df.loc[0, 0].strip())
        self.assertEqual('InCell 2.1', table_df.loc[1, 0].strip())
        self.assertTrue(not table_df.loc[1, 1].strip())

        table_df = rst.tables[0].serialize_in_dataframe(rst.text)
        self.assertEqual('Cell 1.1', table_df.loc[0, 0].strip())
        self.assertEqual('Cell 3.2', table_df.loc[2, 1].strip())

Пример #4

0

Показать файл

    def test_parse_table(self):
        raw = """        
        <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml">
        <head>
        <meta name="dc:publisher" content=""/>
        <title/>
        </head>
        <body><p>What is Lorem Ipsum?</p>
        <p><b>Lorem Ipsum</b> is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry.</p>
        <p/>
        <table><tbody><tr>	<td><p>Row 1, column 1</p>
        </td>	<td><p>Row 1, column 2</p>
        </td>	<td><p>Row 1, column 3</p>
        </td></tr>
        <tr>	<td><p>Row 2, column 1</p>
        </td>	<td><p>Row 2, column 2</p>
        </td>	<td><p>Row 2, column 3</p>
        </td></tr>
        <tr>	<td><p>Row 3, column 1</p>
        </td>	<td><p>Row 3, column 2</p>
        </td>	<td><p>Row 3, column 3</p>
        </td></tr>
        <tr>	<td><p>Row 4, column 1</p>
        </td>	<td><p>Row 4, column 2</p>
        </td>	<td><p>Row 4, column 3</p>
        </td></tr>
        </tbody></table>
        <p/>
        <h2>Where does it come from?</h2>
        <p class="normal_(Web)">Contrary to popular belief, Lorem Ipsum is not simply random text.</p>
        <p/>
        <table><tbody><tr>	<td><p>r1c1: Contrary to popular belief, Lorem Ipsum is not simply random text.</p>
        </td>	<td><p/>
        </td>	<td><p/>
        </td></tr>
        <tr>	<td><p/>
        </td>	<td><p/>
        </td>	<td><p class="normal_(Web)"><a name="_GoBack"/>r2c3: The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.</p>
        </td></tr>
        </tbody></table>
        <p/>
        </body></html>
        """
        parser = TikaXhtmlParser()
        rst = parser.parse_text(raw, detect_tables=True)
        self.assertGreater(len(rst.text), 100)
        self.assertGreater(len(rst.labels['paragraphs']), 1)

        self.assertEqual(2, len(rst.tables))
        table_df = rst.tables[0].serialize_in_dataframe(rst.text)

        for i_row, row in table_df.iterrows():
            for i_cell in range(len(row)):
                target_str = f'Row {i_row + 1}, column {i_cell + 1}'
                self.assertEqual(target_str, row[i_cell])

        table_df = rst.tables[1].serialize_in_dataframe(rst.text)
        cell_text = table_df.loc[1, 2]
        self.assertEqual('{_GoBack} r2c3: The first line of Lorem Ipsum, "Lorem ' +
                         'ipsum dolor sit amet..", comes from a line in section 1.10.32.\n\n', cell_text)

Пример #5

0

Показать файл

 def test_parse_vector_pdf(self):
     text = load_resource_document('parsing/xhtml_pdf.xhtml', encoding='utf-8')
     parser = TikaXhtmlParser()
     rst = parser.parse_text(text)
     self.assertGreater(len(rst.text), 100)
     self.assertGreater(len(rst.labels['pages']), 1)
     self.assertGreater(len(rst.labels['paragraphs']), 5)

Пример #6

0

Показать файл

Файл: test_tika_xhtml_images.py Проект: fagan2888/lexpredict-contraxsuite

 def test_ocr_empty_images(self):
     text = load_resource_document('parsing/xhtml_ocr_emptyimages.xhtml',
                                   encoding='utf-8')
     parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
         ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
         ocr_vector_text_min_length=100))
     rst = parser.parse_text(text)
     self.assertEqual(len(rst.text), rst.markers_extra_text_length)

Пример #7

0

Показать файл

Файл: test_tika_xhtml_images.py Проект: fagan2888/lexpredict-contraxsuite

    def test_ocr_if_less(self):
        text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_ALWAYS,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertEqual(2, len(rst.labels['images']))

        text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)

        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.NEVER_STORE,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)

Пример #8

0

Показать файл

 def test_list_parsing(self):
     raw = """
     <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml">
     <head>
     <meta name="pdf:PDFVersion" content="1.4"/>
     <title>Sample Docx with Image.docx</title>
     </head>
     <body><div class="page"><p/>
     <p>Explore XHTML Tika’s output
     </p>
     <p>JIRA ticket: https://lexpredict.atlassian.net/browse/CS-3966
     </p>
     <p>Here (Improve text segmentation (section / page / paragraph / sentence), section 1.1 Use 
     markup from document parser) I described Tika’s output in XHTML. In short:
     </p>
     <p>● Tika uses PdfBox for “vector” files, MS Word and OpenOffice files 
     ● and Tesseract for scanned files
     ● in both cases Tika returns valid XHTML
     ● XHTML contains almost all information on document’s structure that Tika can get
     </p>
     <p>see the aforementioned document, section 1.2 Verdict on using Tika markup for segmenting 
     text.
     </p>
     <p>I’ve implemented a parser that reads Tika’s output in XHTML and extracts:
     1. plain text with or without extra line breaks inside paragraphs, with hyperlinks 
     </p>
     <p>
     This paragraph contains text with extra line breaks that should have been deleted 
     because the text is not formatted as a list. This paragraph contains text with extra 
     line breaks that should have been deleted because the text is not formatted as a list.
     This paragraph contains text with extra line breaks that should have been deleted because
     the text is not formatted as a list. This paragraph contains text with extra line breaks
     that should have been deleted because the text is not formatted as a list.
     </p>
     <p>formatted
     2. paragraphs’ coordinates
     3. pages’ coordinates
     4. headings
     5. tables as Pandas dataframes + anchors to the source text
     </p>
     </div>
     </body></html>
     """
     parser = TikaXhtmlParser()
     rst = parser.parse_text(raw)
     rst.convert_markers_to_labels()
     self.assertGreater(len(rst.text), 100)
     self.assertGreater(len(rst.labels['paragraphs']), 1)

Пример #9

0

Показать файл

    def __init__(self):
        self.xhtml_parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_ALWAYS,
            remove_extra_newlines=False))
        self.tika_files_path = tempfile.gettempdir()
        self.tika_jar_path = tempfile.gettempdir()

        from django.conf import settings
        jar_base_path = settings.JAR_BASE_PATH

        tika_cls_name = 'org.apache.tika.cli.TikaCLI'
        tika_cp = ':'.join(
            [os.path.join(jar_base_path, jar) for jar in settings.TIKA_JARS])

        self.tika_default_command_list = [
            'java', '-cp', tika_cp,
            '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider',
            tika_cls_name
        ]
        self.tika_lexnlp_default_command_list = self.tika_default_command_list[:]
        from apps.task.app_vars import TIKA_CUSTOM_CONFIG, TIKA_NOOCR_CUSTOM_CONFIG, TIKA_LEXNLP_CUSTOM_CONFIG

        custom_noocr_tika_config = TIKA_NOOCR_CUSTOM_CONFIG.val
        self.tika_noocr_default_command_list = None
        if custom_noocr_tika_config:
            conf_full_path = os.path.join(jar_base_path,
                                          custom_noocr_tika_config)
            self.tika_noocr_default_command_list = self.tika_default_command_list + [
                f'--config={conf_full_path}'
            ]

        custom_tika_config = TIKA_CUSTOM_CONFIG.val
        if custom_tika_config:
            conf_full_path = os.path.join(jar_base_path, custom_tika_config)
            self.tika_default_command_list += [f'--config={conf_full_path}']

        # LexNLP (plugin) Tika config path
        custom_lexp_tika_config = TIKA_LEXNLP_CUSTOM_CONFIG.val
        if custom_lexp_tika_config:
            conf_full_path = os.path.join(jar_base_path,
                                          custom_lexp_tika_config)
            self.tika_lexnlp_default_command_list += [
                f'--config={conf_full_path}'
            ]

Пример #10

0

Показать файл

Файл: tika_parsing_wrapper.py Проект: tx-anin/lexpredict-contraxsuite

    def __init__(self):
        self.xhtml_parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
            remove_extra_newlines=False))
        self.tika_files_path = tempfile.gettempdir()
        self.tika_jar_path = tempfile.gettempdir()

        from django.conf import settings
        jar_base_path = settings.JAR_BASE_PATH

        tika_cls_name = 'org.apache.tika.cli.TikaCLI'
        tika_cp = ':'.join(
            [os.path.join(jar_base_path, jar) for jar in settings.TIKA_JARS])

        conf_full_path = os.path.join(jar_base_path, 'tika.config')
        self.tika_start_command_list = [
            'java', '-cp', tika_cp,
            '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider',
            tika_cls_name, f'--config={conf_full_path}'
        ]

Пример #11

0

Показать файл

    def test_parse_headings(self):
        raw = """
        <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml">
        <head>
        <meta name="date" content="2019-08-08T15:35:00Z"/>
        <title/>
        </head>
        <body><h1>1. Heading One</h1>
        <p class="list_Paragraph"/>
        <p class="list_Paragraph">Contrary to popular belief, Lorem Ipsum is not simply random text. 
        It has roots in a piece of classical Latin literature from &gt; 45 BC, making it over 2000 years old. 
        This book is a treatise on the theory of ethics, very popular during the Renaissance. 
        The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", <a href="en.wikipedia.org/%20s%20s">comes from a line</a> in section 1.10.32.</p>
        <p class="list_Paragraph"/>
        <p class="list_Paragraph">The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for</p>
        <p class="list_Paragraph"/>
        <h2>1.1 Heading One One</h2>
        <p class="list_Paragraph">Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum &amp; et Malorum" by Cicero are also</p>
        <p class="list_Paragraph"/>
        <h2>1.2 Heading one two</h2>
        <h1>2. <a name="_GoBack"/>Heading 2</h1>
        </body></html>
        """
        parser = TikaXhtmlParser()
        rst = parser.parse_text(raw)
        rst.convert_markers_to_labels()
        self.assertGreater(len(rst.text), 100)
        self.assertGreater(len(rst.labels['paragraphs']), 1)

        self.assertGreater(len(rst.labels['heading_1']), 1)
        self.assertGreater(len(rst.labels['heading_2']), 1)
        headings = [rst.text[h_s:h_e] for h_s, h_e in rst.labels['heading_1']]
        self.assertEqual('1. Heading One', headings[0].strip(' \n'))
        self.assertEqual('2. {_GoBack} Heading 2', headings[1].strip(' \n'))

        self.assertGreater(len(rst.labels['a']), 0)

        sections = rst.find_sections()
        self.assertGreater(len(sections), 1)

        self.assertTrue("de Finibus Bonorum & et Malorum" in rst.text)

Пример #12

0

Показать файл

Файл: test_tika_xhtml_images.py Проект: fagan2888/lexpredict-contraxsuite

    def test_ocr_little_text_scanned(self):
        text = load_resource_document('parsing/xhtml_ocr_mixed_long.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertEqual(2, len(rst.labels['images']))
        len_with_ocred = len(rst.text)

        text = load_resource_document('parsing/xhtml_ocr_mixed_short.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)
        len_wo_ocred = len(rst.text)

        self.assertGreater(len_with_ocred - len_wo_ocred, 100)

Пример #13

0

Показать файл

class TikaParsingWrapper:
    """
    Parses file (provided by path) by Tika's local JAR file or calling
    Tika's server to parse the file.
    Can process XHTML or plain text Tika's output.
    """

    # flag defines how Tika parses passed file
    TIKA_URL_FLAG_MODE = 'pdf-parse'

    # the same flag as environment variable
    TIKA_ENV_VAR_FLAG_MODE = 'LEXNLP_TIKA_PARSER_MODE'

    # flag's value - parse only PDF
    TIKA_MODE_OCR = 'pdf_ocr'

    # flag's value - parse both PDF and scanned images
    TIKA_MODE_PDF_ONLY = 'pdf_only'

    def __init__(self):
        self.xhtml_parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_ALWAYS,
            remove_extra_newlines=False))
        self.tika_files_path = tempfile.gettempdir()
        self.tika_jar_path = tempfile.gettempdir()

        from django.conf import settings
        jar_base_path = settings.JAR_BASE_PATH

        tika_cls_name = 'org.apache.tika.cli.TikaCLI'
        tika_cp = ':'.join(
            [os.path.join(jar_base_path, jar) for jar in settings.TIKA_JARS])

        self.tika_default_command_list = [
            'java', '-cp', tika_cp,
            '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider',
            tika_cls_name
        ]
        self.tika_lexnlp_default_command_list = self.tika_default_command_list[:]
        from apps.task.app_vars import TIKA_CUSTOM_CONFIG, TIKA_NOOCR_CUSTOM_CONFIG, TIKA_LEXNLP_CUSTOM_CONFIG

        custom_noocr_tika_config = TIKA_NOOCR_CUSTOM_CONFIG.val
        self.tika_noocr_default_command_list = None
        if custom_noocr_tika_config:
            conf_full_path = os.path.join(jar_base_path,
                                          custom_noocr_tika_config)
            self.tika_noocr_default_command_list = self.tika_default_command_list + [
                f'--config={conf_full_path}'
            ]

        custom_tika_config = TIKA_CUSTOM_CONFIG.val
        if custom_tika_config:
            conf_full_path = os.path.join(jar_base_path, custom_tika_config)
            self.tika_default_command_list += [f'--config={conf_full_path}']

        # LexNLP (plugin) Tika config path
        custom_lexp_tika_config = TIKA_LEXNLP_CUSTOM_CONFIG.val
        if custom_lexp_tika_config:
            conf_full_path = os.path.join(jar_base_path,
                                          custom_lexp_tika_config)
            self.tika_lexnlp_default_command_list += [
                f'--config={conf_full_path}'
            ]

    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex

    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               task: Any,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list

        parse_commands = [
            tika_default_command_list, self.tika_default_command_list
        ]
        from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT
        ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val

        for cmd_index in range(len(parse_commands)):
            cmd_list = parse_commands[cmd_index]
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]
            if ram_limit:
                java_index = cmd.index('java')
                cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m'
                                              ] + cmd[java_index + 1:]
            logger.info(f'Tika (XHTML) args: {", ".join(cmd)}')

            last_try = cmd_index == len(parse_commands) - 1
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout,
                               task=task) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = output.pure_text_length if output else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex

    def parse_file_on_server(self,
                             option: str,
                             url_or_path: str,
                             server_endpoint: str = None,
                             enable_ocr: bool = True) -> Dict:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a server.
        Tika will return plain text.
        :param option: command line options to send to Tika's server
        :param url_or_path: local path (or URL) to the file being parsed
        :param server_endpoint: Tika server's URL
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        parse_mode = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        return self.parse(option,
                          url_or_path,
                          server_endpoint,
                          extra_headers={'pdf-parse': parse_mode})

    def parse(self,
              option: str,
              url_or_path: str,
              server_endpoint: str = None,
              verbose: int = 0,
              tika_server_jar: str = None,
              response_mime_type: str = 'application/json',
              services: dict = None,
              raw_response: bool = False,
              extra_headers: Dict[str, str] = None) -> Dict:
        """
        The method is called from parse_file_on_server to parse the file
        calling Tika as a server.
        :param option: command line options to send to Tika's server
        :param url_or_path: local path (or URL) to the file being parsed
        :param server_endpoint: Tika server's URL
        :param verbose: make Tika produse verbose log
        :param tika_server_jar: path to Tika's JAR file
        :param response_mime_type: response format (application/json) for plain text + metadata in JSON format
        :param services:
        :param raw_response: get raw response from Tika (text + metadata + warnings), False by default
        :param extra_headers: extra request header
        :return: dictionary with "content" (text) and "metadata" (another dictionary) keys
        """

        services = services if services else \
            {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}
        tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path
        server_endpoint = server_endpoint if server_endpoint else self.server_endpoint

        path, file_type = getRemoteFile(url_or_path, self.tika_files_path)
        service = services.get(option, services['all'])
        if service == '/tika':
            response_mime_type = 'text/plain'
        content_path = self.make_content_disposition_header(path)

        headers = {
            'Accept': response_mime_type,
            'Content-Disposition': content_path
        }
        if extra_headers:
            headers = {**headers, **extra_headers}

        status, response = callServer('put',
                                      server_endpoint,
                                      service,
                                      open(path, 'rb'),
                                      headers,
                                      verbose,
                                      tika_server_jar,
                                      rawResponse=raw_response)

        if file_type == 'remote':
            os.unlink(path)
        return _parse((status, response))

    def make_content_disposition_header(self, fn):
        return 'attachment; filename=%s' % os.path.basename(fn)

Python TikaXhtmlParser примеры использования