def parse_file_local(self, local_path: str, original_file_name: str, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> Dict: mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag cmd = self.tika_start_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout) or '' try: return _parse((200, text)) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: """ The method is called from parse_file_on_server to parse the file calling Tika as a server. :param option: command line options to send to Tika's server :param url_or_path: local path (or URL) to the file being parsed :param server_endpoint: Tika server's URL :param verbose: make Tika produse verbose log :param tika_server_jar: path to Tika's JAR file :param response_mime_type: response format (application/json) for plain text + metadata in JSON format :param services: :param raw_response: get raw response from Tika (text + metadata + warnings), False by default :param extra_headers: extra request header :return: dictionary with "content" (text) and "metadata" (another dictionary) keys """ services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))
def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT # don't use at all TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag os.environ[self.TIKA_PARSER_DETAIL] = '' tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))