def parse(self, document_path, mime_type): self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") try: parsed = parser.from_file(document_path) except requests.exceptions.HTTPError as err: raise ParseError( f"Could not parse {document_path} with tika server: {err}") try: content = parsed["content"].strip() except: content = "" try: creation_date = dateutil.parser.isoparse( parsed["metadata"]["Creation-Date"]) except: creation_date = None archive_path = os.path.join(self.tempdir, "convert.pdf") convert_to_pdf(self, document_path, archive_path) self.archive_path = archive_path self.date = creation_date self.text = content
def extract_metadata(self, document_path, mime_type): tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) except Exception as e: self.log( "warning", f"Error while fetching document metadata for " f"{document_path}: {e}") return [] return [{ "namespace": "", "prefix": "", "key": key, "value": parsed['metadata'][key] } for key in parsed['metadata']]
def parse(self, document_path, mime_type, file_name=None): self.log("info", f"Sending {document_path} to Tika server") tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " f"{tika_server}: {err}") self.text = parsed["content"].strip() try: self.date = dateutil.parser.isoparse( parsed["metadata"]["Creation-Date"]) except Exception as e: self.log( "warning", f"Unable to extract date for document " f"{document_path}: {e}") self.archive_path = self.convert_to_pdf(document_path, file_name)