def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmpfile: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) file.mime_type = content_type or file.mime_type tmpfile.write(content) tmpfile.file.seek(0) file.filesize = len(content) except HTTPError: logger.exception("File {}: Failed to download {}".format( file.id, url)) return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) minio_client().put_object( minio_file_bucket, str(file.id), tmpfile.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmpfile.file, tmpfile.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning("File {}: Couldn't get any text".format(file.id)) db.connections.close_all() file.save() return True
def test_pdf_parsing(self): file = os.path.join(test_media_root, "Donald Knuth - The Complexity of Songs.pdf") with open(file, "rb") as fp: parsed_text, page_count = extract_from_file( fp, file, "application/pdf", 0) self.assertTrue("bottles of beer" in parsed_text) self.assertEqual(page_count, 3)
def test_pdf_parsing(pytestconfig, caplog): file = pytestconfig.rootpath.joinpath(test_media_root).joinpath( "Donald Knuth - The Complexity of Songs.pdf") with file.open("rb") as fp: parsed_text, page_count = extract_from_file(fp, file, "application/pdf", 0) assert caplog.messages == [] assert "bottles of beer" in parsed_text assert page_count == 3
def test_pdf_parsing_oom(pytestconfig, caplog): """Check error handling when pdftotext tries to use more than the allowed memory""" file = pytestconfig.rootpath.joinpath(test_media_root).joinpath( "Donald Knuth - The Complexity of Songs.pdf") with file.open("rb") as fp: parsed_text, page_count = extract_from_file(fp, file, "application/pdf", 0) assert caplog.messages == [ "File 0: Failed to run pdftotext: Command '['pdftotext', " f"PosixPath('{file}'), '-']' returned non-zero exit status 127." ] assert parsed_text is None assert page_count == 3
def test_pdf_as_tiff(pytestconfig, caplog, filename): """A tiff tagged as pdf, making PyPDF2 fail https://github.com/codeformuenster/kubernetes-deployment/pull/65#issuecomment-894232803""" file = pytestconfig.rootpath.joinpath("testdata/media").joinpath(filename) with file.open("rb") as fp: parsed_text, page_count = extract_from_file(fp, file, "application/pdf", 0) assert caplog.messages == [ "File 0: Failed to run pdftotext: Command '['pdftotext', " f"PosixPath('{file}'), '-']' returned non-zero exit status 1.", "File 0: Pdf does not allow to read the number of pages", ] assert not parsed_text assert not page_count
def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmp_file: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) if content_type and content_type.split(";")[0] == "text/html": logger.error( f"File {file.id}: Content type was {content_type}, this seems to be a silent error" ) return False file.mime_type = content_type or file.mime_type tmp_file.write(content) tmp_file.file.seek(0) file.filesize = len(content) except RequestException as e: # Normal server error if e.response and 400 <= e.response.status_code < 600: logger.error( f"File {file.id}: Failed to download {url} with error {e.response.status_code}" ) else: logger.exception( f"File {file.id}: Failed to download {url}") return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) if not settings.PROXY_ONLY_TEMPLATE: minio_client().put_object( minio_file_bucket, str(file.id), tmp_file.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmp_file.file, tmp_file.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning(f"File {file.id}: Couldn't get any text") try: db.connections.close_all() file.save() except (ElasticsearchException, DatabaseError) as e: logger.exception(f"File {file.id}: Failed to save: {e}") return False return True