def check(self, exc): if exc.cmd[0] != "import-object": return file_path = exc.cmd[-1] file_ext = file_path.split(".")[-1].lower() if file_ext not in ("tif", "tiff"): return stderr = exc.stderr.decode("utf-8") is_multipage = ( "The file contains multiple streams which is supported only for " "video containers." in stderr ) if is_multipage: raise PreservationError( detail=( f"TIFF file {exc.cmd[-1]} contains multiple pages and is " f"not currently allowed for preservation." ), error="Multi-page TIFF not allowed" )
def check_files(self): """ Check that all files in the SIP are supported """ for file_ in self.all_files: suffix = file_.suffix[1:].lower() if suffix not in SUPPORTED_FORMATS: raise PreservationError( detail=(f"File format {suffix} in SIP {self.sip_filename} " "not supported for preservation."), error=f"Unsupported file format: {suffix}")
def check(self, exc): if exc.cmd[0] != "import-object": return file_path = exc.cmd[-1] file_ext = file_path.split(".")[-1].lower() if file_ext not in ("jpg", "jpeg"): return stderr = exc.stderr.decode("utf-8") mime_type_detection_failed = \ "MIME type not supported by this scraper." in stderr if mime_type_detection_failed: raise PreservationError( detail=( f"JPEG file {exc.cmd[-1]} didn't pass MIME type detection" ), error="JPEG MIME type detection failed" )
def check(self, exc): if exc.cmd[0] != "import-object": return file_path = exc.cmd[-1] file_ext = file_path.split(".")[-1].lower() if file_ext not in ("jpg", "jpeg"): return stderr = exc.stderr.decode("utf-8") version_not_detected = ( "File format version is not supported." in stderr ) if version_not_detected: raise PreservationError( detail=( f"JPEG version not supported for {file_path}" ), error="JPEG version not supported" )
def check(self, exc): if exc.cmd[0] != "import-object": return stderr = exc.stderr.decode("utf-8") if "Validator returned error" not in stderr: return # TODO: Maybe parse the actual XML output? # However, dpres-siptools output may not be stable, so it could be # a flaky solution without much benefit for now. # Error was produced by JHOVE's TIFF-hul report module is_tiff_error = ">TIFF-hul</reportingModule>" in stderr if is_tiff_error: raise PreservationError( detail=( f"TIFF file {exc.cmd[-1]} failed JHOVE validation, and " f"is likely invalid." ), error="TIFF file failed JHOVE validation" )
def check(self, exc): if exc.cmd[0] != "import-object": return file_path = exc.cmd[-1] file_ext = file_path.split(".")[-1].lower() if file_ext not in ("jpg", "jpeg"): return stderr = exc.stderr.decode("utf-8") mpo_found = ( "Conflict with existing value 'image/jpeg' and new value " "'image/mpo'" ) in stderr if mpo_found: raise PreservationError( detail=( f"MPO image file {exc.cmd[-1]} is not supported" ), error="MPO JPEG files not supported" )
def mock_download_object(object_id, package_dir, sip_id): raise PreservationError(detail="Mock detailed error message", error="Filename was not supported")
async def download_attachment(self, item_id: int) -> MuseumAttachment: """ Download an attachment if it hasn't been downloaded already and return a MuseumAttachment instance """ attachment_dir = self.attachment_dir / str(item_id) attachment_dir.mkdir(exist_ok=True, parents=True) etree = await retrieve_cached_xml( session=self.session, url=f"{MUSEUMPLUS_URL}/module/Multimedia/{item_id}", path=attachment_dir / f"Multimedia.xml") attachment = MuseumAttachment(etree) filename = attachment.filename if filename == "Multimedia.xml": # If the attachment is named "Multimedia.xml", it would overwrite # the XML document we have already saved in the same directory. # In this case, raise a PreservationError to prevent packaging # and the file from being silently overwritten. raise PreservationError( detail=( "The attachment filename 'Multimedia.xml' conflicts with " "the Multimedia XML document we have already downloaded."), error="Filename 'Multimedia.xml' not allowed for attachment") # Raise PreservationError if attachment filename contains non-ASCII # characters, as those are not yet supported by the DPRES service. # See CSC ticket #402027. # TODO: Remove once the filename issue at DPRES service is fixed try: filename.encode("ascii") except UnicodeEncodeError: raise PreservationError( detail=( f"Filename {filename} contains non-ASCII characters and " f"can't be uploaded into the DPRES service at this time."), error="Filename contains non-ASCII characters") attachment_path = attachment_dir / attachment.filename # Only download the attachment if it doesn't exist already # This way repeated download attempts don't redownload files # we already have if not os.path.exists(attachment_path): # TODO: Can we use anything else to determine whether the file # is complete? MuseumPlus doesn't seem to provide a file size # or a checksum. logger.info( f"Downloading attachment {attachment.filename} for object " f"{self.museum_object.object_id}") # Use a temporary filename during download so we don't mistake # half-finished downloads for complete files temp_path = attachment_path.with_suffix( f"{attachment_path.suffix}.download") response = await self.session.get( f"{MUSEUMPLUS_URL}/module/Multimedia/{item_id}/attachment", headers={"Accept": "application/octet-stream"}) if response.status == 404: # No attachment exists for this Multimedia instance; only # return the metadata return attachment response.raise_for_status() # TODO: If we can determine the file size beforehand, we could use # fallocate to allocate the required disk space and then # download the file. async with aiofiles.open(temp_path, "wb") as file_: while True: chunk = await response.content.read(CHUNK_SIZE) if not chunk: break await file_.write(chunk) if temp_path.stat().st_size == 0: # File is empty; remove it and only package the metadata XML logger.debug( f"Deleting empty attachment {attachment.filename}") temp_path.unlink() return attachment # Download finished; we can safely rename it now os.rename(temp_path, attachment_path) else: logger.debug(f"Skipping existing attachment {attachment.filename}") return attachment
def mock_create_sip(object_id, package_dir, sip_id, create_date, modify_date, update): raise PreservationError(detail="Mock error message.", error="Unsupported file format: wad")