예제 #1
0
    def check(self, exc):
        if exc.cmd[0] != "import-object":
            return

        file_path = exc.cmd[-1]
        file_ext = file_path.split(".")[-1].lower()

        if file_ext not in ("tif", "tiff"):
            return

        stderr = exc.stderr.decode("utf-8")

        is_multipage = (
            "The file contains multiple streams which is supported only for "
            "video containers." in stderr
        )

        if is_multipage:
            raise PreservationError(
                detail=(
                    f"TIFF file {exc.cmd[-1]} contains multiple pages and is "
                    f"not currently allowed for preservation."
                ),
                error="Multi-page TIFF not allowed"
            )
예제 #2
0
    def check_files(self):
        """
        Check that all files in the SIP are supported
        """
        for file_ in self.all_files:
            suffix = file_.suffix[1:].lower()

            if suffix not in SUPPORTED_FORMATS:
                raise PreservationError(
                    detail=(f"File format {suffix} in SIP {self.sip_filename} "
                            "not supported for preservation."),
                    error=f"Unsupported file format: {suffix}")
예제 #3
0
    def check(self, exc):
        if exc.cmd[0] != "import-object":
            return

        file_path = exc.cmd[-1]
        file_ext = file_path.split(".")[-1].lower()

        if file_ext not in ("jpg", "jpeg"):
            return

        stderr = exc.stderr.decode("utf-8")

        mime_type_detection_failed = \
            "MIME type not supported by this scraper." in stderr

        if mime_type_detection_failed:
            raise PreservationError(
                detail=(
                    f"JPEG file {exc.cmd[-1]} didn't pass MIME type detection"
                ),
                error="JPEG MIME type detection failed"
            )
예제 #4
0
    def check(self, exc):
        if exc.cmd[0] != "import-object":
            return

        file_path = exc.cmd[-1]
        file_ext = file_path.split(".")[-1].lower()

        if file_ext not in ("jpg", "jpeg"):
            return

        stderr = exc.stderr.decode("utf-8")

        version_not_detected = (
            "File format version is not supported." in stderr
        )

        if version_not_detected:
            raise PreservationError(
                detail=(
                    f"JPEG version not supported for {file_path}"
                ),
                error="JPEG version not supported"
            )
예제 #5
0
    def check(self, exc):
        if exc.cmd[0] != "import-object":
            return

        stderr = exc.stderr.decode("utf-8")

        if "Validator returned error" not in stderr:
            return

        # TODO: Maybe parse the actual XML output?
        # However, dpres-siptools output may not be stable, so it could be
        # a flaky solution without much benefit for now.

        # Error was produced by JHOVE's TIFF-hul report module
        is_tiff_error = ">TIFF-hul</reportingModule>" in stderr

        if is_tiff_error:
            raise PreservationError(
                detail=(
                    f"TIFF file {exc.cmd[-1]} failed JHOVE validation, and "
                    f"is likely invalid."
                ),
                error="TIFF file failed JHOVE validation"
            )
예제 #6
0
    def check(self, exc):
        if exc.cmd[0] != "import-object":
            return

        file_path = exc.cmd[-1]
        file_ext = file_path.split(".")[-1].lower()

        if file_ext not in ("jpg", "jpeg"):
            return

        stderr = exc.stderr.decode("utf-8")

        mpo_found = (
            "Conflict with existing value 'image/jpeg' and new value "
            "'image/mpo'"
        ) in stderr

        if mpo_found:
            raise PreservationError(
                detail=(
                    f"MPO image file {exc.cmd[-1]} is not supported"
                ),
                error="MPO JPEG files not supported"
            )
예제 #7
0
 def mock_download_object(object_id, package_dir, sip_id):
     raise PreservationError(detail="Mock detailed error message",
                             error="Filename was not supported")
예제 #8
0
    async def download_attachment(self, item_id: int) -> MuseumAttachment:
        """
        Download an attachment if it hasn't been downloaded already and
        return a MuseumAttachment instance
        """
        attachment_dir = self.attachment_dir / str(item_id)
        attachment_dir.mkdir(exist_ok=True, parents=True)

        etree = await retrieve_cached_xml(
            session=self.session,
            url=f"{MUSEUMPLUS_URL}/module/Multimedia/{item_id}",
            path=attachment_dir / f"Multimedia.xml")
        attachment = MuseumAttachment(etree)

        filename = attachment.filename

        if filename == "Multimedia.xml":
            # If the attachment is named "Multimedia.xml", it would overwrite
            # the XML document we have already saved in the same directory.
            # In this case, raise a PreservationError to prevent packaging
            # and the file from being silently overwritten.
            raise PreservationError(
                detail=(
                    "The attachment filename 'Multimedia.xml' conflicts with "
                    "the Multimedia XML document we have already downloaded."),
                error="Filename 'Multimedia.xml' not allowed for attachment")

        # Raise PreservationError if attachment filename contains non-ASCII
        # characters, as those are not yet supported by the DPRES service.
        # See CSC ticket #402027.
        # TODO: Remove once the filename issue at DPRES service is fixed
        try:
            filename.encode("ascii")
        except UnicodeEncodeError:
            raise PreservationError(
                detail=(
                    f"Filename {filename} contains non-ASCII characters and "
                    f"can't be uploaded into the DPRES service at this time."),
                error="Filename contains non-ASCII characters")

        attachment_path = attachment_dir / attachment.filename

        # Only download the attachment if it doesn't exist already
        # This way repeated download attempts don't redownload files
        # we already have
        if not os.path.exists(attachment_path):
            # TODO: Can we use anything else to determine whether the file
            # is complete? MuseumPlus doesn't seem to provide a file size
            # or a checksum.
            logger.info(
                f"Downloading attachment {attachment.filename} for object "
                f"{self.museum_object.object_id}")
            # Use a temporary filename during download so we don't mistake
            # half-finished downloads for complete files
            temp_path = attachment_path.with_suffix(
                f"{attachment_path.suffix}.download")

            response = await self.session.get(
                f"{MUSEUMPLUS_URL}/module/Multimedia/{item_id}/attachment",
                headers={"Accept": "application/octet-stream"})
            if response.status == 404:
                # No attachment exists for this Multimedia instance; only
                # return the metadata
                return attachment

            response.raise_for_status()

            # TODO: If we can determine the file size beforehand, we could use
            # fallocate to allocate the required disk space and then
            # download the file.
            async with aiofiles.open(temp_path, "wb") as file_:
                while True:
                    chunk = await response.content.read(CHUNK_SIZE)
                    if not chunk:
                        break
                    await file_.write(chunk)

            if temp_path.stat().st_size == 0:
                # File is empty; remove it and only package the metadata XML
                logger.debug(
                    f"Deleting empty attachment {attachment.filename}")
                temp_path.unlink()
                return attachment

            # Download finished; we can safely rename it now
            os.rename(temp_path, attachment_path)
        else:
            logger.debug(f"Skipping existing attachment {attachment.filename}")

        return attachment
예제 #9
0
 def mock_create_sip(object_id, package_dir, sip_id, create_date,
                     modify_date, update):
     raise PreservationError(detail="Mock error message.",
                             error="Unsupported file format: wad")