Пример #1
0
 def scrape_file(self):
     """Check if file exists."""
     if not self.filename:
         self._errors.append("No filename given.")
     elif os.path.isfile(self.filename):
         self._messages.append("File {} was found.".format(
             decode_path(self.filename)))
     else:
         self._errors.append("File {} does not exist.".format(
             decode_path(self.filename)))
     self.streams.append(DummyMeta())
Пример #2
0
    def construct_xsd(self, document_tree):
        """
        Construct one schema file for the given document tree.

        :returns: Path to the constructed XSD schema
        """

        xsd_exists = False

        parser = etree.XMLParser(dtd_validation=False, no_network=True)
        schema_tree = etree.XML(SCHEMA_TEMPLATE, parser)

        schema_locations = set(
            document_tree.xpath("//*/@xsi:schemaLocation",
                                namespaces={"xsi": XSI}))
        for schema_location in schema_locations:
            xsd_exists = True

            namespaces_locations = schema_location.strip().split()
            # Import all found namspace/schema location pairs
            for namespace, location in zip(*[iter(namespaces_locations)] * 2):
                xs_import = etree.Element(XS + "import")
                xs_import.attrib["namespace"] = namespace
                xs_import.attrib["schemaLocation"] = location
                schema_tree.append(xs_import)

        schema_locations = set(
            document_tree.xpath("//*/@xsi:noNamespaceSchemaLocation",
                                namespaces={"xsi": XSI}))
        for schema_location in schema_locations:
            xsd_exists = True

            # Check if XSD file is included in SIP
            local_schema_location = os.path.join(
                os.path.dirname(self.filename), encode_path(schema_location))
            if os.path.isfile(local_schema_location):
                schema_location = local_schema_location

            xs_import = etree.Element(XS + "import")
            xs_import.attrib["schemaLocation"] = decode_path(schema_location)
            schema_tree.append(xs_import)
        if xsd_exists:
            # Contstruct the schema
            _, schema = tempfile.mkstemp(prefix="file-scraper-", suffix=".tmp")
            elem_tree = etree.ElementTree(schema_tree)
            elem_tree.write(schema)
            self._has_constructed_schema = True

            return schema

        return []
Пример #3
0
 def identify(self):
     """Identify file format with using pronom registry."""
     versions = get_local_pronom_versions()
     defaults["xml_pronomSignature"] = versions.pronom_signature
     defaults["containersignature_file"] = \
         versions.pronom_container_signature
     defaults["xml_fidoExtensionSignature"] = \
         versions.fido_extension_signature
     defaults["format_files"] = [defaults["xml_pronomSignature"]]
     defaults["format_files"].append(defaults["xml_fidoExtensionSignature"])
     self.identify_file(
         # Python's zipfile module used internally by FIDO doesn't support
         # paths that are provided as byte strings
         filename=decode_path(self.filename),
         extension=False)
Пример #4
0
    def scrape_file(self):
        """Populate streams with supported metadata objects."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not"
                                  "used.")
            return
        if "mimetype_guess" not in self._params:
            raise AttributeError("MediainfoScraper was not given a parameter "
                                 "dict containing key 'mimetype_guess'.")

        try:
            mediainfo = MediaInfo.parse(decode_path(self.filename))
        except Exception as e:  # pylint: disable=invalid-name, broad-except
            self._errors.append("Error in analyzing file.")
            self._errors.append(six.text_type(e))
            self._check_supported()
            return

        if not self._tracks_ok(mediainfo):
            return
        else:
            self._messages.append("The file was analyzed successfully.")

        mime_guess = self._choose_mimetype_guess()

        for index in range(len(mediainfo.tracks)):
            for md_class in self._supported_metadata:
                if md_class.is_supported(mime_guess):
                    md_object = md_class(mediainfo.tracks, index, mime_guess,
                                         self._given_mimetype,
                                         self._given_version)
                    if not md_object.hascontainer() and index == 0:
                        continue
                    self.streams.append(md_object)

        # Files scraped with SimpleMediainfoMeta will have (:unav) MIME type,
        # but for other scrapes the tests need to be performed without allowing
        # unavs MIME types.
        if self.streams and isinstance(self.streams[0], SimpleMediainfoMeta):
            self._check_supported(allow_unav_mime=True,
                                  allow_unav_version=True)
            return
        self._check_supported(allow_unav_version=True, allow_unap_version=True)
    def scrape_file(self):
        """Populate streams with supported metadata objects."""
        try:
            mediainfo = MediaInfo.parse(decode_path(self.filename))
        except Exception as e:  # pylint: disable=invalid-name, broad-except
            self._errors.append("Error in analyzing file.")
            self._errors.append(six.text_type(e))
            self._check_supported()
            return

        if not self._tracks_ok(mediainfo):
            return
        self._messages.append("The file was analyzed successfully.")

        for index, track in enumerate(mediainfo.tracks):

            # Use predefined mimetype/version for first stream, and
            # detected mimetype for other streams
            if len(self.streams) == 0:
                mimetype = self._predefined_mimetype
                version = self._predefined_version

            # WAV is a special container format. For WAV files,
            # no distinction between container and soundtrack needs to
            # be made, as both are treated as one in the DPS.
            elif (self._predefined_mimetype == 'audio/x-wav'
                  or file_scraper.mediainfo.track_mimetype(
                      mediainfo.tracks[0]) == 'audio/x-wav'):
                mimetype = 'audio/x-wav'
                version = None

            else:
                mimetype = file_scraper.mediainfo.track_mimetype(track)
                version = None

            # Add track as stream
            self.streams += list(
                self.iterate_models(mimetype=mimetype,
                                    version=version,
                                    tracks=mediainfo.tracks,
                                    index=index))

        self._check_supported(allow_unav_version=True, allow_unap_version=True)
Пример #6
0
    def scrape_file(self):
        """
        Check XML file with Xmllint and return a tuple of results.

        Strategy for XML file check is
            1) Try to check syntax by opening file.
            2) If there's DTD specified in file check against that.
            3) If there's no DTD and we have external XSD check againtst
               that.
            4) If there's no external XSD read schemas used in file and do
               check againts them with schema catalog.

        :returns: Tuple (status, report, errors) where
            status -- 0 is success, anything else failure
            report -- generated report
            errors -- errors if encountered, else None

        .. seealso:: https://wiki.csc.fi/wiki/KDK/XMLTiedostomuotojenSkeemat
        """
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        # Try to check syntax by opening file in XML parser
        try:
            file_ = io_open(self.filename, "rb")
            parser = etree.XMLParser(dtd_validation=False, no_network=True)
            tree = etree.parse(file_, parser=parser)
            file_.close()
        except etree.XMLSyntaxError as exception:
            self._errors.append("Failed: document is not well-formed.")
            self._errors.append(six.text_type(exception))
            return
        except IOError as exception:
            self._errors.append("Failed: missing file.")
            self._errors.append(six.text_type(exception))
            return

        # Try check against DTD
        if tree.docinfo.doctype:
            (exitcode, stdout, stderr) = self.exec_xmllint(dtd_check=True)

        # Try check againts XSD
        else:
            if not self._schema:
                self._schema = self.construct_xsd(tree)
                if not self._schema:
                    # No given schema and didn"t find included schemas but XML
                    # was well formed.
                    self._messages.append("Success: Document is well-formed "
                                          "but does not contain schema.")
                    self._add_streams(tree)
                    self._check_supported()
                    return

            (exitcode, stdout, stderr) = self.exec_xmllint(schema=self._schema)

        if exitcode == 0:
            self._messages.append("%s Success\n%s" %
                                  (decode_path(self.filename), stdout))
        else:
            self._errors += stderr.splitlines()
            return

        # Clean up constructed schemas
        if self._has_constructed_schema:
            os.remove(self._schema)

        self._add_streams(tree)
        self._check_supported()