예제 #1
0
    def scrape_file(self):
        """Scrape A/V files."""
        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        # We deny e.g. A-law PCM, mu-law PCM, DPCM and ADPCM and allow
        # only signed/unsigned linear PCM. Note that we need this check
        # only if PCM audio is present. This should not be given e.g.
        # for video streams nor audio streams of another type (such as
        # MPEG).
        for stream in streams:
            if "PCM" in stream.get("codec_long_name", UNAV) and not \
                    any(stream.get("codec_long_name", UNAV).startswith(x)
                        for x in ["PCM signed", "PCM unsigned"]):
                self._errors.append("%s does not seem to be LPCM format." %
                                    stream["codec_long_name"])

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output.
            # We know whether we'll have streams + container or just
            # streams only after scraping the first stream, so there's a
            # risk of trying to add one too many streams. This check
            # prevents constructing more metadata models than there are
            # streams.
            if not container and index == len(streams) - 1:
                break

            self.streams += list(
                self.iterate_models(probe_results=probe_results, index=index))

            for stream in self.streams:
                if stream.hascontainer():
                    container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
예제 #2
0
    def scrape_file(self):
        """Scrape A/V files."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return

        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")
        # if "truncated" in self.filename:
        #     __import__('pdb').set_trace()

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output. We
            # know whether we'll have streams + container or just streams only
            # after scraping the first stream, so there's a risk of trying to
            # add one too many streams. This check prevents constructing more
            # metadata models than there are streams.
            if not container and index == len(streams) - 1:
                break

            for md_class in self._supported_metadata:
                if md_class.is_supported(self._mimetype_guess):
                    stream = md_class(probe_results, index,
                                      self._given_mimetype,
                                      self._given_version)
                    self.streams.append(stream)
                    if stream.hascontainer():
                        container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
예제 #3
0
    def exec_xmllint(self, dtd_check=False, schema=None):
        """
        Execute xmllint.

        :dtd_check: True, if check against DTD, false otherwise
        :schema: Schema file
        :returns: tuple including: returncode, stdout, strderr
        """
        command = ["xmllint"]
        command += ["--valid"] if dtd_check else []
        command += ["--huge"]
        command += ["--noout"]
        command += ["--nonet"] if self._no_network else []
        command += ["--catalogs"] if self._catalogs else []
        command += ["--schema", schema] if schema else []
        command += [encode_path(self.filename)]

        if self._catalog_path is not None:
            environment = {"SGML_CATALOG_FILES": self._catalog_path}
        else:
            environment = None

        shell = Shell(command, env=environment)

        return (shell.returncode, shell.stdout, shell.stderr)
예제 #4
0
    def scrape_file(self):
        """
        Scrape ARC file by converting to WARC.

        This is done using Warctools" arc2warc converter.
        """
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self._errors.append("Empty file.")
            return
        with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \
                as warcfile:
            shell = Shell(
                command=["arc2warc", encode_path(self.filename)],
                stdout=warcfile)
            if shell.returncode != 0:
                self._errors.append("Failed: returncode %s" % shell.returncode)
                self._errors.append(sanitize_bytestring(shell.stderr_raw))
                return
            self._messages.append("File was analyzed successfully.")
            if shell.stdout:
                self._messages.append(shell.stdout)

        for md_class in self._supported_metadata:
            self.streams.append(md_class(self._given_mimetype,
                                         self._given_version))
        self._check_supported(allow_unav_version=True)
예제 #5
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        cmd = [VERAPDF_PATH, encode_path(self.filename)]

        shell = Shell(cmd)
        if shell.returncode not in OK_CODES:
            raise VeraPDFError(shell.stderr)
        profile = None

        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._errors.append(shell.stdout)
                else:
                    self._messages.append(shell.stdout)
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._errors.append(shell.stdout)
        except ET.XMLSyntaxError:
            self._errors.append(shell.stderr)

        self.streams = list(
            self.iterate_models(well_formed=self.well_formed, profile=profile))

        self._check_supported()
예제 #6
0
 def scrape_file(self):
     """Scrape file."""
     if not self._check_wellformed and self._only_wellformed:
         self._messages.append("Skipping scraper: Well-formed check not"
                               "used.")
         return
     temp_dir = tempfile.mkdtemp()
     try:
         env = {"HOME": temp_dir}
         shell = Shell([
             "soffice", "--convert-to", "pdf", "--outdir", temp_dir,
             encode_path(self.filename)
         ],
                       env=env)
         if shell.stderr:
             self._errors.append(shell.stderr)
         self._messages.append(shell.stdout)
     except OSError as error:
         self._errors.append("Error handling file: {}".format(error))
     finally:
         shutil.rmtree(temp_dir)
         for md_class in self._supported_metadata:
             self.streams.append(
                 md_class(self._given_mimetype, self._given_version))
         self._check_supported(allow_unav_mime=True,
                               allow_unav_version=True)
예제 #7
0
    def scrape_file(self):
        """Scrape DPX."""

        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: "
                                  "Well-formed check not used.")
            return

        shell = Shell(["dpxv", encode_path(self.filename)])

        if shell.returncode != 0:
            raise DPXvError(shell.stderr)

        if shell.stderr:
            self._errors += list(shell.stderr.splitlines())

        if shell.stdout:
            self._messages += list(shell.stdout.splitlines())

        for md_class in self._supported_metadata:
            self.streams.append(
                md_class(
                    mimetype=self._given_mimetype,
                    version=self._given_version,
                    info=self.info(),
                    filename=self.filename))

        self._check_supported()
예제 #8
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not"
                                  "used.")
            return
        shell = Shell([
            "gs", "-o", "/dev/null", "-sDEVICE=nullpage",
            encode_path(self.filename)])

        for model in self._supported_metadata:
            self.streams.append(model(self._given_mimetype,
                                      self._given_version))

        # Ghostscript may print characters which cannot be converted to UTF-8
        stdout_message = ensure_text(shell.stdout_raw, errors='replace')
        stderr_message = ensure_text(shell.stderr_raw, errors='replace')
        self._messages.append(stdout_message)

        # Ghostscript will result 0 if it can repair errors.
        # However, in those cases an error is logged to either _errors or
        # _messages. This case should be handled as well-formed failure.
        if stderr_message:
            self._errors.append(stderr_message)
        elif shell.returncode != 0:
            self._errors.append("Ghostscript returned return code: %s"
                                % shell.returncode)

        # If no errors have been logged, the file is valid.
        else:
            self._messages.append("Well-Formed and valid")

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
예제 #9
0
    def _compile_phase(self,
                       stylesheet,
                       inputfile,
                       allowed_codes,
                       outputfile=None,
                       outputfilter=False):
        """
        Compile one phase.

        :stylesheet: XSLT file to used in the conversion
        :inputfile: Input document filename
        :outputfile: Filename of the resulted document, stdout if None
        :outputfilter: Use outputfilter parameter with value only_messages
        :return: Shell instance
        """
        cmd = ["xsltproc"]
        if outputfile:
            cmd = cmd + ["-o", outputfile]
        if outputfilter and not self._verbose:
            cmd = cmd + ["--stringparam", "outputfilter", "only_messages"]
        cmd = cmd + [
            os.path.join(SCHEMATRON_DIRNAME, stylesheet),
            encode_path(inputfile)
        ]
        shell = Shell(cmd)
        if shell.returncode not in allowed_codes:
            raise SchematronValidatorError(
                "Error {}\nstdout:\n{}\nstderr:\n{}".format(
                    shell.returncode, shell.stdout, shell.stderr))
        return shell
def magic_analyze(magic_lib, magic_type, path):
    """Analyze file with given magic module.

    :magic_lib: Magic module
    :magic_type: Magic type to open magic library
    :path: File path to analyze
    :returns: Result from the magic module
    """
    magic_ = magic_lib.open(magic_type)
    magic_.load()
    magic_result = magic_.file(encode_path(path))
    magic_.close()
    return magic_result
예제 #11
0
    def construct_xsd(self, document_tree):
        """
        Construct one schema file for the given document tree.

        :returns: Path to the constructed XSD schema
        """

        xsd_exists = False

        parser = etree.XMLParser(dtd_validation=False, no_network=True)
        schema_tree = etree.XML(SCHEMA_TEMPLATE, parser)

        schema_locations = set(
            document_tree.xpath("//*/@xsi:schemaLocation",
                                namespaces={"xsi": XSI}))
        for schema_location in schema_locations:
            xsd_exists = True

            namespaces_locations = schema_location.strip().split()
            # Import all found namspace/schema location pairs
            for namespace, location in zip(*[iter(namespaces_locations)] * 2):
                xs_import = etree.Element(XS + "import")
                xs_import.attrib["namespace"] = namespace
                xs_import.attrib["schemaLocation"] = location
                schema_tree.append(xs_import)

        schema_locations = set(
            document_tree.xpath("//*/@xsi:noNamespaceSchemaLocation",
                                namespaces={"xsi": XSI}))
        for schema_location in schema_locations:
            xsd_exists = True

            # Check if XSD file is included in SIP
            local_schema_location = os.path.join(
                os.path.dirname(self.filename), encode_path(schema_location))
            if os.path.isfile(local_schema_location):
                schema_location = local_schema_location

            xs_import = etree.Element(XS + "import")
            xs_import.attrib["schemaLocation"] = decode_path(schema_location)
            schema_tree.append(xs_import)
        if xsd_exists:
            # Contstruct the schema
            _, schema = tempfile.mkstemp(prefix="file-scraper-", suffix=".tmp")
            elem_tree = etree.ElementTree(schema_tree)
            elem_tree.write(schema)
            self._has_constructed_schema = True

            return schema

        return []
예제 #12
0
    def _evaluate_xsd_location(self, location):
        """Determine whether or not the XSD schema is a
        local file in relation to the assigned XML file.

        If local file is found, absolute path will be returned for
        xsd-construction's import purpose. Otherwise return the location as-is.

        Absolute path is required for construct_xsd-function as the temporary
        file's location will differ a lot in related to the current
        self.filename.

        :param location: Given schema location in string.
        :return: String of the XSD location. If it's local, absolute path.
        """
        # schemaLocation or noNamespaceSchemaLocation is always either
        # direct path or relative path to the XML in question.
        local_location = os.path.join(
            os.path.dirname(encode_path(self.filename)),
            encode_path(location)
        )
        if os.path.isfile(local_location):
            return os.path.abspath(ensure_text(local_location))
        return ensure_text(location)
    def scrape_file(self):
        """Scrape file."""
        shell = Shell(["pngcheck", encode_path(self.filename)])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            self._errors.append(shell.stderr)

        self._messages.append(shell.stdout)

        # This scraper does not know anything about the MIME type, so checking
        # is not useful. Just add metadata models.
        self.streams = list(self.iterate_models())

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def file_command(filename, parameters=None):
    """Use file command in shell.

    :filename: Filename for the file command.
    :parameters: Parameter list for the file command.
    :returns: Shell class
    """
    cmd = "file"
    env = {}
    if os.path.isfile(FILECMD_PATH) and os.path.isdir(LD_LIBRARY_PATH):
        cmd = FILECMD_PATH
        env = {"LD_LIBRARY_PATH": LD_LIBRARY_PATH}

    if parameters is None:
        parameters = []
    return Shell([cmd] + parameters + [encode_path(filename)], env=env)
예제 #15
0
 def __init__(self, filename, **kwargs):
     """Initialize scraper.
     :filename: File path
     :kwargs: Extra arguments for certain scrapers
     """
     if filename is not None:
         filename = encode_path(filename)
     self.filename = filename
     self.mimetype = None
     self.version = None
     self.streams = None
     self.well_formed = None
     self.info = None
     self._params = kwargs
     self._scraper_results = []
     self._given_mimetype = self._params.get("mimetype", None)
     self._given_version = self._params.get("version", None)
 def scrape_file(self):
     """Scrape file."""
     temp_dir = tempfile.mkdtemp()
     try:
         env = {"HOME": temp_dir}
         shell = Shell([
             SOFFICE_PATH, "--convert-to", "pdf", "--outdir", temp_dir,
             encode_path(self.filename)
         ],
                       env=env)
         if shell.stderr:
             self._errors.append(shell.stderr)
         self._messages.append(shell.stdout)
     except OSError as error:
         self._errors.append("Error handling file: {}".format(error))
     finally:
         shutil.rmtree(temp_dir)
         self.streams = list(self.iterate_models())
         self._check_supported(allow_unav_mime=True,
                               allow_unav_version=True)
예제 #17
0
    def detect(self):
        """
        Run veraPDF to find out if the file is PDF/A and possibly its version.

        If the file is not a PDF/A, the MIME type and version are left as None.
        """
        cmd = [VERAPDF_PATH, encode_path(self.filename)]
        shell = Shell(cmd)

        # Test if the file is a PDF/A
        if shell.returncode != 0:
            self._set_info_not_pdf_a(shell)
            return
        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._set_info_not_pdf_a()
                    return
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._set_info_not_pdf_a(shell)
                return
        except ET.XMLSyntaxError:
            self._set_info_not_pdf_a(shell)
            return

        # If we have not encountered problems, the file is PDF/A and its
        # version can be read from the profile.
        version = profile.split("PDF/A")[1].split(" validation profile")[0]
        self.version = "A{}".format(version.lower())
        self.mimetype = "application/pdf"
        self.info = {
            "class": self.__class__.__name__,
            "messages": ["PDF/A version detected by veraPDF."],
            "errors": [],
            "tools": []
        }
예제 #18
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        shell = Shell(["pngcheck", encode_path(self.filename)])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            self._errors.append(shell.stderr)

        self._messages.append(shell.stdout)

        # This scraper does not know anything about the MIME type, so checking
        # is not useful. Just add metadata models.
        for md_class in self._supported_metadata:
            self.streams.append(
                md_class(self._given_mimetype, self._given_version))

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
예제 #19
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        cmd = [VERAPDF_PATH, encode_path(self.filename)]

        shell = Shell(cmd)
        if shell.returncode != 0:
            raise VeraPDFError(shell.stderr)
        self._messages.append(shell.stdout)

        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._errors.append(shell.stdout)
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._errors.append(shell.stdout)
        except ET.XMLSyntaxError:
            self._errors.append(shell.stderr)

        if self.well_formed:
            for md_class in self._supported_metadata:
                self.streams.append(
                    md_class(profile, self._given_mimetype,
                             self._given_version))
                self._check_supported()