Пример #1
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        cmd = [VERAPDF_PATH, self.filename]

        shell = Shell(cmd)
        if shell.returncode != 0:
            raise VeraPDFError(ensure_str(shell.stderr))
        self.messages(ensure_str(shell.stdout))

        try:
            report = ET.fromstring(shell.stdout)
            if report.xpath('//batchSummary')[0].get('failedToParse') == '0':
                compliant = report.xpath('//validationReport')[0].get(
                    'isCompliant')
                if compliant == 'false':
                    self.errors(ensure_str(shell.stdout))
                profile = \
                    report.xpath('//validationReport')[0].get('profileName')
                self.version = 'A' + profile.split("PDF/A")[1].split(
                    " validation profile")[0].lower()
            else:
                self.errors(ensure_str(shell.stdout))
        except ET.XMLSyntaxError:
            self.errors(ensure_str(shell.stderr))
        finally:
            self._check_supported()
            self._collect_elements()
Пример #2
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return

        # Check file header
        with open(self.filename, 'rb') as input_file:
            first_line = input_file.readline()
        if SPSS_PORTABLE_HEADER not in first_line:
            self.errors("File is not SPSS Portable format.")

        # Try to convert file with pspp-convert. If conversion is succesful
        # (converted.por file is produced), the original file is well-formed.
        temp_dir = tempfile.mkdtemp()
        temp_file = os.path.join(temp_dir, 'converted.por')

        try:
            shell = Shell([PSPP_PATH, self.filename, temp_file])
            self.errors(ensure_str(shell.stderr))
            self.messages(ensure_str(shell.stdout))
            if os.path.isfile(temp_file):
                self.messages('File conversion was succesful.')
            else:
                self.errors('File conversion failed.')
        finally:
            shutil.rmtree(temp_dir)
            self._check_supported()
            self._collect_elements()
Пример #3
0
    def scrape_file(self):
        """
        Run JHove command and store XML output to self.report.
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return

        exec_cmd = [
            'jhove', '-h', 'XML', '-m', self._jhove_module, self.filename
        ]
        self._shell = Shell(exec_cmd)

        if self._shell.returncode != 0:
            self.errors("JHove returned error: %s\n%s" %
                        (self._shell.returncode, self._shell.stderr))

        self._report = lxml.etree.fromstring(self._shell.stdout)

        status = self.report_field("status")
        self.messages(status)
        if 'Well-Formed and valid' not in status:
            self.errors("Validator returned error: %s\n%s" % (ensure_str(
                self._shell.stdout), ensure_str(self._shell.stderr)))
        self._check_supported()
        self._collect_elements()
Пример #4
0
    def scrape_file(self):
        """Do the Schematron check."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return

        if self._schematron_file is None:
            self.errors('Schematron file missing from parameters.')
            self._collect_elements()
            return

        xslt_filename = self._compile_schematron()

        shell = self._compile_phase(
            stylesheet=xslt_filename,
            inputfile=self.filename, allowed_codes=[0, 6])

        self._returncode = shell.returncode
        self.errors(ensure_str(shell.stderr))

        if not self._verbose and shell.returncode == 0:
            self.messages(
                ensure_str(self._filter_duplicate_elements(shell.stdout)))
        else:
            self.messages(ensure_str(shell.stdout))
        self._check_supported()
        self._collect_elements()
Пример #5
0
    def _compile_phase(self, stylesheet, inputfile, allowed_codes,
                       outputfile=None, outputfilter=False):
        """
        Compile one phase.

        :stylesheet: XSLT file to used in the conversion
        :inputfile: Input document filename
        :outputfile: Filename of the resulted document, stdout if None
        :outputfilter: Use outputfilter parameter with value only_messages
        :return: Shell instance
        """
        cmd = ['xsltproc']
        if outputfile:
            cmd = cmd + ['-o', outputfile]
        if outputfilter and not self._verbose:
            cmd = cmd + ['--stringparam', 'outputfilter', 'only_messages']
        cmd = cmd + [os.path.join(self._schematron_dirname, stylesheet),
                     inputfile]
        shell = Shell(cmd)
        if shell.returncode not in allowed_codes:
            raise SchematronValidatorError(
                "Error %s\nstdout:\n%s\nstderr:\n%s" % (
                    shell.returncode, ensure_str(shell.stdout),
                    ensure_str(shell.stderr)))
        return shell
Пример #6
0
 def scrape_file(self):
     """Scrape file using vnu.jar."""
     if not self._check_wellformed and self._only_wellformed:
         self.messages('Skipping scraper: Well-formed check not used.')
         self._collect_elements()
         return
     shell = Shell(['java', '-jar', VNU_PATH, '--verbose', self.filename])
     self.errors(ensure_str(shell.stderr))
     self.messages(ensure_str(shell.stdout))
     self._check_supported()
     self._collect_elements()
Пример #7
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        shell = Shell(['pngcheck', self.filename])

        if shell.returncode != 0:
            self.errors("Failed: returncode %s" % shell.returncode)
            self.errors(ensure_str(shell.stderr))

        self.messages(ensure_str(shell.stdout))
        self._check_supported()
        self._collect_elements()
Пример #8
0
    def scrape_file(self):
        """Scrape A/V files."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        shell = Shell(
            ['ffmpeg', '-v', 'error', '-i', self.filename, '-f', 'null', '-'])

        if shell.returncode == 0:
            self.messages('The file was analyzed successfully.')

        self.errors(ensure_str(shell.stderr))
        self.messages(ensure_str(shell.stdout))
        self._check_supported()
        self._collect_elements()
Пример #9
0
    def scrape_file(self):
        """
        Scrape ARC file by converting to WARC.

        This is done using Warctools' arc2warc converter.
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self.errors('Empty file.')
        with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \
                as warcfile:
            shell = Shell(command=['arc2warc', self.filename],
                          output_file=warcfile)

            if shell.returncode != 0:
                self.errors("Failed: returncode %s" % shell.returncode)
                # replace non-utf8 characters
                utf8string = shell.stderr.decode('utf8', errors='replace')
                # remove non-printable characters
                sanitized_string = sanitize_string(utf8string)
                # encode string to utf8 before adding to errors
                self.errors(sanitized_string.encode('utf-8'))
            elif size > 0:
                self.messages('File was analyzed successfully.')
            self.messages(ensure_str(shell.stdout))

        self.mimetype = 'application/x-internet-archive'
        self._check_supported()
        self._collect_elements()
Пример #10
0
    def scrape_file(self):
        """Scrape WARC file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self.errors('Empty file.')
        shell = Shell(['warcvalid', self.filename])

        if shell.returncode != 0:
            self.errors("Failed: returncode %s" % shell.returncode)
            # Filter some trash printed by warcvalid.
            filtered_errors = \
                b"\n".join([line for line in shell.stderr.split(b'\n')
                            if b'ignored line' not in line])
            self.errors(filtered_errors)

        self.messages(ensure_str(shell.stdout))

        warc_fd = gzip.open(self.filename)
        try:
            # First assume archive is compressed
            line = warc_fd.readline()
        except IOError:
            # Not compressed archive
            warc_fd.close()
            with open(self.filename, 'rb') as warc_fd:
                line = warc_fd.readline()
        except Exception as exception:  # pylint: disable=broad-except
            # Compressed but corrupted gzip file
            self.errors(str(exception))
            self._check_supported()
            self._collect_elements()
            return

        self.mimetype = 'application/warc'
        if len(line.split(b"WARC/", 1)) > 1:
            self.version = ensure_str(
                line.split(b"WARC/", 1)[1].split(b" ")[0].strip())
        if size > 0:
            self.messages('File was analyzed successfully.')
        self._check_supported()
        self._collect_elements()
Пример #11
0
    def errors(self, error=None):
        """
        Return error messages.

        :error: New error to add to the errors
        """
        err_msg = ensure_str(error) if error is not None else None
        if err_msg is not None and err_msg != "":
            self._errors.append(error)
        return concat(self._errors, 'ERROR: ')
Пример #12
0
    def _file_mimetype(self):
        """
        Detect mimetype with the soft option that excludes libmagick.

        :returns: file mimetype
        """
        shell = Shell(
            [FILECMD_PATH, '-be', 'soft', '--mime-type', self.filename],
            env=ENV)

        self.errors(shell.stderr)
        mimetype = ensure_str(shell.stdout).strip()

        return mimetype
Пример #13
0
 def scrape_file(self):
     """Scrape data from file."""
     if not self._check_wellformed and self._only_wellformed:
         self.messages('Skipping scraper: Well-formed check not used.')
         self._collect_elements()
         return
     try:
         self._ffmpeg = ffmpeg.probe(self.filename)
         for stream in [self._ffmpeg['format']] + self._ffmpeg['streams']:
             if 'index' not in stream:
                 stream['index'] = 0
             else:
                 stream['index'] = stream['index'] + 1
         self.set_tool_stream(0)
     except self._ffmpeg.Error as err:
         self.errors('Error in analyzing file.')
         self.errors(ensure_str(err.stderr))
     else:
         self.messages('The file was analyzed successfully.')
     finally:
         self._check_supported()
         self._collect_elements()
Пример #14
0
    def errors(self, error=None):
        """
        Remove the warning which we do not need to see from self.stderr.

        See KDKPAS-1190.

        :error: Error messages
        :returns: Filtered error messages
        """
        if error:
            filtered_errors = []
            for line in error.splitlines():
                line = ensure_str(line)
                if 'this namespace was already imported' in line:
                    continue
                filtered_errors.append(line)
                if 'I/O error : Attempt to load network entity' in line:
                    filtered_errors.append(
                        'ERROR: Schema definition propably missing'
                        'from XML catalog')
            error = "\n".join(filtered_errors)

        return super(Xmllint, self).errors(error)
Пример #15
0
    def scrape_file(self):
        """
        Check XML file with Xmllint and return a tuple of results.

        Strategy for XML file check is
            1) Try to check syntax by opening file.
            2) If there's DTD specified in file check against that.
            3) If there's no DTD and we have external XSD check againtst
               that.
            4) If there's no external XSD read schemas used in file and do
               check againts them with schema catalog.

        :returns: Tuple (status, report, errors) where
            status -- 0 is success, anything else failure
            report -- generated report
            errors -- errors if encountered, else None

        .. seealso:: https://wiki.csc.fi/wiki/KDK/XMLTiedostomuotojenSkeemat
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        # Try to check syntax by opening file in XML parser
        try:
            file_ = open(self.filename, 'rb')
            parser = etree.XMLParser(dtd_validation=False, no_network=True)
            tree = etree.parse(file_, parser=parser)
            self.version = tree.docinfo.xml_version
            file_.close()
        except etree.XMLSyntaxError as exception:
            self.errors("Failed: document is not well-formed.")
            self.errors(str(exception))
            self._collect_elements()
            return
        except IOError as exception:
            self.errors("Failed: missing file.")
            self.errors(str(exception))
            self._collect_elements()
            return

        # Try check against DTD
        if tree.docinfo.doctype:
            (exitcode, stdout, stderr) = self.exec_xmllint(dtd_check=True)

        # Try check againts XSD
        else:
            if not self._schema:
                self._schema = self.construct_xsd(tree)
                if not self._schema:
                    # No given schema and didn't find included schemas but XML
                    # was well formed.
                    self.messages("Success: Document is "
                                  "well-formed but does not contain schema.")
                    self._collect_elements()
                    return

            (exitcode, stdout, stderr) = self.exec_xmllint(schema=self._schema)
        if exitcode == 0:
            self.messages(
                "%s Success\n%s" % (self.filename, ensure_str(stdout)))
        else:
            self.errors(ensure_str(stderr))

        # Clean up constructed schemas
        if self._has_constructed_schema:
            os.remove(self._schema)

        self._check_supported()
        self._collect_elements()