def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not" "used.") return shell = Shell([ "gs", "-o", "/dev/null", "-sDEVICE=nullpage", encode_path(self.filename)]) for model in self._supported_metadata: self.streams.append(model(self._given_mimetype, self._given_version)) # Ghostscript may print characters which cannot be converted to UTF-8 stdout_message = ensure_text(shell.stdout_raw, errors='replace') stderr_message = ensure_text(shell.stderr_raw, errors='replace') self._messages.append(stdout_message) # Ghostscript will result 0 if it can repair errors. # However, in those cases an error is logged to either _errors or # _messages. This case should be handled as well-formed failure. if stderr_message: self._errors.append(stderr_message) elif shell.returncode != 0: self._errors.append("Ghostscript returned return code: %s" % shell.returncode) # If no errors have been logged, the file is valid. else: self._messages.append("Well-Formed and valid") self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(ctx, filename, check_wellformed, tool_info, mimetype, version): """ Identify file type, collect metadata, and optionally check well-formedness. In addition to the given options, the user can provide any extra options that are passed onto the scraper. These options must be in the long form, e.g. "--charset=UTF-8" or "--charset UTF-8". \f :ctx: Context object :filename: Path to the file that should be scraped :check_wellformed: Flag whether the scraper checks wellformedness :tool_info: Flag whether the scraper includes messages from different 3rd party tools :mimetype: Specified mimetype for the scraped file :version: Specified version for the scraped file """ scraper = Scraper(filename, mimetype=mimetype, version=version, **_extra_options_to_dict(ctx.args)) scraper.scrape(check_wellformed=check_wellformed) results = { "path": ensure_text(scraper.filename), "MIME type": ensure_text(scraper.mimetype), "version": ensure_text(scraper.version), "metadata": scraper.streams, "grade": scraper.grade() } if check_wellformed: results["well-formed"] = scraper.well_formed if tool_info: results["tool_info"] = scraper.info errors = {} for item in scraper.info.values(): if "ScraperNotFound" in item["class"]: raise click.ClickException("Proper scraper was not found. The " "file was not analyzed.") if item["errors"]: errors[item["class"]] = item["errors"] if errors: results["errors"] = errors click.echo(json.dumps(results, indent=4))
def scrape_file(self): """Do the Schematron check.""" if self._schematron_file is None: self._errors.append("Schematron file missing from parameters.") return xslt_filename = self._compile_schematron() shell = self._compile_phase(stylesheet=xslt_filename, inputfile=self.filename, allowed_codes=[0, 6]) self._returncode = shell.returncode if shell.stderr: self._errors.append(shell.stderr) if not self._verbose and shell.returncode == 0: self._messages.append( ensure_text(self._filter_duplicate_elements(shell.stdout_raw))) else: self._messages.append(shell.stdout) self.streams = list(self.iterate_models(well_formed=self.well_formed)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """Do the Schematron check.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return if self._schematron_file is None: self._errors.append("Schematron file missing from parameters.") return xslt_filename = self._compile_schematron() shell = self._compile_phase(stylesheet=xslt_filename, inputfile=self.filename, allowed_codes=[0, 6]) self._returncode = shell.returncode if shell.stderr: self._errors.append(shell.stderr) if not self._verbose and shell.returncode == 0: self._messages.append( ensure_text(self._filter_duplicate_elements(shell.stdout_raw))) else: self._messages.append(shell.stdout) for md_class in self._supported_metadata: self.streams.append( md_class(self._given_mimetype, self._given_version)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """Scrape A/V files.""" try: probe_results = ffmpeg.probe(encode_path(self.filename)) streams = [probe_results["format"]] + probe_results["streams"] for stream in streams: if "index" not in stream: stream["index"] = 0 else: stream["index"] = stream["index"] + 1 except ffmpeg.Error as err: self._errors.append("Error in analyzing file.") self._errors.append(ensure_text(err.stderr)) shell = Shell([ "ffmpeg", "-v", "error", "-i", encode_path(self.filename), "-f", "null", "-" ]) if shell.returncode == 0: self._messages.append("The file was analyzed successfully.") if self._filter_stderr(shell.stderr): self._errors.append(shell.stderr) return # We deny e.g. A-law PCM, mu-law PCM, DPCM and ADPCM and allow # only signed/unsigned linear PCM. Note that we need this check # only if PCM audio is present. This should not be given e.g. # for video streams nor audio streams of another type (such as # MPEG). for stream in streams: if "PCM" in stream.get("codec_long_name", UNAV) and not \ any(stream.get("codec_long_name", UNAV).startswith(x) for x in ["PCM signed", "PCM unsigned"]): self._errors.append("%s does not seem to be LPCM format." % stream["codec_long_name"]) container = False for index in range(len(streams)): # FFMpeg has separate "format" (relevant for containers) and # "streams" (relevant for all files) elements in its output. # We know whether we'll have streams + container or just # streams only after scraping the first stream, so there's a # risk of trying to add one too many streams. This check # prevents constructing more metadata models than there are # streams. if not container and index == len(streams) - 1: break self.streams += list( self.iterate_models(probe_results=probe_results, index=index)) for stream in self.streams: if stream.hascontainer(): container = True self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def version(self): """Return the version.""" if self._given_mimetype and self._given_version: return self._given_version if len(self._line.split(b"WARC/", 1)) > 1: return ensure_text( self._line.split(b"WARC/", 1)[1].split(b" ")[0].strip()) return "(:unav)"
def stdout(self): """ Command standard error output. :returns: Stdout as unicode string """ if self.stdout_raw is None: return None return ensure_text(self.stdout_raw)
def stderr(self): """ Standard error output from the command. :returns: Stderr as unicode string """ if self.stderr_raw is None: return None return ensure_text(self.stderr_raw)
def scrape_file(self): """Scrape A/V files.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return try: probe_results = ffmpeg.probe(encode_path(self.filename)) streams = [probe_results["format"]] + probe_results["streams"] for stream in streams: if "index" not in stream: stream["index"] = 0 else: stream["index"] = stream["index"] + 1 except ffmpeg.Error as err: self._errors.append("Error in analyzing file.") self._errors.append(ensure_text(err.stderr)) shell = Shell([ "ffmpeg", "-v", "error", "-i", encode_path(self.filename), "-f", "null", "-" ]) if shell.returncode == 0: self._messages.append("The file was analyzed successfully.") # if "truncated" in self.filename: # __import__('pdb').set_trace() if self._filter_stderr(shell.stderr): self._errors.append(shell.stderr) return container = False for index in range(len(streams)): # FFMpeg has separate "format" (relevant for containers) and # "streams" (relevant for all files) elements in its output. We # know whether we'll have streams + container or just streams only # after scraping the first stream, so there's a risk of trying to # add one too many streams. This check prevents constructing more # metadata models than there are streams. if not container and index == len(streams) - 1: break for md_class in self._supported_metadata: if md_class.is_supported(self._mimetype_guess): stream = md_class(probe_results, index, self._given_mimetype, self._given_version) self.streams.append(stream) if stream.hascontainer(): container = True self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def version(self): """Return version.""" if self._given_mimetype and self._given_version: return self._given_version for supported_version in self._supported["image/x-dpx"]: version_string = "File {}: Validated as V{}".format( ensure_text(self._filename), supported_version) if version_string in self._messages: return supported_version return '(:unav)'
def _evaluate_xsd_location(self, location): """Determine whether or not the XSD schema is a local file in relation to the assigned XML file. If local file is found, absolute path will be returned for xsd-construction's import purpose. Otherwise return the location as-is. Absolute path is required for construct_xsd-function as the temporary file's location will differ a lot in related to the current self.filename. :param location: Given schema location in string. :return: String of the XSD location. If it's local, absolute path. """ # schemaLocation or noNamespaceSchemaLocation is always either # direct path or relative path to the XML in question. local_location = os.path.join( os.path.dirname(encode_path(self.filename)), encode_path(location) ) if os.path.isfile(local_location): return os.path.abspath(ensure_text(local_location)) return ensure_text(location)
def __init__(self, filename, **kwargs): """Initialize scraper. :filename: File path :kwargs: Extra arguments for certain scrapers """ if filename is not None: filename = ensure_text(filename) self.filename = filename self.mimetype = None self.version = None self.streams = None self.well_formed = None self.info = None self._important = {} self._params = kwargs
def errors(self): """ Return errors without unnecessary ones. See KDKPAS-1190. :returns: Filtered error messages """ errors_to_remove = [] errors_to_add = [] for error in self._errors: line = ensure_text(error) if "this namespace was already imported" in line: errors_to_remove.append(error) if "I/O error : Attempt to load network entity" in line: errors_to_add.append( "Schema definition probably missing from XML catalog") errors_to_remove.append(error) for error in errors_to_remove: self._errors.remove(error) for error in errors_to_add: self._errors.append(error) return super(XmllintScraper, self).errors()