def __init__(self, file): """Represents an XML under validation. The XML can be retrieved given its filesystem path, an URL, a file-object or an etree instance. The XML is validated against the JATS Publishing tag set and the SPS Style. :param file: Path to the XML file, URL or etree. """ if isinstance(file, etree._ElementTree): self.lxml = file else: self.lxml = etree.parse(file) self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd') self.schematron = XMLSchematron('sps.sch') self.ppl = StyleCheckingPipeline()
def __init__(self, file, no_network=True, dtd=None): """Represents an SPS article XML. The XML can be retrieved given its filesystem path, an URL, a file-object or an etree instance. The XML is validated against the JATS Publishing tag set and the SPS Style. :param file: Path to the XML file, URL or etree. :param no_network: (optional) prevent network access for external DTD. :param dtd: (optional) etree.DTD instance. if not provided, we try to guess. """ if isinstance(file, etree._ElementTree): self.lxml = file else: parser = etree.XMLParser(remove_blank_text=True, load_dtd=True, no_network=no_network) self.lxml = etree.parse(file, parser) self.dtd = dtd or self.lxml.docinfo.externalDTD self.schematron = XMLSchematron('scielo-style.sch') self.ppl = StyleCheckingPipeline()
class XML(object): def __init__(self, file): """Represents an XML under validation. The XML can be retrieved given its filesystem path, an URL, a file-object or an etree instance. The XML is validated against the JATS Publishing tag set and the SPS Style. :param file: Path to the XML file, URL or etree. """ if isinstance(file, etree._ElementTree): self.lxml = file else: self.lxml = etree.parse(file) self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd') self.schematron = XMLSchematron('sps.sch') self.ppl = StyleCheckingPipeline() def find_element(self, tagname, lineno=None, fallback=True): """Find an element given a tagname and a line number. If no element is found than the return value is None. :param tagname: string of the tag name. :param lineno: int if the line it appears on the original source file. :param fallback: fallback to root element when `element` is not found. """ for elem in self.lxml.findall('//' + tagname): if lineno is None: return elem elif elem.sourceline == lineno: logger.debug('method *find*: hit a regular element: %s.' % tagname) return elem else: continue else: root = self.lxml.getroot() if fallback: return root elif root.tag == tagname: logger.debug('method *find*: hit a root element.') return root else: raise ValueError("Could not find element '%s'." % tagname) def validate(self): """Validate the source XML against the JATS Publishing Schema. Returns a tuple comprising the validation status and the errors list. """ result = setdefault(self, '__validation_result', lambda: self.xmlschema.validate(self.lxml)) errors = setdefault(self, '__validation_errors', lambda: self.xmlschema.error_log) return result, errors def _validate_sch(self): """Validate the source XML against the SPS Schematron. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): err_log = self.schematron.error_log return [StyleError.from_schematron_errlog(err) for err in err_log] result = setdefault(self, '__sch_validation_result', lambda: self.schematron.validate(self.lxml)) errors = setdefault(self, '__sch_validation_errors', make_error_log) return result, errors def validate_style(self): """Validate the source XML against the SPS Tagging guidelines. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): errors = next(self.ppl.run(self.lxml, rewrap=True)) errors += self._validate_sch()[1] return errors errors = setdefault(self, '__style_validation_result', make_error_log) result = setdefault(self, '__style_validation_errors', lambda: not bool(errors)) return result, errors def _annotate_error(self, element, error): """Add an annotation prior to `element`, with `error` as the content. The annotation is a <SPS-ERROR> element added prior to `element`. If `element` is the root element, then the error is annotated as comment. :param element: etree instance to be annotated. :param error: string of the error. """ notice_element = etree.Element('SPS-ERROR') notice_element.text = error try: element.addprevious(notice_element) except TypeError: # In case of a root element, a comment if added. element.addprevious(etree.Comment('SPS-ERROR: %s' % error)) def annotate_errors(self): """Add notes on all elements that have errors. The errors list is generated as a result of calling both :meth:`validate` and :meth:`validate_style` methods. """ v_result, v_errors = self.validate() s_result, s_errors = self.validate_style() if v_result and s_result: return None for error in itertools.chain(v_errors, s_errors): try: element_name = search_element_name(error.message) except ValueError: # could not find the element name logger.info('Could not locate the element name in: %s' % error.message) continue if error.line is None: err_element = self.find_element(element_name) else: err_element = self.find_element(element_name, lineno=error.line) self._annotate_error(err_element, error.message) def __str__(self): return etree.tostring(self.lxml, pretty_print=True, encoding='utf-8', xml_declaration=True) def __unicode__(self): return str(self).decode('utf-8') def __repr__(self): return '<packtools.stylechecker.XML xml=%s valid=%s>' % ( self.lxml, self.validate()[0]) def read(self): """ Read the XML contents as text. """ return unicode(self)
class XML(object): def __init__(self, file): """Represents an XML under validation. The XML can be retrieved given its filesystem path, an URL, a file-object or an etree instance. The XML is validated against the JATS Publishing tag set and the SPS Style. :param file: Path to the XML file, URL or etree. """ if isinstance(file, etree._ElementTree): self.lxml = file else: self.lxml = etree.parse(file) self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd') self.schematron = XMLSchematron('sps.sch') self.ppl = StyleCheckingPipeline() def find_element(self, tagname, lineno=None, fallback=True): """Find an element given a tagname and a line number. If no element is found than the return value is None. :param tagname: string of the tag name. :param lineno: int if the line it appears on the original source file. :param fallback: fallback to root element when `element` is not found. """ for elem in self.lxml.findall('//' + tagname): if lineno is None: return elem elif elem.sourceline == lineno: logger.debug('method *find*: hit a regular element: %s.' % tagname) return elem else: continue else: root = self.lxml.getroot() if fallback: return root elif root.tag == tagname: logger.debug('method *find*: hit a root element.') return root else: raise ValueError("Could not find element '%s'." % tagname) def validate(self): """Validate the source XML against the JATS Publishing Schema. Returns a tuple comprising the validation status and the errors list. """ result = setdefault(self, '__validation_result', lambda: self.xmlschema.validate(self.lxml)) errors = setdefault(self, '__validation_errors', lambda: self.xmlschema.error_log) return result, errors def _validate_sch(self): """Validate the source XML against the SPS Schematron. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): err_log = self.schematron.error_log return [StyleError.from_schematron_errlog(err) for err in err_log] result = setdefault(self, '__sch_validation_result', lambda: self.schematron.validate(self.lxml)) errors = setdefault(self, '__sch_validation_errors', make_error_log) return result, errors def validate_style(self): """Validate the source XML against the SPS Tagging guidelines. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): errors = next(self.ppl.run(self.lxml, rewrap=True)) errors += self._validate_sch()[1] return errors errors = setdefault(self, '__style_validation_result', make_error_log) result = setdefault(self, '__style_validation_errors', lambda: not bool(errors)) return result, errors def _annotate_error(self, element, error): """Add an annotation prior to `element`, with `error` as the content. The annotation is a <SPS-ERROR> element added prior to `element`. If `element` is the root element, then the error is annotated as comment. :param element: etree instance to be annotated. :param error: string of the error. """ notice_element = etree.Element('SPS-ERROR') notice_element.text = error try: element.addprevious(notice_element) except TypeError: # In case of a root element, a comment if added. element.addprevious(etree.Comment('SPS-ERROR: %s' % error)) def annotate_errors(self): """Add notes on all elements that have errors. The errors list is generated as a result of calling both :meth:`validate` and :meth:`validate_style` methods. """ v_result, v_errors = self.validate() s_result, s_errors = self.validate_style() if v_result and s_result: return None for error in itertools.chain(v_errors, s_errors): try: element_name = search_element_name(error.message) except ValueError: # could not find the element name logger.info('Could not locate the element name in: %s' % error.message) continue if error.line is None: err_element = self.find_element(element_name) else: err_element = self.find_element(element_name, lineno=error.line) self._annotate_error(err_element, error.message) def __str__(self): return etree.tostring(self.lxml, pretty_print=True, encoding='utf-8', xml_declaration=True) def __unicode__(self): return str(self).decode('utf-8') def __repr__(self): return '<packtools.stylechecker.XML xml=%s valid=%s>' % (self.lxml, self.validate()[0]) def read(self): """ Read the XML contents as text. """ return unicode(self)
class XML(object): def __init__(self, file, no_network=True, dtd=None): """Represents an SPS article XML. The XML can be retrieved given its filesystem path, an URL, a file-object or an etree instance. The XML is validated against the JATS Publishing tag set and the SPS Style. :param file: Path to the XML file, URL or etree. :param no_network: (optional) prevent network access for external DTD. :param dtd: (optional) etree.DTD instance. if not provided, we try to guess. """ if isinstance(file, etree._ElementTree): self.lxml = file else: parser = etree.XMLParser(remove_blank_text=True, load_dtd=True, no_network=no_network) self.lxml = etree.parse(file, parser) self.dtd = dtd or self.lxml.docinfo.externalDTD self.schematron = XMLSchematron('scielo-style.sch') self.ppl = StyleCheckingPipeline() @cachedmethod def validate(self): """Validate the source XML against the JATS Publishing Schema. Returns a tuple comprising the validation status and the errors list. """ if self.dtd is None: raise TypeError('The DTD/XSD could not be loaded.') def make_error_log(): return [SchemaStyleError(err) for err in self.dtd.error_log] result = self.dtd.validate(self.lxml) errors = make_error_log() return result, errors @cachedmethod def _validate_sch(self): """Validate the source XML against the SPS Schematron. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): err_log = self.schematron.error_log return [SchematronStyleError(err) for err in err_log] result = self.schematron.validate(self.lxml) errors = make_error_log() return result, errors @cachedmethod def validate_style(self): """Validate the source XML against the SPS Tagging guidelines. Returns a tuple comprising the validation status and the errors list. """ def make_error_log(): errors = next(self.ppl.run(self.lxml, rewrap=True)) errors += self._validate_sch()[1] return errors errors = make_error_log() result = not bool(errors) return result, errors def _annotate_error(self, element, error): """Add an annotation prior to `element`, with `error` as the content. The annotation is a <SPS-ERROR> element added prior to `element`. If `element` is the root element, then the error is annotated as comment. :param element: etree instance to be annotated. :param error: string of the error. """ notice_element = etree.Element('SPS-ERROR') notice_element.text = error element.addprevious(etree.Comment('SPS-ERROR: %s' % error)) def annotate_errors(self, fail_fast=False): """Add notes on all elements that have errors. The errors list is generated as a result of calling both :meth:`validate` and :meth:`validate_style` methods. :param fail_fast: (optional) raise TypeError if the dtd have not been loaded. """ try: v_result, v_errors = self.validate() except TypeError: if fail_fast: raise else: v_result = True v_errors = [] s_result, s_errors = self.validate_style() if v_result and s_result: return None for error in itertools.chain(v_errors, s_errors): try: err_element = error.get_apparent_element(self.lxml) except ValueError: logger.info('Could not locate the element name in: %s' % error.message) err_element = self.lxml.getroot() self._annotate_error(err_element, error.message) def __str__(self): return etree.tostring(self.lxml, pretty_print=True, encoding='utf-8', xml_declaration=True) def __unicode__(self): return str(self).decode('utf-8') def __repr__(self): return '<packtools.stylechecker.XML xml=%s valid=%s>' % (self.lxml, self.validate()[0]) def read(self): """ Read the XML contents as text. """ return unicode(self)