def _decodeDeclaration(sig, dec, permitted, loggedEvents): sig = _normaliseNewlines(dec(sig)[0]) eo = _encodingFromDecl(sig) if not (eo): _logEvent( loggedEvents, logging.UnicodeError({ 'exception': 'This XML file (apparently ' + permitted[0] + ') requires an encoding declaration' }), (1, 1)) elif permitted and not (eo[0].upper() in permitted): if _hasCodec(eo[0]): # see if the codec is an alias of one of the permitted encodings codec = codecs.lookup(eo[0]) for encoding in permitted: if _hasCodec(encoding) and codecs.lookup( encoding)[-1] == codec[-1]: break else: _logEvent( loggedEvents, logging.UnicodeError({ 'exception': 'This XML file claims an encoding of ' + eo[0] + ', but looks more like ' + permitted[0] }), eo[1]) return eo
def _decodePostBOMDeclaration(sig, dec, permitted, loggedEvents, fallback=None): sig = _normaliseNewlines(dec(sig)[0]) eo = _encodingFromDecl(sig) if eo and not(eo[0].upper() in permitted): _logEvent(loggedEvents, logging.UnicodeError({'exception': 'Document starts with ' + permitted[0] + ' BOM marker but has incompatible declaration of ' + eo[0]}), eo[1]) return None else: return eo or (fallback, None)
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) validator.loggedEvents += loggedEvents # experimental RSS-Profile draft 1.06 support validator.setLiterals(re.findall('&(\w+);',aString)) xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString) if xmlver and xmlver.group(1)<>'1.0': validator.log(logging.BadXmlVersion({"version":xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix="...%s..." % str(random()).replace('0.','') msg=[] libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg=''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':',4)[-1].strip()) validator.xmlvalidator=xmlvalidate try: parser.parse(source) except SAXException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception":value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher=dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source.getByteStream().reset() parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator
def decode(mediaType, charset, bs, loggedEvents, fallback=None): eo = _detect(bs, loggedEvents, fallback=None) # Check declared encodings if eo and eo[1] and _hasCodec(eo[0]): if not (isCommon(eo[0])): _logEvent(loggedEvents, ObscureEncoding({"encoding": eo[0]}), eo[1]) elif not (isStandard(eo[0])): _logEvent(loggedEvents, NonstdEncoding({"encoding": eo[0]}), eo[1]) if eo: encoding = eo[0] else: encoding = None if charset and encoding and charset.lower() != encoding.lower(): # RFC 3023 requires us to use 'charset', but a number of aggregators # ignore this recommendation, so we should warn. loggedEvents.append( logging.EncodingMismatch({ "charset": charset, "encoding": encoding })) if mediaType and mediaType.startswith("text/") and charset is None: loggedEvents.append(logging.TextXml({})) # RFC 3023 requires text/* to default to US-ASCII. Issue a warning # if this occurs, but continue validation using the detected encoding try: bs.decode("US-ASCII") except: if not encoding: try: bs.decode(fallback) encoding = fallback except: pass if encoding and encoding.lower() != 'us-ascii': loggedEvents.append( logging.EncodingMismatch({ "charset": "US-ASCII", "encoding": encoding })) enc = charset or encoding if enc is None: loggedEvents.append(logging.MissingEncoding({})) enc = fallback elif not (_hasCodec(enc)): if eo: _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}), eo[1]) else: _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc})) enc = fallback if enc is None: return enc, None dec = getdecoder(enc) try: return enc, dec(bs)[0] except UnicodeError, ue: salvage = dec(bs, 'replace')[0] if 'start' in ue.__dict__: # XXX 'start' is in bytes, not characters. This is wrong for multibyte # encodings pos = _position(salvage, ue.start) else: pos = None _logEvent(loggedEvents, logging.UnicodeError({"exception": ue}), pos) return enc, salvage