def create(self,request, user_name, vendor, name, version): user = user_authentication(user_name) # Get the xml containing the tags from the request tags_xml = request.__getitem__('tags_xml') # Parse the xml containing the tags parser = make_parser() handler = TagsXMLHandler() # Tell the parser to use our handler parser.setContentHandler(handler) # Parse the input try: from StringIO import StringIO except ImportError: from cStringIO import StringIO inpsrc = InputSource() inpsrc.setByteStream(StringIO(tags_xml)) parser.parse(inpsrc) # Get the gadget's id for those vendor, name and version gadget = get_object_or_404(GadgetResource, short_name=name,vendor=vendor,version=version) # Insert the tags for these resource and user in the database for e in handler._tags: try: UserTag.objects.get_or_create(tag=e, idUser=user, idResource=gadget) except: return HttpResponseServerError(get_xml_error(str(sys.exc_info()[1])),mimetype='text/xml; charset=UTF-8') response = '<?xml version="1.0" encoding="UTF-8" ?>\n' response += get_tags_by_resource(gadget, user) return HttpResponse(response,mimetype='text/xml; charset=UTF-8')
def countriesSource() -> InputSource: ''' Provides the countries input source for the XML. ''' source = InputSource() source.setByteStream(openURI(path.join(path.dirname(__file__), 'iso_3166-1_list_en.xml'))) return source
def rootElement(self): # Get the context that was originally generated during startup and # create a new context using its registrations real_context = zope.app.appsetup.appsetup.getConfigContext() context = config.ConfigurationMachine() context._registry = copy.copy(real_context._registry) context._features = copy.copy(real_context._features) context.package = self.package # Shut up i18n domain complaints context.i18n_domain = 'zope' # Since we want to use a custom configuration handler, we need to # instantiate the parser object ourselves parser = make_parser() handler = MyConfigHandler(context) parser.setContentHandler(handler) parser.setFeature(feature_namespaces, True) # Now open the file file = open(self.filename) src = InputSource(getattr(file, 'name', '<string>')) src.setByteStream(file) # and parse it parser.parse(src) # Finally we retrieve the root element, have it provide a special root # directive interface and give it a location, so that we can do local # lookups. root = handler.rootElement directlyProvides(root, IRootDirective) root.__parent__ = self return root
def __loadxmlparts(z, manifest, doc, objectpath): from load import LoadParser from xml.sax import make_parser, handler for xmlfile in ( objectpath + 'settings.xml', objectpath + 'meta.xml', objectpath + 'content.xml', objectpath + 'styles.xml'): if xmlfile not in manifest: continue try: xmlpart = z.read(xmlfile) doc._parsing = xmlfile parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(LoadParser(doc)) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(StringIO(xmlpart)) parser.parse(inpsrc) del doc._parsing except KeyError as v: pass
def parse(self, xml, source='string'): '''Parses a XML stream. * If p_source is "string", p_xml must be a string containing valid XML content. * If p_source is "file": p_xml can be: - a string containing the path to the XML file on disk; - a file instance opened for reading. Note that in this case, this method will close it. ''' try: from cStringIO import StringIO except ImportError: from StringIO import StringIO self.parser.setContentHandler(self) self.parser.setErrorHandler(self) self.parser.setFeature(feature_external_ges, False) inputSource = InputSource() if source == 'string': inputSource.setByteStream(StringIO(xml)) else: if not isinstance(xml, file): xml = file(xml) inputSource.setByteStream(xml) self.parser.parse(inputSource) if isinstance(xml, file): xml.close() return self.res
def load(odffile): """ Load an ODF file into memory Returns a reference to the structure """ from load import LoadParser from xml.sax import make_parser, handler z = zipfile.ZipFile(odffile) mimetype = z.read("mimetype") doc = OpenDocument(mimetype, add_generator=False) # Look in the manifest file to see if which of the four files there are manifestpart = z.read("META-INF/manifest.xml") manifest = manifestlist(manifestpart) for xmlfile in ("settings.xml", "meta.xml", "content.xml", "styles.xml"): if not manifest.has_key(xmlfile): continue try: xmlpart = z.read(xmlfile) doc._parsing = xmlfile parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(LoadParser(doc)) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(StringIO(xmlpart)) parser.parse(inpsrc) del doc._parsing except KeyError, v: pass
def limit_featurecollection(content, limit=200): """ Parse a WFS FeatureCollection XML string and produce a similar string with at most 200 features. """ parser = make_parser() _input = BytesIO(content) input_source = InputSource() input_source.setByteStream(_input) output = StringIO() downstream = XMLGenerator(output, 'utf-8') _filter = _XMLFilterLimit(parser, downstream, limit=limit) _filter.parse(input_source) result = output.getvalue() _input.close() output.close() return result
def create(self, request, vendor, name, version): format = request.POST.get('format', 'default') # Get the xml containing the tags from the request tags_xml = request.POST.get('tags_xml') tags_xml = tags_xml.encode("utf-8") # Parse the xml containing the tags parser = make_parser() handler = TagsXMLHandler() # Tell the parser to use our handler parser.setContentHandler(handler) # Parse the input inpsrc = InputSource() inpsrc.setByteStream(StringIO(tags_xml)) parser.parse(inpsrc) # Get the resource's id for those vendor, name and version resource = get_object_or_404(CatalogueResource, short_name=name, vendor=vendor, version=version) # Insert the tags for these resource and user in the database for e in handler._tags: tag_resource(request.user, e, resource) return get_tag_response(resource, request.user, format)
def Load(self): try: self.document() self.layer() error_handler = ErrorHandler() entity_resolver = EntityResolver() dtd_handler = DTDHandler() input = open(self.filename, "r") input_source = InputSource() input_source.setByteStream(input) xml_reader = xml.sax.make_parser() xml_reader.setContentHandler(SVGHandler(self)) xml_reader.setErrorHandler(error_handler) xml_reader.setEntityResolver(entity_resolver) xml_reader.setDTDHandler(dtd_handler) xml_reader.setFeature(handler.feature_external_ges, False) xml_reader.parse(input_source) input.close self.end_all() if self.page_layout: self.object.load_SetLayout(self.page_layout) self.object.load_Completed() return self.object except: warn_tb('INTERNAL') raise
def _build_model(self): content_handler = XMLDocReader(self.presenter) error_handler = ErrorHandler() entity_resolver = EntityResolver() dtd_handler = DTDHandler() try: filename = os.path.join(self.presenter.doc_dir, 'content.xml') handler = open(filename, 'r') lines = float(sum(1 for l in handler)) handler.close() self.file_handler = open(filename, "r") input_source = InputSource() input_source.setByteStream(self.file_handler) content_handler.lines = lines xml_reader = xml.sax.make_parser() xml_reader.setContentHandler(content_handler) xml_reader.setErrorHandler(error_handler) xml_reader.setEntityResolver(entity_resolver) xml_reader.setDTDHandler(dtd_handler) xml_reader.parse(input_source) self.file_handler.close() content_handler.file = None except: errtype, value, traceback = sys.exc_info() msg = _('It seems content.xml is corrupted') + '\n' + value events.emit(events.MESSAGES, msgconst.ERROR, msg) raise IOError(errtype, msg , traceback) self.model = content_handler.model msg = _('Content.xml is parsed successfully') events.emit(events.MESSAGES, msgconst.OK, msg)
class AbstractXMLLoader(AbstractLoader, handler.ContentHandler): xml_reader = None input_source = None def init_load(self): self.input_source = InputSource() self.input_source.setByteStream(self.fileptr) self.xml_reader = xml.sax.make_parser() self.xml_reader.setContentHandler(self) self.xml_reader.setErrorHandler(ErrorHandler()) self.xml_reader.setEntityResolver(EntityResolver()) self.xml_reader.setDTDHandler(DTDHandler()) self.do_load() def start_parsing(self): self.xml_reader.parse(self.input_source) def startElement(self, name, attrs): self.start_element(name, attrs) def endElement(self, name): self.end_element(name) def characters(self, data): self.element_data(data) def start_element(self, name, attrs):pass def end_element(self, name):pass def element_data(self, data):pass
def parse(self) : if (isinstance(self.source,unicode)) : # Create a string source file = io.StringIO(self.source) input = InputSource(file) input.setEncoding("utf-8") input.setCharacterStream(file) # There is a bug in xml.sax.saxutils.prepare_input_source input.setByteStream(file) input.setSystemId(None) elif (isinstance(self.source,InputSource)): input = self.source else: raise Exception("Parse source must be either string or InputSource") # Create the parser/xmlreader parser = xml.sax.make_parser() # Tell the parser to use our handler(s) parser.setContentHandler(self) #parser.setErrorHandler(self) #parser.setFeature(xml.sax.handler.feature_namespaces,True) # Shut off dtd validation parser.setFeature(xml.sax.handler.feature_validation,False) parser.setFeature(xml.sax.handler.feature_external_ges, False) # Parse the document parser.parse(input)
def load(odffile): from load import LoadParser from xml.sax import make_parser, handler z = zipfile.ZipFile(odffile) mimetype = z.read('mimetype') doc = OpenDocument(mimetype, add_generator=False) # Look in the manifest file to see if which of the four files there are manifestpart = z.read('META-INF/manifest.xml') manifest = manifestlist(manifestpart) for xmlfile in ('settings.xml', 'meta.xml', 'content.xml', 'styles.xml'): if not manifest.has_key(xmlfile): continue try: xmlpart = z.read(xmlfile) doc._parsing = xmlfile parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(LoadParser(doc)) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(StringIO(xmlpart)) parser.parse(inpsrc) del doc._parsing except KeyError, v: pass # FIXME: Add subobjects correctly here for mentry,mvalue in manifest.items(): if mentry[:9] == "Pictures/" and len(mentry) > 9: doc.addPicture(mvalue['full-path'], mvalue['media-type'], z.read(mentry)) elif mentry == "Thumbnails/thumbnail.png": doc.addThumbnail(z.read(mentry)) elif mentry in ('settings.xml', 'meta.xml', 'content.xml', 'styles.xml'): pass else: if mvalue['full-path'][-1] == '/': doc._extra.append(OpaqueObject(mvalue['full-path'], mvalue['media-type'], None)) else: doc._extra.append(OpaqueObject(mvalue['full-path'], mvalue['media-type'], z.read(mentry))) # Add the SUN junk here to the struct somewhere # It is cached data, so it can be out-of-date z.close() b = doc.getElementsByType(Body) if mimetype[:39] == 'application/vnd.oasis.opendocument.text': doc.text = b[0].firstChild elif mimetype[:43] == 'application/vnd.oasis.opendocument.graphics': doc.graphics = b[0].firstChild elif mimetype[:47] == 'application/vnd.oasis.opendocument.presentation': doc.presentation = b[0].firstChild elif mimetype[:46] == 'application/vnd.oasis.opendocument.spreadsheet': doc.spreadsheet = b[0].firstChild elif mimetype[:40] == 'application/vnd.oasis.opendocument.chart': doc.chart = b[0].firstChild elif mimetype[:40] == 'application/vnd.oasis.opendocument.image': doc.image = b[0].firstChild elif mimetype[:42] == 'application/vnd.oasis.opendocument.formula': doc.formula = b[0].firstChild return doc
def resolveEntity(self, publicId, systemId): for p in self.path: fname = os.path.join(p, systemId) if os.path.exists(fname): source = InputSource(systemId) source.setByteStream(open(fname)) return source return InputSource(systemId)
def parse_request(self,soap_body,sinfo,encoding): parser = make_parser() ch = SOAP11ContentHandler(parser) parser.setContentHandler(ch) inpsrc = InputSource() inpsrc.setByteStream(BytesIO(soap_body)) parser.parse(inpsrc) return ch.req_dict
def test_parse_InputSource(self): # accept data without declared but with explicitly specified encoding make_xml_file(self.data, 'iso-8859-1', None) with open(TESTFN, 'rb') as f: input = InputSource() input.setByteStream(f) input.setEncoding('iso-8859-1') self.check_parse(input)
def test_byte_stream(self): # If the source is an InputSource that does not have a character # stream but does have a byte stream, use the byte stream. src = InputSource(self.file) src.setByteStream(self.make_byte_stream()) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This is a byte stream.")
def test_expat_inpsource_stream(): parser = make_parser() result = StringIO() xmlgen = XMLGenerator(result) parser.setContentHandler(xmlgen) inpsrc = InputSource() inpsrc.setByteStream(open(findfile("test.xml"))) parser.parse(inpsrc) return result.getvalue() == xml_test_out
def test_expat_inpsource_location(): parser = make_parser() parser.setContentHandler(ContentHandler()) # do nothing source = InputSource() source.setByteStream(StringIO("<foo bar foobar>")) #ill-formed name = "a file name" source.setSystemId(name) try: parser.parse(source) except SAXException, e: return e.getSystemId() == name
def test_expat_inpsource_stream(self): parser = create_parser() result = StringIO() xmlgen = XMLGenerator(result) parser.setContentHandler(xmlgen) inpsrc = InputSource() inpsrc.setByteStream(open(findfile("test.xml"))) parser.parse(inpsrc) self.assertEqual(result.getvalue(), xml_test_out)
def test_expat_inpsource_location(self): parser = create_parser() parser.setContentHandler(ContentHandler()) # do nothing source = InputSource() source.setByteStream(StringIO("<foo bar foobar>")) #ill-formed name = "a file name" source.setSystemId(name) try: parser.parse(source) self.fail() except SAXException as e: self.assertEqual(e.getSystemId(), name)
def test_expat_inpsource_stream(self): parser = create_parser() result = StringIO() xmlgen = XMLGenerator(result) parser.setContentHandler(xmlgen) inpsrc = InputSource() with open(TEST_XMLFILE) as f: inpsrc.setByteStream(f) parser.parse(inpsrc) self.assertEqual(result.getvalue(), xml_test_out)
def manifestlist(manifestxml): odhandler = ODFManifestHandler() parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(odhandler) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(StringIO(manifestxml)) parser.parse(inpsrc) return odhandler.manifest
def resolveEntity(self, publicId, systemId): source = InputSource() source.setSystemId(systemId) try: dtdPath = self.knownDTDs[systemId] except KeyError: raise process.ProcessingFailure( "Invalid DTD system identifier (%r) in %s. Only " "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd " "is allowed." % (systemId, self.filename)) source.setByteStream(dtdPath.open()) return source
def manifestlist(manifestxml): odhandler = ODFManifestHandler() parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(odhandler) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(StringIO(manifestxml)) parser.setFeature(handler.feature_external_ges, False) # Changed by Kovid to ignore external DTDs parser.parse(inpsrc) return odhandler.manifest
def listNewDatasets(config, onlyThese=[]): """ Reads the table_of_contents.xml to determine which datasets have changed since the last download, and then downloads them """ # Start XML parsing parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) ch = ToCParser(config, onlyThese) parser.setContentHandler(ch) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() inpsrc.setByteStream(open('table_of_contents.xml')) parser.parse(inpsrc) return ch.todownload
def processxmlfile(file, context, testing=False): """Process a configuration file See examples in tests/text_xmlconfig.py """ src = InputSource(getattr(file, 'name', '<string>')) src.setByteStream(file) parser = make_parser() parser.setContentHandler(ConfigurationHandler(context, testing=testing)) parser.setFeature(feature_namespaces, True) try: parser.parse(src) except SAXParseException: raise ZopeSAXParseException(sys.exc_info()[1]), None, sys.exc_info()[2]
def load_from_file(self, filename=None): """ Loads color scheme from well-formated xml file. filename - full path to xml file """ if filename: content_handler = XMLPrefReader(pref=self) error_handler = ErrorHandler() entity_resolver = EntityResolver() dtd_handler = DTDHandler() try: input = open(filename, "r") input_source = InputSource() input_source.setByteStream(input) xml_reader = xml.sax.make_parser() xml_reader.setContentHandler(content_handler) xml_reader.setErrorHandler(error_handler) xml_reader.setEntityResolver(entity_resolver) xml_reader.setDTDHandler(dtd_handler) xml_reader.parse(input_source) input.close except: import traceback traceback.print_exc() raise self.name=None if self.disabledforeground is None: self.disabledforeground=lighter_color(self.foreground, .3) if self.menubackground is None: self.menubackground=self.bg if self.menuforeground is None: self.menuforeground=self.foreground if self.menuselectbackground is None: self.menuselectbackground=self.selectbackground if self.menuselectforeground is None: self.menuselectforeground=self.selectforeground if self.menudisabledforeground is None: self.menudisabledforeground=self.disabledforeground if self.menubordercolor is None: self.menubordercolor=self.disabledforeground if self.editfieldbackground is None: self.editfieldbackground='#ffffff' if self.editfieldforeground is None: self.editfieldforeground=self.foreground if self.evencolor is None: self.evencolor=middle_color(self.bg, self.editfieldbackground, 0.7) if self.treelinescolor is None: self.treelinescolor=self.editfieldforeground
def __loadxmlparts(z, manifest, doc, objectpath): """ Parses a document from its zipfile @param z an instance of zipfile.ZipFile @param manifest Manifest data structured in a dictionary @param doc instance of OpenDocument to feed in @param objectpath unicode string: path to an object """ assert(isinstance(z, zipfile.ZipFile)) assert(type(manifest)==type(dict())) assert(isinstance(doc, OpenDocument)) assert(type(objectpath)==type(u"")) from odf.load import LoadParser from defusedxml.sax import make_parser from xml.sax import handler for xmlfile in (objectpath+u'settings.xml', objectpath+u'meta.xml', objectpath+u'content.xml', objectpath+u'styles.xml'): if xmlfile not in manifest: continue ########################################################## # this one is added to debug the bad behavior with Python2 # which raises exceptions of type SAXParseException from xml.sax._exceptions import SAXParseException ########################################################## try: xmlpart = z.read(xmlfile).decode("utf-8") doc._parsing = xmlfile parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setFeature(handler.feature_external_ges, 0) parser.setContentHandler(LoadParser(doc)) parser.setErrorHandler(handler.ErrorHandler()) inpsrc = InputSource() ################# # There may be a SAXParseException triggered because of # a missing xmlns prefix like meta, config, etc. # So i add such declarations when needed (GK, 2014/10/21). # Is there any option to prevent xmlns checks by SAX? xmlpart=__fixXmlPart(xmlpart) inpsrc.setByteStream(BytesIO(xmlpart.encode("utf-8"))) parser.parse(inpsrc) del doc._parsing except KeyError as v: pass except SAXParseException: print (u"====== SAX FAILED TO PARSE ==========\n", xmlpart)
def parse(self, charSet, file): ''' Parses the provided content. @param charSet: string The character set of the content. @param file: byte file The bytes file object providing the content. @return: object The object obtained from parsing. ''' inpsrc = InputSource() inpsrc.setByteStream(file) inpsrc.setEncoding(charSet) return self.parseInputSource(inpsrc)
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None, mediaType=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from .base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO if re.match("^\s+<\?xml", aString) and re.search( "<generator.*wordpress.*</generator>", aString): lt = aString.find('<') gt = aString.find('>') if lt > 0 and gt > 0 and lt < gt: loggedEvents.append(logging.WPBlankLine({'line': 1, 'column': 1})) # rearrange so that other errors can be found aString = aString[lt:gt + 1] + aString[0:lt] + aString[gt + 1:] # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) if mediaType == 'application/atomsvc+xml': validator.setFeedType(TYPE_APP_SERVICE) elif mediaType == 'application/atomcat+xml': validator.setFeedType(TYPE_APP_CATEGORIES) validator.loggedEvents += loggedEvents # experimental RSS-Profile support validator.rssCharData = [s.find('&#x') >= 0 for s in aString.split('\n')] xmlver = re.match( "^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]", aString) if xmlver and xmlver.group(1) != '1.0': validator.log(logging.BadXmlVersion({"version": xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append( {'http://www.w3.org/XML/1998/namespace': 'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix = "...%s..." % str(random()).replace('0.', '') msg = [] libxml2.registerErrorHandler(lambda msg, str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg = ''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':', 4)[-1].strip()) validator.xmlvalidator = xmlvalidate try: parser.parse(source) except SAXException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception": value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher = dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator
elif tag == (RELAXNS, 'choice') and self.currattr is None: self.optional = self.optional - 1 self.data = [] if __name__ == "__main__": elements = {} parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(S22RelaxParser(elements)) parser.setErrorHandler(handler.ErrorHandler()) for relaxfile in ["simple-manifest-7-22.rng","simple-schema-7-22.rng"]: content = file(relaxfile) inpsrc = InputSource() inpsrc.setByteStream(content) parser.parse(inpsrc) slist = elements.keys() slist.sort() print "required_attributes = {" for s in slist: e = elements[s] if e.ns == DBNS: continue if len(e.attrs) > 0: print "# required_attributes" print "\t(%sNS,u'%s'):" % (nsdict.get(e.ns,'unknown').upper(), e.name), print "(" for a in e.attrs.values(): print "\t\t(%sNS,u'%s')," % (nsdict.get(a.ns,'unknown').upper(), a.name)
def resolveEntity(self, publicId, systemId): inpsrc = InputSource() inpsrc.setByteStream(BytesIO(b"<entity/>")) return inpsrc
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) validator.loggedEvents += loggedEvents xmlver = re.match( "^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]", aString) if xmlver and xmlver.group(1) <> '1.0': validator.log(logging.BadXmlVersion({"version": xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append( {'http://www.w3.org/XML/1998/namespace': 'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix = "...%s..." % str(random()).replace('0.', '') msg = [] libxml2.registerErrorHandler(lambda msg, str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg = ''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':', 4)[-1].strip()) validator.xmlvalidator = xmlvalidate try: parser.parse(source) except SAXParseException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception": value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher = dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source.getByteStream().reset() parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator
except SAXNotRecognizedException: pass except SAXException, e: raise DistutilsModuleError(e.getMessage()) handler = InclusionFilter() parser.setContentHandler(handler) if isinstance(source, (str, unicode)): try: stream = Uri.UrlOpen(source) except OSError: # Assume part of an XInclude w/fallback. return source = InputSource(source) source.setByteStream(stream) elif hasattr(source, 'read'): stream = source source = InputSource(getattr(stream, 'name', None)) source.setByteStream(stream) parser.parse(source) return INDEX_TEMPLATE = """<?xml version="1.0" encoding="ISO-8859-1"?> <!DOCTYPE article PUBLIC "-//OASIS//DTD Simplified DocBook XML V1.1//EN" "http://docbook.org/xml/simple/1.1/sdocbook.dtd"> <?ftdb-ignore-namespace http://xmlns.4suite.org/reserved?> <article> <title>%(fullname)s Document Index</title> %(sections)s
def resolveEntity(self,publicId,systemId): inp = InputSource() inp.setByteStream(StringIO("")) return inp
def test_byte_stream(self): src = InputSource(self.file) src.setByteStream(self.make_byte_stream()) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b'This is a byte stream.')
# regression test for SAX 2.0
class DataImporter(Digester): def __init__(self, ictx, file): Digester.__init__(self) self._ictx = ictx self._file = file self._input = InputSource(file.name) self._input.setByteStream(BZ2File(file.name, 'r')) self._conn = ictx['conn'].connection self._cursor = self._conn.cursor() self.success = self._closed = False self._add_rules() def _add_rules(self): self.addOnBegin('packet', self._check_packet) self.addOnBeginAndEnd('packet/transaction/event', self._on_event, self._on_event_end) self.addOnBody('packet/transaction/event/keys/column', self._on_key_column) self.addOnBody('packet/transaction/event/values/column', self._on_value_column) self.addOnFinish(self._on_finish) def _check_packet(self, tag, attrs): if self._ictx['schema_seq'] != int(attrs.getValue('schema_seq')): raise Exception( '<packet> schema_seq: {0} not matched the expected seq number {1}', attrs.getValue('schema_seq'), self._ictx['replication_seq']) if self._ictx['replication_seq'] != int( attrs.getValue('replication_seq')): raise Exception( '<packet> replication_seq: {0} not matched the expected seq number {1}', attrs.getValue('replication_seq'), self._ictx['replication_seq']) def _on_key_column(self, tag, attrs, val): event = self.peek() event['keys'][attrs.getValue('name')] = val def _on_value_column(self, tag, attrs, val): event = self.peek() isNull = attrs.getValue("null") if attrs.has_key('null') else None event['values'][attrs.getValue( 'name')] = val if isNull != "yes" else None def _on_event(self, tag, attrs): event = { 'op': attrs.getValue('op'), 'table': attrs.getValue('table'), 'keys': OrderedDict(), #array of tuples column name -> column val 'values': OrderedDict() #array of tuples column name -> column val } self.push(event) def _on_event_end(self, tag): event = self.pop() type = event['op'] table = event['table'] keys = event['keys'] values = event['values'] params = [] if type == 'I': sql_columns = ', '.join(values.keys()) sql_values = ', '.join(['%s'] * len(values)) sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, sql_columns, sql_values) params = values.values() elif type == 'U': sql_values = ', '.join('%s=%%s' % i for i in values) sql = 'UPDATE %s SET %s' % (table, sql_values) params = values.values() elif type == 'D': sql = 'DELETE FROM %s' % table else: raise Exception('Invalid <event> op: %s' % type) if type == 'D' or type == 'U': sql += ' WHERE ' + ' AND '.join( '%s%s%%s' % (i, ' IS ' if keys[i] is None else '=') for i in keys.keys()) params.extend(keys.values()) #print '%s %s' % (sql, params) self._cursor.execute(sql, params) def _on_finish(self): pass def load(self): logger.warning('Saving dataset....') self.parse(self._input) self.success = True def recover(self): """ This is duty hack to remove weird characters presented in some replications files. Using the tidy tool. """ logger.warning('Trying to recover invalid XML...') originalXML = None fixedXML = None try: originalXML = tempfile.NamedTemporaryFile( suffix='.xml', delete=False) #bunzipped tmp fixedXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #fixed tmp fixedXML.close() #Fetch uncompressed file data to recover bzf = self._input.getByteStream() bzf.seek(0) shutil.copyfileobj(bzf, originalXML) originalXML.close() cmd = ['tidy', '-xml', '-o', fixedXML.name, originalXML.name] logger.warning('Running: %s', ' '.join(cmd)) ret = subprocess.call(cmd) if ret: #raise Exception('Failed to fix XML data, ret=%s' % ret) pass #ready to load self.close() self._file = file(fixedXML.name, 'r') self._input = InputSource(fixedXML.name) self._input.setByteStream(self._file) self._cursor = self._conn.cursor() self.success = self._closed = False self.reset() self._add_rules() self.load() finally: for f in [originalXML, fixedXML]: if f and not f.closed: f.close() if f and os.path.exists(f.name): os.unlink(f.name) def close(self): if self._closed: return try: if self.success: self._conn.commit() logger.warning('Done') else: logger.warning( 'Rolling back transaction. Seq number: {0}'.format( self._ictx['replication_seq'])) self._conn.rollback() self._cursor.close() finally: self._closed = True self._input.getByteStream().close() self._file.close()