def test_text_file(self): # If the source is a text file-like object, use it as a character # stream. prep = prepare_input_source(self.make_character_stream()) self.assertIsNone(prep.getByteStream()) self.checkContent(prep.getCharacterStream(), "This is a character stream.")
def prepare_input_source(source): """given a URL, returns a xml.sax.xmlreader.InputSource Works like xml.sax.saxutils.prepare_input_source. Wraps the InputSource in a ReseekFile if the URL returns a non-seekable file. To turn the buffer off if that happens, you'll need to do something like f = source.getCharacterStream() ... try: f.nobuffer() except AttributeError: pass or if isinstance(f, ReseekFile): f.nobuffer() """ from xml.sax import saxutils source = saxutils.prepare_input_source(source) # Is this correct? Don't know - don't have Unicode experience f = source.getCharacterStream() or source.getByteStream() try: f.tell() except (AttributeError, IOError): f = ReseekFile.ReseekFile(f) source.setByteStream(f) source.setCharacterStream(None) return source
def parse(self, source): """Parse an XML document from a URL or an InputSource.""" source = saxutils.prepare_input_source(source) self._source = source self.reset() self._cont_handler.setDocumentLocator(ExpatLocator(self)) xmlreader.IncrementalParser.parse(self, source)
def test_binary_file(self): # If the source is a binary file-like object, use it as a byte # stream. prep = prepare_input_source(self.make_byte_stream()) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This is a byte stream.")
def external_entity_ref(self, context, base, sysid, pubid): """Add external entity reference to XML document.""" if not self._external_ges: return 1 source = self._ent_handler.resolveEntity(pubid, sysid) source = saxutils.prepare_input_source(source, self._source.getSystemId() or "") # If an entry does not exist in the xml cache, create it. filepath = os.path.join(XML_CACHE, base64.urlsafe_b64encode(pubid)) if not os.path.isfile(filepath): with open(filepath, 'w') as f: contents = source.getByteStream().read() source.setByteStream(StringIO(contents)) f.write(contents) self._entity_stack.append((self._parser, self._source)) self._parser = self._parser.ExternalEntityParserCreate(context) self._source = source try: xmlreader.IncrementalParser.parse(self, source) except: return 0 # FIXME: save error info here? (self._parser, self._source) = self._entity_stack[-1] del self._entity_stack[-1] return 1
def parse(self, source): "Parse an XML document from a URL or an InputSource." self._source = saxutils.prepare_input_source(source) try: self._parser.parse(source) except SAXException, e: raise _exceptions.SAXException("", e)
def documentFromURI(uri,fragID=None,targetLocalName=None,idAttr=None): if type(uri)==types.UnicodeType: uri=uri.encode('utf_8') source = prepare_input_source(uri) doc = documentFromSource(source,fragID,targetLocalName,idAttr) # who closes the input? return doc
def test_locator_sax(tester): tester.startTest("SAX InputSource") parser = CreateParser() parser.setContentHandler(LocatorTester(tester, CONTENT_PATH)) parser.parse(prepare_input_source(CONTENT_PATH)) verify_finished_locator(tester, parser) tester.testDone() return
def test_character_stream(self): # If the source is an InputSource with a character stream, use it. src = InputSource(self.file) src.setCharacterStream(self.make_character_stream()) prep = prepare_input_source(src) self.assertIsNone(prep.getByteStream()) self.checkContent(prep.getCharacterStream(), "This is a character stream.")
def test_system_id(self): # If the source is an InputSource that has neither a character # stream nor a byte stream, open the system ID. src = InputSource(self.file) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This was read from a file.")
def test_xmlreader_sax(tester): tester.startTest("SAX InputSource") parser = CreateParser() builder = DomBuilder() parser.setContentHandler(builder) parser.parse(prepare_input_source(CONTENT_PATH)) tester.compare(XMLREADER_CONTENT, builder, func=compare_builder) tester.testDone() return
def test_byte_stream(self): # If the source is an InputSource that does not have a character # stream but does have a byte stream, use the byte stream. src = InputSource(self.file) src.setByteStream(self.make_byte_stream()) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This is a byte stream.")
def parse(self, source): source = prepare_input_source(source) self.prepareParser(source) file = source.getByteStream() buffer = file.read(self._bufsize) while buffer != "": self.feed(buffer) buffer = file.read(self._bufsize) self.close()
def _parse_to_queue(self, stream, queue): parser = self.rdflib_parser() store = _QueueGraph(queue) source = prepare_input_source(stream) try: parser.parse(source, store, *self.parser_args, **self.parser_kwargs) except: queue.put(('exception', sys.exc_info())) else: queue.put(('sentinel', None)) # Sentinel
def fromStream(self, stream, ownerDoc=None): self.handler.initState(ownerDoc=ownerDoc) #self.parser.parseFile(stream) s = saxutils.prepare_input_source(stream) self.parser.parse(s) rt = self.handler.getRootNode() #if hasattr(self.parser.parser,'deref'): # self.parser.parser.deref() #self.parser.parser = None #self.parser = None #self.handler = None return rt
def parse(self, source): try: self.__parsing = 1 # interpret source source = prepare_input_source(source) # create parser if self.__validate: parser = xmlval.XMLValidator() else: parser = xmlproc.XMLProcessor() # set handlers if self._cont_handler != None or self._lex_handler != None: if self._cont_handler == None: self._cont_handler = saxlib.ContentHandler() if self._lex_handler == None: self._lex_handler = saxlib.LexicalHandler() if self.__namespaces: filter = NamespaceFilter(parser, self._cont_handler, self._lex_handler, self) parser.set_application(filter) else: parser.set_application(self) if self._err_handler != None: parser.set_error_handler(self) if self._decl_handler != None or self._dtd_handler != None: parser.set_dtd_listener(self) # FIXME: set other handlers bufsize=16384 self._parser = parser # make it available for callbacks #parser.parse_resource(source.getSystemId()) # FIXME: rest! parser.set_sysid(source.getSystemId()) parser.read_from(source.getByteStream(), bufsize) source.getByteStream().close() parser.flush() parser.parseEnd() finally: self._parser = None self.__parsing = 0
def produce_items (input, produce): source = prepare_input_source (input) parser = xml.sax.make_parser () parser.setFeature (xml.sax.handler.feature_namespaces, 1) parser.setContentHandler (OnixHandler (parser, process_item)) url_cache_dir = os.getenv ("URL_CACHE_DIR") if url_cache_dir: sys.stderr.write ("using url cache in %s\n" % url_cache_dir) parser.setEntityResolver (CachingEntityResolver (parser, url_cache_dir)) else: sys.stderr.write ("no url_cache_dir; XML resources will always be loaded from network\n") parser.setErrorHandler (TestErrorHandler ()) parser.parse (source)
def parse(self, source): "Parse an XML document from a URL or an InputSource." source = saxutils.prepare_input_source(source) self._source = source try: self.reset() self._cont_handler.setDocumentLocator(ExpatLocator(self)) xmlreader.IncrementalParser.parse(self, source) except: # bpo-30264: Close the source on error to not leak resources: # xml.sax.parse() doesn't give access to the underlying parser # to the caller self._close_source() raise
def external_entity_ref(self, context, base, sysid, pubid): if not self._external_ges: return 1 source = self._ent_handler.resolveEntity(pubid, sysid) source = saxutils.prepare_input_source(source, self._source.getSystemId() or '') self._entity_stack.append((self._parser, self._source)) self._parser = self._parser.ExternalEntityParserCreate(context) self._source = source try: xmlreader.IncrementalParser.parse(self, source) except: return 0 (self._parser, self._source) = self._entity_stack[-1] del self._entity_stack[-1] return 1
def parse(self, source): "Parse an XML document from a URL or an InputSource." source = saxutils.prepare_input_source(source) self._source = source self.reset() self._cont_handler.setDocumentLocator(ExpatLocator(self)) try: xmlreader.IncrementalParser.parse(self, source) finally: # Drop reference to Expat parser, but read potential # error state before that. Also, if close has completed, # we don't have a parser anymore, anyway. if self._parser: self._ColumnNumber = self._parser.ErrorColumnNumber self._LineNumber = self._parser.ErrorLineNumber self._parser = None
def prepare_input_source(self, source, publicID=None): if isinstance(source, InputSource): input_source = source else: if hasattr(source, "read") and not isinstance(source, Namespace): # we need to make sure it's not an instance of Namespace since # Namespace instances have a read attr input_source = prepare_input_source(source) else: location = self.absolutize(source) input_source = URLInputSource(location) publicID = publicID or location if publicID: input_source.setPublicId(publicID) id = input_source.getPublicId() if id is None: #_logger.warning("no publicID set for source. Using '' for publicID.") input_source.setPublicId("") return input_source
def filter_svg (input, output, mode): """filter_svg(input:file, output:file, mode) Parses the SVG input from the input stream. For mode == 'hotspots' it filters out all layers except for hotspots and slices. Also makes hotspots visible. For mode == 'shadows' it filters out the shadows layer. """ mode_objs = [] if 'hotspots' in mode: mode_objs.append (mode_hotspots) if 'shadows' in mode: mode_objs.append (mode_shadows) if 'slices' in mode: mode_objs.append (mode_slices) if 'invert' in mode: mode_objs.append (mode_invert) if len (mode_objs) == 0: raise ValueError() output_gen = saxutils.XMLGenerator(output) parser = make_parser() filter = SVGFilter(parser, output_gen, mode_objs) filter.setFeature(handler.feature_namespaces, False) filter.setErrorHandler(handler.ErrorHandler()) # This little I/O dance is here to ensure that SAX parser does not stash away # an open file descriptor for the input file, which would prevent us from unlinking it later with open (input, 'rb') as inp: contents = inp.read () contents_io = io.BytesIO (contents) source_object = saxutils.prepare_input_source (contents_io) filter.parse(source_object) del filter del parser del output_gen
def test_path_objects(self): # If the source is a Path object, use it as a system ID and open it. prep = prepare_input_source(FakePath(self.file)) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This was read from a file.")
def test_string(self): # If the source is a string, use it as a system ID and open it. prep = prepare_input_source(self.file) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b"This was read from a file.")
def test_text_file(self): prep = prepare_input_source(self.make_character_stream()) self.assertIsNone(prep.getByteStream()) self.checkContent(prep.getCharacterStream(), 'This is a character stream.')
def iterate(self, source, cont_handler = None): """parse using the URL or file handle""" source = saxutils.prepare_input_source(source) file = source.getCharacterStream() or source.getByteStream() return self.iterateFile(file, cont_handler)
def test_byte_stream(self): src = InputSource(self.file) src.setByteStream(self.make_byte_stream()) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b'This is a byte stream.')
def test_system_id(self): src = InputSource(self.file) prep = prepare_input_source(src) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b'This was read from a file.')
def parse(self, source): source = saxutils.prepare_input_source(source) self._source = source self.reset() self._cont_handler.setDocumentLocator(ExpatLocator(self)) xmlreader.IncrementalParser.parse(self, source)
return str.replace('\n','\\n').replace('\r','\\r') else: return str if __name__ == '__main__': import sys fn = sys.argv.pop() pfs = SyncPullFromSAX evc = None ns = False while len(sys.argv)>1: flag = sys.argv.pop() if flag=='-a': pfs=ASyncPullFromSAX elif flag=='-s': evc = SummaryEvent elif flag=='-e': evc=TupleEvent elif flag=='-n': ns=True else: raise Exception, "Usage: python PullFromSAX.py [-a] [-n] [-e|-s] URI" p = pfs(prepare_input_source(fn),evc, {feature_namespaces:ns}) while True: e = p.getEvent() print e if e is None: break
def iterate(self, source, cont_handler=None): source = saxutils.prepare_input_source(source) file = source.getCharacterStream() or source.getByteStream() return self.iterateFile(file, cont_handler)
def parse(self, source, no_content=False): """ Parse an XML document from a URL or an InputSource. @param source a file or a stream @param no_content avoid keeping the content into memory """ source0 = source source = saxutils.prepare_input_source(source) self._source = source self.reset() self._cont_handler.setDocumentLocator( xml.sax.expatreader.ExpatLocator(self)) # xmlreader.IncrementalParser.parse(self, source) # source = saxutils.prepare_input_source(source) self.prepareParser(source) file_char = source.getCharacterStream() if file_char is None: file_bytes = source.getByteStream() file = file_bytes else: file = file_char if file is None: raise FileNotFoundError( "file is None, it should not, source={0}\n{1}".format(source0, source0.name)) buffer = file.read(self._bufsize) isFinal = 0 while buffer != "" or isFinal == 0: # self.feed(buffer) data = buffer isFinal = 1 if len(buffer) == 0 else 0 if not self._parsing: self.reset() self._parsing = 1 self._cont_handler.startDocument() try: # The isFinal parameter is internal to the expat reader. # If it is set to true, expat will check validity of the entire # document. When feeding chunks, they are not normally final - # except when invoked from close. self._parser.Parse(data, isFinal) for o in self._cont_handler._objs: yield o del self._cont_handler._objs[:] except expat.error as e: exc = xml.sax.SAXParseException( expat.ErrorString( e.code), e, self) # FIXME: when to invoke error()? # mes = "\n".join([str(e), str(exc)]) self._err_handler.fatalError(exc) buffer = file.read(self._bufsize) # self.close() self._cont_handler.endDocument() self._parsing = 0 # break cycle created by expat handlers pointing to our methods self._parser = None for o in self._cont_handler._objs: yield o del self._cont_handler._objs[:]
def test_binary_file(self): prep = prepare_input_source(self.make_byte_stream()) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b'This is a byte stream.')
def test_string(self): prep = prepare_input_source(self.file) self.assertIsNone(prep.getCharacterStream()) self.checkContent(prep.getByteStream(), b'This was read from a file.')
"""
def parse(self, source): """parse using the URL or file handle""" source = saxutils.prepare_input_source(source) self.parseFile(source.getCharacterStream() or source.getByteStream())
def parse(self, source): self.__parsing = 1 try: # prepare source and create reader if isinstance(source, StringTypes): reader = libxml2.newTextReaderFilename(source) else: source = saxutils.prepare_input_source(source) input = libxml2.inputBuffer(source.getByteStream()) reader = input.newTextReader(source.getSystemId()) reader.SetErrorHandler(self._errorHandler, None) # configure reader if self.__extparams: reader.SetParserProp(libxml2.PARSER_LOADDTD, 1) reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS, 1) reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES, 1) reader.SetParserProp(libxml2.PARSER_VALIDATE, self.__validate) else: reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) # we reuse attribute maps (for a slight performance gain) if self.__ns: attributesNSImpl = xmlreader.AttributesNSImpl({}, {}) else: attributesImpl = xmlreader.AttributesImpl({}) # prefixes to pop (for endPrefixMapping) prefixes = [] # start loop self._cont_handler.startDocument() while 1: r = reader.Read() # check for errors if r == 1: if not self.__errors is None: self._reportErrors(0) elif r == 0: if not self.__errors is None: self._reportErrors(0) break # end of parse else: if not self.__errors is None: self._reportErrors(1) else: self._err_handler.fatalError(\ SAXException("Read failed (no details available)")) break # fatal parse error # get node type nodeType = reader.NodeType() # Element if nodeType == 1: if self.__ns: eltName = (_d(reader.NamespaceUri()),\ _d(reader.LocalName())) eltQName = _d(reader.Name()) attributesNSImpl._attrs = attrs = {} attributesNSImpl._qnames = qnames = {} newPrefixes = [] while reader.MoveToNextAttribute(): qname = _d(reader.Name()) value = _d(reader.Value()) if qname.startswith("xmlns"): if len(qname) > 5: newPrefix = qname[6:] else: newPrefix = None newPrefixes.append(newPrefix) self._cont_handler.startPrefixMapping(\ newPrefix,value) if not self.__nspfx: continue # don't report xmlns attribute attName = (_d(reader.NamespaceUri()), _d(reader.LocalName())) qnames[attName] = qname attrs[attName] = value reader.MoveToElement() self._cont_handler.startElementNS( \ eltName,eltQName,attributesNSImpl) if reader.IsEmptyElement(): self._cont_handler.endElementNS(eltName, eltQName) for newPrefix in newPrefixes: self._cont_handler.endPrefixMapping(newPrefix) else: prefixes.append(newPrefixes) else: eltName = _d(reader.Name()) attributesImpl._attrs = attrs = {} while reader.MoveToNextAttribute(): attName = _d(reader.Name()) attrs[attName] = _d(reader.Value()) reader.MoveToElement() self._cont_handler.startElement( \ eltName,attributesImpl) if reader.IsEmptyElement(): self._cont_handler.endElement(eltName) # EndElement elif nodeType == 15: if self.__ns: self._cont_handler.endElementNS( \ (_d(reader.NamespaceUri()),_d(reader.LocalName())), _d(reader.Name())) for prefix in prefixes.pop(): self._cont_handler.endPrefixMapping(prefix) else: self._cont_handler.endElement(_d(reader.Name())) # Text elif nodeType == 3: self._cont_handler.characters(_d(reader.Value())) # Whitespace elif nodeType == 13: self._cont_handler.ignorableWhitespace(_d(reader.Value())) # SignificantWhitespace elif nodeType == 14: self._cont_handler.characters(_d(reader.Value())) # CDATA elif nodeType == 4: if not self.__lex_handler is None: self.__lex_handler.startCDATA() self._cont_handler.characters(_d(reader.Value())) if not self.__lex_handler is None: self.__lex_handler.endCDATA() # EntityReference elif nodeType == 5: if not self.__lex_handler is None: self.startEntity(_d(reader.Name())) reader.ResolveEntity() # EndEntity elif nodeType == 16: if not self.__lex_handler is None: self.endEntity(_d(reader.Name())) # ProcessingInstruction elif nodeType == 7: self._cont_handler.processingInstruction( \ _d(reader.Name()),_d(reader.Value())) # Comment elif nodeType == 8: if not self.__lex_handler is None: self.__lex_handler.comment(_d(reader.Value())) # DocumentType elif nodeType == 10: #if not self.__lex_handler is None: # self.__lex_handler.startDTD() pass # TODO (how to detect endDTD? on first non-dtd event?) # XmlDeclaration elif nodeType == 17: pass # TODO # Entity elif nodeType == 6: pass # TODO (entity decl) # Notation (decl) elif nodeType == 12: pass # TODO # Attribute (never in this loop) #elif nodeType == 2: # pass # Document (not exposed) #elif nodeType == 9: # pass # DocumentFragment (never returned by XmlReader) #elif nodeType == 11: # pass # None #elif nodeType == 0: # pass # - else: raise SAXException("Unexpected node type %d" % nodeType) if r == 0: self._cont_handler.endDocument() reader.Close() finally: self.__parsing = 0
def parse(self, source): self.__parsing = 1 try: # prepare source and create reader if type(source) in StringTypes: reader = libxml2.newTextReaderFilename(source) else: source = saxutils.prepare_input_source(source) input = libxml2.inputBuffer(source.getByteStream()) reader = input.newTextReader(source.getSystemId()) reader.SetErrorHandler(self._errorHandler,None) # configure reader reader.SetParserProp(libxml2.PARSER_LOADDTD,1) reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) # we reuse attribute maps (for a slight performance gain) if self.__ns: attributesNSImpl = xmlreader.AttributesNSImpl({},{}) else: attributesImpl = xmlreader.AttributesImpl({}) # prefixes to pop (for endPrefixMapping) prefixes = [] # start loop self._cont_handler.startDocument() while 1: r = reader.Read() # check for errors if r == 1: if not self.__errors is None: self._reportErrors(0) elif r == 0: if not self.__errors is None: self._reportErrors(0) break # end of parse else: if not self.__errors is None: self._reportErrors(1) else: self._err_handler.fatalError(\ SAXException("Read failed (no details available)")) break # fatal parse error # get node type nodeType = reader.NodeType() # Element if nodeType == 1: if self.__ns: eltName = (_d(reader.NamespaceUri()),\ _d(reader.LocalName())) eltQName = _d(reader.Name()) attributesNSImpl._attrs = attrs = {} attributesNSImpl._qnames = qnames = {} newPrefixes = [] while reader.MoveToNextAttribute(): qname = _d(reader.Name()) value = _d(reader.Value()) if qname.startswith("xmlns"): if len(qname) > 5: newPrefix = qname[6:] else: newPrefix = None newPrefixes.append(newPrefix) self._cont_handler.startPrefixMapping(\ newPrefix,value) if not self.__nspfx: continue # don't report xmlns attribute attName = (_d(reader.NamespaceUri()), _d(reader.LocalName())) qnames[attName] = qname attrs[attName] = value reader.MoveToElement() self._cont_handler.startElementNS( \ eltName,eltQName,attributesNSImpl) if reader.IsEmptyElement(): self._cont_handler.endElementNS(eltName,eltQName) for newPrefix in newPrefixes: self._cont_handler.endPrefixMapping(newPrefix) else: prefixes.append(newPrefixes) else: eltName = _d(reader.Name()) attributesImpl._attrs = attrs = {} while reader.MoveToNextAttribute(): attName = _d(reader.Name()) attrs[attName] = _d(reader.Value()) reader.MoveToElement() self._cont_handler.startElement( \ eltName,attributesImpl) if reader.IsEmptyElement(): self._cont_handler.endElement(eltName) # EndElement elif nodeType == 15: if self.__ns: self._cont_handler.endElementNS( \ (_d(reader.NamespaceUri()),_d(reader.LocalName())), _d(reader.Name())) for prefix in prefixes.pop(): self._cont_handler.endPrefixMapping(prefix) else: self._cont_handler.endElement(_d(reader.Name())) # Text elif nodeType == 3: self._cont_handler.characters(_d(reader.Value())) # Whitespace elif nodeType == 13: self._cont_handler.ignorableWhitespace(_d(reader.Value())) # SignificantWhitespace elif nodeType == 14: self._cont_handler.characters(_d(reader.Value())) # CDATA elif nodeType == 4: if not self.__lex_handler is None: self.__lex_handler.startCDATA() self._cont_handler.characters(_d(reader.Value())) if not self.__lex_handler is None: self.__lex_handler.endCDATA() # EntityReference elif nodeType == 5: if not self.__lex_handler is None: self.startEntity(_d(reader.Name())) reader.ResolveEntity() # EndEntity elif nodeType == 16: if not self.__lex_handler is None: self.endEntity(_d(reader.Name())) # ProcessingInstruction elif nodeType == 7: self._cont_handler.processingInstruction( \ _d(reader.Name()),_d(reader.Value())) # Comment elif nodeType == 8: if not self.__lex_handler is None: self.__lex_handler.comment(_d(reader.Value())) # DocumentType elif nodeType == 10: #if not self.__lex_handler is None: # self.__lex_handler.startDTD() pass # TODO (how to detect endDTD? on first non-dtd event?) # XmlDeclaration elif nodeType == 17: pass # TODO # Entity elif nodeType == 6: pass # TODO (entity decl) # Notation (decl) elif nodeType == 12: pass # TODO # Attribute (never in this loop) #elif nodeType == 2: # pass # Document (not exposed) #elif nodeType == 9: # pass # DocumentFragment (never returned by XmlReader) #elif nodeType == 11: # pass # None #elif nodeType == 0: # pass # - else: raise SAXException("Unexpected node type %d" % nodeType) if r == 0: self._cont_handler.endDocument() reader.Close() finally: self.__parsing = 0