def parse(filename): # parse file and return document root try: doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(filename)) return doc.documentElement except Ft.FtException, e: raise Error(_("File '%s' has invalid XML: %s") % (filename, str(e)))
def readxml(self, uri, tmpDir='/tmp', sha1sum=False, compress=None, sign=None): uri = File.make_uri(uri) localpath = File.download(uri, tmpDir,sha1sum=sha1sum,compress=compress,sign=sign) try: self.doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(localpath)) return self.doc.documentElement except Ft.FtException, e: raise Error(_("File '%s' has invalid XML: %s") % (localpath, str(e)) )
def readxml(self, uri, tmpDir="/tmp"): uri = File.make_uri(uri) localpath = File.download(uri, tmpDir) try: self.doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(localpath)) return self.doc.documentElement except Ft.FtException, e: raise Error(_("File '%s' has invalid XML: %s") % (localpath, str(e)))
def readxml(self, uri, tmpDir='/tmp', sha1sum=False, compress=None, sign=None): uri = File.make_uri(uri) localpath = File.download(uri, tmpDir, sha1sum=sha1sum, compress=compress, sign=sign) try: self.doc = NoExtDtdReader.parseUri( Ft.Lib.Uri.OsPathToUri(localpath)) return self.doc.documentElement except Ft.FtException, e: raise Error( _("File '%s' has invalid XML: %s") % (localpath, str(e)))
def cdom(name): doc = NoExtDtdReader.parseUri(Ft.Lib.Uri.OsPathToUri(name))
def load(self, webget): """ >>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph()) >>> g.load(WebMemo()) >>> g.dom.documentElement.localName u'purchaseOrder' """ if self.dom: return lastUri, (content, self.headers) = webget(self.url, (RDF_MT, XML_MT, XML_text_MT, XHTML_MT, XHTML_text_MT)) if lastUri != self.url: ##We want the retrieval URL even in the face of a redirect self.url = lastUri parsedAsRDF = False #Until we peak in for a base, use the one given or the retrieval URL initialBase = self.baseURI and self.baseURI or self.url #peek in response headers to determine content-type #NOTE we need to attempt to parse the source as RDF/XML regardless (by the base case rule): #If an information resource IR is represented by a conforming RDF/XML document[RDFX], then #the RDF graph represented by that document is a GRDDL result of IR. if self.headers['content-type'].startswith(RDF_MT): try: self.graph.parse(StringIO(content),publicID=initialBase) parsedAsRDF = True except: pass self.dom = None try: if self.DEBUG: print >>sys.stderr, "Parsing XML content WRT baseURI of %s"%(initialBase) self.dom = XMLParser.parseString(content, initialBase, processIncludes=self.useXInclude) #WG consensus is to follow XML Base. This bottoms out in using #the base URI of the root node once the parser has been given #the base that HTTP indicates via RFC 3986 #Note: this interpretation is based off the assumption that the #encapsulating context for a GRDDL result is the root node of the #source document #See: http://4suite.org/docs/CoreManual.xml#base_URIs if self.baseURI is None: self.baseURI = self.dom.xpath(u'/*')[0].baseURI if self.DEBUG: print >>sys.stderr,\ "Adopting the baseURI of the root node: %s"%(self.baseURI) #Note, if an XHTML Base is embedded, it needs to be respected also for htmlBase in self.dom.xpath(u'/xhtml:html/xhtml:head/xhtml:base/@href',{u'xhtml': XHTML_NS}): if self.DEBUG: print >>sys.stderr, "Found an XHTML Base: %s"%(htmlBase.value) self.baseURI = htmlBase.value #WG consensus is that we should peek into XML content for rdf:RDF #at the root, if we find it we need to attempt a parse as RDF/XML if not parsedAsRDF and self.dom.xpath(u'/rdf:RDF',{u'rdf':str(RDF.RDFNS)}): try: self.graph.parse(StringIO(content), publicID=self.baseURI) except: pass except Exception, e: #@@ narrow exception if self.DEBUG: print >>sys.stderr, "Unable to parse ", self.baseURI, repr(e) #Unable to glean. Fail gracefully.. self.dom = None
def transform(self, transformURLs, webget): """ Takes a space seperated list of transform url's and applies them against the pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied """ for xformURL in transformURLs.split(): if self.DEBUG: print >>sys.stderr, "applying transformation %s" % (xformURL) if xformURL not in self.appliedTransforms: self.appliedTransforms.append(xformURL) #The transform url is resolved against the source URL (to #accomodate relative urls) stylesheetLoc = Absolutize(xformURL, self.baseURI) lastUri, (content, info) = webget(stylesheetLoc, (XSLT_MT,)) transform = InputSource.DefaultFactory.fromString(content, stylesheetLoc) processor = Processor.Processor() processor.appendStylesheet(transform) #see: http://www.w3.org/TR/grddl/#stylepi #Note, for the XSLT transform, the base URI of the source document #is passed in, instead of the base URI of the root node result = processor.runNode(self.dom, self.url, ignorePis=1) #get output method / media-type # <!-- Category: top-level-element --> # <xsl:output # method = "xml" | "html" | "text" | qname-but-not-ncname # version = nmtoken # encoding = string # omit-xml-declaration = "yes" | "no" # standalone = "yes" | "no" # doctype-public = string # doctype-system = string # cdata-section-elements = qnames # indent = "yes" | "no" # media-type = string /> #How to accomodate @media-type? method = processor.outputParams.method[-1] currLen = len(self.graph) if method == 'xml': self.graph.parse(StringIO(result), publicID=self.baseURI) #@@This is mostly as a workaround for RDFLib 2.4 which will #force an empty URI string as the subject if xml:base = '' if XMLParser.parseString(result, self.baseURI).xpath('//@xml:base'): import warnings;warnings.warn( "RDFLib 2.4.0 may not be resolving relative xml:base values") replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \ self.graph.triples((URIRef(''),None,None))] if replace: if self.DEBUG: print >>sys.stderr, \ "Replacing empty string URI ref with %s" % ( self.baseURI) self.graph.remove((URIRef(''),None,None)) self.graph.addN(replace) if self.DEBUG: print >>sys.stderr,\ "Parsed %s triples (using baseURI: %s) as RDF/XML" % ( max(0,len(self.graph) - currLen),self.baseURI) elif method == 'text': #Attempt a Notation 3 parse (covers NTriples, and Turtle) try: self.graph.parse(StringIO(result), format='n3', publicID=self.baseURI) #@@This is mostly as a workaround for RDFLib 2.4 which will #force an empty URI string as the subject if xml:base = '' replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \ self.graph.triples((URIRef(''),None,None))] if replace: if self.DEBUG: print >>sys.stderr, \ "Replacing empty string URI ref with %s" % ( self.baseURI) self.graph.remove((URIRef(''),None,None)) self.graph.addN(replace) if self.DEBUG: print >>sys.stderr, \ "Parsed %s triples (using baseURI: %s) as Notation 3" % ( max(0,len(self.graph) - currLen),self.baseURI) except: if self.DEBUG: print >>sys.stderr, "Unknown text-based RDF serialization" else: #HTML result - recursive GRDDL mechanism? raise Exception("unsupported output type")