def is_cache_full(self): """ Check if cache has more than one document """ if os.path.exists(self.cache_file): reader = SAXReader() cache_doc = reader.read( File(self.cache_file) ) pinfo = ParliamentInfoParams() list_of_cached_nodes = cache_doc.selectNodes( pinfo._xpath_content_types() ) if not self.bicameral: """ If its unicameral """ if len(list_of_cached_nodes) == 1: return True else: return False else: """ If its bicameral """ if len(list_of_cached_nodes) == 2: return True else: return False return False
def append_to_cache(self, input_file): reader = SAXReader() new_doc = reader.read( File(input_file) ) element_to_import = new_doc.getRootElement() self.append_element_into_cache_document(element_to_import)
def getPayloadContent(self): format = self.__metadata.getField("dc_format") slash = self.__oid.rfind("/") pid = self.__oid[slash+1:] print " *** payload content, format: %s, pid: %s *** " % (format, pid) contentStr = "" if format.startswith("text"): contentStr = "<pre>" payload = self.__storage.getPayload(self.__oid, pid) str = StringWriter() IOUtils.copy(payload.getInputStream(), str) contentStr += str.toString() contentStr += "</pre>" elif format.find("vnd.ms-")>-1 or format.find("vnd.oasis.opendocument.")>-1: #get the html version if exist.... pid = pid[:pid.find(".")] + ".htm" payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader() document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//div[@class='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") return contentStr
def getPayloadContent(self): mimeType = self.__mimeType print " * single.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % ( contextPath, portalId, self.__oid, ) else: pid = self.__oid[self.__oid.rfind("/") + 1 :] payload = self.__storage.getPayload(self.__oid, pid) print " * single.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif ( mimeType == "application/pdf" or mimeType.find("vnd.ms") > -1 or mimeType.find("vnd.oasis.opendocument.") > -1 ): # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * single.py: pid=%s" % pid # contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") # linkNodes = slideNode.selectNodes("//img") # contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = '<p class="error">No preview available</p>' elif mimeType.startswith("image/"): src = "%s/%s" % (self.__oid, self.__pid) contentStr = ( '<a class="image" href="%(src)s" style="max-width:98%%">' '<img src="%(src)s" style="max-width:100%%" /></a>' % {"src": self.__pid} ) return contentStr
def getPayloadContent(self): mimeType = self.__mimeType print " * detail.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType == "application/octet-stream": dcFormat = self.__json.get("response/docs/dc_format") if dcFormat is not None: dcFormat = dcFormat[1:-1] print dcFormat, mimeType if dcFormat != mimeType: return "<div><em>(File not found)</em></div>" else: return "<div><em>(Binary file)</em></div>" elif mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \ (contextPath, portalId, self.__oid) else: pid = self.__oid[self.__oid.rfind("/")+1:] payload = self.__storage.getPayload(self.__oid, pid) #print " * detail.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * detail.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" return contentStr
def loadConfig(): """ Load the configuration file return Document """ document = None reader = SAXReader() try: document = reader.read(CONF_FILE) except DocumentException, detail: print "Error: %s" % detail.getMessage()
def test(docname): """Test the functions in this module. """ reader = SAXReader() doc = reader.read(docname) show_tree(doc, 'Original tree:') date = time.ctime() modify_tree(doc, 'person', 'date', date) show_tree(doc, 'After walk and modify:') modify_tree_xpath(doc, '//people/person/name', 'date', date) show_tree(doc, 'After XPath modify:')
def append_to_cache(self, input_file, search_node): reader = SAXReader() new_doc = reader.read( File(input_file) ) found_node = new_doc.selectSingleNode(search_node) found_id = found_node.getText() if not self.does_item_exist_in_cache(found_id): element_to_import = new_doc.getRootElement() self.append_element_into_cache_document(element_to_import) else: LOG.info(found_id + " already exists in cache")
def append_to_cache(self, input_file): reader = SAXReader() new_doc = reader.read( File(input_file) ) parl_node = new_doc.selectSingleNode("/contenttype[@name='parliament']/field[@name='parliament_id']") parliament_id = parl_node.getText() if not self.does_parliament_exists_in_cache(parliament_id): element_to_import = new_doc.getRootElement() self.append_element_into_cache_document(element_to_import) else: LOG.info(parliament_id + " already exists in cache")
def getPayloadContent(self): mimeType = self.__mimeType print " * single.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \ (portalPath, self.__oid) else: pid = self.__oid[self.__oid.rfind("/") + 1:] payload = self.__storage.getPayload(self.__oid, pid) print " * single.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find( "vnd.ms") > -1 or mimeType.find( "vnd.oasis.opendocument.") > -1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * single.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/download/%s/%s"></iframe>' % \ # (portalPath, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode( "//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" elif mimeType.startswith("image/"): src = "%s/%s" % (self.__oid, self.__pid) contentStr = '<a class="image" href="%(src)s" style="max-width:98%%">' \ '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": self.__pid } return contentStr
def new_cache(self, input_file): """ Takes the input file, creates a new empty cache document, and adds the input file to the cache """ self.new_cache_document() reader = SAXReader() new_doc = reader.read( File(input_file) ) element_to_import = new_doc.getRootElement() self.append_element_into_cache_document(element_to_import)
def __activate__(self, context): self.log = context["log"] self.config = context["systemConfig"] response = context["response"] try: ## Variable prep defaultPath = FascinatorHome.getPath("alerts") self.alertsPath = self.config.getString(defaultPath, ["alerts", "path"]) self.configFile = None # We'll allocate this later... if needed self.redboxVersion = self.config.getString( "", "redbox.version.string") self.csvDialect = csv.excel self.csvDialect.skipinitialspace = True ## XML Parsing docFactory = DocumentFactory() ##docFactory.setXPathNamespaceURIs(namespaces) self.saxReader = SAXReader(docFactory) ## Do our job (success, failed) = self.__processDir() ## Send response to the client (if debugging in browser) writer = response.getPrintWriter("text/plain; charset=UTF-8") writer.println("%s successful, %s failed" % (success, failed)) writer.close() except Exception, e: response.setStatus(500) writer = response.getPrintWriter("text/plain; charset=UTF-8") writer.println("Unexpected error during script execution:\n%s" % str(e)) writer.close()
def is_cache_full(self): """ Check if cache has more than one document """ if os.path.exists(self.cache_file): reader = SAXReader() cache_doc = reader.read( File(self.cache_file) ) #pinfo = ParliamentInfoParams() list_of_cached_nodes = cache_doc.selectNodes( self.axis_to_check_cache_full() ) if len(list_of_cached_nodes) == self.chambers_required: return True else: return False return False
def __init__(self, file, config, baseline): AlertHandler.__init__(self, file, config, baseline) docFactory = DocumentFactory() self.saxReader = SAXReader(docFactory) self.xmlMapFile = StrSubstitutor.replaceSystemProperties( config['xmlMap']) if not os.path.exists(self.xmlMapFile): raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile) ## Make sure we can see our mappings inStream = FileInputStream(File(self.xmlMapFile)) xmlMappings = JsonSimple(inStream) self.map = xmlMappings.getObject(["mappings"]) self.exceptions = xmlMappings.getObject(["exceptions"]) self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"]) self.mappedExceptionCount = 0
def __getPayloadContent(self, oid, pid): print " * combined.py: oid='%s' pid='%s'" % (oid, pid) payload = self.__storage.getPayload(oid, pid) if payload is None: return "<div>Error: No content for '%s'</div>" % oid mimeType = payload.contentType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \ (portalPath, oid) else: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(pid)[0] + ".htm" print " * combined.py: pid=%s" % pid payload = self.__storage.getPayload(oid, pid) saxReader = SAXReader(False) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" elif mimeType.startswith("image/"): src = "%s/%s" % (oid, pid) contentStr = '<a class="image" href="%(src)s" style="max-width:98%%">' \ '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": pid } return contentStr
def getPayloadContent(self): mimeType = self.__mimeType print " * detail.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \ (contextPath, portalId, self.__oid) else: pid = self.__oid[self.__oid.rfind("/")+1:] payload = self.__storage.getPayload(self.__oid, pid) print " * detail.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * detail.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) except: traceback.print_exc() #slideNode = document.selectSingleNode("//div[@class='body']") slideNode = document.selectSingleNode("//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") return contentStr
def __init__(self, file, config, baseline): AlertHandler.__init__(self, file, config, baseline) docFactory = DocumentFactory() self.saxReader = SAXReader(docFactory) self.xmlMapFile = StrSubstitutor.replaceSystemProperties(config['xmlMap']) if not os.path.exists(self.xmlMapFile): raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile) ## Make sure we can see our mappings inStream = FileInputStream(File(self.xmlMapFile)) xmlMappings = JsonSimple(inStream) self.map = xmlMappings.getObject(["mappings"]) self.exceptions = xmlMappings.getObject(["exceptions"]) self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"]) self.mappedExceptionCount = 0
def test(indocname): reader = SAXReader() doc = reader.read(indocname) visitor = Visitor() doc.accept(visitor)
def __createEpub(self): title = self.__manifest.get("title") response.setHeader("Content-Disposition", "attachment; filename=%s.epub" % urllib.quote(title)) out = response.getOutputStream("application/epub+zip") zipOutputStream = ZipOutputStream(out) #save mimetype... and the rest of standard files in epub zipOutputStream.putNextEntry(ZipEntry("mimetype")) epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype") IOUtils.copy(epubMimetypeStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml")) epubContainerStream = self.__getResourceAsStream("/epub/container.xml") IOUtils.copy(epubContainerStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css")) epubcss = self.__getResourceAsStream("/epub/epub.css") IOUtils.copy(epubcss, zipOutputStream) zipOutputStream.closeEntry() #### Creating toc.ncx #### tocXml = ElementTree.Element("ncx", {"version": "2005-1", "xml:lang":"en", "xmlns":"http://www.daisy.org/z3986/2005/ncx/"}) headNode = ElementTree.Element("head") tocXml.append(headNode) headNode.append(ElementTree.Element("meta", {"name": "dtb:uid", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:depth", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:totalPageCount", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:maxPageNumber", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:generator", "content": "ICE v2"})) #docTitle docTitle = ElementTree.Element("docTitle") textNode = ElementTree.Element("text") textNode.text = title docTitle.append(textNode) tocXml.append(docTitle) #docAuthor docAuthor = ElementTree.Element("docAuthor") textNode = ElementTree.Element("text") textNode.text = "ICE v2" docAuthor.append(textNode) tocXml.append(docAuthor) #navMap navMap = ElementTree.Element("navMap") tocXml.append(navMap) #### Creating content.opf #### contentXml = ElementTree.Element("package", {"version": "2.0", "xmlns":"http://www.idpf.org/2007/opf", "unique-identifier":"BookId"}) metadataNode = ElementTree.Element("metadata", {"xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf"}) contentXml.append(metadataNode) #metadata information metadata = ElementTree.Element("dc:title") metadata.text = title metadataNode.append(metadata) metadata = ElementTree.Element("dc:language") metadata.text = "en-AU" metadataNode.append(metadata) metadata = ElementTree.Element("dc:creator", {"opf:role":"aut"}) metadata.text = "ICE" metadataNode.append(metadata) metadata = ElementTree.Element("dc:publisher") metadata.text = "University of Southern Queensland" metadataNode.append(metadata) metadata = ElementTree.Element("dc:identifier", {"id":"BookId"}) metadata.text = title metadataNode.append(metadata) #manifest manifest = ElementTree.Element("manifest") contentXml.append(manifest) spine = ElementTree.Element("spine", {"toc":"ncx"}) contentXml.append(spine) item = ElementTree.Element("item", {"id":"ncx", "href":"toc.ncx", "media-type":"text/xml"}) manifest.append(item) css = ElementTree.Element("item", {"id":"style", "href":"epub.css", "media-type":"text/css"}) manifest.append(css) count = 1 for itemHash in self.__orderedItem: id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[itemHash] for payloadId in payloadDict: payload, payloadType = payloadDict[payloadId] if isinstance(payload, Payload): payloadId = payloadId.lower() zipEntryId = payloadId.replace(" ", "_").replace("\\", "/") if payloadType == "application/xhtml+xml": zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) ##process the html.... saxReader = SAXReader(False) try: saxDoc = saxReader.read(payload.open()) payload.close() # ## remove class or style nodes # classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ") # for classOrStyleNode in classOrStyleNodes: # node = classOrStyleNode # if classOrStyleNode.getParent(): # node = classOrStyleNode.getParent() # if node.getQualifiedName() == "img": # attr = node.attribute(QName("class")) # attr = node.attribute(QName("class")) # if attr: # node.remove(attr) # attr = node.attribute(QName("style")) # if attr: # node.remove(attr) ## remove name in a tags ahrefs = saxDoc.selectNodes("//*[local-name()='a' and @name!='']") for a in ahrefs: attr = a.attribute(QName("name")) if attr: a.remove(attr) ## fix images src name.... replace space with underscore and all lower case imgs = saxDoc.selectNodes("//*[local-name()='img' and contains(@src, '_files')]") for img in imgs: srcAttr = img.attribute(QName("src")) if srcAttr: src = srcAttr.getValue() #hash the sourcename filepath, filename = os.path.split(src) filename, ext = os.path.splitext(filename) filename = hashlib.md5(filename).hexdigest() src = os.path.join(filepath.lower().replace(" ", "_"), "node-%s%s" % (filename, ext)) img.addAttribute(QName("src"), src.replace(" ", "_")) bodyNode = saxDoc.selectSingleNode("//*[local-name()='div' and @class='body']") bodyNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(bodyNode) writer.flush() contentStr = out.toString("UTF-8") htmlString = """<?xml version="1.0" encoding="UTF-8"?> <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title> <link rel="stylesheet" href="epub.css"/> </head><body>%s</body></html>""" htmlString = htmlString % (title, contentStr) self.__copyString(htmlString, zipOutputStream) includeFile = False except: traceback.print_exc() else: #images.... zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload.open(), zipOutputStream) payload.close() zipOutputStream.closeEntry() else: zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload, zipOutputStream) zipOutputStream.closeEntry() itemNode = ElementTree.Element("item", {"media-type":payloadType, "href": zipEntryId}) if payloadId == htmlFileName.lower(): itemNode.set("id", itemHash) else: itemNode.set("id", payloadId.replace("/", "_")) manifest.append(itemNode) if not isImage: navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % itemHash, "playOrder":"%s" % count}) else: navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % htmlFileName, "playOrder":"%s" % count}) navMap.append(navPoint) navLabel = ElementTree.Element("navLabel") navPoint.append(navLabel) textNode = ElementTree.Element("text") textNode.text = title navLabel.append(textNode) content = ElementTree.Element("content") navPoint.append(content) content.set("src", htmlFileName) count +=1 itemRefNode = ElementTree.Element("itemref") spine.append(itemRefNode) itemRefNode.set("idref", itemHash) #saving content.opf... zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf")) self.__copyString(ElementTree.tostring(contentXml), zipOutputStream) zipOutputStream.closeEntry() #saving toc.ncx zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx")) self.__copyString(ElementTree.tostring(tocXml), zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.close()
def __load_xml__(self): xml_file = File(self.xmlfile) sr = SAXReader() self.xmldoc = sr.read(xml_file)
class XMLAlertHandler(AlertHandler): ''''Processing class for a single XML File. Each XML file is expected to contain only a single Collection ''' def __init__(self, file, config, baseline): AlertHandler.__init__(self, file, config, baseline) docFactory = DocumentFactory() self.saxReader = SAXReader(docFactory) self.xmlMapFile = StrSubstitutor.replaceSystemProperties(config['xmlMap']) if not os.path.exists(self.xmlMapFile): raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile) ## Make sure we can see our mappings inStream = FileInputStream(File(self.xmlMapFile)) xmlMappings = JsonSimple(inStream) self.map = xmlMappings.getObject(["mappings"]) self.exceptions = xmlMappings.getObject(["exceptions"]) self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"]) self.mappedExceptionCount = 0 def process(self): '''Read the XML file and map xpath items to metadata Return a list with 1 JsonSimple object (at most) ''' jsonList = [] data = None reader = None inStream = None document = None # Run the XML through our parser try: inStream = FileInputStream(File(self.file)) reader = InputStreamReader(inStream, "UTF-8") document = self.saxReader.read(reader) # Parse fails except: raise # Close our file access objects finally: if reader is not None: reader.close() if inStream is not None: inStream.close() # Now go looking for all our data data = self.getNewJsonObject() self.__mapXpathToFields(document, self.map, data) if data is None: return None jsonList.append(JsonSimple(data)) return jsonList ## Used recursively def __mapXpathToFields(self, sourceData, map, responseData, index = 1): for xpath in map.keySet(): field = map.get(xpath) if xpath != "": xpathobj = DefaultXPath(xpath) if not self.defaultNamespace is None: xpathobj.setNamespaceContext(SimpleNamespaceContext(self.defaultNamespace)) nodes = xpathobj.selectNodes(sourceData) if isinstance(field, JsonObject): #The XPath key provides a dictionary containing sub xpath queries mapped to fields i = 1 for node in nodes: self.__mapXpathToFields(node, field, responseData, i) i += 1 else: # Lists indicate we're copying the several fields if isinstance(field, JSONArray): for eachField in field: self.__insertFieldData(nodes, eachField, responseData, index) # or just one field else: self.__insertFieldData(nodes, field, responseData, index) def __insertFieldData(self, xmlNodes, field, responseData, index): multiValue = False multiIndex = 1 fieldString = "" if self.exceptions["fields"].containsKey(field): #The field is an exception excepted = True output = self.exceptions["output"] self.mappedExceptionCount += 1 else: # Nope, just normal excepted = False if ('.0.' in field and len(xmlNodes) > 1): #In ReDBox, a field such as dc:subject.vivo:keyword.0.rdf:PlainLiteral indicates a list of values, using the number as a counter. #In the code below, if a field contains this number element, we can increment the counter and add more and more. #If there is no number, we just overwrite the value. multiValue = True #we'll do the fieldString index change a little later fieldString = field else: fieldString = field.replace(".0.", ".%s."%index, 1) for node in xmlNodes: text = node.getTextTrim() if fieldString != "" and text != "": if excepted: exceptionString = "%s: '%s' (%s)" % (exceptions["fields"][field], text, field) responseData.put(fieldString, exceptionString) else: if multiValue: fieldString = field.replace(".0.", ".%s."%multiIndex, 1) multiIndex += 1 responseData.put(fieldString, text)
def __doc_cache_file(self): reader = SAXReader() cache_doc = reader.read( File(self.cache_file) ) return cache_doc
def __createEpub(self): title = self.__manifest.getString(None, "title") self.vc("response").setHeader( "Content-Disposition", "attachment; filename=%s.epub" % urllib.quote(title)) out = self.vc("response").getOutputStream("application/epub+zip") zipOutputStream = ZipOutputStream(out) #save mimetype... and the rest of standard files in epub zipOutputStream.putNextEntry(ZipEntry("mimetype")) epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype") IOUtils.copy(epubMimetypeStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml")) epubContainerStream = self.__getResourceAsStream("/epub/container.xml") IOUtils.copy(epubContainerStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css")) epubcss = self.__getResourceAsStream("/epub/epub.css") IOUtils.copy(epubcss, zipOutputStream) zipOutputStream.closeEntry() #### Creating toc.ncx #### tocXml = ElementTree.Element( "ncx", { "version": "2005-1", "xml:lang": "en", "xmlns": "http://www.daisy.org/z3986/2005/ncx/" }) headNode = ElementTree.Element("head") tocXml.append(headNode) headNode.append( ElementTree.Element("meta", { "name": "dtb:uid", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:depth", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:totalPageCount", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:maxPageNumber", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:generator", "content": "ICE v2" })) #docTitle docTitle = ElementTree.Element("docTitle") textNode = ElementTree.Element("text") textNode.text = title docTitle.append(textNode) tocXml.append(docTitle) #docAuthor docAuthor = ElementTree.Element("docAuthor") textNode = ElementTree.Element("text") textNode.text = "ICE v2" docAuthor.append(textNode) tocXml.append(docAuthor) #navMap navMap = ElementTree.Element("navMap") tocXml.append(navMap) #### Creating content.opf #### contentXml = ElementTree.Element( "package", { "version": "2.0", "xmlns": "http://www.idpf.org/2007/opf", "unique-identifier": "BookId" }) metadataNode = ElementTree.Element( "metadata", { "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) contentXml.append(metadataNode) #metadata information metadata = ElementTree.Element("dc:title") metadata.text = title metadataNode.append(metadata) metadata = ElementTree.Element("dc:language") metadata.text = "en-AU" metadataNode.append(metadata) metadata = ElementTree.Element("dc:creator", {"opf:role": "aut"}) metadata.text = "ICE" metadataNode.append(metadata) metadata = ElementTree.Element("dc:publisher") metadata.text = "University of Southern Queensland" metadataNode.append(metadata) metadata = ElementTree.Element("dc:identifier", {"id": "BookId"}) metadata.text = title metadataNode.append(metadata) #manifest manifest = ElementTree.Element("manifest") contentXml.append(manifest) spine = ElementTree.Element("spine", {"toc": "ncx"}) contentXml.append(spine) item = ElementTree.Element("item", { "id": "ncx", "href": "toc.ncx", "media-type": "text/xml" }) manifest.append(item) css = ElementTree.Element("item", { "id": "style", "href": "epub.css", "media-type": "text/css" }) manifest.append(css) count = 1 for itemHash in self.__orderedItem: id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[ itemHash] for payloadId in payloadDict: payload, payloadType = payloadDict[payloadId] if isinstance(payload, Payload): payloadId = payloadId.lower() zipEntryId = payloadId.replace(" ", "_").replace("\\", "/") if payloadType == "application/xhtml+xml": zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) ##process the html.... saxReader = SAXReader(False) try: saxDoc = saxReader.read(payload.open()) payload.close() # ## remove class or style nodes # classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ") # for classOrStyleNode in classOrStyleNodes: # node = classOrStyleNode # if classOrStyleNode.getParent(): # node = classOrStyleNode.getParent() # if node.getQualifiedName() == "img": # attr = node.attribute(QName("class")) # attr = node.attribute(QName("class")) # if attr: # node.remove(attr) # attr = node.attribute(QName("style")) # if attr: # node.remove(attr) ## remove name in a tags ahrefs = saxDoc.selectNodes( "//*[local-name()='a' and @name!='']") for a in ahrefs: attr = a.attribute(QName("name")) if attr: a.remove(attr) ## fix images src name.... replace space with underscore and all lower case imgs = saxDoc.selectNodes( "//*[local-name()='img' and contains(@src, '_files')]" ) for img in imgs: srcAttr = img.attribute(QName("src")) if srcAttr: src = srcAttr.getValue() #hash the sourcename filepath, filename = os.path.split(src) filename, ext = os.path.splitext(filename) filename = hashlib.md5( filename).hexdigest() src = os.path.join( filepath.lower().replace(" ", "_"), "node-%s%s" % (filename, ext)) img.addAttribute(QName("src"), src.replace(" ", "_")) bodyNode = saxDoc.selectSingleNode( "//*[local-name()='div' and @class='body']") bodyNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(bodyNode) writer.flush() contentStr = out.toString("UTF-8") htmlString = """<?xml version="1.0" encoding="UTF-8"?> <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title> <link rel="stylesheet" href="epub.css"/> </head><body>%s</body></html>""" htmlString = htmlString % (title, contentStr) self.__copyString(htmlString, zipOutputStream) includeFile = False except: traceback.print_exc() else: #images.... zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload.open(), zipOutputStream) payload.close() zipOutputStream.closeEntry() else: zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload, zipOutputStream) zipOutputStream.closeEntry() itemNode = ElementTree.Element("item", { "media-type": payloadType, "href": zipEntryId }) if payloadId == htmlFileName.lower(): itemNode.set("id", itemHash) else: itemNode.set("id", payloadId.replace("/", "_")) manifest.append(itemNode) if not isImage: navPoint = ElementTree.Element( "navPoint", { "class": "chapter", "id": "%s" % itemHash, "playOrder": "%s" % count }) else: navPoint = ElementTree.Element( "navPoint", { "class": "chapter", "id": "%s" % htmlFileName, "playOrder": "%s" % count }) navMap.append(navPoint) navLabel = ElementTree.Element("navLabel") navPoint.append(navLabel) textNode = ElementTree.Element("text") textNode.text = title navLabel.append(textNode) content = ElementTree.Element("content") navPoint.append(content) content.set("src", htmlFileName) count += 1 itemRefNode = ElementTree.Element("itemref") spine.append(itemRefNode) itemRefNode.set("idref", itemHash) #saving content.opf... zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf")) self.__copyString(ElementTree.tostring(contentXml), zipOutputStream) zipOutputStream.closeEntry() #saving toc.ncx zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx")) self.__copyString(ElementTree.tostring(tocXml), zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.close()
class XMLAlertHandler(AlertHandler): ''''Processing class for a single XML File. Each XML file is expected to contain only a single Collection ''' def __init__(self, file, config, baseline): AlertHandler.__init__(self, file, config, baseline) docFactory = DocumentFactory() self.saxReader = SAXReader(docFactory) self.xmlMapFile = StrSubstitutor.replaceSystemProperties( config['xmlMap']) if not os.path.exists(self.xmlMapFile): raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile) ## Make sure we can see our mappings inStream = FileInputStream(File(self.xmlMapFile)) xmlMappings = JsonSimple(inStream) self.map = xmlMappings.getObject(["mappings"]) self.exceptions = xmlMappings.getObject(["exceptions"]) self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"]) self.mappedExceptionCount = 0 def process(self): '''Read the XML file and map xpath items to metadata Return a list with 1 JsonSimple object (at most) ''' jsonList = [] data = None reader = None inStream = None document = None # Run the XML through our parser try: inStream = FileInputStream(File(self.file)) reader = InputStreamReader(inStream, "UTF-8") document = self.saxReader.read(reader) # Parse fails except: raise # Close our file access objects finally: if reader is not None: reader.close() if inStream is not None: inStream.close() # Now go looking for all our data data = self.getNewJsonObject() self.__mapXpathToFields(document, self.map, data) if data is None: return None jsonList.append(JsonSimple(data)) return jsonList ## Used recursively def __mapXpathToFields(self, sourceData, map, responseData, index=1): for xpath in map.keySet(): field = map.get(xpath) if xpath != "": xpathobj = DefaultXPath(xpath) if not self.defaultNamespace is None: xpathobj.setNamespaceContext( SimpleNamespaceContext(self.defaultNamespace)) nodes = xpathobj.selectNodes(sourceData) if isinstance(field, JsonObject): #The XPath key provides a dictionary containing sub xpath queries mapped to fields i = 1 for node in nodes: self.__mapXpathToFields(node, field, responseData, i) i += 1 else: # Lists indicate we're copying the several fields if isinstance(field, JSONArray): for eachField in field: self.__insertFieldData(nodes, eachField, responseData, index) # or just one field else: self.__insertFieldData(nodes, field, responseData, index) def __insertFieldData(self, xmlNodes, field, responseData, index): multiValue = False multiIndex = 1 fieldString = "" if self.exceptions["fields"].containsKey(field): #The field is an exception excepted = True output = self.exceptions["output"] self.mappedExceptionCount += 1 else: # Nope, just normal excepted = False if ('.0.' in field and len(xmlNodes) > 1): #In ReDBox, a field such as dc:subject.vivo:keyword.0.rdf:PlainLiteral indicates a list of values, using the number as a counter. #In the code below, if a field contains this number element, we can increment the counter and add more and more. #If there is no number, we just overwrite the value. multiValue = True #we'll do the fieldString index change a little later fieldString = field else: fieldString = field.replace(".0.", ".%s." % index, 1) for node in xmlNodes: try: text = node.getTextTrim() except: try: text = node.getValue().strip() except: text = node if fieldString != "" and text != "": if excepted: exceptionString = "%s: '%s' (%s)" % ( exceptions["fields"][field], text, field) responseData.put(fieldString, exceptionString) else: if multiValue: fieldString = field.replace(".0.", ".%s." % multiIndex, 1) multiIndex += 1 responseData.put(fieldString, text)