def __init__(self, config_dir, data_dir): self.doc = None self.config_dir = config_dir self.data_dir = data_dir os.chdir(data_dir) libxml2.initializeCatalog() libxml2.loadCatalog(os.path.join(data_dir, "catalog.xml")) parser = libxml2.createFileParserCtxt( os.path.join(config_dir, "jjigw.xml")) parser.validate(1) parser.parseDocument() if not parser.isValid(): raise JJIGWFatalError, "Invalid configuration" self.doc = parser.doc() self.connect = ConnectConfig(self.doc.xpathEval("jjigw/connect")[0]) self.jid = None self.networks = {} for n in self.doc.xpathEval("jjigw/network"): network = NetworkConfig(n) if not self.jid: self.jid = network.jid self.networks[network.jid.domain] = network spidentd = self.doc.xpathEval("jjigw/spidentd") if spidentd: self.spidentd = SPIdentDConfig(spidentd[0]) else: self.spidentd = None self.admins = [] for n in self.doc.xpathEval("jjigw/admin"): self.admins.append(JID(n.getContent()))
def __init__(self,config_dir,data_dir): self.doc=None self.config_dir=config_dir self.data_dir=data_dir os.chdir(data_dir) libxml2.initializeCatalog() libxml2.loadCatalog(os.path.join(data_dir,"catalog.xml")) parser=libxml2.createFileParserCtxt(os.path.join(config_dir,"jjigw.xml")) parser.validate(1) parser.parseDocument() if not parser.isValid(): raise JJIGWFatalError,"Invalid configuration" self.doc=parser.doc() self.connect=ConnectConfig(self.doc.xpathEval("jjigw/connect")[0]) self.jid=None self.networks={} for n in self.doc.xpathEval("jjigw/network"): network=NetworkConfig(n) if not self.jid: self.jid=network.jid self.networks[network.jid.domain]=network spidentd=self.doc.xpathEval("jjigw/spidentd") if spidentd: self.spidentd=SPIdentDConfig(spidentd[0]) else: self.spidentd=None self.admins=[] for n in self.doc.xpathEval("jjigw/admin"): self.admins.append(JID(n.getContent()))
def load(self): """ Load the document """ if self.filepath is None: raise DocumentError('The document source file path is not defined.') # Create the catalog lists catalogs = self.get_catalogs_list() # Load the catalogs for catalog in catalogs: libxml2.loadCatalog(catalog) # Load the document self.xml = libxml2.parseFile(self.filepath) # Process the XInclude part res = self.xml.xincludeProcess()
def xsl_transform(content, bDownloadImages): # 1 strTidiedHtml = tidy_and_premail(content) # 2 Settings for libxml2 for transforming XHTML entities to valid XML libxml2.loadCatalog(XHTML_ENTITIES) libxml2.lineNumbersDefault(1) libxml2.substituteEntitiesDefault(1) # 3 First XSLT transformation styleDoc1 = libxml2.parseFile(GDOCS2CNXML_XSL1) style1 = libxslt.parseStylesheetDoc(styleDoc1) # doc1 = libxml2.parseFile(afile)) doc1 = libxml2.parseDoc(strTidiedHtml) result1 = style1.applyStylesheet(doc1, None) #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1) strResult1 = style1.saveResultToString(result1) style1.freeStylesheet() doc1.freeDoc() result1.freeDoc() # Parse XML with etree from lxml for TeX2MathML and image download etreeXml = etree.fromstring(strResult1) # 4 Convert TeX to MathML with Blahtex etreeXml = tex2mathml(etreeXml) # 5 Optional: Download Google Docs Images imageObjects = {} if bDownloadImages: etreeXml, imageObjects = downloadImages(etreeXml) # Convert etree back to string strXml = etree.tostring(etreeXml) # pretty_print=True) # 6 Second transformation styleDoc2 = libxml2.parseFile(GDOCS2CNXML_XSL2) style2 = libxslt.parseStylesheetDoc(styleDoc2) doc2 = libxml2.parseDoc(strXml) result2 = style2.applyStylesheet(doc2, None) #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging strResult2 = style2.saveResultToString(result2) style2.freeStylesheet() doc2.freeDoc() result2.freeDoc() return strResult2, imageObjects
def xsl_transform(content, bDownloadImages, base_or_source_url='.'): use_readability = True # 1 get title with readability html_title = "Untitled" try: html_title = Document(content).title() except: pass # 2 use readabilty to get content if use_readability: readable_article = Document(content).summary() else: readable_article = content # 3 tidy and premail strTidiedHtml = tidy_and_premail(readable_article) # 4 Load XHTML catalog files: Makes XHTML entities readable. libxml2.loadCatalog(XHTML_ENTITIES) libxml2.lineNumbersDefault(1) libxml2.substituteEntitiesDefault(1) # 5 XSLT transformation styleDoc1 = libxml2.parseFile(XHTML2CNXML_XSL1) style1 = libxslt.parseStylesheetDoc(styleDoc1) # doc1 = libxml2.parseFile(afile)) doc1 = libxml2.parseDoc(strTidiedHtml) result1 = style1.applyStylesheet(doc1, None) #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1) strResult1 = style1.saveResultToString(result1) # print strResult1 style1.freeStylesheet() doc1.freeDoc() result1.freeDoc() # Parse XML with etree from lxml for TeX2MathML and image download etreeXml = etree.fromstring(strResult1) # 6 Convert TeX to MathML with Blahtex (not in XHTML) # etreeXml = tex2mathml(etreeXml) # 7 Optional: Download Google Docs Images imageObjects = {} if bDownloadImages: etreeXml, imageObjects = downloadImages(etreeXml, base_or_source_url) # 8 add title from html etreeXml = add_cnxml_title(etreeXml, html_title) # Convert etree back to string strXml = etree.tostring(etreeXml) # pretty_print=True) # 9 Second transformation styleDoc2 = libxml2.parseFile(XHTML2CNXML_XSL2) style2 = libxslt.parseStylesheetDoc(styleDoc2) doc2 = libxml2.parseDoc(strXml) result2 = style2.applyStylesheet(doc2, None) #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging strResult2 = style2.saveResultToString(result2) style2.freeStylesheet() doc2.freeDoc() result2.freeDoc() return strResult2, imageObjects, html_title
def init_libxml2(xml): libxml2.loadCatalog(XHTML_ENTITIES) libxml2.lineNumbersDefault(1) libxml2.substituteEntitiesDefault(1) return xml, {}
def xsl_transform(content, bDownloadImages, base_or_source_url='.'): html_title = "Untitled" # 1 get title with readability # ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml #try: # html_title = Document(content).title() #except: # pass # 2 use readabilty to get content # ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml #readable_article = Document(content).summary() readable_article = content # 3 tidy and premail strTidiedHtml = tidy_and_premail(readable_article) # 4 Load XHTML catalog files: Makes XHTML entities readable. libxml2.loadCatalog(XHTML_ENTITIES) libxml2.lineNumbersDefault(1) libxml2.substituteEntitiesDefault(1) # 5 XSLT transformation styleDoc1 = libxml2.parseFile(XHTML2CNXML_XSL1) style1 = libxslt.parseStylesheetDoc(styleDoc1) # doc1 = libxml2.parseFile(afile)) doc1 = libxml2.parseDoc(strTidiedHtml) result1 = style1.applyStylesheet(doc1, None) #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1) strResult1 = style1.saveResultToString(result1) style1.freeStylesheet() doc1.freeDoc() result1.freeDoc() # Parse XML with etree from lxml for TeX2MathML and image download etreeXml = etree.fromstring(strResult1) # 6 Convert TeX to MathML with Blahtex (not in XHTML) # etreeXml = tex2mathml(etreeXml) # 7 Optional: Download Google Docs Images imageObjects = {} if bDownloadImages: etreeXml, imageObjects = downloadImages(etreeXml, base_or_source_url) # 8 add title from html etreeXml = add_cnxml_title(etreeXml, html_title) # Convert etree back to string strXml = etree.tostring(etreeXml) # pretty_print=True) # 9 Second transformation styleDoc2 = libxml2.parseFile(XHTML2CNXML_XSL2) style2 = libxslt.parseStylesheetDoc(styleDoc2) doc2 = libxml2.parseDoc(strXml) result2 = style2.applyStylesheet(doc2, None) #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging strResult2 = style2.saveResultToString(result2) style2.freeStylesheet() doc2.freeDoc() result2.freeDoc() return strResult2, imageObjects, html_title