def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None): """ Extract the text of a DOM node Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context return a unicode string """ lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS) if len(lNdText) != 1: if len(lNdText) > 1: raise ValueError( "More than 1 textual content for this node: %s" % etree.tostring(ndBlock)) #let's try to get th etext of the words, and concatenate... # traceln("Warning: no text in node %s => looking at words!"%ndBlock.prop("id")) # lsText = [ntext.content.decode('utf-8').strip() for ntext in ctxt.xpathEval('.//pc:Word/pc:TextEquiv//text()')] #if we have both PlainText and UnicodeText in XML, :-/ lsText = [ _nd.text.strip() for _nd in ctxt.xpathEval('.//pc:Word/pc:TextEquiv') ] #if we have both PlainText and UnicodeText in XML, :-/ return " ".join(lsText) return PageXml.makeText(lNdText[0])
def _get_GraphNodeText(self, doc, domNdPage, ndBlock): """ Extract the text of a DOM node Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context return a unicode string """ lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS) if len(lNdText) != 1: if len(lNdText) <= 0: raise ValueError("I found no useful TextEquiv below this node... \n%s"%etree.tostring(ndBlock)) else: raise ValueError("I expected exactly one useful TextEquiv below this node. Got many... \n%s"%etree.tostring(ndBlock)) return PageXml.makeText(lNdText[0])