def __init__(self, tree): if hasattr(tree, "getroot"): tree = Root(tree) elif isinstance(tree, list): tree = FragmentRoot(tree) _base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = ihatexml.InfosetFilter()
def testSerializer(element): rv = [] finalText = None filter = ihatexml.InfosetFilter() def serializeElement(element, indent=0): if not hasattr(element, "tag"): if hasattr(element, "getroot"): #Full tree case rv.append("#document") if element.docinfo.internalDTD: if not (element.docinfo.public_id or element.docinfo.system_url): dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name else: dtd_str = """<!DOCTYPE %s "%s" "%s">""" % ( element.docinfo.root_name, element.docinfo.public_id, element.docinfo.system_url) rv.append("|%s%s" % (' ' * (indent + 2), dtd_str)) next_element = element.getroot() while next_element.getprevious() is not None: next_element = next_element.getprevious() while next_element is not None: serializeElement(next_element, indent + 2) next_element = next_element.getnext() elif isinstance(element, basestring): #Text in a fragment rv.append("|%s\"%s\"" % (' ' * indent, element)) else: #Fragment case rv.append("#document-fragment") for next_element in element: serializeElement(next_element, indent + 2) elif type(element.tag) == type(etree.Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element.text)) else: rv.append("|%s<%s>" % (' ' * indent, filter.fromXmlName(element.tag))) if hasattr(element, "attrib"): for name, value in element.attrib.iteritems(): rv.append('|%s%s="%s"' % (' ' * (indent + 2), filter.fromXmlName(name), value)) if element.text: rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) indent += 2 for child in element.getchildren(): serializeElement(child, indent) if hasattr(element, "tail") and element.tail: rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) serializeElement(element, 0) if finalText is not None: rv.append("|%s\"%s\"" % (' ' * 2, finalText)) return "\n".join(rv)
def tostring(element): """Serialize an element and its child nodes to a string""" rv = [] finalText = None filter = ihatexml.InfosetFilter() def serializeElement(element): if type(element) == type(ElementTree.ElementTree): element = element.getroot() if element.tag == "<!DOCTYPE>": if element.get("publicId") or element.get("systemId"): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" % (element.text, publicId, systemId)) else: rv.append("<!DOCTYPE %s>" % (element.text, )) elif element.tag == "<DOCUMENT_ROOT>": if element.text: rv.append(element.text) if element.tail: finalText = element.tail for child in element.getchildren(): serializeElement(child) elif type(element.tag) == type(ElementTree.Comment): rv.append("<!--%s-->" % (element.text, )) else: #This is assumed to be an ordinary element if not element.attrib: rv.append("<%s>" % (filter.fromXmlName(element.tag), )) else: attr = " ".join([ "%s=\"%s\"" % (filter.fromXmlName(name), value) for name, value in element.attrib.iteritems() ]) rv.append("<%s %s>" % (element.tag, attr)) if element.text: rv.append(element.text) for child in element.getchildren(): serializeElement(child) rv.append("</%s>" % (element.tag, )) if element.tail: rv.append(element.tail) serializeElement(element) if finalText is not None: rv.append("%s\"" % (' ' * 2, finalText)) return "".join(rv)
def testSerializer(element): rv = [] finalText = None infosetFilter = ihatexml.InfosetFilter() def serializeElement(element, indent=0): if not hasattr(element, "tag"): if hasattr(element, "getroot"): #Full tree case rv.append("#document") if element.docinfo.internalDTD: if not (element.docinfo.public_id or element.docinfo.system_url): dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name else: dtd_str = """<!DOCTYPE %s "%s" "%s">""" % ( element.docinfo.root_name, element.docinfo.public_id, element.docinfo.system_url) rv.append("|%s%s" % (' ' * (indent + 2), dtd_str)) next_element = element.getroot() while next_element.getprevious() is not None: next_element = next_element.getprevious() while next_element is not None: serializeElement(next_element, indent + 2) next_element = next_element.getnext() elif isinstance(element, str) or isinstance(element, bytes): #Text in a fragment assert isinstance(element, str) or sys.version_info.major == 2 rv.append("|%s\"%s\"" % (' ' * indent, element)) else: #Fragment case rv.append("#document-fragment") for next_element in element: serializeElement(next_element, indent + 2) elif type(element.tag) == type(etree.Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element.text)) if hasattr(element, "tail") and element.tail: rv.append("|%s\"%s\"" % (' ' * indent, element.tail)) else: assert isinstance(element, etree._Element) nsmatch = etree_builders.tag_regexp.match(element.tag) if nsmatch is not None: ns = nsmatch.group(1) tag = nsmatch.group(2) prefix = constants.prefixes[ns] rv.append( "|%s<%s %s>" % (' ' * indent, prefix, infosetFilter.fromXmlName(tag))) else: rv.append( "|%s<%s>" % (' ' * indent, infosetFilter.fromXmlName(element.tag))) if hasattr(element, "attrib"): attributes = [] for name, value in element.attrib.items(): nsmatch = tag_regexp.match(name) if nsmatch is not None: ns, name = nsmatch.groups() name = infosetFilter.fromXmlName(name) prefix = constants.prefixes[ns] attr_string = "%s %s" % (prefix, name) else: attr_string = infosetFilter.fromXmlName(name) attributes.append((attr_string, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) if element.text: rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) indent += 2 for child in element.getchildren(): serializeElement(child, indent) if hasattr(element, "tail") and element.tail: rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) serializeElement(element, 0) if finalText is not None: rv.append("|%s\"%s\"" % (' ' * 2, finalText)) return "\n".join(rv)
def __init__(self, namespaceHTMLElements, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): def __init__(self, element, value={}): self._element = element dict.__init__(self, value) for key, value in self.items(): if isinstance(key, tuple): name = "{%s}%s" % ( key[2], infosetFilter.coerceAttribute(key[1])) else: name = infosetFilter.coerceAttribute(key) self._element._element.attrib[name] = value def __setitem__(self, key, value): dict.__setitem__(self, key, value) if isinstance(key, tuple): name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) else: name = infosetFilter.coerceAttribute(key) self._element._element.attrib[name] = value class Element(builder.Element): def __init__(self, name, namespace): name = infosetFilter.coerceElement(name) builder.Element.__init__(self, name, namespace=namespace) self._attributes = Attributes(self) def _setName(self, name): self._name = infosetFilter.coerceElement(name) self._element.tag = self._getETreeTag(self._name, self._namespace) def _getName(self): return infosetFilter.fromXmlName(self._name) name = property(_getName, _setName) def _getAttributes(self): return self._attributes def _setAttributes(self, attributes): self._attributes = Attributes(self, attributes) attributes = property(_getAttributes, _setAttributes) def insertText(self, data, insertBefore=None): data = infosetFilter.coerceCharacters(data) builder.Element.insertText(self, data, insertBefore) def appendChild(self, child): builder.Element.appendChild(self, child) class Comment(builder.Comment): def __init__(self, data): data = infosetFilter.coerceComment(data) builder.Comment.__init__(self, data) def _setData(self, data): data = infosetFilter.coerceComment(data) self._element.text = data def _getData(self): return self._element.text data = property(_getData, _setData) self.elementClass = Element self.commentClass = builder.Comment #self.fragmentClass = builder.DocumentFragment _base.TreeBuilder.__init__(self, namespaceHTMLElements)
def getDomBuilder(DomImplementation): Dom = DomImplementation infoset_filter = ihatexml.InfosetFilter() class AttrList: def __init__(self, element): self.element = element def __iter__(self): return self.element.attributes.items().__iter__() def __setitem__(self, name, value): self.element.setAttribute(infoset_filter.coerceAttribute(name), infoset_filter.coerceCharacters(value)) def items(self): return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in self.element.attributes.items()] def keys(self): return [ infoset_filter.fromXmlName(item) for item in self.element.attributes.keys() ] def __getitem__(self, name): name = infoset_filter.toXmlName(name) return self.element.getAttribute(name) def __contains__(self, name): if isinstance(name, tuple): raise NotImplementedError else: return self.element.hasAttribute( infoset_filter.toXmlName(name)) class NodeBuilder(_base.Node): def __init__(self, element): _base.Node.__init__(self, element.localName) self.element = element namespace = property(lambda self: hasattr(self.element, "namespaceURI") and self.element.namespaceURI or None) def appendChild(self, node): node.parent = self self.element.appendChild(node.element) def insertText(self, data, insertBefore=None): data = infoset_filter.coerceCharacters(data) text = self.element.ownerDocument.createTextNode(data) if insertBefore: self.element.insertBefore(text, insertBefore.element) else: self.element.appendChild(text) def insertBefore(self, node, refNode): self.element.insertBefore(node.element, refNode.element) node.parent = self def removeChild(self, node): if node.element.parentNode == self.element: self.element.removeChild(node.element) node.parent = None def reparentChildren(self, newParent): while self.element.hasChildNodes(): child = self.element.firstChild self.element.removeChild(child) newParent.element.appendChild(child) self.childNodes = [] def getAttributes(self): return AttrList(self.element) def setAttributes(self, attributes): if attributes: for name, value in attributes.items(): if isinstance(name, tuple): if name[0] is not None: qualifiedName = ( name[0] + ":" + infoset_filter.coerceAttribute(name[1])) else: qualifiedName = infoset_filter.coerceAttribute( name[1]) self.element.setAttributeNS(name[2], qualifiedName, value) else: self.element.setAttribute( infoset_filter.coerceAttribute(name), value) attributes = property(getAttributes, setAttributes) def cloneNode(self): return NodeBuilder(self.element.cloneNode(False)) def hasContent(self): return self.element.hasChildNodes() def getNameTuple(self): if self.namespace == None: return namespaces["html"], self.name else: return self.namespace, self.name nameTuple = property(getNameTuple) class TreeBuilder(_base.TreeBuilder): def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument( None, None, None) return self def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] domimpl = Dom.getDOMImplementation() doctype = domimpl.createDocumentType(name, publicId, systemId) self.document.appendChild(NodeBuilder(doctype)) if Dom == minidom: doctype.ownerDocument = self.dom def elementClass(self, name, namespace=None): if namespace is None and self.defaultNamespace is None: node = self.dom.createElement(name) else: node = self.dom.createElementNS(namespace, name) return NodeBuilder(node) def commentClass(self, data): return NodeBuilder(self.dom.createComment(data)) def fragmentClass(self): return NodeBuilder(self.dom.createDocumentFragment()) def appendChild(self, node): self.dom.appendChild(node.element) def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.dom def getFragment(self): return _base.TreeBuilder.getFragment(self).element def insertText(self, data, parent=None): data = infoset_filter.coerceCharacters(data) if parent <> self: _base.TreeBuilder.insertText(self, data, parent) else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): if not Node.TEXT_NODE in self.dom._child_node_types: self.dom._child_node_types = list( self.dom._child_node_types) self.dom._child_node_types.append(Node.TEXT_NODE) self.dom.appendChild(self.dom.createTextNode(data)) name = None def testSerializer(element): element.normalize() rv = [] def serializeElement(element, indent=0): if element.nodeType == Node.DOCUMENT_TYPE_NODE: if element.name: if element.publicId or element.systemId: publicId = element.publicId or "" systemId = element.systemId or "" rv.append( """|%s<!DOCTYPE %s "%s" "%s">""" % (' ' * indent, element.name, publicId, systemId)) else: rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name)) else: rv.append("|%s<!DOCTYPE >" % (' ' * indent, )) elif element.nodeType == Node.DOCUMENT_NODE: rv.append("#document") elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: rv.append("#document-fragment") elif element.nodeType == Node.COMMENT_NODE: rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue)) elif element.nodeType == Node.TEXT_NODE: rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue)) else: if (hasattr(element, "namespaceURI") and element.namespaceURI not in (None, constants.namespaces["html"])): name = "%s %s" % (constants.prefixes[element.namespaceURI], element.nodeName) else: name = element.nodeName rv.append("|%s<%s>" % (' ' * indent, name)) if element.hasAttributes(): i = 0 attr = element.attributes.item(i) while attr: name = infoset_filter.fromXmlName(attr.localName) value = attr.value ns = attr.namespaceURI if ns: name = "%s %s" % (constants.prefixes[ns], name) i += 1 attr = element.attributes.item(i) rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) indent += 2 for child in element.childNodes: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv) def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}): if node.nodeType == Node.ELEMENT_NODE: if not nsmap: handler.startElement(node.nodeName, node.attributes) for child in node.childNodes: dom2sax(child, handler, nsmap) handler.endElement(node.nodeName) else: attributes = dict(node.attributes.itemsNS()) # gather namespace declarations prefixes = [] for attrname in node.attributes.keys(): attr = node.getAttributeNode(attrname) if (attr.namespaceURI == XMLNS_NAMESPACE or (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): prefix = (attr.localName != 'xmlns' and attr.localName or None) handler.startPrefixMapping(prefix, attr.nodeValue) prefixes.append(prefix) nsmap = nsmap.copy() nsmap[prefix] = attr.nodeValue del attributes[(attr.namespaceURI, attr.localName)] # apply namespace declarations for attrname in node.attributes.keys(): attr = node.getAttributeNode(attrname) if attr.namespaceURI == None and ':' in attr.nodeName: prefix = attr.nodeName.split(':')[0] if nsmap.has_key(prefix): del attributes[(attr.namespaceURI, attr.localName)] attributes[(nsmap[prefix], attr.localName)] = attr.nodeValue # SAX events ns = node.namespaceURI or nsmap.get(None, None) handler.startElementNS((ns, node.nodeName), node.nodeName, attributes) for child in node.childNodes: dom2sax(child, handler, nsmap) handler.endElementNS((ns, node.nodeName), node.nodeName) for prefix in prefixes: handler.endPrefixMapping(prefix) elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: handler.characters(node.nodeValue) elif node.nodeType == Node.DOCUMENT_NODE: handler.startDocument() for child in node.childNodes: dom2sax(child, handler, nsmap) handler.endDocument() elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: for child in node.childNodes: dom2sax(child, handler, nsmap) else: # ATTRIBUTE_NODE # ENTITY_NODE # PROCESSING_INSTRUCTION_NODE # COMMENT_NODE # DOCUMENT_TYPE_NODE # NOTATION_NODE pass return locals()
def __init__(self, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) filter = self.filter = ihatexml.InfosetFilter() class Attributes(dict): def __init__(self, element, value={}): self._element = element dict.__init__(self, value) for k, v in self.iteritems(): self._element._element.attrib[filter.coerceAttribute( k)] = v def __setitem__(self, key, value): dict.__setitem__(self, key, value) self._element._element.attrib[filter.coerceAttribute( key)] = value class Element(builder.Element): def __init__(self, name): self._name = name builder.Element.__init__(self, filter.coerceElement(name)) self._attributes = Attributes(self) def _setName(self, name): self._name = name self._element.tag = filter.coerceElement(name) def _getName(self): return self._name name = property(_getName, _setName) def _getAttributes(self): return self._attributes def _setAttributes(self, attributes): self._attributes = Attributes(self, attributes) attributes = property(_getAttributes, _setAttributes) def insertText(self, data, insertBefore=None): data = filter.coerceCharacters(data) builder.Element.insertText(self, data, insertBefore) def appendChild(self, child): builder.Element.appendChild(self, child) class Comment(builder.Comment): def __init__(self, data): data = filter.coerceComment(data) builder.Comment.__init__(self, data) def _setData(self, data): data = filter.coerceComment(data) self._element.text = data def _getData(self): return self._element.text data = property(_getData, _setData) self.elementClass = Element self.commentClass = builder.Comment #self.fragmentClass = builder.DocumentFragment _base.TreeBuilder.__init__(self)