class PushParser(object): def __init__(self, elementPath, onResultDo): builder = SubTreesTreeBuilder(elementPath=elementPath, onResult=onResultDo) self._parser = XMLParser(target=builder) def feed(self, data): self._parser.feed(data)
def checkfile(modelXbrl, filepath): result = [] lineNum = 1 foundXmlDeclaration = False isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" file, encoding = modelXbrl.fileSource.file(filepath) parserResults = {} class checkFileType(object): def start(self, tag, attr): # check root XML element type parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet") def end(self, tag): pass def data(self, data): pass def close(self): pass _parser = XMLParser(target=checkFileType()) _isTestcase = False with file as f: while True: line = f.readline() if line == "": break; # check for disallowed characters or entity codes for match in docCheckPattern.finditer(line): text = match.group() if text.startswith("&"): if not text in xhtmlEntities: modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"), _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) elif isEFM and not _isTestcase: if len(text) == 1: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), file=os.path.basename(filepath), line=lineNum, column=match.start()) else: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) if lineNum == 1: xmlDeclarationMatch = XMLdeclaration.search(line) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() line = line[0:start] + line[end:] foundXmlDeclaration = True if _parser: # feed line after removal of xml declaration _parser.feed(line.encode('utf-8','ignore')) if "rootIsTestcase" in parserResults: # root XML element has been encountered _isTestcase = parserResults["rootIsTestcase"] _parser = None # no point to parse past the root element result.append(line) lineNum += 1 result = ''.join(result) if not foundXmlDeclaration: # may be multiline, try again xmlDeclarationMatch = XMLdeclaration.search(result) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() result = result[0:start] + result[end:] foundXmlDeclaration = True return (io.StringIO(initial_value=result), encoding)
def testOnResult(self): trees = [] def onResult(tree): trees.append(tree) xml = """<a><b>Dit is een tag in een tag</b></a>""" builder = SubTreesTreeBuilder(elementPath=['r', 'a', 'b'], onResult=onResult) parser = XMLParser(target=builder) parser.feed("<r>") parser.feed(xml) parser.feed(xml) self.assertEquals(2, len(trees)) self.assertEquals('<b>Dit is een tag in een tag</b>', tostring(trees[0]))
def testIdentityTransformWithNS(self): builder = SubTreesTreeBuilder(buildFor={ 'one': lambda stack: [d['tag'] for d in stack] == ['{u:ri/default#}root'], }) parser = XMLParser(target=builder) parser.feed(XML_NS) parser.close() subtrees = [t for t in builder.getSubtrees()] self.assertEquals(1, len(subtrees)) id, lxml = subtrees[0] self.assertEquals('one', id) self.assertEqualsLxml(parseString(XML_NS), lxml)
def testFilterTag(self): target = Target('mies') p = XMLParser(target = target) p.feed("<aap><mies>") p.feed("noot") p.feed("</mies>") p.feed("</aap>") self.assertEquals("<mies>noot</mies>", lxmltostring(target.root))
def testFilterTag(self): target = Target('mies') p = XMLParser(target = target) p.feed("<aap><mies>") p.feed("noot") p.feed("</mies>") p.feed("</aap>") self.assertEqual("<mies>noot</mies>", lxmltostring(target.root))
def dictnode_to_lxml(tree, node_lookup=None, encoding=None): """ Input: A dictionary-based representation of a node tree. Output: An lxml representation of the same. Each dictionary has three attributes: name -- The type of node, a string. In html, this would be the tag name. text -- The content of the node: <b>text</b> tail -- Any content after the end of this node, but before the start of the next: <br/>tail attrs -- A dictionary of any extra attributes. children -- An ordered list of more node-dictionaries. """ if not node_lookup: from refactorlib.node import node_lookup from lxml.etree import Element, XMLParser root = None stack = [(tree, root)] while stack: node, parent = stack.pop() # sort attributes for determinism attrs = node.get('attrs', {}) attrs = {k: attrs[k] for k in sorted(attrs)} if parent is None: # We use this roundabout method becuase the encoding is always set # to 'UTF8' if we use parser.makeelement() parser = XMLParser(encoding=encoding) parser.set_element_class_lookup(node_lookup) parser.feed(b'<a/>') lxmlnode = parser.close() lxmlnode.tag = node['name'] lxmlnode.attrib.update(attrs) root = lxmlnode else: lxmlnode = Element(node['name'], attrib=attrs) parent.append(lxmlnode) lxmlnode.text = node['text'] lxmlnode.tail = node['tail'] for child in reversed(node['children']): stack.append((child, lxmlnode)) return root
def dictnode_to_lxml(tree, node_lookup=None, encoding=None): """ Input: A dictionary-based representation of a node tree. Output: An lxml representation of the same. Each dictionary has three attributes: name -- The type of node, a string. In html, this would be the tag name. text -- The content of the node: <b>text</b> tail -- Any content after the end of this node, but before the start of the next: <br/>tail attrs -- A dictionary of any extra attributes. children -- An ordered list of more node-dictionaries. """ if not node_lookup: from node import node_lookup from lxml.etree import XMLParser lxml_parser_object = XMLParser(encoding=encoding) lxml_parser_object.set_element_class_lookup(node_lookup) Element = lxml_parser_object.makeelement root = None stack = [ (tree,root) ] while stack: node, parent = stack.pop() if parent is None: # We use this roundabout method becuase the encoding is always set # to 'UTF8' if we use parser.makeelement() lxml_parser_object.feed('<trash></trash>') lxmlnode = lxml_parser_object.close() lxmlnode.tag = node['name'] lxmlnode.attrib.update(node.get('attrs', {})) root = lxmlnode else: lxmlnode = Element(node['name'], attrib=node.get('attrs', {})) parent.append(lxmlnode) lxmlnode.text = node['text'] lxmlnode.tail = node['tail'] for child in reversed(node['children']): stack.append((child, lxmlnode)) return root
def parseIncrementallyBy20(builder, inputXml): parser = XMLParser(target=builder) xmlStream = StringIO(inputXml) result = [] data = xmlStream.read(20) loops = 0 while data: loops += 1 parser.feed(data) for id, subtree in builder.getSubtrees(): result.append((id, subtree)) data = xmlStream.read(20) retval = parser.close() for id, subtree in builder.getSubtrees(): result.append((id, subtree)) assert retval is None, 'Errr?' assert ceil(len(inputXml) / 20.0) == loops, 'Errr?' return result, loops
def start(self): def isPath(stack): return [d['tag'] for d in stack] == self._path builder = SubTreesTreeBuilder(buildFor={ 'simple': isPath, }) def processSubtrees(): for id, subtree in builder.getSubtrees(): self._callback(subtree) parser = XMLParser(target=builder) data = self._stream.read(4096) while data: parser.feed(data) processSubtrees() data = self._stream.read(4096) parser.close() processSubtrees()
def testTwoTags(self): target = Target('aap') p = XMLParser(target = target) p.feed("<aap>") p.feed("noot") p.feed("</aap>") self.assertEquals("<aap>noot</aap>", lxmltostring(target.root))
def testTwoTags(self): target = Target('aap') p = XMLParser(target = target) p.feed("<aap>") p.feed("noot") p.feed("</aap>") self.assertEqual("<aap>noot</aap>", lxmltostring(target.root))
def iterparse(source, events=('end',), tag=None, **kwargs): """ Iteratively parse an xml file, firing end events for any requested tags stream: The XML stream to parse. tag: The iterable of tags to fire events on. size: (optional, 1024) The number of bytes to read at a time. """ # Note: We need to remove all kwargs not supported by XMLParser # which but are supported by iterparse: source, events, tag, html, # recover, huge_tree. # # http://lxml.de/api/lxml.etree.XMLParser-class.html # http://lxml.de/api/lxml.etree.iterparse-class.html size = kwargs.pop('size', 1024) target_kwargs = dict( strip_namespace=kwargs.pop('strip_namespace', False), ignore_namespace=kwargs.pop('ignore_namespace', False), debug=kwargs.pop('debug', False), ) target = MinimalTarget(events=events, tags=tag, **target_kwargs) parser = XMLParser(target=target, **kwargs) raw = source.read(size) while raw: try: parser.feed(raw) finally: # Note: When exceptions are raised within the parser the # target's close method will be called. events = target.completed_events while events: yield events.pop(0) raw = source.read(size)
def iterparse(source, events=('end', ), tag=None, **kwargs): """ Iteratively parse an xml file, firing end events for any requested tags stream: The XML stream to parse. tag: The iterable of tags to fire events on. size: (optional, 1024) The number of bytes to read at a time. """ # Note: We need to remove all kwargs not supported by XMLParser # which but are supported by iterparse: source, events, tag, html, # recover, huge_tree. # # http://lxml.de/api/lxml.etree.XMLParser-class.html # http://lxml.de/api/lxml.etree.iterparse-class.html size = kwargs.pop('size', 1024) target_kwargs = dict( strip_namespace=kwargs.pop('strip_namespace', False), ignore_namespace=kwargs.pop('ignore_namespace', False), debug=kwargs.pop('debug', False), ) target = MinimalTarget(events=events, tags=tag, **target_kwargs) parser = XMLParser(target=target, **kwargs) raw = source.read(size) while raw: try: parser.feed(raw) finally: # Note: When exceptions are raised within the parser the # target's close method will be called. events = target.completed_events while events: yield events.pop(0) raw = source.read(size)
def run(self, xmlPath, xmlFile): logger.info(f'{self.name}, normalise start ...') try: parser = XMLParser(target=self.nodeTree, recover=True) logger.info(f'parsing {xmlFile} ...') with open(xmlPath, 'r') as fhr: parser.feed('<Root>\n') for xmlRecord in fhr: try: self.nodeTree.count() parser.feed(xmlRecord) except ParseError as ex: logger.error(exc_info=True) parser.feed('<\Root>\n') parser.close() rowcount = self.nodeTree.result() logger.info(f'### {xmlFile} rowcount : {rowcount}') except Exception as ex: errMsg = f'xml inputFile, recnum : {xmlFile}, {rowcount}' logger.error(errMsg, exc_info=True) raise
def checkfile(modelXbrl, filepath): result = [] lineNum = 1 foundXmlDeclaration = False isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" file, encoding = modelXbrl.fileSource.file(filepath) parserResults = {} class checkFileType(object): def start(self, tag, attr, nsmap=None): # check root XML element type parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet") if tag in ("{http://www.w3.org/1999/xhtml}html", "{http://www.w3.org/1999/xhtml}xhtml"): if nsmap and any(ns in ixbrlAll for ns in nsmap.values()): parserResults["isInline"] = True else: parserResults["maybeInline"] = True def end(self, tag): pass def data(self, data): pass def close(self): pass _parser = XMLParser(target=checkFileType()) _isTestcase = False mayBeInline = isInline = False with file as f: while True: line = f.readline() if line == "": break; # check for disallowed characters or entity codes for match in docCheckPattern.finditer(line): text = match.group() if text.startswith("&"): if not text in xhtmlEntities: modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"), _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) elif isEFM and not _isTestcase: if len(text) == 1: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), file=os.path.basename(filepath), line=lineNum, column=match.start()) else: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) if lineNum == 1: xmlDeclarationMatch = XMLdeclaration.search(line) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() line = line[0:start] + line[end:] foundXmlDeclaration = True if _parser: # feed line after removal of xml declaration _parser.feed(line.encode('utf-8','ignore')) if "rootIsTestcase" in parserResults: # root XML element has been encountered _isTestcase = parserResults["rootIsTestcase"] if "isInline" in parserResults: isInline = True elif "maybeInline" in parserResults: mayBeInline = True _parser = None # no point to parse past the root element if mayBeInline and inlinePattern.search(line): mayBeInline = False isInline = True if isInline: for match in inlineSelfClosedElementPattern.finditer(line): selfClosedLocalName = match.group(3) if selfClosedLocalName not in elementsWithNoContent: modelXbrl.warning("ixbrl:selfClosedTagWarning", _("Self-closed element \"%(element)s\" may contain text or other elements and should not use self-closing tag syntax (/>) when empty; change these to end-tags in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, element=match.group(1), file=os.path.basename(filepath), line=lineNum, column=match.start()) result.append(line) lineNum += 1 result = ''.join(result) if not foundXmlDeclaration: # may be multiline, try again xmlDeclarationMatch = XMLdeclaration.search(result) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() result = result[0:start] + result[end:] foundXmlDeclaration = True return (io.StringIO(initial_value=result), encoding)
def checkfile(modelXbrl, filepath): result = [] lineNum = 1 foundXmlDeclaration = False isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" file, encoding = modelXbrl.fileSource.file(filepath) parserResults = {} class checkFileType(object): def start(self, tag, attr, nsmap=None): # check root XML element type parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet") if tag in ("{http://www.w3.org/1999/xhtml}html", "{http://www.w3.org/1999/xhtml}xhtml"): if nsmap and any(ns in ixbrlAll for ns in nsmap.values()): parserResults["isInline"] = True else: parserResults["maybeInline"] = True def end(self, tag): pass def data(self, data): pass def close(self): pass _parser = XMLParser(target=checkFileType(),huge_tree=True) _isTestcase = False mayBeInline = isInline = False with file as f: while True: line = f.readline() if line == "": break; # check for disallowed characters or entity codes for match in docCheckPattern.finditer(line): text = match.group() if text.startswith("&"): if not text in xhtmlEntities: modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"), _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) elif isEFM and not _isTestcase: if len(text) == 1: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), file=os.path.basename(filepath), line=lineNum, column=match.start()) else: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) if lineNum == 1: xmlDeclarationMatch = XMLdeclaration.search(line) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() line = line[0:start] + line[end:] foundXmlDeclaration = True if _parser: # feed line after removal of xml declaration _parser.feed(line.encode('utf-8','ignore')) if "rootIsTestcase" in parserResults: # root XML element has been encountered _isTestcase = parserResults["rootIsTestcase"] if "isInline" in parserResults: isInline = True elif "maybeInline" in parserResults: mayBeInline = True _parser = None # no point to parse past the root element if mayBeInline and inlinePattern.search(line): mayBeInline = False isInline = True if isInline: for match in inlineSelfClosedElementPattern.finditer(line): selfClosedLocalName = match.group(3) if selfClosedLocalName not in elementsWithNoContent: modelXbrl.warning("ixbrl:selfClosedTagWarning", _("Self-closed element \"%(element)s\" may contain text or other elements and should not use self-closing tag syntax (/>) when empty; change these to end-tags in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, element=match.group(1), file=os.path.basename(filepath), line=lineNum, column=match.start()) result.append(line) lineNum += 1 result = ''.join(result) if not foundXmlDeclaration: # may be multiline, try again xmlDeclarationMatch = XMLdeclaration.search(result) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() result = result[0:start] + result[end:] foundXmlDeclaration = True return (io.StringIO(initial_value=result), encoding)
self.group_name = '' def start(self, tag, attrib): if tag != 'outline': # Ignore anything not part of the outline return if not attrib.get('xmlUrl'): # Remember the current group self.group_name = attrib['text'] else: # Output a podcast entry self.writer.writerow( (self.group_name, attrib['text'], attrib['xmlUrl'], attrib.get('htmlUrl', ''))) def end(self, tag): "Ignore closing tags" def data(self, data): "Ignore data inside nodes" def close(self): "Nothing special to do here" target = PodcastListToCSV(sys.stdout) parser = XMLParser(target=target) with open('podcasts.opml', 'rt') as f: for line in f: parser.feed(line) parser.close()