def __init__(self, root_tag, root_factory, **kwargs): self.encoding = kwargs.pop('encoding', None) if not callable(root_factory): raise SerializableAPIError("Factory not callable") self.root_tag = root_tag self.root_factory = root_factory self.root = None self.stack = [] self.dcache = '' if self.encoding is None: self.parser = expat.ParserCreate() else: self.parser = expat.ParserCreate(self.encoding) self.parser.StartElementHandler = self.start_element_handler self.parser.EndElementHandler = self.end_element_handler self.parser.CharacterDataHandler = self.character_data_handler
def xml_parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', disable_entities=True, **kwargs): handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate(encoding, namespace_separator) try: parser.ordered_attributes = True except AttributeError: pass parser.StartNamespaceDeclHandler = handler.startNamespaceDecl parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True if disable_entities: try: feature = "http://apache.org/xml/features/disallow-doctype-decl" parser._reader.setFeature(feature, True) except AttributeError: parser.DefaultHandler = lambda x: None parser.ExternalEntityRefHandler = lambda *x: 1 if hasattr(xml_input, 'read'): parser.ParseFile(xml_input) else: parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', **kwargs): handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate(encoding, namespace_separator) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True try: parser.ParseFile(xml_input) except (TypeError, AttributeError): parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', disable_entities=True, **kwargs): """Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string` or a file-like object. If `xml_attribs` is `True`, element attributes are put in the dictionary among regular child elements, using `@` as a prefix to avoid collisions. If set to `False`, they are just ignored. If `item_depth` is `0`, the function returns a dictionary for the root element (default behavior). Otherwise, it calls `item_callback` every time an item at the specified depth is found and returns `None` in the end (streaming mode). The callback function receives two parameters: the `path` from the document root to the item (name-attribs pairs), and the `item` (dict). If the callback's return value is false-ish, parsing will be stopped with the :class:`ParsingInterrupted` exception. Streaming example:: The optional argument `postprocessor` is a function that takes `path`, `key` and `value` as positional arguments and returns a new `(key, value)` pair where both `key` and `value` may have changed. Usage example:: OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) You can pass an alternate version of `expat` (such as `defusedexpat`) by using the `expat` parameter. E.g: You can use the force_list argument to force lists to be created even when there is only a single child of a given level of hierarchy. The force_list argument is a tuple of keys. If the key for a given level of hierarchy is in the force_list argument, that level of hierarchy will have a list as a child (even if there is only one sub-element). The index_keys operation takes precendence over this. This is applied after any user-supplied postprocessor has already run. For example, given this input: <servers> <server> <name>host1</name> <os>Linux</os> <interfaces> <interface> <name>em0</name> <ip_address>10.0.0.1</ip_address> </interface> </interfaces> </server> </servers> If called with force_list=('interface',), it will produce this dictionary: {'servers': {'server': {'name': 'host1', 'os': 'Linux'}, 'interfaces': {'interface': [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } `force_list` can also be a callable that receives `path`, `key` and `value`. This is helpful in cases where the logic that decides whether a list should be forced is more complex. """ handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate( encoding, namespace_separator ) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartNamespaceDeclHandler = handler.startNamespaceDecl parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True if disable_entities: try: # Attempt to disable DTD in Jython's expat parser (Xerces-J). feature = "http://apache.org/xml/features/disallow-doctype-decl" parser._reader.setFeature(feature, True) except AttributeError: # For CPython / expat parser. # Anything not handled ends up here and entities aren't expanded. parser.DefaultHandler = lambda x: None # Expects an integer return; zero means failure -> expat.ExpatError. parser.ExternalEntityRefHandler = lambda *x: 1 if hasattr(xml_input, 'read'): parser.ParseFile(xml_input) else: parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', **kwargs): """ Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string` or a file-like object. If `xml_attribs` is `True`, element attributes are put in the dictionary among regular child elements, using `@` as a prefix to avoid collisions. If set to `False`, they are just ignored. Simple example:: >>> import xmltodict >>> doc = xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a> ... \"\"\") >>> doc['a']['@prop'] u'x' >>> doc['a']['b'] [u'1', u'2'] If `item_depth` is `0`, the function returns a dictionary for the root element (default behavior). Otherwise, it calls `item_callback` every time an item at the specified depth is found and returns `None` in the end (streaming mode). The callback function receives two parameters: the `path` from the document root to the item (name-attribs pairs), and the `item` (dict). If the callback's return value is false-ish, parsing will be stopped with the :class:`ParsingInterrupted` exception. Streaming example:: >>> def handle(path, item): ... print 'path:%s item:%s' % (path, item) ... return True ... >>> xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a>\"\"\", item_depth=2, item_callback=handle) path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 The optional argument `postprocessor` is a function that takes `path`, `key` and `value` as positional arguments and returns a new `(key, value)` pair where both `key` and `value` may have changed. Usage example:: >>> def postprocessor(path, key, value): ... try: ... return key + ':int', int(value) ... except (ValueError, TypeError): ... return key, value >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', ... postprocessor=postprocessor) OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) You can pass an alternate version of `expat` (such as `defusedexpat`) by using the `expat` parameter. E.g: >>> import defusedexpat >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) OrderedDict([(u'a', u'hello')]) You can use the force_list argument to force lists to be created even when there is only a single child of a given level of hierarchy. The force_list argument is a tuple of keys. If the key for a given level of hierarchy is in the force_list argument, that level of hierarchy will have a list as a child (even if there is only one sub-element). The index_keys operation takes precendence over this. This is applied after any user-supplied postprocessor has already run. For example, given this input: <servers> <server> <name>host1</name> <os>Linux</os> <interfaces> <interface> <name>em0</name> <ip_address>10.0.0.1</ip_address> </interface> </interfaces> </server> </servers> If called with force_list=('interface',), it will produce this dictionary: {'servers': {'server': {'name': 'host1', 'os': 'Linux'}, 'interfaces': {'interface': [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } `force_list` can also be a callable that receives `path`, `key` and `value`. This is helpful in cases where the logic that decides whether a list should be forced is more complex. """ handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate(encoding, namespace_separator) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True try: parser.ParseFile(xml_input) except (TypeError, AttributeError): parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', disable_entities=True, ordered_mixed_children=False, **kwargs): """Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string` or a file-like object. If `xml_attribs` is `True`, element attributes are put in the dictionary among regular child elements, using `@` as a prefix to avoid collisions. If set to `False`, they are just ignored. Simple example:: >>> import xmltodict >>> doc = xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a> ... \"\"\") >>> doc['a']['@prop'] u'x' >>> doc['a']['b'] [u'1', u'2'] If `item_depth` is `0`, the function returns a dictionary for the root element (default behavior). Otherwise, it calls `item_callback` every time an item at the specified depth is found and returns `None` in the end (streaming mode). The callback function receives two parameters: the `path` from the document root to the item (name-attribs pairs), and the `item` (dict). If the callback's return value is false-ish, parsing will be stopped with the :class:`ParsingInterrupted` exception. Streaming example:: >>> def handle(path, item): ... print('path:%s item:%s' % (path, item)) ... return True ... >>> xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a>\"\"\", item_depth=2, item_callback=handle) path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 The optional argument `postprocessor` is a function that takes `path`, `key` and `value` as positional arguments and returns a new `(key, value)` pair where both `key` and `value` may have changed. Usage example:: >>> def postprocessor(path, key, value): ... try: ... return key + ':int', int(value) ... except (ValueError, TypeError): ... return key, value >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', ... postprocessor=postprocessor) OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) You can pass an alternate version of `expat` (such as `defusedexpat`) by using the `expat` parameter. E.g: >>> import defusedexpat >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) OrderedDict([(u'a', u'hello')]) You can use the force_list argument to force lists to be created even when there is only a single child of a given level of hierarchy. The force_list argument is a tuple of keys. If the key for a given level of hierarchy is in the force_list argument, that level of hierarchy will have a list as a child (even if there is only one sub-element). The index_keys operation takes precendence over this. This is applied after any user-supplied postprocessor has already run. For example, given this input: <servers> <server> <name>host1</name> <os>Linux</os> <interfaces> <interface> <name>em0</name> <ip_address>10.0.0.1</ip_address> </interface> </interfaces> </server> </servers> If called with force_list=('interface',), it will produce this dictionary: {'servers': {'server': {'name': 'host1', 'os': 'Linux'}, 'interfaces': {'interface': [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } `force_list` can also be a callable that receives `path`, `key` and `value`. This is helpful in cases where the logic that decides whether a list should be forced is more complex. The parameter ordered_mixed_children will cause the parser to add an attribute to each element in the data with a key `@__order__`, with a value that corresponds to the element's processing order within the document. By default, mixed child elements are grouped by name and only keep their relative order. Sometimes the order does matter, but the system you're working with doesn't have any other way to indicate order than by the coincidence of order in the document. For example, this input: <a> <b>1</b> <c>2</c> <b>3</b> </a> Would normally be parsed as: {"a": {"b": [1, 3], "c": 2}} This would then be unparsed as: <a> <b>1</b> <b>3</b> <c>2</c> </a> With `ordered_mixed_children=True`, the order information is included so that the original input is produced when unparsing (the `@__order__` attribute is removed). {"a": { "@__order__": 1, "b": ({"@__order__": 2, "#text": 1}, {"@__order__": 3, "#text": 3}), "c": {"@__order__": 4, "#text": 2} } } """ handler = _DictSAXHandler(namespace_separator=namespace_separator, ordered_mixed_children=ordered_mixed_children, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate( encoding, namespace_separator ) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartNamespaceDeclHandler = handler.startNamespaceDecl parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True if disable_entities: try: # Attempt to disable DTD in Jython's expat parser (Xerces-J). feature = "http://apache.org/xml/features/disallow-doctype-decl" parser._reader.setFeature(feature, True) except AttributeError: # For CPython / expat parser. # Anything not handled ends up here and entities aren't expanded. parser.DefaultHandler = lambda x: None # Expects an integer return; zero means failure -> expat.ExpatError. parser.ExternalEntityRefHandler = lambda *x: 1 if hasattr(xml_input, 'read'): parser.ParseFile(xml_input) else: parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', disable_entities=True, **kwargs): force_list = [ 'domainSetInfo', 'evaluationCriteria', 'inputInfoType_model.resourceType', 'relationInfo', 'domainSetInfo.domainId', 'annotationInfo', 'validationInfo', 'textClassificationInfo', 'telephoneNumber', 'requiredLRs', 'annotationManual', 'metadataLanguageName', 'appropriatenessForDSI', 'evaluationTool', 'operatingSystem', 'metadataLanguageId', 'originalSource', 'documentation', 'segmentationLevel', 'encodingLevel', 'variant', 'fundingCountryId', 'funder', 'languageSetInfo', 'inputInfoType_model.annotationType', 'outputInfoType_model.annotationType', 'affiliation', 'fundingType', 'validator', 'identifier', 'theoreticModel', 'creationTool', 'distributionInfo', 'licenceInfo', 'evaluationLevel', 'sizePerLanguage', 'languageVarietyName', 'restrictionsOfUse', 'domainSetInfo.domain', 'contactPerson', 'evaluationReport', 'outputInfoType_model.resourceType', 'domainSetInfo.subdomainId', 'evaluationMeasure', 'keywords', 'fundingCountry', 'url', 'author', 'iprHolder', 'annotationTool', 'email', 'requiredSoftware', 'domainInfo', 'languageVarietyInfo', 'conformanceToStandardsBestPractices', 'characterEncodingInfo', 'extratextualInformation', 'textFormatInfo', 'distributionMedium', 'corpusTextInfo', 'implementationLanguage', 'publisher', 'externalRef', 'languageInfo', 'resourceCreator', 'evaluator', 'executionLocation', 'domainSetInfo.subdomain', 'samplesLocation', 'linguisticInformation', 'function', 'fundingProject', 'downloadLocation', 'sizeInfo', 'editor', 'task', 'extraTextualInformationUnit', 'metadataCreator', 'validationReport', 'outputInfoType_model.mediaType' ] handler = Parser(namespace_separator=namespace_separator, force_list=force_list, **kwargs) if isinstance(xml_input, xmltodict._unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate(encoding, namespace_separator) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartNamespaceDeclHandler = handler.startNamespaceDecl parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True if disable_entities: try: # Attempt to disable DTD in Jython's expat parser (Xerces-J). feature = "http://apache.org/xml/features/disallow-doctype-decl" parser._reader.setFeature(feature, True) except AttributeError: # For CPython / expat parser. # Anything not handled ends up here and entities aren't expanded. parser.DefaultHandler = lambda x: None # Expects an integer return; zero means failure -> expat.ExpatError. parser.ExternalEntityRefHandler = lambda *x: 1 if hasattr(xml_input, 'read'): parser.ParseFile(xml_input) else: parser.Parse(xml_input, True) return handler.item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', disable_entities=True, process_comments=False, **kwargs): """Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string`, a file-like object, or a generator of strings. If `xml_attribs` is `True`, element attributes are put in the dictionary among regular child elements, using `@` as a prefix to avoid collisions. If set to `False`, they are just ignored. Simple example:: >>> import xmltodict >>> doc = xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a> ... \"\"\") >>> doc['a']['@prop'] u'x' >>> doc['a']['b'] [u'1', u'2'] If `item_depth` is `0`, the function returns a dictionary for the root element (default behavior). Otherwise, it calls `item_callback` every time an item at the specified depth is found and returns `None` in the end (streaming mode). The callback function receives two parameters: the `path` from the document root to the item (name-attribs pairs), and the `item` (dict). If the callback's return value is false-ish, parsing will be stopped with the :class:`ParsingInterrupted` exception. Streaming example:: >>> def handle(path, item): ... print('path:%s item:%s' % (path, item)) ... return True ... >>> xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a>\"\"\", item_depth=2, item_callback=handle) path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 The optional argument `postprocessor` is a function that takes `path`, `key` and `value` as positional arguments and returns a new `(key, value)` pair where both `key` and `value` may have changed. Usage example:: >>> def postprocessor(path, key, value): ... try: ... return key + ':int', int(value) ... except (ValueError, TypeError): ... return key, value >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', ... postprocessor=postprocessor) {'a': {'b:int': [1, 2], 'b': 'x'}} You can pass an alternate version of `expat` (such as `defusedexpat`) by using the `expat` parameter. E.g: >>> import defusedexpat >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) {'a': 'hello'} You can use the force_list argument to force lists to be created even when there is only a single child of a given level of hierarchy. The force_list argument is a tuple of keys. If the key for a given level of hierarchy is in the force_list argument, that level of hierarchy will have a list as a child (even if there is only one sub-element). The index_keys operation takes precedence over this. This is applied after any user-supplied postprocessor has already run. For example, given this input: <servers> <server> <name>host1</name> <os>Linux</os> <interfaces> <interface> <name>em0</name> <ip_address>10.0.0.1</ip_address> </interface> </interfaces> </server> </servers> If called with force_list=('interface',), it will produce this dictionary: {'servers': {'server': {'name': 'host1', 'os': 'Linux'}, 'interfaces': {'interface': [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } `force_list` can also be a callable that receives `path`, `key` and `value`. This is helpful in cases where the logic that decides whether a list should be forced is more complex. If `process_comment` is `True` then comment will be added with comment_key (default=`'#comment'`) to then tag which contains comment For example, given this input: <a> <b> <!-- b comment --> <c> <!-- c comment --> 1 </c> <d>2</d> </b> </a> If called with process_comment=True, it will produce this dictionary: 'a': { 'b': { '#comment': 'b comment', 'c': { '#comment': 'c comment', '#text': '1', }, 'd': '2', }, } """ handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate(encoding, namespace_separator) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartNamespaceDeclHandler = handler.startNamespaceDecl parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters if process_comments: parser.CommentHandler = handler.comments parser.buffer_text = True if disable_entities: try: # Attempt to disable DTD in Jython's expat parser (Xerces-J). feature = "http://apache.org/xml/features/disallow-doctype-decl" parser._reader.setFeature(feature, True) except AttributeError: # For CPython / expat parser. # Anything not handled ends up here and entities aren't expanded. parser.DefaultHandler = lambda x: None # Expects an integer return; zero means failure -> expat.ExpatError. parser.ExternalEntityRefHandler = lambda *x: 1 if hasattr(xml_input, 'read'): parser.ParseFile(xml_input) elif isgenerator(xml_input): for chunk in xml_input: parser.Parse(chunk, False) parser.Parse(b'', True) else: parser.Parse(xml_input, True) return handler.item