def naive_xml_to_json(xml_string): if xml_string in (b"<data/>", b"<root/>"): return '{"success":""}' def walk(xml_node, depth=0): json = list() children = xml_node.getchildren() name = xml_node.tag.split("}")[-1] body = xml_node.text if body is not None: body = tornado.escape.json_encode(body) has_children = len(children) > 0 if has_children: json.append('"%s" : {' % name) else: if body is not None: json.append('"%s" : %s' % (name, body)) else: json.append('"%s" : ""' % (name)) for child, is_last in iterate_with_lookahead(children): name = xml_node.tag.split("{")[-1] body = xml_node.text json.append(walk(child, depth + 1)) if not is_last: json.append(',') if has_children: json.append('}') return ''.join(json) try: xml_string = bytes(xml_string, "utf-8") except TypeError: pass root_xml_node = lxml.etree.fromstring(xml_string) root_name = get_xml_element_name(root_xml_node) if root_name in ("data", "root"): root_xml_node = root_xml_node[0] root_name = get_xml_element_name(root_xml_node) json_string = walk(root_xml_node) return "{%s}" % json_string
def prune_non_schema_xml(root_schema_node, xml_string): def _walk(schema_node, xml_node): xml_children = collect_siblings(xml_node) actual_names = set(xml_children.keys()) expected_names = collect_child_names(schema_node) remove_names = actual_names - expected_names for name, siblings in xml_children.items(): if name in remove_names: for sib in siblings: xml_node.remove(sib) else: child_schema_node = find_child_by_name(schema_node, name) for sib in siblings: _walk(child_schema_node, sib) # check root node schema_children = collect_children(root_schema_node) root_xml_node = et.fromstring(xml_string) root_name = get_xml_element_name(root_xml_node) for child in schema_children: if child.get_name() == root_name: root_schema_node = child break else: # root node isn't in the schema return "<data/>" _walk(root_schema_node, root_xml_node) xml_string = et.tostring(root_xml_node, pretty_print=True).decode("utf-8") return xml_string