def find(input, xpaths, root_tag, no_root, **kwargs): """ Extracts specified portions of XML data from the input. Requires valid input. This command can be used to extract a data subset from within more complex data. Note: The 'find' command is both similar to and different than the 'strip -x' command. The 'find' command outputs MATCHING input, while 'strip -x' outputs NON-matching input. Note: The ElementTree package (Python builtin) has limited XPath support. Therefore, some of the examples below will only work if the lxml package is used (instead of ElementTree). Examples: \b Example: Find all b elements: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b' -nr <b><c/></b> <b><d><e/></d><d/></b> \b Example: Find the 1st b element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[1]' -nr <b><c/></b> \b Example: Find the 2nd b element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[2]' -nr <b><d><e/></d><d/></b> \b Example: Find the last b element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[last()]' -nr <b><d><e/></d><d/></b> \b Example: Find all e elements that are a child of a d element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//d/e' -nr <e/> \b Example: Find all d elements with a child e element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//d/e/parent::*' -nr <d><e/></d> \b Example: Find all elements with exactly 1 inner element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(*)=1]' -nr <b><c/></b> <d><e/></d> \b Example: Find all elements with exactly 2+ inner elements: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(*)>=2]' -nr <a><b><c/></b><b><d><e/></d><d/></b></a> <b><d><e/></d><d/></b> \b Example: Find all elements with 1 child element: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(/*)=1]' -nr <b><c/></b> <d><e/></d> \b Example: Find all elements with 1 inner element with tag c: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(./c)=1]' -nr <b><c/></b> \b Example: Find all elements with 1 inner element with either the c OR e tag: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(./c|./e)=1]' -nr <b><c/></b> <d><e/></d> \b Example: Find all b elements with attribute @z=1: $ echo '<a><b z="1"><c/></b><b z="2"><d><e z="1"/></d><d/></b></a>' | \\ python -mclifunzone.xmltool find -x '//b[@z="1"]' -nr <b z="1"><c/></b> \b Example: Find all elements with attribute @z=1: $ echo '<a><b z="1"><c/></b><b z="2"><d><e z="1"/></d><d/></b></a>' | \\ python -mclifunzone.xmltool find -x '//*[@z="1"]' -nr <b z="1"><c/></b> <e z="1"/> \b Example: Find all elements with attribute @z except those with @z=2: $ echo '<a><b z="1"><c/></b><b z="2"><d z="1"><e z="2"/></d></b></a>' | \\ python -mclifunzone.xmltool find -x '//*[@z and @z!="2"]' -nr <b z="1"><c/></b> <d z="1"><e z="2"/></d> \b Example: Find all elements with attribute @z=1 and a node position greater than 2: $ echo '<a><b z="1"><c/></b><b z="2"><d z="1"><e z="2"/></d></b></a>' | \\ python -mclifunzone.xmltool find -x '//*[@z="1" and position()>2]' -nr <d z="1"><e z="2"/></d> \b Example: Find all elements with text that contains "3": $ echo '<z><a>1a1</a><b>2b1</b><c>3c1</c><a>4a2</a><b>5b2</b><c>6c2</c><a>7a3</a></z>' | \\ python -mclifunzone.xmltool find -x '//*[contains(text(),"3")]' -nr <c>3c1</c> <a>7a3</a> """ if not input: input = '-' with click.open_file(input, mode='rb') as f: tree = ET.parse(f) root = tree.getroot() if xpaths: elements = list(itertools.chain(*(xml_utils.get_elements(root, xpath=xpath) for xpath in xpaths))) else: elements = [] # output = ET.tostring(root, method='text') if no_root: root_tag = None if root_tag: header = '<%s>' % root_tag footer = '</%s>' % root_tag else: header, footer = None, None if header: click.echo(header) for i in elements: output = ET.tostring(i) click.echo(output) if footer: click.echo(footer)
def strip(input, whitespace, empty, xpaths, tags, attributes, attribute_values, empty_attributes, all_attributes, all_text, **kwargs): """ Removes specified portions of XML data from the input. Requires valid input. This command can be used to simplify complex data (by discarding specific portions of it). Such simplification might be used (for example) as part of an interactive data analysis process. Note: The 'find' command is both similar to and different than the 'strip -x' command. The 'find' command outputs MATCHING input, while 'strip -x' outputs NON-matching input. Examples: \b Example: Remove all d tags that are direct children of b tags: $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool strip -x "//b/d" <a><b><c/></b><b/></a> """ if not input: input = '-' with click.open_file(input, mode='rb') as f: parser = None if whitespace: try: parser = ET.XMLParser(remove_blank_text=True) # since the parser will take care of the whitespace removal, we don't need to do it manually below whitespace = False except TypeError: # TypeError: __init__() got an unexpected keyword argument 'remove_blank_text' # lxml not imported? pass if parser: tree = ET.parse(f, parser=parser) else: tree = ET.parse(f) root = tree.getroot() # from clifunzone import reflection_utils # click.echo('tree: %s' % reflection_utils.varsdict(tree)) # click.echo('tree: %s' % dir(tree)) # click.echo('root: %s' % reflection_utils.varsdict(root)) # click.echo('root: %s' % dir(root)) if tags: # convert each tag to an xpath if not xpaths: xpaths = tuple() xpaths += tuple('//{tag}'.format(tag=s) for s in tags) for xpath in xpaths: xml_utils.remove_elements(root, xpath=xpath) if all_attributes: for i in [i for i in root.iter() if i.attrib]: i.attrib.clear() else: if empty_attributes: xml_utils.remove_attributes_with_empty_value(root) if attributes: for attrib_name in attributes: xml_utils.remove_attributes_with_name(root, attrib_name) if attribute_values: for attrib_value in attribute_values: xml_utils.remove_attributes_with_value(root, attrib_value) if all_text: for i in [i for i in root.iter() if i.text]: i.text = '' if whitespace: for i in [i for i in root.iter() if i.text]: i.text = i.text.strip() if empty: # Note: the repeat flag will cause elements that become empty (as a result of removal of empty children) # to be subsequently detected as empty and removed. repeat = True while repeat: repeat = False # stop unless a removal occurs for parent in [i for i in root.iter() if xml_utils.is_parent_element(i)]: for child in [i for i in xml_utils.get_elements(parent, xpath='./*') if xml_utils.is_empty_element(i)]: repeat = True parent.remove(child) # output = ET.tostring(root, method='text') output = ET.tostring(root) click.echo(output)