def test_process_inner_child(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111. i. iii</P> <STARS /> <P>A. AAA</P> <P><E T="03">1.</E> eee</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] self.assertEqual(['1'], n1.label) self.assertEqual(1, len(n1.children)) n1i = n1.children[0] self.assertEqual(['1', 'i'], n1i.label) self.assertEqual(n1i.text.strip(), 'i. iii') self.assertEqual(1, len(n1i.children)) n1iA = n1i.children[0] self.assertEqual(['1', 'i', 'A'], n1iA.label) self.assertEqual(1, len(n1iA.children)) n1iA1 = n1iA.children[0] self.assertEqual(['1', 'i', 'A', '1'], n1iA1.label) self.assertEqual(0, len(n1iA1.children))
def test_process_inner_child_has_citation(self): xml = """ <ROOT> <HD>Title</HD> <P>1. Something something see comment 22(a)-2.i. please</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(0, len(tree.children))
def test_process_inner_child_incorrect_xml(self): xml = """ <ROOT> <HD>Title</HD> <P><E T="03">1.</E> 111</P> <P>i. iii</P> <P><E T="03">2.</E> 222 Incorrect Content</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() self.assertEqual(2, len(stack.m_stack[0]))
def test_process_inner_child_no_marker(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111</P> <P>i. iii</P> <P>Howdy Howdy</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() i1 = stack.m_stack[0][0][1] self.assertEqual(1, len(i1.children)) i1i = i1.children[0] self.assertEqual(0, len(i1i.children)) self.assertEqual(i1i.text.strip(), "i. iii\n\nHowdy Howdy")
def test_process_inner_child_collapsed_i(self): xml = """ <ROOT> <HD>Title</HD> <P>1. <E T="03">Keyterm text</E> i. Content content</P> <P>ii. Other stuff</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(['1'], tree.label) self.assertEqual(2, len(tree.children)) self.assertEqual(['1', 'i'], tree.children[0].label) self.assertEqual(0, len(tree.children[0].children)) self.assertEqual(['1', 'ii'], tree.children[1].label) self.assertEqual(0, len(tree.children[1].children))
def test_process_inner_child_space(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111</P> <P>i. See country A. Not that country</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] self.assertEqual(['1'], n1.label) self.assertEqual(1, len(n1.children)) n1i = n1.children[0] self.assertEqual(['1', 'i'], n1i.label) self.assertEqual(0, len(n1i.children))
def test_process_inner_child_stars_and_inline(self): xml = """ <ROOT> <HD>Title</HD> <STARS /> <P>2. Content. * * *</P> <STARS /> <P>xi. Content</P> <STARS /> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(['2'], tree.label) self.assertEqual(1, len(tree.children)) self.assertEqual(['2', 'xi'], tree.children[0].label) self.assertEqual(0, len(tree.children[0].children))
def process_without_headers(cfr_part, parent_xml, amended_labels): """Sometimes, we only get a list of paragraphs that have changes, but no header indicating with which sections they are associated. Accommodate by trying to match up amended_labels with paragraphs""" parent_xml = standardize_xml(parent_xml) relevant_labels = [al.label for al in filter(_is_interp_amend, amended_labels)] label_indices = [] for idx, child in enumerate(parent_xml): text = tree_utils.get_node_text(child) if len(relevant_labels) > len(label_indices): marker = relevant_labels[len(label_indices)][-1] + '.' if text.startswith(marker): label_indices.append(idx) labelXindex = zip(relevant_labels, label_indices) nodes = [] # Reverse it so we can delete from the bottom for label, idx in reversed(labelXindex): stack = tree_utils.NodeStack() prefix = label[:label.index(Node.INTERP_MARK) + 1] section = Node(node_type=Node.INTERP, label=prefix) stack.add(2, section) interpretations.process_inner_children(stack, parent_xml[idx - 1]) while stack.size() > 1: stack.unwind() nodes.append(stack.m_stack[0][0][1]) # delete the tail while len(parent_xml.getchildren()) > idx: parent_xml.remove(parent_xml[idx]) if nodes: nodes.append(Node(node_type=Node.INTERP, label=[cfr_part, Node.INTERP_MARK])) # Reverse it again into normal flow return treeify(list(reversed(nodes)))[0] else: return None