def test_text_to_labels(self): text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and " text += "(b)(2)" self.assertEqual( [['1111', '9', 'c', '2', 'iii', 'Interp']], interpretation.text_to_labels(text, Label(part='1111', comment=True))) text = "Paragraphs 4(b)(7) and (b)(8)." self.assertEqual( [['1111', '4', 'b', '7', 'Interp'], ['1111', '4', 'b', '8', 'Interp']], interpretation.text_to_labels(text, Label(part='1111', comment=True))) text = "Appendices G and H-Something" self.assertEqual( [['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']], interpretation.text_to_labels(text, Label(part='1111', comment=True))) text = "Paragraph 38(l)(7)(i)(A)(2)." self.assertEqual( [['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']], interpretation.text_to_labels(text, Label(part='1111', comment=True)))
def _p_with_label_in_child(xml_node): """E.g. <P><E>22(a)</E>.</P>""" children = xml_node.getchildren() return (xml_node.tag.upper() == 'P' and not (xml_node.text or '').strip() and len(children) == 1 and not (children[0].tail or '').strip(" \n\t.") and text_to_labels(children[0].text, Label(), warn=False))
def is_title(xml_node): """Not all titles are created equal. Sometimes a title appears as a paragraph tag, mostly to add confusion.""" if xml_node.getchildren(): child = xml_node.getchildren()[0] else: child = None return bool( (xml_node.tag.upper() == 'HD' and xml_node.attrib['SOURCE'] != 'HED') or (xml_node.tag.upper() == 'P' and (xml_node.text is None or not xml_node.text.strip()) and len(xml_node.getchildren()) == 1 and (child.tail is None or not child.tail.strip(" \n\t.")) and text_to_labels(child.text, Label(), warn=False)) or (xml_node.tag.upper() == 'P' and len(xml_node.getchildren()) == 0 and xml_node.text and not get_first_interp_marker(xml_node.text) and text_to_labels(xml_node.text, Label(), warn=False, force_start=True)))
def _non_interp_p_with_label(xml_node): """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>""" return ( xml_node.tag.upper() == 'P' and not xml_node.getchildren() and xml_node.text and not get_first_interp_marker(xml_node.text) and text_to_labels(xml_node.text, Label(), warn=False, force_start=True) )
def _p_with_label_in_child(xml_node): """E.g. <P><E>22(a)</E>.</P>""" children = xml_node.getchildren() return ( xml_node.tag.upper() == 'P' and not (xml_node.text or '').strip() and len(children) == 1 and not (children[0].tail or '').strip(" \n\t.") and text_to_labels(children[0].text, Label(), warn=False) )
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def test_text_to_labels(self): text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and " text += "(b)(2)" self.assertEqual([['1111', '9', 'c', '2', 'iii', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Paragraphs 4(b)(7) and (b)(8)." self.assertEqual([['1111', '4', 'b', '7', 'Interp'], ['1111', '4', 'b', '8', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Appendices G and H-Something" self.assertEqual([['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Paragraph 38(l)(7)(i)(A)(2)." self.assertEqual([['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True)))
def per_node(node): if (node.node_type != struct.Node.INTERP or node.label[-1] != struct.Node.INTERP_MARK): return # Always add a connection based on the interp's label self.lookup_table[tuple(node.label[:-1])].append(node) # Also add connections based on the title for label in text_to_labels(node.title or '', Label.from_node(node), warn=False): label = tuple(label[:-1]) # Remove Interp marker if node not in self.lookup_table[label]: self.lookup_table[label].append(node)
def _non_interp_p_with_label(xml_node): """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>""" return (xml_node.tag.upper() == 'P' and not xml_node.getchildren() and xml_node.text and not get_first_interp_marker(xml_node.text) and text_to_labels( xml_node.text, Label(), warn=False, force_start=True))