def process(self, xml, root): nodes = self.parse_nodes(xml) intro_node, nodes = self.separate_intro(nodes) if intro_node: root.text = " ".join([root.text, intro_node.text]).strip() # @todo - this is ugly. Make tagged_text a legitimate field on Node tagged_text_list = [] if hasattr(root, 'tagged_text'): tagged_text_list.append(root.tagged_text) if hasattr(intro_node, 'tagged_text'): tagged_text_list.append(intro_node.tagged_text) if tagged_text_list: root.tagged_text = ' '.join(tagged_text_list) if nodes: markers = [node.label[0] for node in nodes] constraints = self.additional_constraints() depths = derive_depths(markers, constraints) if not depths: fails_at = debug_idx(markers, constraints) logging.error( "Could not determine paragraph depths (<%s /> %s):\n" "%s\n" "?? %s\n" "Remaining markers: %s", xml.tag, root.label_id(), derive_depths(markers[:fails_at], constraints)[0].pretty_str(), markers[fails_at], markers[fails_at + 1:]) depths = self.select_depth(depths) return self.build_hierarchy(root, nodes, depths) else: return root
def test_simple_stars(self): results = derive_depths(['A', '1', STARS_TAG, 'd']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 2, 2], [r.depth for r in results[0]]) results = derive_depths(['A', '1', 'a', STARS_TAG, 'd']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 2, 2, 2], [r.depth for r in results[0]])
def test_star_star(self): results = derive_depths(['A', STARS_TAG, STARS_TAG, 'D']) self.assertEqual(1, len(results)) self.assertTrue([0, 1, 0, 0], [r.depth for r in results[0]]) results = derive_depths(['A', INLINE_STARS, STARS_TAG, 'D']) self.assertEqual(2, len(results)) self.assertTrue([0, 1, 2, 2], [r.depth for r in results[0]]) self.assertTrue([0, 1, 0, 0], [r.depth for r in results[0]])
def test_inline_star(self): results = derive_depths(['1', STARS_TAG, '2']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 0], [r.depth for r in results[0]]) results = derive_depths(['1', INLINE_STARS, '2']) self.assertEqual(2, len(results)) results = [[r.depth for r in result] for result in results] self.assertTrue([0, 0, 0] in results) self.assertTrue([0, 1, 0] in results)
def test_depth_type_order(self): extra = rules.depth_type_order([markers.ints, markers.lower]) results = derive_depths(['1', 'a'], [extra]) self.assertEqual(1, len(results)) results = derive_depths(['i', 'a'], [extra]) self.assertEqual(0, len(results)) extra = rules.depth_type_order([(markers.ints, markers.roman), markers.lower]) results = derive_depths(['1', 'a'], [extra]) self.assertEqual(1, len(results)) results = derive_depths(['i', 'a'], [extra]) self.assertEqual(1, len(results))
def test_mix_levels_roman_alpha(self): results = derive_depths(['A', '1', '2', 'i', 'ii', 'iii', 'iv', 'B', '1', 'a', 'b', '2', 'a', 'b', 'i', 'ii', 'iii', 'c']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 1, 2, 2, 2, 2, 0, 1, 2, 2, 1, 2, 2, 3, 3, 3, 2], [r.depth for r in results[0]])
def test_alpha_roman_ambiguous(self): results = derive_depths(['i', 'ii', STARS_TAG, 'v', STARS_TAG, 'vii']) self.assertEqual(3, len(results)) results = [[r.depth for r in result] for result in results] self.assertTrue([0, 0, 1, 1, 2, 2] in results) self.assertTrue([0, 0, 1, 1, 0, 0] in results) self.assertTrue([0, 0, 0, 0, 0, 0] in results)
def end_group(self): """We've hit a header (or the end of the appendix), so take the collected paragraphs and determine their depths and insert into the heap accordingly""" if self.nodes: nodes = list(reversed(self.nodes)) markers = [n.label[-1] for n in self.nodes if not AppendixProcessor.filler_regex.match(n.label[-1])] if markers: results = derive_depths(markers) # currently no heuristics applied depths = list(reversed( [a.depth for a in results[0].assignment])) else: depths = [] depth_zero = None # relative for beginning of marker depth self.depth += 1 while nodes: node = nodes.pop() if AppendixProcessor.filler_regex.match(node.label[-1]): # Not a marker paragraph self.m_stack.add(self.depth, node) else: depth = depths.pop() # Match old behavior, placing marker paragraphs as # children within non-marker paragraphs above if depth_zero is None: depth_zero = self.depth_zero_finder(node) self.depth = depth_zero + depth self.m_stack.add(self.depth, node) self.nodes = []
def end_group(self): """We've hit a header (or the end of the appendix), so take the collected paragraphs and determine their depths and insert into the heap accordingly""" if self.nodes: nodes = list(reversed(self.nodes)) marker_list = [ n.label[-1] for n in self.nodes if not AppendixProcessor.filler_regex.match(n.label[-1]) ] if marker_list: results = derive_depths(marker_list) # currently no heuristics applied depths = list( reversed([a.depth for a in results[0].assignment])) else: depths = [] depth_zero = None # relative for beginning of marker depth self.depth += 1 while nodes: node = nodes.pop() if AppendixProcessor.filler_regex.match(node.label[-1]): # Not a marker paragraph self.m_stack.add(self.depth, node) else: depth = depths.pop() # Match old behavior, placing marker paragraphs as # children within non-marker paragraphs above if depth_zero is None: depth_zero = self.depth_zero_finder(node) self.depth = depth_zero + depth self.m_stack.add(self.depth, node) self.nodes = []
def add_nodes_to_stack(nodes, inner_stack): """Calculate most likely depth assignments to each node; add to the provided stack""" # Use constraint programming to figure out possible depth assignments depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def test_double_stars(self): results = derive_depths(['A', '1', 'a', STARS_TAG, STARS_TAG, 'B']) self.assertEqual(3, len(results)) results = [[r.depth for r in result] for result in results] self.assertTrue([0, 1, 2, 2, 1, 0] in results) self.assertTrue([0, 1, 2, 3, 2, 0] in results) self.assertTrue([0, 1, 2, 3, 1, 0] in results)
def test_i_ambiguity(self): results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']) self.assertEqual(2, len(results)) results = [[r.depth for r in result] for result in results] self.assertTrue([0, 0, 0, 0, 0, 0, 0, 0, 0] in results) self.assertTrue([0, 0, 0, 0, 0, 0, 0, 0, 1] in results) results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']) self.assertEqual(1, len(results)) self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [r.depth for r in results[0]]) results = derive_depths(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ii']) self.assertEqual(1, len(results)) self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [r.depth for r in results[0]])
def test_start_star(self): results = derive_depths([STARS_TAG, 'c', '1', STARS_TAG, 'ii', 'iii', '2', 'i', 'ii', STARS_TAG, 'v', STARS_TAG, 'vii', 'A']) self.assertEqual(4, len(results)) results = [[r.depth for r in result] for result in results] self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3] in results) self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 3, 3, 2, 2, 3] in results) self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 3, 3, 4, 4, 5] in results) self.assertTrue([0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 2] in results)
def process(self, xml, root): nodes = self.parse_nodes(xml) intro_node, nodes = self.separate_intro(nodes) if intro_node: root.text = " ".join([root.text, intro_node.text]).strip() # @todo - this is ugly. Make tagged_text a legitimate field on Node tagged_text_list = [] if getattr(root, 'tagged_text', None): tagged_text_list.append(root.tagged_text) if getattr(intro_node, 'tagged_text', None): tagged_text_list.append(intro_node.tagged_text) if tagged_text_list: root.tagged_text = ' '.join(tagged_text_list) if nodes: markers = [node.label[0] for node in nodes] constraints = self.additional_constraints() depths = derive_depths(markers, constraints) if not depths: logging.warning("Could not derive paragraph depths." " Retrying with relaxed constraints.") deemphasized_markers = [deemphasize(m) for m in markers] constraints = self.relaxed_constraints() depths = derive_depths(deemphasized_markers, constraints) if not depths: fails_at = debug_idx(markers, constraints) logging.error( "Could not determine paragraph depths (<%s /> %s):\n" "%s\n" "?? %s\n" "Remaining markers: %s", xml.tag, root.label_id(), derive_depths(markers[:fails_at], constraints)[0].pretty_str(), markers[fails_at], markers[fails_at + 1:]) depths = self.select_depth(depths) return self.build_hierarchy(root, nodes, depths) else: return root
def outline_depths(markers): """ Infer an outline's structure. Return a list of outline depths for a given list of space-separated markers. """ # Input is space-separated. marker_list = markers.split(' ') all_solutions = derive_depths(marker_list, [optional_rules.limit_sequence_gap(1)]) depths = {tuple(str(a.depth) for a in s) for s in all_solutions}.pop() # Expected output is space-separated. formatted_output = ' '.join(depths) click.echo(formatted_output)
def outline_depths(markers): """ Infer an outline's structure. Return a list of outline depths for a given list of space-separated markers. """ # Input is space-separated. marker_list = markers.split(' ') all_solutions = derive_depths( marker_list, [optional_rules.limit_sequence_gap(1)] ) depths = {tuple(str(a.depth) for a in s) for s in all_solutions}.pop() # Expected output is space-separated. formatted_output = ' '.join(depths) click.echo(formatted_output)
def end_group(self): """We've hit a header (or the end of the appendix), so take the collected paragraphs and determine their depths and insert into the heap accordingly""" if self.nodes: nodes = list(reversed(self.nodes)) markers = [ n.label[-1] for n in self.nodes if not AppendixProcessor.filler_regex.match(n.label[-1]) ] if markers: results = derive_depths(markers) if not results or results == []: logging.warning( 'Could not derive depth from {}'.format(markers)) depths = [] else: depths = list( reversed([a.depth for a in results[0].assignment])) else: depths = [] depth_zero = None # relative for beginning of marker depth self.depth += 1 while nodes: node = nodes.pop() if (AppendixProcessor.filler_regex.match(node.label[-1]) or depths == []): # Not a marker paragraph, or a marker paragraph that isn't # actually part of a hierarchy (e.g. Appendix C to 1024, # notice 2013-28210) self.m_stack.add(self.depth, node) else: depth = depths.pop() # Match old behavior, placing marker paragraphs as # children within non-marker paragraphs above if depth_zero is None: depth_zero = self.depth_zero_finder(node) self.depth = depth_zero + depth self.m_stack.add(self.depth, node) self.nodes = []
def end_group(self): """We've hit a header (or the end of the appendix), so take the collected paragraphs and determine their depths and insert into the heap accordingly""" if self.nodes: nodes = list(reversed(self.nodes)) markers = [n.label[-1] for n in self.nodes if not AppendixProcessor.filler_regex.match(n.label[-1])] if markers: results = derive_depths(markers) if not results or results == []: logging.warning( 'Could not derive depth from {}'.format(markers)) depths = [] else: depths = list(reversed( [a.depth for a in results[0].assignment])) else: depths = [] depth_zero = None # relative for beginning of marker depth self.depth += 1 while nodes: node = nodes.pop() if (AppendixProcessor.filler_regex.match(node.label[-1]) or depths == []): # Not a marker paragraph, or a marker paragraph that isn't # actually part of a hierarchy (e.g. Appendix C to 1024, # notice 2013-28210) self.m_stack.add(self.depth, node) else: depth = depths.pop() # Match old behavior, placing marker paragraphs as # children within non-marker paragraphs above if depth_zero is None: depth_zero = self.depth_zero_finder(node) self.depth = depth_zero + depth self.m_stack.add(self.depth, node) self.nodes = []
def get_depths(self, subsections): markers = [s[0] for s in subsections] if self.markers_are_valid(markers): solution = derive_depths(markers, self.additional_constraints) # if not solution: # still stall out sometimes # solution = derive_depths(markers, self.relaxed_constraints) else: solution = None if solution: depths = [assignment.depth for assignment in solution[0]] else: depths = [0] * len(markers) print("..PARSE FAILED") # print(markers) # print(subsections) self.section_failures += 1 result = [] for depth, subsection in zip(depths, subsections): result.append((depth, subsection[0], subsection[1])) return result
def get_depths(self, subsections): markers = [s[0] for s in subsections] if self.markers_are_valid(markers): solution = derive_depths(markers, self.additional_constraints) # if not solution: # still stall out sometimes # solution = derive_depths(markers, self.relaxed_constraints) else: solution = None if solution: depths = [assignment.depth for assignment in solution[0]] else: depths = [0]*len(markers) print("..PARSE FAILED") # print(markers) # print(subsections) self.section_failures += 1 result = [] for depth, subsection in zip(depths, subsections): result.append((depth, subsection[0], subsection[1])) return result
def add_nodes_to_stack(nodes, inner_stack): """Calculate most likely depth assignments to each node; add to the provided stack""" # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile(lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate( filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning( "Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order( [(mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def assert_depth_match_extra(self, markers, extra, *depths_set): """Verify that the set of markers resolves to the provided set of depths (in any order). Allows extra constraints.""" solutions = derive_depths(markers, extra) results = {tuple(a.depth for a in s) for s in solutions} six.assertCountEqual(self, results, {tuple(s) for s in depths_set})
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] # Collect paragraph markers and section text (intro text for the # section) for ch in filter(lambda ch: ch.tag in ('P', 'STARS'), section_xml.getchildren()): text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) m_stack = tree_utils.NodeStack() if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def test_repeat_alpha(self): results = derive_depths(['A', '1', 'a', 'i', 'ii', 'a', 'b', 'c', 'b']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 2, 3, 3, 4, 4, 4, 2], [r.depth for r in results[0]])
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate(filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([ (mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy = [] if (reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[reg_part]): manual_hierarchy = PARAGRAPH_HIERARCHY[reg_part][ section_no_without_marker] # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) # If the child has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if ch.get("depth") is not None: manual_hierarchy.append(int(ch.get("depth"))) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list and manual_hierarchy: # is this a bunch of definitions that don't have numbers next to # them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): # TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([word[0].upper() + word[1:] for word in def_marker]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([word[0].upper() + word[1:] for word in def_marker]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) elif not markers_list and not manual_hierarchy: # No manual heirarchy specified, append to the section. section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy: depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, spec in zip(nodes, depths): if isinstance(spec, int): depth = spec elif isinstance(spec, tuple): depth, marker = spec node.marker = marker last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error('Manual hierarchy length does not match node ' 'list length! ({0} nodes but {1} provided, ' '{2})'.format( len(nodes), len(depths), [x.label[0] for x in nodes])) elif nodes and not manual_hierarchy: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [node.label[0] for node in nodes])) for node in nodes: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def test_alpha_ints(self): results = derive_depths(['A', '1', '2', '3']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 1, 1], [r.depth for r in results[0]])
def test_alpha_ints_jump_back(self): results = derive_depths(['A', '1', '2', '3', 'B', '1', '2', '3', 'C']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 1, 1, 0, 1, 1, 1, 0], [r.depth for r in results[0]])
def test_roman_alpha(self): results = derive_depths(['a', '1', '2', 'b', '1', '2', '3', '4', 'i', 'ii', 'iii', '5', 'c', 'd', '1', '2', 'e']) self.assertEqual(1, len(results)) self.assertEqual([0, 1, 1, 0, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 1, 1, 0], [r.depth for r in results[0]])
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy_flag = False if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[ reg_part]: manual_hierarchy_flag = True # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS'] ] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: # is this a bunch of definitions that don't have numbers next to them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): #TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text #nodes[-1].children.append(n) nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([ mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman ]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!' ' ({0} nodes but {1} provided)'.format(len(nodes), len(depths))) elif nodes and not manual_hierarchy_flag: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [n.label[0] for n in nodes])) for node in nodes: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def assert_depth_match_extra(self, markers, extra, *depths_set): """Verify that the set of markers resolves to the provided set of depths (in any order). Allows extra contraints.""" solutions = derive_depths(markers, extra) results = [[a.depth for a in s] for s in solutions] self.assertItemsEqual(results, depths_set)
def test_ints(self): results = derive_depths(['1', '2', '3', '4']) self.assertEqual(1, len(results)) self.assertEqual([0, 0, 0, 0], [r.depth for r in results[0]])