def test_get_paragraph_marker(self): text = '(k)(2)(iii) abc (j)' result = [m for m in tree_utils.get_paragraph_markers(text)] self.assertEqual(['k', '2', 'iii'], result) text = '(i)(A) The minimum period payment' result = [m for m in tree_utils.get_paragraph_markers(text)] self.assertEqual(['i', 'A'], result)
def test_get_paragraph_marker(self): text = "(k)(2)(iii) abc (j)" result = [m for m in tree_utils.get_paragraph_markers(text)] self.assertEqual(["k", "2", "iii"], result) text = "(i)(A) The minimum period payment" result = [m for m in tree_utils.get_paragraph_markers(text)] self.assertEqual(["i", "A"], result)
def process_appendix(m_stack, current_section, child): html_parser = HTMLParser.HTMLParser() for ch in child.getchildren(): if ch.tag == 'HD': appendix_section = get_appendix_section_number( ch.text, current_section) if appendix_section is None: appendix_section = determine_next_section(m_stack, 2) n = Node( node_type=Node.APPENDIX, label=[appendix_section], title=ch.text) node_level = 2 tree_utils.add_to_stack(m_stack, node_level, n) if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 0: if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text( node_text, actual_markers) else: node_text = [node_text] for m, node_text in zip(markers_list, node_text): n = Node( node_text, label=[str(m)], node_type=Node.APPENDIX) last = m_stack.peek() node_level = determine_level(m, last[0][0]) if m == 'i': #This is bit of a hack, since we can't easily #distinguish between the Roman numeral #(i) and the #letter (i) to determine the level. We look ahead to #help. This is not #a complete solution and we should #circle back at some point. next_text = ' '.join( [ch.getnext().text] + [c.tail for c in ch.getnext() if c.tail]) next_markers = tree_utils.get_paragraph_markers( next_text) if next_markers[0] == 'ii': node_level = 5 tree_utils.add_to_stack(m_stack, node_level, n) else: last = m_stack.peek_last() last[1].text = last[1].text + '\n %s' % node_text
def build_section(reg_part, section_xml): p_level = 1 m_stack = NodeStack() section_texts = [] for ch in section_xml.getchildren(): if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text(node_text, actual_markers) elif markers_list: node_text = [node_text] else: # Does not contain paragraph markers section_texts.append(node_text) for m, node_text in zip(markers_list, node_text): n = Node(node_text, [], [str(m)]) new_p_level = determine_level(m, p_level) last = m_stack.peek() if len(last) == 0: m_stack.push_last((new_p_level, n)) else: tree_utils.add_to_stack(m_stack, new_p_level, n) p_level = new_p_level section_title = section_xml.xpath('SECTNO')[0].text subject_text = section_xml.xpath('SUBJECT')[0].text if subject_text: section_title += " " + subject_text section_number_match = re.search(r'%s\.(\d+)' % reg_part, section_title) # Sometimes not reg text sections get mixed in if section_number_match: section_number = section_number_match.group(1) section_text = ' '.join([section_xml.text] + section_texts) sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: tree_utils.unwind_stack(m_stack) return m_stack.pop()[0][1]
def overwrite_marker(origin, new_label): """ The node passed in has a label, but we're going to give it a new one (new_label). This is necessary during node moves. """ if origin.node_type == Node.REGTEXT: marker_list = tree_utils.get_paragraph_markers(origin.text) if len(marker_list) > 0: marker = '(%s)' % marker_list[0] new_marker = '(%s)' % new_label origin.text = origin.text.replace(marker, new_marker, 1) elif origin.node_type == Node.INTERP: marker = interpretations.get_first_interp_marker(origin.text) marker = marker + '.' new_marker = new_label + '.' origin.text = origin.text.replace(marker, new_marker, 1) return origin
def get_markers(text): """ Extract all the paragraph markers from text. Do some checks on the collapsed markers.""" markers = tree_utils.get_paragraph_markers(text) collapsed_markers = tree_utils.get_collapsed_markers(text) # Check that the collapsed markers make sense (i.e. are at least one # level below the initial marker) if markers and collapsed_markers: initial_marker_levels = p_level_of(markers[-1]) final_collapsed_markers = [] for collapsed_marker in collapsed_markers: collapsed_marker_levels = p_level_of(collapsed_marker) if any(c > f for f in initial_marker_levels for c in collapsed_marker_levels): final_collapsed_markers.append(collapsed_marker) collapsed_markers = final_collapsed_markers markers_list = [m for m in markers] + [m for m in collapsed_markers] return markers_list
def get_markers(text, next_marker=None): """ Extract all the paragraph markers from text. Do some checks on the collapsed markers.""" initial = tree_utils.get_paragraph_markers(text) if next_marker is None: collapsed = [] else: collapsed = tree_utils.get_collapsed_markers(text) # Check that the collapsed markers make sense: # * at least one level below the initial marker # * followed by a marker in sequence if initial and collapsed: collapsed = [c for c in collapsed if _deeper_level(initial[-1], c)] for marker in reversed(collapsed): if _continues_collapsed(marker, next_marker): break else: collapsed.pop() return initial + collapsed
def test_get_paragraph_marker(self): result = [m for m in tree_utils.get_paragraph_markers('(k)(2)(iii) abc (j)')] self.assertEqual(['k', '2', 'iii'], result)