def test_labels_until_sections(self): """We can fill in sections""" start = Label(cfr_title='11', part='222', section='33') end = Label(cfr_title='11', part='222', section='36') self.assertEqual(list(start.labels_until(end)), [Label(cfr_title='11', part='222', section='34'), Label(cfr_title='11', part='222', section='35')])
def test_determine_schema(self): self.assertEqual(Label.app_sect_schema, Label.determine_schema({'appendix_section': '1'})) self.assertEqual(Label.app_schema, Label.determine_schema({'appendix': 'A'})) self.assertEqual(Label.regtext_schema, Label.determine_schema({'section': '12'})) self.assertEqual(None, Label.determine_schema({}))
def test_labels_until_paragraphs(self): """We can fill in paragraphs""" start = Label(cfr_title='11', part='222', section='33', p1='a', p2='2') end = Label(cfr_title='11', part='222', section='33', p1='a', p2='6') self.assertEqual( list(start.labels_until(end)), [Label(cfr_title='11', part='222', section='33', p1='a', p2='3'), Label(cfr_title='11', part='222', section='33', p1='a', p2='4'), Label(cfr_title='11', part='222', section='33', p1='a', p2='5')])
def segment_tree(text, part, parent_label): """Build a tree representing the interpretation of a section, paragraph, or appendix.""" title, body = utils.title_body(text) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(body, Label(part=parent_label[0]))] label = merge_labels(text_to_labels(title, Label(part=part, comment=True))) return interpParser.build_tree(body, 1, exclude, label, title)
def test_to_list(self): label = Label(part='222', section='11', p1='c', p2='2') self.assertEqual(['222', '11', 'c', '2'], label.to_list()) label = Label(part='222', p1='d', appendix='R3') self.assertEqual(['222', 'R3', 'd'], label.to_list()) label = Label(part='222', p1='d', appendix='R', appendix_section='4') self.assertEqual(['222', 'R', '4', 'd'], label.to_list())
def _p_with_label_in_child(xml_node): """E.g. <P><E>22(a)</E>.</P>""" children = xml_node.getchildren() return (xml_node.tag.upper() == 'P' and not (xml_node.text or '').strip() and len(children) == 1 and not (children[0].tail or '').strip(" \n\t.") and text_to_labels(children[0].text, Label(), warn=False))
def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if (self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) try: cfr_part = node.label[0] except IndexError: cfr_part = None if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part): for included_term, context in settings.INCLUDE_DEFINITIONS_IN[ cfr_part]: if context in node.text and included_term in node.text: pos_start = node.text.index(included_term) add_match(node, included_term.lower(), (pos_start, pos_start + len(included_term))) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[ match.term.pos[0]:match.term.pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def determine_scope(self, stack): for node in stack.lineage(): scopes = self.scope_of_text(node.text, Label.from_node(node)) if scopes: return [tuple(s) for s in scopes] # Couldn't determine scope; default to the entire reg return [tuple(node.label[:1])]
def is_title(xml_node): """Not all titles are created equal. Sometimes a title appears as a paragraph tag, mostly to add confusion.""" if xml_node.getchildren(): child = xml_node.getchildren()[0] else: child = None return bool( (xml_node.tag.upper() == 'HD' and xml_node.attrib['SOURCE'] != 'HED') or (xml_node.tag.upper() == 'P' and (xml_node.text is None or not xml_node.text.strip()) and len(xml_node.getchildren()) == 1 and (child.tail is None or not child.tail.strip(" \n\t.")) and text_to_labels(child.text, Label(), warn=False)) or (xml_node.tag.upper() == 'P' and len(xml_node.getchildren()) == 0 and xml_node.text and not get_first_interp_marker(xml_node.text) and text_to_labels(xml_node.text, Label(), warn=False, force_start=True)))
def test_section_ref_in_appendix(self): text = u"""(a) Something something § 1005.7(b)(1).""" citations = internal_citations( text, Label(part='1005', appendix='A', appendix_section='2', p1='a')) self.assertEqual(citations[0].label.to_list(), ['1005', '7', 'b', '1'])
def _non_interp_p_with_label(xml_node): """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>""" return ( xml_node.tag.upper() == 'P' and not xml_node.getchildren() and xml_node.text and not get_first_interp_marker(xml_node.text) and text_to_labels(xml_node.text, Label(), warn=False, force_start=True) )
def test_text_to_labels(): text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and " text += "(b)(2)" result = tree.text_to_labels(text, Label(part='1111', comment=True)) assert result == [['1111', '9', 'c', '2', 'iii', 'Interp']] text = "Paragraphs 4(b)(7) and (b)(8)." result = tree.text_to_labels(text, Label(part='1111', comment=True)) assert result == [['1111', '4', 'b', '7', 'Interp'], ['1111', '4', 'b', '8', 'Interp']] text = "Appendices G and H-Something" result = tree.text_to_labels(text, Label(part='1111', comment=True)) assert result == [['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']] text = "Paragraph 38(l)(7)(i)(A)(2)." result = tree.text_to_labels(text, Label(part='1111', comment=True)) assert result == [['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']]
def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if (self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) try: cfr_part = node.label[0] except IndexError: cfr_part = None if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part): for included_term, context in settings.INCLUDE_DEFINITIONS_IN[ cfr_part]: if context in node.text and included_term in node.text: pos_start = node.text.index(included_term) add_match(node, included_term.lower(), (pos_start, pos_start + len(included_term))) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[match.term.pos[0]:match.term. pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def test_single_match_multiple_paragraphs4(self): text = "Listing sections 11.55(d) and 321.11 (h)(4)" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['11', '55', 'd'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '11.55(d)') citation = citations[1] self.assertEqual(['321', '11', 'h', '4'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '321.11 (h)(4)')
def parse_into_labels(txt, part): """Find what part+section+(paragraph) (could be multiple) this text is related to.""" citations = internal_citations(txt, Label(part=part)) # odd corner case: headers shouldn't include both an appendix and regtext labels = [c.label for c in citations] if any('appendix' in l.settings for l in labels): labels = [l for l in labels if 'appendix' in l.settings] labels = ['-'.join(l.to_list()) for l in labels] return labels
def add_spaces_to_title(title): """Federal Register often seems to miss spaces in the title of SxS sections. Make sure spaces get added if appropriate""" for citation in internal_citations(title, Label()): end = citation.end # Next char is an alpha and last char isn't a space if end < len(title) and title[end].isalpha() and title[end - 1] != ' ': title = title[:end] + ' ' + title[end:] break # Assumes there is only one paragraph in a title return title
def generate_keyterm(node): label_id = node.label_id() if label_id in real_key_terms_layer: layer[label_id] = real_key_terms_layer[label_id] else: node_text = key_terms.KeyTerms.process_node_text(node) if not node_text: return # Our Appendix parsing isn't particularly accurate -- avoid keyterms if node.node_type == struct.Node.APPENDIX: return exclude = [(start, end) for _, start, end in exclude_parser.scanString(node_text)] exclude.extend((pc.full_start, pc.full_end) for pc in internal_citations(node_text, Label())) periods = [m.start() for m in period.finditer(node_text)] # Remove any periods which are part of a citation periods = filter(lambda p: all(p < start or p > end for start, end in exclude), periods) # Key terms must either have a full "sentence" or end with a hyphen if not periods and node_text[-1] != u'—': return if periods: first_p = periods[0] # Check for cases where the period is "inside" something; # include the period next_char = node_text[first_p + 1: first_p + 2] if next_char in (')', u'”'): first_sentence = node_text[:first_p + 2] else: first_sentence = node_text[:first_p + 1] else: first_sentence = node_text # Key terms can't be the entire text of a leaf node if first_sentence == node_text and not node.children: return words = first_sentence.split() if (not words[-1] == part_end and not first_sentence.startswith('![')): num_words = len(words) # key terms are short if num_words <= 15: layer_element = { "key_term": first_sentence, "locations": [0] } layer[label_id] = [layer_element]
def test_labels_until_paragraphs(self): """We can fill in paragraphs""" start = Label(cfr_title='11', part='222', section='33', p1='a', p2='2') end = Label(cfr_title='11', part='222', section='33', p1='a', p2='6') self.assertEqual(list(start.labels_until(end)), [ Label(cfr_title='11', part='222', section='33', p1='a', p2='3'), Label(cfr_title='11', part='222', section='33', p1='a', p2='4'), Label(cfr_title='11', part='222', section='33', p1='a', p2='5') ])
def test_single_match_multiple_paragraphs5(self): text = "See, e.g., comments 31(b)(1)(iv)-1 and 31(b)(1)(vi)-1" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '31', 'b', '1', 'iv', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '31(b)(1)(iv)-1') citation = citations[1] self.assertEqual(['222', '31', 'b', '1', 'vi', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '31(b)(1)(vi)-1')
def test_single_match_multiple_paragraphs6(self): text = "comments 5(b)(3)-1 through -3" citations = internal_citations(text, Label(part='100', section='5')) citation = citations[0] self.assertEqual(2, len(citations)) self.assertEqual(['100', '5', 'b', '3', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '5(b)(3)-1') citation = citations[1] self.assertEqual(['100', '5', 'b', '3', 'Interp', '3'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '-3')
def test_multiple_matches(self): text = "Please see A-5 and Q-2(r) and Z-12(g)(2)(ii) then more text" citations = internal_citations(text, Label(part='102', section='1')) self.assertEqual(3, len(citations)) citation = citations[0] self.assertEqual(citation.label.to_list(), ['102', 'A', '5']) self.assertEqual(to_text(citation, text), 'A-5') citation = citations[1] self.assertEqual(citation.label.to_list(), ['102', 'Q', '2(r)']) self.assertEqual(to_text(citation, text), 'Q-2(r)') citation = citations[2] self.assertEqual(citation.label.to_list(), ['102', 'Z', '12(g)(2)(ii)']) self.assertEqual(to_text(citation, text), 'Z-12(g)(2)(ii)') text = u"Appendices G and H—Yadda yadda" citations = internal_citations(text, Label(part='102')) self.assertEqual(2, len(citations)) citG, citH = citations self.assertEqual(citG.label.to_list(), ['102', 'G']) self.assertEqual(citH.label.to_list(), ['102', 'H'])
def find(self, node): refs = [] for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): valid_scope = self.finder.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) valid_term = re.match("^[a-z ]+$", match.term.tokens[0]) if valid_scope and valid_term: term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) refs.append(Ref(term, node.label_id(), pos_start)) return refs
def test_single_match_multiple_paragraphs2(self): text = u'§ 1005.10(a) and (d)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['1005', '10', 'a'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '1005.10(a)') citation = citations[1] self.assertEqual(['1005', '10', 'd'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(d)') text = u'§ 1005.7(b)(1), (2) and (3)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '7', 'b', '1'], citations[0].label.to_list()) self.assertEqual(['1005', '7', 'b', '2'], citations[1].label.to_list()) self.assertEqual(['1005', '7', 'b', '3'], citations[2].label.to_list()) text = u'§ 1005.15(d)(1)(i) and (ii)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) self.assertEqual(['1005', '15', 'd', '1', 'i'], citations[0].label.to_list()) self.assertEqual(['1005', '15', 'd', '1', 'ii'], citations[1].label.to_list()) text = u'§ 1005.9(a)(5) (i), (ii), or (iii)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '9', 'a', '5', 'i'], citations[0].label.to_list()) self.assertEqual(['1005', '9', 'a', '5', 'ii'], citations[1].label.to_list()) self.assertEqual(['1005', '9', 'a', '5', 'iii'], citations[2].label.to_list()) text = u'§ 1005.11(a)(1)(vi) or (vii).' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) self.assertEqual(['1005', '11', 'a', '1', 'vi'], citations[0].label.to_list()) self.assertEqual(['1005', '11', 'a', '1', 'vii'], citations[1].label.to_list()) text = u'§§ 1005.3(b)(2) and (3), 1005.10(b), (d), and (e), 1005.13, ' text += 'and 1005.20' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(7, len(citations)) text = 'Sections 1005.3, .4, and .5' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '3'], citations[0].label.to_list()) self.assertEqual(['1005', '4'], citations[1].label.to_list()) self.assertEqual(['1005', '5'], citations[2].label.to_list())
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def find(self, node): refs = [] for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): valid_scope = self.finder.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) valid_term = re.match("^[a-z ]+$", match.term.tokens[0]) if valid_scope and valid_term: term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) refs.append(Ref(term, node.label_id(), pos_start)) return refs
def test_text_to_labels(self): text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and " text += "(b)(2)" self.assertEqual([['1111', '9', 'c', '2', 'iii', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Paragraphs 4(b)(7) and (b)(8)." self.assertEqual([['1111', '4', 'b', '7', 'Interp'], ['1111', '4', 'b', '8', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Appendices G and H-Something" self.assertEqual([['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True))) text = "Paragraph 38(l)(7)(i)(A)(2)." self.assertEqual([['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']], interpretation.text_to_labels( text, Label(part='1111', comment=True)))
def test_labels_until_sections(self): """We can fill in sections""" start = Label(cfr_title='11', part='222', section='33') end = Label(cfr_title='11', part='222', section='36') self.assertEqual(list(start.labels_until(end)), [ Label(cfr_title='11', part='222', section='34'), Label(cfr_title='11', part='222', section='35') ])
def build_section_tree(text, part): """Construct the tree for a whole section. Assumes the section starts with an identifier""" title, text = utils.title_body(text) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(text, Label(part=part))] section = re.search(r'%d\.(\d+)\b' % part, title).group(1) label = [str(part), section] p_tree = regParser.build_tree(text, exclude=exclude, label=label, title=title) return p_tree
def test_lt(self): """Comparisons between labels""" self.assertTrue( Label(part='105', section='3') < Label(part='105', section='4')) self.assertTrue( Label(part='105', section='3') < Label( part='105', section='3', p1='a')) self.assertTrue( Label(part='105', section='3', p1='a') < Label(part='222'))
def node_definitions(self, node, stack=None): """Find defined terms in this node's text. 'Act' is a special case, as it is also defined as an external citation.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if ((term == 'act' and list(uscode.scanString(n.text))) or self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[ match.term.pos[0]:match.term.pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def per_node(node): if (node.node_type != struct.Node.INTERP or node.label[-1] != struct.Node.INTERP_MARK): return # Always add a connection based on the interp's label self.lookup_table[tuple(node.label[:-1])].append(node) # Also add connections based on the title for label in text_to_labels(node.title or '', Label.from_node(node), warn=False): label = tuple(label[:-1]) # Remove Interp marker if node not in self.lookup_table[label]: self.lookup_table[label].append(node)
def node_definitions(self, node, stack=None): """Find defined terms in this node's text. 'Act' is a special case, as it is also defined as an external citation.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if ((term == 'act' and list(uscode.scanString(n.text))) or self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[match.term.pos[0]:match.term. pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def test_from_node(self): for lst, typ in [(['111'], Node.REGTEXT), (['111', '31', 'a', '3'], Node.REGTEXT), (['111', 'A', 'b'], Node.APPENDIX), (['111', 'A', '4', 'a'], Node.APPENDIX), (['111', '21', 'Interp'], Node.INTERP), (['111', '21', 'Interp', '1'], Node.INTERP), (['111', '21', 'r', 'Interp'], Node.INTERP), (['111', '21', 'r', 'Interp', '2'], Node.INTERP), (['111', 'G', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp'], Node.INTERP), (['111', 'G', '2', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP), (['111', 'G', '2', 'Interp', '5'], Node.INTERP), (['111', 'Subpart', 'A'], Node.SUBPART), (['111', 'Subpart'], Node.EMPTYPART)]: n = Node(label=lst, node_type=typ) self.assertEqual(Label.from_node(n).to_list(), lst)
def paragraph_tree(appendix_letter, sections, text, label, title=None): """Use the paragraph parser to parse through each section in this appendix.""" if not sections: return Node(text, label=label, title=title, node_type=Node.APPENDIX) children = [] for begin, end in sections: seg_title, section_text = utils.title_body(text[begin:end]) sec_num = carving.get_appendix_section_number( seg_title, appendix_letter) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(section_text, Label(part=label[0]))] child = parParser.build_tree( section_text, exclude=exclude, label=label + [sec_num], title=seg_title) children.append(child) return Node(text[:sections[0][0]], children, label, title, Node.APPENDIX)
def test_interp_headers(self): for text, label in [ ("Section 102.22Stuff", ['102', '22']), ("22(d) Content", ['101', '22', 'd']), ("22(d)(5) Content", ['101', '22', 'd', '5']), ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']), (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']), ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']), ("Appendix A Heading", ['101', 'A']), ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']), ("Paragraph 38(l)(7)(i)(A)(2).", ['101', '38', 'l', '7', 'i', 'A', '2']), (u'Official Interpretations of § 102.33(c)(2)', ['102', '33', 'c', '2', 'Interp']) ]: citations = internal_citations(text, Label(part='101')) self.assertEqual(1, len(citations)) self.assertEqual(citations[0].label.to_list(), label)
def test_from_node(self): for lst, typ in [(['111'], Node.REGTEXT), (['111', '31', 'a', '3'], Node.REGTEXT), # _Very_ deeply nested, ignoring the recommended # 6-level paragraph limit (['111', '2', 'c', '4', 'v', 'F', '7', 'viii', 'p1', 'p1', 'p1'], Node.REGTEXT), (['111', 'A', 'b'], Node.APPENDIX), (['111', 'A', '4', 'a'], Node.APPENDIX), (['111', '21', 'Interp'], Node.INTERP), (['111', '21', 'Interp', '1'], Node.INTERP), (['111', '21', 'r', 'Interp'], Node.INTERP), (['111', '21', 'r', 'Interp', '2'], Node.INTERP), (['111', 'G', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp'], Node.INTERP), (['111', 'G', '2', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP), (['111', 'G', '2', 'Interp', '5'], Node.INTERP), (['111', 'Subpart', 'A'], Node.SUBPART), (['111', 'Subpart'], Node.EMPTYPART)]: n = Node(label=lst, node_type=typ) self.assertEqual(Label.from_node(n).to_list(), lst)
def test_copy(self): label = Label(part='222', section='11', p1='c', p2='2') label = label.copy(p3='ii') self.assertEqual(['222', '11', 'c', '2', 'ii'], label.to_list()) label = label.copy(p2='4', p3='iv') self.assertEqual(['222', '11', 'c', '4', 'iv'], label.to_list()) label = label.copy(section='12', p1='d') self.assertEqual(['222', '12', 'd'], label.to_list()) label = label.copy(appendix='D', appendix_section='4') self.assertEqual(['222', 'D', '4'], label.to_list()) label = label.copy(p1='c', p2='3') self.assertEqual(['222', 'D', '4', 'c', '3'], label.to_list())
def process(self, node): citations_list = self.parse(node.text, label=Label.from_node(node), title=str(self.cfr_title)) if citations_list: return citations_list