def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict(changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict( changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def diff_driver(regulation_files): pairs = combinations(regulation_files, 2) for pair in pairs: with open(pair[0], 'r') as f: xml_tree1 = etree.fromstring(f.read()) with open(pair[1], 'r') as f: xml_tree2 = etree.fromstring(f.read()) reg_tree1 = build_reg_tree(xml_tree1) reg_tree2 = build_reg_tree(xml_tree2) recursive_comparison(reg_tree1, reg_tree2)
def test_height(self): xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) result = reg_tree.height() self.assertEqual(result, 5)
def ecfr_notice(title, cfr_part, notice, applies_to, act_title, act_section, with_version=False, without_notice=False): """ Generate RegML for a single notice from eCFR XML. """ # Get the notice the new one applies to with open(find_file(os.path.join(cfr_part, applies_to)), 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) doc_number = xml_tree.find('.//{eregs}documentNumber').text # Validate the file relative to schema validator = get_validator(xml_tree) # Get the ecfr builder builder = Builder(cfr_title=title, cfr_part=cfr_part, doc_number=doc_number, checkpointer=None, writer_type='XML') # Fetch the notices from the FR API and find the notice we're # looking for builder.fetch_notices_json() print([n['document_number'] for n in builder.notices_json]) notice_json = next((n for n in builder.notices_json if n['document_number'] == notice)) # Build the notice notice = builder.build_single_notice(notice_json)[0] if 'changes' not in notice: print('There are no changes in this notice to apply.') return # We've successfully fetched and parsed the new notice. # Build a the reg tree and layers for the notice it applies to. old_tree = build_reg_tree(xml_tree) # Build the new reg tree from the old_tree + notice changes last_version = doc_number version = notice['document_number'] merged_changes = builder.merge_changes(version, notice['changes']) reg_tree = compile_regulation(old_tree, merged_changes) layer_cache = LayerCacheAggregator() layers = builder.generate_layers(reg_tree, [act_title, act_section], layer_cache) # Write the notice file if not without_notice: builder.write_notice(version, old_tree=old_tree, reg_tree=reg_tree, layers=layers, last_version=last_version) # Write the regulation file for the new notice if with_version: builder.write_regulation(new_tree, layers=layers)
def test_section_callout(self): reg_xml = etree.fromstring(""" <section label="1024-3" sectionNum="3" xmlns="eregs"> <subject>§ 1024.3 Questions or suggestions from public and copies of public guidance documents.</subject> <paragraph label="1024-3-p1" marker=""> <content> <callout type="note"> <line>Note:</line> <line>This is a test callout.</line> </callout> </content> </paragraph> </section>""") result = build_reg_tree(reg_xml) expected_result = OrderedDict([(u'children', [OrderedDict([(u'children', []), (u'label', [u'1024', u'3', u'p1']), (u'node_type', u'regtext'), (u'text', u'Note:\n This is a test callout.'), (u'marker', u''), ])]), (u'label', [u'1024', u'3']), (u'node_type', u'regtext'), (u'text', u''), (u'title', u'\xa7 1024.3 Questions or suggestions from public and copies of public guidance documents.') ] ) # This callout should correctly get identified as NOT an intro paragraph, and its content should stay in # an element with the paragraph's label and not smushed into the section's label self.assertEqual(expected_result, result.to_json())
def test_appendix_graphic(self): reg_xml = etree.fromstring(""" <appendixSection appendixSecNum="1" label="1013-A-1" xmlns="eregs"> <subject>A-1—Model Open-End or Finance Vehicle Lease Disclosures</subject> <paragraph label="1013-A-1-p1" marker=""> <content> <graphic> <altText></altText> <text>![](ER19DE11.010)</text> <url>https://s3.amazonaws.com/images.federalregister.gov/ER19DE11.010/original.gif</url> </graphic> </content> </paragraph> </appendixSection>""") result = build_reg_tree(reg_xml) expected_result = OrderedDict([(u'children', [OrderedDict([(u'children', []), (u'label', [u'1013', u'A', u'1', u'p1']), (u'node_type', u'appendix'), (u'text', '![](ER19DE11.010)'), (u'marker', ''), ])]), (u'label', [u'1013', u'A', u'1']), (u'node_type', u'appendix'), (u'text', u''), (u'title', u'A-1\u2014Model Open-End or Finance Vehicle Lease Disclosures')]) # This graphic should correctly get identified as NOT an intro paragraph, and its content should stay in # an element with the paragraph's label and not smushed into the section's label self.assertEqual(expected_result, result.to_json())
def test_markerless_nodes(self): """ Make sure marker: '' comes through in the json """ xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) parent = reg_tree.find_node(lambda n: n.string_label == '1234-1-a')[0] self.assertEqual(parent.children[0].to_json()['marker'], '') self.assertEqual(parent.children[1].to_json()['marker'], '')
def test_labels(self): xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) result = reg_tree.labels() self.assertEqual(1, 1)
def test_flatten_tree(self): xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) result = reg_tree.flatten() self.assertEqual(1, 1)
def test_find_node_single(self): xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) def predicate(node): if node.string_label == '1234-1-a': return True else: return False result = reg_tree.find_node(predicate) self.assertEqual(len(result), 1) self.assertEqual(result[0].string_label, '1234-1-a') self.assertEqual(result[0].text, "a I'm a marked paragraph") self.assertEqual(result[0].marker, "a") self.assertEqual(result[0].depth, 3)
def test_appendix_callout(self): reg_xml = etree.fromstring(""" <appendixSection appendixSecNum="6" label="1024-A-h6" xmlns="eregs"> <subject>Instructions for Completing HUD-1A</subject> <paragraph label="1024-A-h6-p92" marker=""> <content> <callout type="note"> <line>Note:</line> <line>The HUD-1A is an optional form that may be used for refinancing and subordinate-lien federally related mortgage loans, as well as for any other one-party transaction that does not involve the transfer of title to residential real property. The HUD-1 form may also be used for such transactions, by utilizing the borrower's side of the HUD-1 and following the relevant parts of the instructions as set forth above. The use of either the HUD-1 or HUD-1A is not mandatory for open-end lines of credit (home-equity plans), as long as the provisions of Regulation Z are followed.</line> </callout> </content> </paragraph> </appendixSection>""") result = build_reg_tree(reg_xml) expected_result = { "children": [ { "children": [], "label": [ "1024", "A", "h6", "p92" ], "marker": "", "node_type": "appendix", "text": "Note:\n The HUD-1A is an optional form that may be used for refinancing and subordinate-lien federally related mortgage loans, as well as for any other one-party transaction that does not involve the transfer of title to residential real property. The HUD-1 form may also be used for such transactions, by utilizing the borrower's side of the HUD-1 and following the relevant parts of the instructions as set forth above. The use of either the HUD-1 or HUD-1A is not mandatory for open-end lines of credit (home-equity plans), as long as the provisions of Regulation Z are followed." } ], "label": [ "1024", "A", "h6" ], "node_type": "appendix", "text": "", "title": "Instructions for Completing HUD-1A" } # This callout should correctly get identified as NOT an intro paragraph, and its content should stay in # an element with the paragraph's label and not smushed into the appendixSection's label self.assertEqual(expected_result, result.to_json())
def test_find_node_multiple(self): xml_tree = etree.fromstring(test_xml) reg_tree = build_reg_tree(xml_tree) def predicate(node): if node.text.find('marked') > -1: return True else: return False result = reg_tree.find_node(predicate) self.assertEqual(len(result), 4) self.assertEqual(result[0].string_label, '1234-1') self.assertEqual(result[0].text, "I'm an unmarked paragraph") self.assertEqual(result[0].marker, None) self.assertEqual(result[1].string_label, '1234-1-a') self.assertEqual(result[1].text, "a I'm a marked paragraph") self.assertEqual(result[1].marker, "a")
def test_build_reg_tree(self): # Do some basic introspection of the outcome node = build_reg_tree(self.root) node_dict = node.to_json() self.assertEqual(node_dict['title'], 'REGULATION TESTING') self.assertEqual(node_dict['label'], ['1234']) self.assertEqual(len(node_dict['children']), 3) self.assertEqual(node.depth, 0) subpart_dict = node_dict['children'][0] self.assertEqual(subpart_dict['label'], ['1234', 'Subpart']) self.assertEqual(node.children[0].depth, 1) appendix_dict = node_dict['children'][1] self.assertEqual(appendix_dict['label'], ['1234', 'A']) self.assertEqual(node.children[1].depth, 1) interp_dict = node_dict['children'][2] self.assertEqual(interp_dict['label'], ['1234', 'Interp']) self.assertEqual(node.children[2].depth, 1)
def test_build_reg_tree_intro_para(self): tree = etree.fromstring(""" <section label="foo" xmlns="eregs"> <subject>Some Subject</subject> <paragraph label="foo-p1" marker=""> <content> An unmarked intro paragraph. </content> </paragraph> <paragraph label="foo-a" marker="a"> <content>A marked paragraph</content> </paragraph> </section> """) expected_result = { 'children': [ { 'children': [], 'label': [ 'foo', 'a' ], 'node_type': 'regtext', 'text': 'a A marked paragraph', 'marker': 'a' } ], 'label': [ 'foo' ], 'node_type': 'regtext', 'text': 'An unmarked intro paragraph.', 'title': 'Some Subject' } result = build_reg_tree(tree) self.assertEqual(expected_result, result.to_json())
def generate_json(regulation_file, check_terms=False): with open(find_file(regulation_file), 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) # Validate the file relative to schema validator = get_validator(xml_tree) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), ' 'using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice') return reg_number, notice, xml_tree
def parser_driver(regulation_file, check_terms=False, correct_interps=False, headerize_interps=False, fix_missed_cites=False): with open(regulation_file, 'r') as f: reg_xml = f.read() xml_tree = etree.fromstring(reg_xml) # validate relative to schema validator = EregsValidator(settings.XSD_FILE) validator.validate_reg(xml_tree) if not validator.is_valid: for event in validator.events: print(str(event)) sys.exit(0) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] # we can correct interps right away if necessary if correct_interps: validator.insert_interp_markers(xml_tree, regulation_file) if headerize_interps: validator.headerize_interps(xml_tree, regulation_file) if fix_missed_cites: validator.fix_omitted_cites(xml_tree, regulation_file) paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice')