def test_build_toc_layer_appendix_section(self): tree = etree.fromstring(""" <appendix xmlns="eregs" appendixLetter="A" label="1234-A"> <appendixTitle>Appendix A</appendixTitle> <appendixSection appendixSecNum="1" label="1234-A-1"> <subject>Section 1</subject> <tableOfContents> <tocAppEntry target="1234-A-1-A"> <appendixLetter>A-1-A</appendixLetter> <appendixSubject>Something</appendixSubject> </tocAppEntry> </tableOfContents> <paragraph label="1234-A-1-A" marker=""> <content>Something here</content> </paragraph> </appendixSection> </appendix> """) expected_result = { '1234-A-1': [ {u'index': [u'1234', u'A', u'1', 'A'], u'title': 'Something'} ], } result = build_toc_layer(tree) self.assertEqual(expected_result, result)
def test_build_toc_layer_subpart(self): tree = etree.fromstring(""" <subpart xmlns="eregs" subpartLetter="A" label="1234-Subpart-A"> <title>General</title> <tableOfContents label="1234-Subpart-A-TOC"> <tocSecEntry target="1234-1"> <sectionNum>1</sectionNum> <sectionSubject>§ 1234.1</sectionSubject> </tocSecEntry> <tocSecEntry target="1234-1"> <sectionNum>1</sectionNum> <sectionSubject>§ 1234.2</sectionSubject> </tocSecEntry> </tableOfContents> <content></content> </subpart> """) expected_result = { '1234-Subpart-A': [ {'index': [u'1234', u'1'], 'title': u'\xa7 1234.1'}, {'index': [u'1234', u'1'], 'title': u'\xa7 1234.2'} ], } result = build_toc_layer(tree) self.assertEqual(expected_result, result)
def test_build_toc_layer_appendix(self): tree = etree.fromstring(""" <appendix xmlns="eregs" appendixLetter="A" label="1234-A"> <appendixTitle>Appendix A</appendixTitle> <tableOfContents> <tocAppEntry target="1234-A-1"> <appendixLetter>A-1</appendixLetter> <appendixSubject>Some Subject</appendixSubject> </tocAppEntry> </tableOfContents> </appendix> """) expected_result = { '1234-A': [ {u'index': [u'1234', u'A', u'1'], u'title': 'Some Subject'} ], } result = build_toc_layer(tree) self.assertEqual(expected_result, result)
def test_build_toc_layer_section(self): tree = etree.fromstring(""" <section xmlns="eregs" label="1234-1" sectionNum="1"> <subject>§ 1234.1</subject> <tableOfContents label="1234-Subpart-A-TOC"> <tocSecEntry target="1234-1-a"> <sectionNum>1</sectionNum> <sectionSubject>§ 1234.1(a)</sectionSubject> </tocSecEntry> </tableOfContents> <paragraph label="1234-1-a" marker="a"> <content>This is a section with its own TOC</content> </paragraph> </section> """) expected_result = { '1234-1': [ {u'index': [u'1234', u'1', u'a'], u'title': u'\xa7 1234.1(a)'} ], } result = build_toc_layer(tree) self.assertEqual(expected_result, result)
def test_build_toc_layer_part(self): tree = etree.fromstring(""" <part xmlns="eregs" label="1234"> <tableOfContents> <tocSecEntry target="1234-1"> <sectionNum>1</sectionNum> <sectionSubject>§ 1234.1</sectionSubject> </tocSecEntry> <tocAppEntry target="1234-A"> <appendixLetter>A</appendixLetter> <appendixSubject>Appendix</appendixSubject> </tocAppEntry> </tableOfContents> <content/> </part> """) expected_result = { '1234': [ {'index': [u'1234', u'1'], 'title': u'\xa7 1234.1'}, {'index': [u'1234', u'A'], 'title': 'Appendix'} ], } result = build_toc_layer(tree) self.assertEqual(expected_result, result)
def generate_json(regulation_file, check_terms=False): with open(find_file(regulation_file), 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) # Validate the file relative to schema validator = get_validator(xml_tree) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), ' 'using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice') return reg_number, notice, xml_tree
def parser_driver(regulation_file, check_terms=False, correct_interps=False, headerize_interps=False, fix_missed_cites=False): with open(regulation_file, 'r') as f: reg_xml = f.read() xml_tree = etree.fromstring(reg_xml) # validate relative to schema validator = EregsValidator(settings.XSD_FILE) validator.validate_reg(xml_tree) if not validator.is_valid: for event in validator.events: print(str(event)) sys.exit(0) reg_tree = build_reg_tree(xml_tree) reg_number = reg_tree.label[0] # we can correct interps right away if necessary if correct_interps: validator.insert_interp_markers(xml_tree, regulation_file) if headerize_interps: validator.headerize_interps(xml_tree, regulation_file) if fix_missed_cites: validator.fix_omitted_cites(xml_tree, regulation_file) paragraph_markers = build_paragraph_marker_layer(xml_tree) internal_citations = build_internal_citations_layer(xml_tree) external_citations = build_external_citations_layer(xml_tree) terms = build_terms_layer(xml_tree) meta = build_meta_layer(xml_tree) toc = build_toc_layer(xml_tree) keyterms = build_keyterm_layer(xml_tree) graphics = build_graphics_layer(xml_tree) formatting = build_formatting_layer(xml_tree) interps = build_interp_layer(xml_tree) analysis = build_analysis(xml_tree) notice_dict = build_notice(xml_tree) # if the validator had problems then we should report them and bail out validator.validate_terms(xml_tree, terms) validator.validate_internal_cites(xml_tree, internal_citations) if check_terms: validator.validate_term_references(xml_tree, terms, regulation_file) for event in validator.events: print(str(event)) reg_tree.include_children = True reg_json = reg_tree.to_json() notice = xml_tree.find('.//{eregs}documentNumber').text version = os.path.split(regulation_file)[-1].replace('.xml', '') if notice != version: print('Notice ({}) different from version ({}), using version'.format(notice, version)) notice = version write_layer(reg_json, reg_number, notice, 'regulation') write_layer(meta, reg_number, notice, 'layer/meta') write_layer(paragraph_markers, reg_number, notice, 'layer/paragraph-markers') write_layer(internal_citations, reg_number, notice, 'layer/internal-citations') write_layer(external_citations, reg_number, notice, 'layer/external-citations') write_layer(terms, reg_number, notice, 'layer/terms') write_layer(toc, reg_number, notice, 'layer/toc') write_layer(keyterms, reg_number, notice, 'layer/keyterms') write_layer(graphics, reg_number, notice, 'layer/graphics') write_layer(formatting, reg_number, notice, 'layer/formatting') write_layer(interps, reg_number, notice, 'layer/interpretations') write_layer(analysis, reg_number, notice, 'layer/analyses') write_layer(notice_dict, reg_number, notice, 'notice')