def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() doc_number = args.notice act_title_and_section = [args.act_title, args.act_section] # First, the regulation tree reg_tree = Builder.reg_tree(reg) builder = Builder(cfr_title=args.title, cfr_part=reg_tree.label_id(), doc_number=doc_number) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(doc_number, reg_tree, act_title_and_section, builder, layer_cache)
def build_from(filename, title, act_title, act_section, generate_diffs, checkpoint, version_identifier): """Build all data from provided xml. Reads the provided file and builds all versions of the regulation, its layers, etc. that follow. \b FILENAME: XML file containing the regulation TITLE: CFR title """ act_title_and_section = [act_title, act_section] # First, the regulation tree reg_tree, builder = tree_and_builder(filename, title, checkpoint, version_identifier) builder.write_notices() # Always do at least the first reg logger.info("Version %s", builder.doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if generate_diffs: gen_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def test_layer_cache(self, init): """Integration test for layer caching""" init.return_value = None cache = LayerCacheAggregator() b = Builder() # Don't need parameters as init's been mocked out b.cfr_title, b.cfr_part, b.doc_number = 15, '111', '111-222' b.writer = Mock() b.checkpointer = NullCheckpointer() write = b.writer.layer.return_value.write tree = Node(label=["1234"], children=[ Node(label=["1234", "1"], children=[ Node("See paragraph (b)", label=["1234", "1", "a"]), Node("This is b", label=["1234", "1", "b"]) ]) ]) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) cache.replace_using(tree) write.reset_mock() tree.children[0].children[1].text = "References paragraph (a)" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() tree.children[0].children[0].text = "Contains no references" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice = {'document_number': '111-222'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice['changes'] = {'1234-1-b': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a', '1234-1-b'], list(sorted(arg.keys()))) write.reset_mock() notice['changes'] = {'1234-Subpart-A': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-b'], list(sorted(arg.keys())))
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint("init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def test_layer_cache(self, init): """Integration test for layer caching""" init.return_value = None cache = LayerCacheAggregator() b = Builder() # Don't need parameters as init's been mocked out b.cfr_title, b.cfr_part, b.doc_number = 15, '111', '111-222' b.writer = Mock() b.checkpointer = NullCheckpointer() write = b.writer.layer.return_value.write tree = Node(label=["1234"], children=[ Node(label=["1234", "1"], children=[ Node("See paragraph (b)", label=["1234", "1", "a"]), Node("This is b", label=["1234", "1", "b"])])]) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) cache.replace_using(tree) write.reset_mock() tree.children[0].children[1].text = "References paragraph (a)" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() tree.children[0].children[0].text = "Contains no references" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice = {'document_number': '111-222'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice['changes'] = {'1234-1-b': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a', '1234-1-b'], list(sorted(arg.keys()))) write.reset_mock() notice['changes'] = {'1234-Subpart-A': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-b'], list(sorted(arg.keys())))
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() act_title_and_section = [args.act_title, args.act_section] if args.checkpoint: checkpointer = Checkpointer(args.checkpoint) else: checkpointer = NullCheckpointer() # First, the regulation tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() doc_number = checkpointer.checkpoint( "doc-number-" + file_digest, lambda: Builder.determine_doc_number(reg, args.title, title_part)) if not doc_number: raise ValueError("Could not determine document number") checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # Run Builder builder = Builder(cfr_title=args.title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(doc_number, reg_tree, act_title_and_section, builder, layer_cache, checkpointer)
def ecfr_all(title, file, act_title, act_section, with_all_versions=False, without_versions=False, without_notices=False, only_notice=None): """ Parse eCFR into RegML """ # Get the tree and layers reg_tree, builder = tree_and_builder( file, title, writer_type='XML') layer_cache = LayerCacheAggregator() layers = builder.generate_layers(reg_tree, [act_title, act_section], layer_cache) # Do the first version last_version = builder.doc_number print("Version {}".format(last_version)) if (only_notice is not None and builder.doc_number == only_notice) \ or only_notice is None: if not without_versions: builder.write_regulation(reg_tree, layers=layers) for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] print("Version {}".format(version)) builder.doc_number = version layers = builder.generate_layers(new_tree, [act_title, act_section], layer_cache, notices) if (only_notice is not None and version == only_notice) or \ only_notice is None: if with_all_versions: builder.write_regulation(new_tree, layers=layers) if not without_notices: builder.write_notice(version, old_tree=old, reg_tree=new_tree, layers=layers, last_version=last_version) layer_cache.invalidate_by_notice(last_notice) layer_cache.replace_using(new_tree) last_version = version del last_notice, old, new_tree, notices # free some memory
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ act_title_and_section = [args.act_title, args.act_section] # First, the regulation tree reg_tree, builder = tree_and_builder(args.filename, args.title, args.checkpoint_dir, args.doc_number) builder.write_notices() # Always do at least the first reg logger.info("Version %s", builder.doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def generate_xml(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): act_title_and_section = [act_title, act_section] # First, the regulation tree reg_tree, builder = tree_and_builder(filename, title, checkpoint, writer_type='XML') layer_cache = LayerCacheAggregator() layers = builder.generate_layers(reg_tree, act_title_and_section, layer_cache) # Always do at least the first reg logger.info("Version", builder.doc_number) builder.write_regulation(reg_tree, layers=layers) all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version layers = builder.generate_layers(new_tree, act_title_and_section, layer_cache, notices) builder.write_regulation(new_tree, layers=layers) builder.write_notice(version, old_tree=old, reg_tree=new_tree, layers=layers) layer_cache.invalidate_by_notice(last_notice) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ act_title_and_section = [args.act_title, args.act_section] # First, the regulation tree reg_tree, builder = tree_and_builder(args.filename, args.title, args.checkpoint) builder.write_notices() # Always do at least the first reg logger.info("Version %s", builder.doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
cfr_part=reg_tree.label_id(), doc_number=doc_number) # Didn't include the provided version if not any(n['document_number'] == doc_number for n in builder.notices): print "Could not find notice_doc_#, %s" % doc_number exit() builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, sys.argv[4:6], layer_cache) layer_cache.replace_using(reg_tree) if len(sys.argv) < 7 or sys.argv[6].lower() == 'true': all_versions = {doc_number: reg_tree} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = new_tree builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, sys.argv[4:6], layer_cache, notices) layer_cache.replace_using(new_tree) # now build diffs - include "empty" diffs comparing a version to itself
["", title_part, str(args.title), doc_number]) # Run Builder builder = Builder(cfr_title=args.title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) # this used to assume implicitly that if gen-diffs was not specified it was # True; changed it to explicit check if args.generate_diffs: all_versions = {doc_number: reg_tree} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = new_tree builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section,